Skip to content

Commit

Permalink
added mlflow
Browse files Browse the repository at this point in the history
  • Loading branch information
5uperpalo committed May 17, 2024
1 parent d982423 commit 1d2f93b
Show file tree
Hide file tree
Showing 23 changed files with 3,923 additions and 1,023 deletions.
54 changes: 28 additions & 26 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,29 +58,31 @@ jobs:
name: coverage${{ matrix.python-version }}
path: .coverage

finish:
needs: test
runs-on: ubuntu-latest
if: ${{ github.event_name == 'push' || !github.event.pull_request.draft }}
steps:
- uses: actions/checkout@v4
- name: Set up Python 3.10
uses: actions/setup-python@v5
with:
python-version: "3.10"
- name: Install dependencies
run: |
python -m pip install --upgrade pip
python -m pip install coverage
- name: Download all artifacts
# Downloads coverage1, coverage2, etc.
uses: actions/download-artifact@v4
- name: Convert coverage
run: |
coverage combine coverage*/.coverage*
# coverage report --fail-under=95
coverage xml
- name: upload coverage to Codecov
uses: codecov/codecov-action@v4
with:
fail_ci_if_error: true
# some token issue: "Error: Codecov token not found. Please provide Codecov token with -t flag."
# possibly connected to https://github.com/codecov/codecov-action/issues/1292
# finish:
# needs: test
# runs-on: ubuntu-latest
# if: ${{ github.event_name == 'push' || !github.event.pull_request.draft }}
# steps:
# - uses: actions/checkout@v4
# - name: Set up Python 3.10
# uses: actions/setup-python@v5
# with:
# python-version: "3.10"
# - name: Install dependencies
# run: |
# python -m pip install --upgrade pip
# python -m pip install coverage
# - name: Download all artifacts
# # Downloads coverage1, coverage2, etc.
# uses: actions/download-artifact@v4
# - name: Convert coverage
# run: |
# coverage combine coverage*/.coverage*
# # coverage report --fail-under=95
# coverage xml
# - name: upload coverage to Codecov
# uses: codecov/codecov-action@v4
# with:
# fail_ci_if_error: true
4 changes: 4 additions & 0 deletions churn_pred/preprocessing/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,10 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame:

dfc = df.drop(columns=self.id_cols).copy()

# added as mlflow inference is receiving all the data as objects
dfc[self.cat_cols] = dfc[self.cat_cols].astype(str)
dfc[self.final_cont_cols] = dfc[self.final_cont_cols].astype(float)

try:
dfc = dfc[self.cat_cols + self.final_cont_cols + [self.target_col]]
except KeyError:
Expand Down
12 changes: 10 additions & 2 deletions churn_pred/training/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from typing import Any, Dict, List, Tuple, Union, Literal, Callable, Optional

import numpy as np
import mlflow
import pandas as pd
import lightgbm as lgb
from lightgbm import Dataset as lgbDataset
Expand Down Expand Up @@ -54,7 +55,7 @@ def __init__(
)


class BaseTrainer(Base):
class BaseTrainer(Base, mlflow.pyfunc.PythonModel):
def __init__(
self,
cat_cols: List[str],
Expand Down Expand Up @@ -124,15 +125,22 @@ def fit(
"""
raise NotImplementedError("Trainer must implement a 'fit' method")

def predict(self, df: pd.DataFrame, raw_score: bool = True) -> pd.DataFrame:
def predict(
self, context: Optional[dict], df: pd.DataFrame, raw_score: bool = True
) -> pd.DataFrame:
"""Predict.
Args:
df (pd.DataFrame): dataset
raw_score (bool): whether to return raw output
context (dict): for compatibility with mlflow model methods
Returns:
preds_raw (np.ndarray):
"""
# for mlflow inference service testing
if type(df) is dict:
df = pd.DataFrame.from_dict(df, orient="index").transpose()

if self.preprocessors:
for prep in self.preprocessors:
df = prep.transform(df)
Expand Down
39 changes: 39 additions & 0 deletions churn_pred/training/utils.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,50 @@
from typing import List, Tuple, Literal, Optional
from collections.abc import MutableMapping

import numpy as np
import mlflow
import pandas as pd
import lightgbm as lgb
from lightgbm import Dataset as lgbDataset


def flatten_dict(
d: MutableMapping, parent_key: str = "", sep: str = "_"
) -> MutableMapping:
"""
fastest according to https://www.freecodecamp.org/news/how-to-flatten-a-dictionary-in-python-in-4-different-ways/
"""
items = []
for k, v in d.items():
new_key = parent_key + sep + k if parent_key else k
if isinstance(v, MutableMapping):
items.extend(flatten_dict(v, new_key, sep=sep).items())
else:
items.append((new_key, v))
return dict(items)


def get_or_create_experiment(experiment_name):
"""
Retrieve the ID of an existing MLflow experiment or create a new one if it doesn't exist.
This function checks if an experiment with the given name exists within MLflow.
If it does, the function returns its ID. If not, it creates a new experiment
with the provided name and returns its ID.
Parameters:
- experiment_name (str): Name of the MLflow experiment.
Returns:
- str: ID of the existing or newly created MLflow experiment.
"""

if experiment := mlflow.get_experiment_by_name(experiment_name):
return experiment.experiment_id
else:
return mlflow.create_experiment(experiment_name)


def to_lgbdataset(
train: pd.DataFrame,
cat_cols: List[str],
Expand Down
Loading

0 comments on commit 1d2f93b

Please sign in to comment.