added mlflow

5uperpalo · May 17, 2024 · 1d2f93b · 1d2f93b
1 parent d982423
commit 1d2f93b
Show file tree

Hide file tree

Showing 23 changed files with 3,923 additions and 1,023 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -58,29 +58,31 @@ jobs:
         name: coverage${{ matrix.python-version }}
         path: .coverage
 
-  finish: 
-    needs: test
-    runs-on: ubuntu-latest
-    if: ${{ github.event_name == 'push' || !github.event.pull_request.draft }}
-    steps:
-    - uses: actions/checkout@v4
-    - name: Set up Python 3.10
-      uses: actions/setup-python@v5
-      with:
-        python-version: "3.10"
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        python -m pip install coverage
-    - name: Download all artifacts
-      # Downloads coverage1, coverage2, etc.
-      uses: actions/download-artifact@v4
-    - name: Convert coverage
-      run: |
-        coverage combine coverage*/.coverage*
-        # coverage report --fail-under=95
-        coverage xml
-    - name: upload coverage to Codecov
-      uses: codecov/codecov-action@v4
-      with:
-        fail_ci_if_error: true
+  # some token issue: "Error: Codecov token not found. Please provide Codecov token with -t flag."
+  # possibly connected to https://github.com/codecov/codecov-action/issues/1292
+  # finish: 
+  #   needs: test
+  #   runs-on: ubuntu-latest
+  #   if: ${{ github.event_name == 'push' || !github.event.pull_request.draft }}
+  #   steps:
+  #   - uses: actions/checkout@v4
+  #   - name: Set up Python 3.10
+  #     uses: actions/setup-python@v5
+  #     with:
+  #       python-version: "3.10"
+  #   - name: Install dependencies
+  #     run: |
+  #       python -m pip install --upgrade pip
+  #       python -m pip install coverage
+  #   - name: Download all artifacts
+  #     # Downloads coverage1, coverage2, etc.
+  #     uses: actions/download-artifact@v4
+  #   - name: Convert coverage
+  #     run: |
+  #       coverage combine coverage*/.coverage*
+  #       # coverage report --fail-under=95
+  #       coverage xml
+  #   - name: upload coverage to Codecov
+  #     uses: codecov/codecov-action@v4
+  #     with:
+  #       fail_ci_if_error: true
diff --git a/churn_pred/preprocessing/preprocess.py b/churn_pred/preprocessing/preprocess.py
@@ -104,6 +104,10 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame:
 
         dfc = df.drop(columns=self.id_cols).copy()
 
+        # added as mlflow inference is receiving all the data as objects
+        dfc[self.cat_cols] = dfc[self.cat_cols].astype(str)
+        dfc[self.final_cont_cols] = dfc[self.final_cont_cols].astype(float)
+
         try:
             dfc = dfc[self.cat_cols + self.final_cont_cols + [self.target_col]]
         except KeyError:

diff --git a/churn_pred/training/_base.py b/churn_pred/training/_base.py
@@ -2,6 +2,7 @@
 from typing import Any, Dict, List, Tuple, Union, Literal, Callable, Optional
 
 import numpy as np
+import mlflow
 import pandas as pd
 import lightgbm as lgb
 from lightgbm import Dataset as lgbDataset
@@ -54,7 +55,7 @@ def __init__(
         )
 
 
-class BaseTrainer(Base):
+class BaseTrainer(Base, mlflow.pyfunc.PythonModel):
     def __init__(
         self,
         cat_cols: List[str],
@@ -124,15 +125,22 @@ def fit(
         """
         raise NotImplementedError("Trainer must implement a 'fit' method")
 
-    def predict(self, df: pd.DataFrame, raw_score: bool = True) -> pd.DataFrame:
+    def predict(
+        self, context: Optional[dict], df: pd.DataFrame, raw_score: bool = True
+    ) -> pd.DataFrame:
         """Predict.
 
         Args:
             df (pd.DataFrame): dataset
             raw_score (bool): whether to return raw output
+            context (dict): for compatibility with mlflow model methods
         Returns:
             preds_raw (np.ndarray):
         """
+        # for mlflow inference service testing
+        if type(df) is dict:
+            df = pd.DataFrame.from_dict(df, orient="index").transpose()
+
         if self.preprocessors:
             for prep in self.preprocessors:
                 df = prep.transform(df)

diff --git a/churn_pred/training/utils.py b/churn_pred/training/utils.py
@@ -1,11 +1,50 @@
 from typing import List, Tuple, Literal, Optional
+from collections.abc import MutableMapping
 
 import numpy as np
+import mlflow
 import pandas as pd
 import lightgbm as lgb
 from lightgbm import Dataset as lgbDataset
 
 
+def flatten_dict(
+    d: MutableMapping, parent_key: str = "", sep: str = "_"
+) -> MutableMapping:
+    """
+    fastest according to https://www.freecodecamp.org/news/how-to-flatten-a-dictionary-in-python-in-4-different-ways/
+    """
+    items = []
+    for k, v in d.items():
+        new_key = parent_key + sep + k if parent_key else k
+        if isinstance(v, MutableMapping):
+            items.extend(flatten_dict(v, new_key, sep=sep).items())
+        else:
+            items.append((new_key, v))
+    return dict(items)
+
+
+def get_or_create_experiment(experiment_name):
+    """
+    Retrieve the ID of an existing MLflow experiment or create a new one if it doesn't exist.
+
+    This function checks if an experiment with the given name exists within MLflow.
+    If it does, the function returns its ID. If not, it creates a new experiment
+    with the provided name and returns its ID.
+
+    Parameters:
+    - experiment_name (str): Name of the MLflow experiment.
+
+    Returns:
+    - str: ID of the existing or newly created MLflow experiment.
+    """
+
+    if experiment := mlflow.get_experiment_by_name(experiment_name):
+        return experiment.experiment_id
+    else:
+        return mlflow.create_experiment(experiment_name)
+
+
 def to_lgbdataset(
     train: pd.DataFrame,
     cat_cols: List[str],