Added mlflow

fmind · Feb 22, 2024 · aa2263b · aa2263b
1 parent 3a4473f
commit aa2263b
Show file tree

Hide file tree

Showing 14 changed files with 91 additions and 75 deletions.
diff --git a/.gitignore b/.gitignore
@@ -20,7 +20,6 @@ poetry.lock
 
 # Project
 /docs/*
-/mlruns/*
 /outputs/*
 !**/.gitkeep
 

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -1,5 +1,4 @@
 # https://docs.docker.com/compose/compose-file/
-# tracking/registry uri: http://localhost:5000
 services:
   mlflow:
     image: ghcr.io/mlflow/mlflow:v2.10.2

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@
 
 [tool.poetry]
 name = "bikes"
-version = "0.1.0"
+version = "0.3.0"
 description = "Predict the number of bikes available."
 repository = "https://github.com/fmind/mlops-python-package"
 authors = ["Médéric HURIER <[email protected]>"]
@@ -32,8 +32,12 @@ pyarrow = "^15.0.0"
 pydantic = "^2.5.3"
 pydantic-settings = "^2.1.0"
 scikit-learn = "^1.4.0"
-# mlflow-skinny = "^2.10.2"
-mlflow = "^2.10.2"
+mlflow-skinny = "^2.10.2"
+
+[tool.poetry.group.dev.dependencies]
+invoke = "^2.2.0"
+ipykernel = "^6.29.0"
+pre-commit = "^3.6.0"
 
 [tool.poetry.group.checkers.dependencies]
 coverage = "^7.4.1"
@@ -44,11 +48,6 @@ pytest-cov = "^4.1.0"
 pytest-xdist = "^3.5.0"
 pandera = { extras = ["mypy"], version = "^0.18.0" }
 
-[tool.poetry.group.dev.dependencies]
-invoke = "^2.2.0"
-ipykernel = "^6.29.0"
-pre-commit = "^3.6.0"
-
 [tool.poetry.group.documenters.dependencies]
 pdoc = "^13.1.1"
 
@@ -61,7 +60,6 @@ nbformat = "^5.9.2"
 
 # CONFIGURATIONS
 
-
 [tool.black]
 line-length = 120
 

diff --git a/src/bikes/datasets.py b/src/bikes/datasets.py
@@ -18,7 +18,7 @@ class Reader(abc.ABC, pdt.BaseModel, strict=True):
     e.g., to read file, database, cloud storage, ...
 
     Attributes:
-        limit: maximum number of rows to read from dataset.
+        limit (int, optional): maximum number of rows to read from dataset.
     """
 
     KIND: str
@@ -38,7 +38,7 @@ class ParquetReader(Reader):
     """Read a dataframe from a parquet file.
 
     Attributes:
-        path: local or remote path to a dataset.
+        path (str): local or remote path to a dataset.
     """
 
     KIND: T.Literal["ParquetReader"] = "ParquetReader"
@@ -80,7 +80,7 @@ class ParquetWriter(Writer):
     """Writer a dataframe to a parquet file.
 
     Attributes:
-        path: local or remote file to a dataset.
+        path (str): local or remote file to a dataset.
     """
 
     KIND: T.Literal["ParquetWriter"] = "ParquetWriter"

diff --git a/src/bikes/jobs.py b/src/bikes/jobs.py
@@ -26,12 +26,12 @@ class Job(abc.ABC, pdt.BaseModel, strict=True):
     e.g., to define common services like logger
 
     Attributes:
-        logger_service: manage the logging system.
+        logger_service (services.LoggerService): manage the logging system.
+        mlflow_service (services.MLflowService): manage the mlflow system.
     """
 
     KIND: str
 
-    # services
     logger_service: services.LoggerService = services.LoggerService()
     mlflow_service: services.MLflowService = services.MLflowService()
 
@@ -75,14 +75,14 @@ class TuningJob(Job):
     """Find the best hyperparameters for a model.
 
     Attributes:
-        run_name: name of the MLflow experiment run.
-        inputs: dataset reader with inputs variables.
-        targets: dataset reader with targets variables.
-        results: dataset writer for searcher results.
-        model: machine learning model to tune.
-        metric: main metric for evaluation.
-        splitter: splitter for datasets.
-        searcher: searcher algorithm.
+        run_name (str): name of the MLflow experiment run.
+        inputs (datasets.ReaderKind): dataset reader with inputs variables.
+        targets (datasets.ReaderKind): dataset reader with targets variables.
+        results (datasets.WriterKind): dataset writer for searcher results.
+        model (models.ModelKind): machine learning model to tune.
+        metric (metrics.MetricKind): main metric for evaluation.
+        splitter (splitters.SplitterKind): splitter for datasets.
+        searcher (searchers.SearcherKind): searcher algorithm.
     """
 
     KIND: T.Literal["TuningJob"] = "TuningJob"
@@ -146,15 +146,15 @@ class TrainingJob(Job):
     """Train and register a single AI/ML model
 
     Attributes:
-        run_name: name of the MLflow experiment run.
-        inputs: dataset reader with inputs variables.
-        targets: dataset reader with targets variables.
-        saver: save the trained model in registry.
-        model: machine learning model to tune.
-        signer: signer for the trained model.
-        scorers: metrics for the evaluation.
-        splitter: splitter for datasets.
-        registry_alias: alias of model.
+        run_name (str): name of the MLflow experiment run.
+        inputs (datasets.ReaderKind): dataset reader with inputs variables.
+        targets (datasets.ReaderKind): dataset reader with targets variables.
+        saver (registers.SaverKind): save the trained model in registry.
+        model (models.ModelKind): machine learning model to tune.
+        signer (registers.SignerKind): signer for the trained model.
+        scorers (list[metrics.MetricKind]): metrics for the evaluation.
+        splitter (splitters.SplitterKind): splitter for datasets.
+        registry_alias (str): alias of model.
     """
 
     KIND: T.Literal["TrainingJob"] = "TrainingJob"
@@ -244,10 +244,10 @@ class InferenceJob(Job):
     """Load a model and generate predictions.
 
     Attributes:
-        inputs: dataset reader with inputs variables.
-        outputs: dataset writer for the model outputs.
-        registry_alias: alias of the model to load.
-        loader: load the model from registry.
+        inputs (datasets.ReaderKind): dataset reader with inputs variables.
+        outputs (datasets.WriterKind): dataset writer for the model outputs.
+        registry_alias (str): alias of the model to load.
+        loader (registers.LoaderKind): load the model from registry.
     """
 
     KIND: T.Literal["InferenceJob"] = "InferenceJob"

diff --git a/src/bikes/metrics.py b/src/bikes/metrics.py
@@ -20,7 +20,7 @@ class Metric(abc.ABC, pdt.BaseModel, strict=True):
     e.g., accuracy, precision, recall, mae, f1, ...
 
     Attributes:
-        name: name of the metric.
+        name (str): name of the metric.
     """
 
     KIND: str
@@ -59,8 +59,8 @@ class SklearnMetric(Metric):
     """Compute metrics with sklearn.
 
     Attributes:
-        name: name of the sklearn metric.
-        greater_is_better: maximize or minimize.
+        name (str): name of the sklearn metric.
+        greater_is_better (bool): maximize or minimize.
     """
 
     KIND: T.Literal["SklearnMetric"] = "SklearnMetric"

diff --git a/src/bikes/models.py b/src/bikes/models.py
@@ -82,9 +82,9 @@ class BaselineSklearnModel(Model):
     """Simple baseline model built on top of sklearn.
 
     Attributes:
-        max_depth: maximum depth of the random forest.
-        n_estimators: number of estimators in the random forest.
-        random_state: random state of the machine learning pipeline.
+        max_depth (int): maximum depth of the random forest.
+        n_estimators (int): number of estimators in the random forest.
+        random_state (int, optional): random state of the machine learning pipeline.
     """
 
     KIND: T.Literal["BaselineSklearnModel"] = "BaselineSklearnModel"

diff --git a/src/bikes/schemas.py b/src/bikes/schemas.py
@@ -20,12 +20,12 @@ class Config:
         """Default configuration.
 
         Attributes:
-            coerce: convert data type if possible.
-            strict: ensure the data type is correct.
+            coerce (bool): convert data type if possible.
+            strict (bool): ensure the data type is correct.
         """
 
-        coerce = True
-        strict = True
+        coerce: bool = True
+        strict: bool = True
 
     @classmethod
     def check(cls, data: pd.DataFrame, **kwargs):
@@ -35,7 +35,7 @@ def check(cls, data: pd.DataFrame, **kwargs):
             data (pd.DataFrame): dataframe to check.
 
         Returns:
-            _type_: validated dataframe with schema.
+            pd.DataFrame: validated dataframe with schema.
         """
         return cls.validate(data, **kwargs)
 

diff --git a/src/bikes/scripts.py b/src/bikes/scripts.py
@@ -17,7 +17,7 @@ class Settings(pdts.BaseSettings, strict=True):
     """Settings for the program.
 
     Attributes:
-        job: job associated with the settings.
+        job (jobs.JobKind): job associated with the settings.
     """
 
     job: jobs.JobKind = pdt.Field(..., discriminator="KIND")

diff --git a/src/bikes/searchers.py b/src/bikes/searchers.py
@@ -63,12 +63,12 @@ class GridCVSearcher(Searcher):
     """Grid searcher with cross-folds.
 
     Attributes:
-        param_grid: mapping of param key -> values.
-        n_jobs: number of jobs to run in parallel.
-        refit: refit the model after the tuning.
-        verbose: set the search verbosity level.
-        error_score: strategy or value on error.
-        return_train_score: include train scores.
+        param_grid (Grid): mapping of param key -> values.
+        n_jobs (int, optional): number of jobs to run in parallel.
+        refit (bool): refit the model after the tuning.
+        verbose (int): set the search verbosity level.
+        error_score (str | float): strategy or value on error.
+        return_train_score (bool): include train scores.
     """
 
     KIND: T.Literal["GridCVSearcher"] = "GridCVSearcher"

diff --git a/src/bikes/services.py b/src/bikes/services.py
@@ -36,14 +36,14 @@ class LoggerService(Service):
     https://loguru.readthedocs.io/en/stable/api/logger.html
 
     Attributes:
-        sink: logging output.
-        level: logging level.
-        format: logging format.
-        colorize: colorize output.
-        serialize: convert to JSON.
-        backtrace: enable exception trace.
-        diagnose: enable variable display.
-        catch: catch errors during log handling.
+        sink (str): logging output.
+        level (str): logging level.
+        format (str): logging format.
+        colorize (bool): colorize output.
+        serialize (bool): convert to JSON.
+        backtrace (bool): enable exception trace.
+        diagnose (bool): enable variable display.
+        catch (bool): catch errors during log handling.
     """
 
     sink: str = "stderr"
@@ -79,7 +79,22 @@ def start(self) -> None:
 
 
 class MLflowService(Service):
-    """Service for MLflow tracking and registry."""
+    """Service for MLflow tracking and registry.
+
+    Attributes:
+        autolog_disable (bool): disable autologging.
+        autolog_disable_for_unsupported_versions (bool): disable autologging for unsupported versions.
+        autolog_exclusive (bool): If True, enables exclusive autologging.
+        autolog_log_input_examples (bool): If True, logs input examples during autologging.
+        autolog_log_model_signatures (bool): If True, logs model signatures during autologging.
+        autolog_log_models (bool): If True, enables logging of models during autologging.
+        autolog_log_datasets (bool): If True, logs datasets used during autologging.
+        autolog_silent (bool): If True, suppresses all MLflow warnings during autologging.
+        tracking_uri (str): The URI for the MLflow tracking server.
+        experiment_name (str): The name of the experiment to log runs under.
+        registry_uri (str): The URI for the MLflow model registry.
+        registry_name (str): The name of the registry.
+    """
 
     # autolog
     autolog_disable: bool = False
@@ -91,11 +106,10 @@ class MLflowService(Service):
     autolog_log_datasets: bool = True
     autolog_silent: bool = False
     # tracking
-    tracking_uri: str = "./mlruns"
+    tracking_uri: str = "http://localhost:5000"
     experiment_name: str = "bikes"
-    experiment_tags: dict[str, T.Any] | None = None
     # registry
-    registry_uri: str = "./mlruns"
+    registry_uri: str = "http://localhost:5000"
     registry_name: str = "bikes"
 
     def start(self):

diff --git a/src/bikes/splitters.py b/src/bikes/splitters.py
@@ -62,9 +62,9 @@ class TrainTestSplitter(Splitter):
     """Split a dataframe into a train and test subsets.
 
     Attributes:
-        shuffle: shuffle dataset before splitting.
-        test_size: number or ratio for the test dataset.
-        random_state: random state for the splitter object.
+        shuffle (bool): shuffle dataset before splitting.
+        test_size (int | float): number or ratio for the test dataset.
+        random_state (int): random state for the splitter object.
     """
 
     KIND: T.Literal["TrainTestSplitter"] = "TrainTestSplitter"
@@ -90,9 +90,9 @@ class TimeSeriesSplitter(Splitter):
     """Split a dataframe into fixed time series subsets.
 
     Attributes:
-        gap: gap between splits.
-        n_splits: number of split to generate.
-        test_size: number or ratio for the test dataset.
+        gap (int): gap between splits.
+        n_splits (int): number of split to generate.
+        test_size (int | float): number or ratio for the test dataset.
     """
 
     KIND: T.Literal["TimeSeriesSplitter"] = "TimeSeriesSplitter"

diff --git a/tasks/containers.py b/tasks/containers.py
@@ -12,6 +12,12 @@
 # %% TASKS
 
 
+@task
+def compose(ctx: Context) -> None:
+    """Start docker compose."""
+    ctx.run("docker compose up")
+
+
 @task(pre=[packages.build])
 def build(ctx: Context) -> None:
     """Build the container image."""

diff --git a/tasks/docs.py b/tasks/docs.py
@@ -15,13 +15,13 @@
 @task
 def api(ctx: Context) -> None:
     """Document the API with pdoc."""
-    ctx.run(f"poetry run pdoc -o docs/api src/{ctx.project.name}")
+    ctx.run(f"poetry run pdoc -d google -o docs/api src/{ctx.project.name}")
 
 
 @task
 def serve(ctx: Context) -> None:
     """Document the API with pdoc."""
-    ctx.run(f"poetry run pdoc src/{ctx.project.name}")
+    ctx.run(f"poetry run pdoc -d google src/{ctx.project.name}")
 
 
 @task(pre=[cleans.docs, api], default=True)