Skip to content

Commit

Permalink
Added mlflow
Browse files Browse the repository at this point in the history
  • Loading branch information
fmind committed Feb 22, 2024
1 parent 3a4473f commit aa2263b
Show file tree
Hide file tree
Showing 14 changed files with 91 additions and 75 deletions.
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ poetry.lock

# Project
/docs/*
/mlruns/*
/outputs/*
!**/.gitkeep

Expand Down
1 change: 0 additions & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
# https://docs.docker.com/compose/compose-file/
# tracking/registry uri: http://localhost:5000
services:
mlflow:
image: ghcr.io/mlflow/mlflow:v2.10.2
Expand Down
16 changes: 7 additions & 9 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

[tool.poetry]
name = "bikes"
version = "0.1.0"
version = "0.3.0"
description = "Predict the number of bikes available."
repository = "https://github.com/fmind/mlops-python-package"
authors = ["Médéric HURIER <[email protected]>"]
Expand Down Expand Up @@ -32,8 +32,12 @@ pyarrow = "^15.0.0"
pydantic = "^2.5.3"
pydantic-settings = "^2.1.0"
scikit-learn = "^1.4.0"
# mlflow-skinny = "^2.10.2"
mlflow = "^2.10.2"
mlflow-skinny = "^2.10.2"

[tool.poetry.group.dev.dependencies]
invoke = "^2.2.0"
ipykernel = "^6.29.0"
pre-commit = "^3.6.0"

[tool.poetry.group.checkers.dependencies]
coverage = "^7.4.1"
Expand All @@ -44,11 +48,6 @@ pytest-cov = "^4.1.0"
pytest-xdist = "^3.5.0"
pandera = { extras = ["mypy"], version = "^0.18.0" }

[tool.poetry.group.dev.dependencies]
invoke = "^2.2.0"
ipykernel = "^6.29.0"
pre-commit = "^3.6.0"

[tool.poetry.group.documenters.dependencies]
pdoc = "^13.1.1"

Expand All @@ -61,7 +60,6 @@ nbformat = "^5.9.2"

# CONFIGURATIONS


[tool.black]
line-length = 120

Expand Down
6 changes: 3 additions & 3 deletions src/bikes/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ class Reader(abc.ABC, pdt.BaseModel, strict=True):
e.g., to read file, database, cloud storage, ...
Attributes:
limit: maximum number of rows to read from dataset.
limit (int, optional): maximum number of rows to read from dataset.
"""

KIND: str
Expand All @@ -38,7 +38,7 @@ class ParquetReader(Reader):
"""Read a dataframe from a parquet file.
Attributes:
path: local or remote path to a dataset.
path (str): local or remote path to a dataset.
"""

KIND: T.Literal["ParquetReader"] = "ParquetReader"
Expand Down Expand Up @@ -80,7 +80,7 @@ class ParquetWriter(Writer):
"""Writer a dataframe to a parquet file.
Attributes:
path: local or remote file to a dataset.
path (str): local or remote file to a dataset.
"""

KIND: T.Literal["ParquetWriter"] = "ParquetWriter"
Expand Down
46 changes: 23 additions & 23 deletions src/bikes/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,12 @@ class Job(abc.ABC, pdt.BaseModel, strict=True):
e.g., to define common services like logger
Attributes:
logger_service: manage the logging system.
logger_service (services.LoggerService): manage the logging system.
mlflow_service (services.MLflowService): manage the mlflow system.
"""

KIND: str

# services
logger_service: services.LoggerService = services.LoggerService()
mlflow_service: services.MLflowService = services.MLflowService()

Expand Down Expand Up @@ -75,14 +75,14 @@ class TuningJob(Job):
"""Find the best hyperparameters for a model.
Attributes:
run_name: name of the MLflow experiment run.
inputs: dataset reader with inputs variables.
targets: dataset reader with targets variables.
results: dataset writer for searcher results.
model: machine learning model to tune.
metric: main metric for evaluation.
splitter: splitter for datasets.
searcher: searcher algorithm.
run_name (str): name of the MLflow experiment run.
inputs (datasets.ReaderKind): dataset reader with inputs variables.
targets (datasets.ReaderKind): dataset reader with targets variables.
results (datasets.WriterKind): dataset writer for searcher results.
model (models.ModelKind): machine learning model to tune.
metric (metrics.MetricKind): main metric for evaluation.
splitter (splitters.SplitterKind): splitter for datasets.
searcher (searchers.SearcherKind): searcher algorithm.
"""

KIND: T.Literal["TuningJob"] = "TuningJob"
Expand Down Expand Up @@ -146,15 +146,15 @@ class TrainingJob(Job):
"""Train and register a single AI/ML model
Attributes:
run_name: name of the MLflow experiment run.
inputs: dataset reader with inputs variables.
targets: dataset reader with targets variables.
saver: save the trained model in registry.
model: machine learning model to tune.
signer: signer for the trained model.
scorers: metrics for the evaluation.
splitter: splitter for datasets.
registry_alias: alias of model.
run_name (str): name of the MLflow experiment run.
inputs (datasets.ReaderKind): dataset reader with inputs variables.
targets (datasets.ReaderKind): dataset reader with targets variables.
saver (registers.SaverKind): save the trained model in registry.
model (models.ModelKind): machine learning model to tune.
signer (registers.SignerKind): signer for the trained model.
scorers (list[metrics.MetricKind]): metrics for the evaluation.
splitter (splitters.SplitterKind): splitter for datasets.
registry_alias (str): alias of model.
"""

KIND: T.Literal["TrainingJob"] = "TrainingJob"
Expand Down Expand Up @@ -244,10 +244,10 @@ class InferenceJob(Job):
"""Load a model and generate predictions.
Attributes:
inputs: dataset reader with inputs variables.
outputs: dataset writer for the model outputs.
registry_alias: alias of the model to load.
loader: load the model from registry.
inputs (datasets.ReaderKind): dataset reader with inputs variables.
outputs (datasets.WriterKind): dataset writer for the model outputs.
registry_alias (str): alias of the model to load.
loader (registers.LoaderKind): load the model from registry.
"""

KIND: T.Literal["InferenceJob"] = "InferenceJob"
Expand Down
6 changes: 3 additions & 3 deletions src/bikes/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ class Metric(abc.ABC, pdt.BaseModel, strict=True):
e.g., accuracy, precision, recall, mae, f1, ...
Attributes:
name: name of the metric.
name (str): name of the metric.
"""

KIND: str
Expand Down Expand Up @@ -59,8 +59,8 @@ class SklearnMetric(Metric):
"""Compute metrics with sklearn.
Attributes:
name: name of the sklearn metric.
greater_is_better: maximize or minimize.
name (str): name of the sklearn metric.
greater_is_better (bool): maximize or minimize.
"""

KIND: T.Literal["SklearnMetric"] = "SklearnMetric"
Expand Down
6 changes: 3 additions & 3 deletions src/bikes/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,9 +82,9 @@ class BaselineSklearnModel(Model):
"""Simple baseline model built on top of sklearn.
Attributes:
max_depth: maximum depth of the random forest.
n_estimators: number of estimators in the random forest.
random_state: random state of the machine learning pipeline.
max_depth (int): maximum depth of the random forest.
n_estimators (int): number of estimators in the random forest.
random_state (int, optional): random state of the machine learning pipeline.
"""

KIND: T.Literal["BaselineSklearnModel"] = "BaselineSklearnModel"
Expand Down
10 changes: 5 additions & 5 deletions src/bikes/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,12 @@ class Config:
"""Default configuration.
Attributes:
coerce: convert data type if possible.
strict: ensure the data type is correct.
coerce (bool): convert data type if possible.
strict (bool): ensure the data type is correct.
"""

coerce = True
strict = True
coerce: bool = True
strict: bool = True

@classmethod
def check(cls, data: pd.DataFrame, **kwargs):
Expand All @@ -35,7 +35,7 @@ def check(cls, data: pd.DataFrame, **kwargs):
data (pd.DataFrame): dataframe to check.
Returns:
_type_: validated dataframe with schema.
pd.DataFrame: validated dataframe with schema.
"""
return cls.validate(data, **kwargs)

Expand Down
2 changes: 1 addition & 1 deletion src/bikes/scripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ class Settings(pdts.BaseSettings, strict=True):
"""Settings for the program.
Attributes:
job: job associated with the settings.
job (jobs.JobKind): job associated with the settings.
"""

job: jobs.JobKind = pdt.Field(..., discriminator="KIND")
Expand Down
12 changes: 6 additions & 6 deletions src/bikes/searchers.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,12 @@ class GridCVSearcher(Searcher):
"""Grid searcher with cross-folds.
Attributes:
param_grid: mapping of param key -> values.
n_jobs: number of jobs to run in parallel.
refit: refit the model after the tuning.
verbose: set the search verbosity level.
error_score: strategy or value on error.
return_train_score: include train scores.
param_grid (Grid): mapping of param key -> values.
n_jobs (int, optional): number of jobs to run in parallel.
refit (bool): refit the model after the tuning.
verbose (int): set the search verbosity level.
error_score (str | float): strategy or value on error.
return_train_score (bool): include train scores.
"""

KIND: T.Literal["GridCVSearcher"] = "GridCVSearcher"
Expand Down
38 changes: 26 additions & 12 deletions src/bikes/services.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,14 @@ class LoggerService(Service):
https://loguru.readthedocs.io/en/stable/api/logger.html
Attributes:
sink: logging output.
level: logging level.
format: logging format.
colorize: colorize output.
serialize: convert to JSON.
backtrace: enable exception trace.
diagnose: enable variable display.
catch: catch errors during log handling.
sink (str): logging output.
level (str): logging level.
format (str): logging format.
colorize (bool): colorize output.
serialize (bool): convert to JSON.
backtrace (bool): enable exception trace.
diagnose (bool): enable variable display.
catch (bool): catch errors during log handling.
"""

sink: str = "stderr"
Expand Down Expand Up @@ -79,7 +79,22 @@ def start(self) -> None:


class MLflowService(Service):
"""Service for MLflow tracking and registry."""
"""Service for MLflow tracking and registry.
Attributes:
autolog_disable (bool): disable autologging.
autolog_disable_for_unsupported_versions (bool): disable autologging for unsupported versions.
autolog_exclusive (bool): If True, enables exclusive autologging.
autolog_log_input_examples (bool): If True, logs input examples during autologging.
autolog_log_model_signatures (bool): If True, logs model signatures during autologging.
autolog_log_models (bool): If True, enables logging of models during autologging.
autolog_log_datasets (bool): If True, logs datasets used during autologging.
autolog_silent (bool): If True, suppresses all MLflow warnings during autologging.
tracking_uri (str): The URI for the MLflow tracking server.
experiment_name (str): The name of the experiment to log runs under.
registry_uri (str): The URI for the MLflow model registry.
registry_name (str): The name of the registry.
"""

# autolog
autolog_disable: bool = False
Expand All @@ -91,11 +106,10 @@ class MLflowService(Service):
autolog_log_datasets: bool = True
autolog_silent: bool = False
# tracking
tracking_uri: str = "./mlruns"
tracking_uri: str = "http://localhost:5000"
experiment_name: str = "bikes"
experiment_tags: dict[str, T.Any] | None = None
# registry
registry_uri: str = "./mlruns"
registry_uri: str = "http://localhost:5000"
registry_name: str = "bikes"

def start(self):
Expand Down
12 changes: 6 additions & 6 deletions src/bikes/splitters.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,9 @@ class TrainTestSplitter(Splitter):
"""Split a dataframe into a train and test subsets.
Attributes:
shuffle: shuffle dataset before splitting.
test_size: number or ratio for the test dataset.
random_state: random state for the splitter object.
shuffle (bool): shuffle dataset before splitting.
test_size (int | float): number or ratio for the test dataset.
random_state (int): random state for the splitter object.
"""

KIND: T.Literal["TrainTestSplitter"] = "TrainTestSplitter"
Expand All @@ -90,9 +90,9 @@ class TimeSeriesSplitter(Splitter):
"""Split a dataframe into fixed time series subsets.
Attributes:
gap: gap between splits.
n_splits: number of split to generate.
test_size: number or ratio for the test dataset.
gap (int): gap between splits.
n_splits (int): number of split to generate.
test_size (int | float): number or ratio for the test dataset.
"""

KIND: T.Literal["TimeSeriesSplitter"] = "TimeSeriesSplitter"
Expand Down
6 changes: 6 additions & 0 deletions tasks/containers.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,12 @@
# %% TASKS


@task
def compose(ctx: Context) -> None:
"""Start docker compose."""
ctx.run("docker compose up")


@task(pre=[packages.build])
def build(ctx: Context) -> None:
"""Build the container image."""
Expand Down
4 changes: 2 additions & 2 deletions tasks/docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,13 @@
@task
def api(ctx: Context) -> None:
"""Document the API with pdoc."""
ctx.run(f"poetry run pdoc -o docs/api src/{ctx.project.name}")
ctx.run(f"poetry run pdoc -d google -o docs/api src/{ctx.project.name}")


@task
def serve(ctx: Context) -> None:
"""Document the API with pdoc."""
ctx.run(f"poetry run pdoc src/{ctx.project.name}")
ctx.run(f"poetry run pdoc -d google src/{ctx.project.name}")


@task(pre=[cleans.docs, api], default=True)
Expand Down

0 comments on commit aa2263b

Please sign in to comment.