-
Notifications
You must be signed in to change notification settings - Fork 7.8k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
08bfc7a
commit 5e6560c
Showing
27 changed files
with
798 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
include *.txt | ||
include *.md | ||
include *.pkl | ||
recursive-include ./classification_model/* | ||
|
||
include classification_model/datasets/train.csv | ||
include classification_model/datasets/test.csv | ||
include classification_model/trained_models/*.pkl | ||
include classification_model/VERSION | ||
include classification_model/config.yml | ||
|
||
include ./requirements/requirements.txt | ||
include ./requirements/test_requirements.txt | ||
exclude *.log | ||
exclude *.cfg | ||
|
||
recursive-exclude * __pycache__ | ||
recursive-exclude * *.py[co] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
# Productionized Titanic Classification Model Package | ||
|
||
## Run With Tox (Recommended) | ||
- Download the data from: https://www.openml.org/data/get_csv/16826755/phpMYEkMl | ||
- Save the file as `raw.csv` in the classification_model/datasets directory | ||
- `pip install tox` | ||
- Make sure you are in the assignment-section-05 directory (where the tox.ini file is) then run the command: `tox` (this runs the tests and typechecks, trains the model under the hood). The first time you run this it creates a virtual env and installs | ||
dependencies, so takes a few minutes. | ||
|
||
## Run Without Tox | ||
- Download the data from: https://www.openml.org/data/get_csv/16826755/phpMYEkMl | ||
- Save the file as `raw.csv` in the classification_model/datasets directory | ||
- Add assignment-section-05 *and* classification_model paths to your system PYTHONPATH | ||
- `pip install -r requirements/test_requirements` | ||
- Train the model: `python classification_model/train_pipeline.py` | ||
- Run the tests `pytest tests` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
0.0.1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
import logging | ||
|
||
from classification_model.config.core import PACKAGE_ROOT, config | ||
|
||
# It is strongly advised that you do not add any handlers other than | ||
# NullHandler to your library’s loggers. This is because the configuration | ||
# of handlers is the prerogative of the application developer who uses your | ||
# library. The application developer knows their target audience and what | ||
# handlers are most appropriate for their application: if you add handlers | ||
# ‘under the hood’, you might well interfere with their ability to carry out | ||
# unit tests and deliver logs which suit their requirements. | ||
# https://docs.python.org/3/howto/logging.html#configuring-logging-for-a-library | ||
logging.getLogger(config.app_config.package_name).addHandler(logging.NullHandler()) | ||
|
||
|
||
with open(PACKAGE_ROOT / "VERSION") as version_file: | ||
__version__ = version_file.read().strip() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
# Package Overview | ||
package_name: regression_model | ||
|
||
# Data Files | ||
raw_data_file: raw.csv | ||
training_data_file: train.csv | ||
test_data_file: test.csv | ||
|
||
# Variables | ||
# The variable we are attempting to predict (sale price) | ||
target: survived | ||
|
||
pipeline_name: titanic_classification_model | ||
pipeline_save_file: titanic_classification_model_output_v | ||
|
||
features: | ||
- pclass | ||
- sex | ||
- age | ||
- sibsp | ||
- parch | ||
- fare | ||
- cabin | ||
- embarked | ||
- title # generated from name | ||
|
||
# set train/test split | ||
test_size: 0.1 | ||
|
||
# to set the random seed | ||
random_state: 0 | ||
|
||
unused_fields: | ||
- name | ||
- ticket | ||
- boat | ||
- body | ||
- home.dest | ||
|
||
numerical_vars: | ||
- age | ||
- fare | ||
|
||
categorical_vars: | ||
- sex | ||
- cabin | ||
- embarked | ||
- title | ||
|
||
cabin_vars: | ||
- cabin |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
from pathlib import Path | ||
from typing import Sequence | ||
|
||
from pydantic import BaseModel | ||
from strictyaml import YAML, load | ||
|
||
import classification_model | ||
|
||
# Project Directories | ||
PACKAGE_ROOT = Path(classification_model.__file__).resolve().parent | ||
ROOT = PACKAGE_ROOT.parent | ||
CONFIG_FILE_PATH = PACKAGE_ROOT / "config.yml" | ||
DATASET_DIR = PACKAGE_ROOT / "datasets" | ||
TRAINED_MODEL_DIR = PACKAGE_ROOT / "trained_models" | ||
|
||
|
||
class AppConfig(BaseModel): | ||
""" | ||
Application-level config. | ||
""" | ||
|
||
package_name: str | ||
raw_data_file: str | ||
pipeline_save_file: str | ||
|
||
|
||
class ModelConfig(BaseModel): | ||
""" | ||
All configuration relevant to model | ||
training and feature engineering. | ||
""" | ||
|
||
target: str | ||
unused_fields: Sequence[str] | ||
features: Sequence[str] | ||
test_size: float | ||
random_state: int | ||
numerical_vars: Sequence[str] | ||
categorical_vars: Sequence[str] | ||
cabin_vars: Sequence[str] | ||
|
||
|
||
class Config(BaseModel): | ||
"""Master config object.""" | ||
|
||
app_config: AppConfig | ||
model_config: ModelConfig | ||
|
||
|
||
def find_config_file() -> Path: | ||
"""Locate the configuration file.""" | ||
if CONFIG_FILE_PATH.is_file(): | ||
return CONFIG_FILE_PATH | ||
raise Exception(f"Config not found at {CONFIG_FILE_PATH!r}") | ||
|
||
|
||
def fetch_config_from_yaml(cfg_path: Path = None) -> YAML: | ||
"""Parse YAML containing the package configuration.""" | ||
|
||
if not cfg_path: | ||
cfg_path = find_config_file() | ||
|
||
if cfg_path: | ||
with open(cfg_path, "r") as conf_file: | ||
parsed_config = load(conf_file.read()) | ||
return parsed_config | ||
raise OSError(f"Did not find config file at path: {cfg_path}") | ||
|
||
|
||
def create_and_validate_config(parsed_config: YAML = None) -> Config: | ||
"""Run validation on config values.""" | ||
if parsed_config is None: | ||
parsed_config = fetch_config_from_yaml() | ||
|
||
# specify the data attribute from the strictyaml YAML type. | ||
_config = Config( | ||
app_config=AppConfig(**parsed_config.data), | ||
model_config=ModelConfig(**parsed_config.data), | ||
) | ||
|
||
return _config | ||
|
||
|
||
config = create_and_validate_config() |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
# for encoding categorical variables | ||
from feature_engine.encoding import OneHotEncoder, RareLabelEncoder | ||
|
||
# for imputation | ||
from feature_engine.imputation import ( | ||
AddMissingIndicator, | ||
CategoricalImputer, | ||
MeanMedianImputer, | ||
) | ||
from sklearn.linear_model import LogisticRegression | ||
from sklearn.pipeline import Pipeline | ||
from sklearn.preprocessing import StandardScaler | ||
|
||
from classification_model.config.core import config | ||
from classification_model.processing.features import ExtractLetterTransformer | ||
|
||
titanic_pipe = Pipeline( | ||
[ | ||
# impute categorical variables with string missing | ||
( | ||
"categorical_imputation", | ||
CategoricalImputer( | ||
imputation_method="missing", | ||
variables=config.model_config.categorical_vars, | ||
), | ||
), | ||
# add missing indicator to numerical variables | ||
( | ||
"missing_indicator", | ||
AddMissingIndicator(variables=config.model_config.numerical_vars), | ||
), | ||
# impute numerical variables with the median | ||
( | ||
"median_imputation", | ||
MeanMedianImputer( | ||
imputation_method="median", variables=config.model_config.numerical_vars | ||
), | ||
), | ||
# Extract letter from cabin | ||
( | ||
"extract_letter", | ||
ExtractLetterTransformer(variables=config.model_config.cabin_vars), | ||
), | ||
# == CATEGORICAL ENCODING ====== | ||
# remove categories present in less than 5% of the observations (0.05) | ||
# group them in one category called 'Rare' | ||
( | ||
"rare_label_encoder", | ||
RareLabelEncoder( | ||
tol=0.05, n_categories=1, variables=config.model_config.categorical_vars | ||
), | ||
), | ||
# encode categorical variables using one hot encoding into k-1 variables | ||
( | ||
"categorical_encoder", | ||
OneHotEncoder( | ||
drop_last=True, variables=config.model_config.categorical_vars | ||
), | ||
), | ||
# scale | ||
("scaler", StandardScaler()), | ||
("Logit", LogisticRegression(C=0.0005, random_state=0)), | ||
] | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
import typing as t | ||
|
||
import pandas as pd | ||
|
||
from classification_model import __version__ as _version | ||
from classification_model.config.core import config | ||
from classification_model.processing.data_manager import load_pipeline | ||
from classification_model.processing.validation import validate_inputs | ||
|
||
pipeline_file_name = f"{config.app_config.pipeline_save_file}{_version}.pkl" | ||
_titanic_pipe = load_pipeline(file_name=pipeline_file_name) | ||
|
||
|
||
def make_prediction( | ||
*, | ||
input_data: t.Union[pd.DataFrame, dict], | ||
) -> dict: | ||
"""Make a prediction using a saved model pipeline.""" | ||
|
||
data = pd.DataFrame(input_data) | ||
validated_data, errors = validate_inputs(input_data=data) | ||
results = {"predictions": None, "version": _version, "errors": errors} | ||
|
||
if not errors: | ||
predictions = _titanic_pipe.predict( | ||
X=validated_data[config.model_config.features] | ||
) | ||
results = { | ||
"predictions": predictions, | ||
"version": _version, | ||
"errors": errors, | ||
} | ||
|
||
return results |
Empty file.
Oops, something went wrong.