Skip to content

Commit

Permalink
Section5 assignment (#789)
Browse files Browse the repository at this point in the history
  • Loading branch information
ChristopherGS authored Jan 15, 2022
1 parent 08bfc7a commit 5e6560c
Show file tree
Hide file tree
Showing 27 changed files with 798 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ packages/regression_model/regression_model/datasets/*.zip
packages/regression_model/regression_model/datasets/*.txt
train.csv
test.csv
raw.csv
data_description.txt
house-prices-advanced-regression-techniques.zip
sample_submission.csv
Expand Down
18 changes: 18 additions & 0 deletions assignment-section-05/MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
include *.txt
include *.md
include *.pkl
recursive-include ./classification_model/*

include classification_model/datasets/train.csv
include classification_model/datasets/test.csv
include classification_model/trained_models/*.pkl
include classification_model/VERSION
include classification_model/config.yml

include ./requirements/requirements.txt
include ./requirements/test_requirements.txt
exclude *.log
exclude *.cfg

recursive-exclude * __pycache__
recursive-exclude * *.py[co]
16 changes: 16 additions & 0 deletions assignment-section-05/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Productionized Titanic Classification Model Package

## Run With Tox (Recommended)
- Download the data from: https://www.openml.org/data/get_csv/16826755/phpMYEkMl
- Save the file as `raw.csv` in the classification_model/datasets directory
- `pip install tox`
- Make sure you are in the assignment-section-05 directory (where the tox.ini file is) then run the command: `tox` (this runs the tests and typechecks, trains the model under the hood). The first time you run this it creates a virtual env and installs
dependencies, so takes a few minutes.

## Run Without Tox
- Download the data from: https://www.openml.org/data/get_csv/16826755/phpMYEkMl
- Save the file as `raw.csv` in the classification_model/datasets directory
- Add assignment-section-05 *and* classification_model paths to your system PYTHONPATH
- `pip install -r requirements/test_requirements`
- Train the model: `python classification_model/train_pipeline.py`
- Run the tests `pytest tests`
1 change: 1 addition & 0 deletions assignment-section-05/classification_model/VERSION
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
0.0.1
17 changes: 17 additions & 0 deletions assignment-section-05/classification_model/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import logging

from classification_model.config.core import PACKAGE_ROOT, config

# It is strongly advised that you do not add any handlers other than
# NullHandler to your library’s loggers. This is because the configuration
# of handlers is the prerogative of the application developer who uses your
# library. The application developer knows their target audience and what
# handlers are most appropriate for their application: if you add handlers
# ‘under the hood’, you might well interfere with their ability to carry out
# unit tests and deliver logs which suit their requirements.
# https://docs.python.org/3/howto/logging.html#configuring-logging-for-a-library
logging.getLogger(config.app_config.package_name).addHandler(logging.NullHandler())


with open(PACKAGE_ROOT / "VERSION") as version_file:
__version__ = version_file.read().strip()
51 changes: 51 additions & 0 deletions assignment-section-05/classification_model/config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# Package Overview
package_name: regression_model

# Data Files
raw_data_file: raw.csv
training_data_file: train.csv
test_data_file: test.csv

# Variables
# The variable we are attempting to predict (sale price)
target: survived

pipeline_name: titanic_classification_model
pipeline_save_file: titanic_classification_model_output_v

features:
- pclass
- sex
- age
- sibsp
- parch
- fare
- cabin
- embarked
- title # generated from name

# set train/test split
test_size: 0.1

# to set the random seed
random_state: 0

unused_fields:
- name
- ticket
- boat
- body
- home.dest

numerical_vars:
- age
- fare

categorical_vars:
- sex
- cabin
- embarked
- title

cabin_vars:
- cabin
Empty file.
84 changes: 84 additions & 0 deletions assignment-section-05/classification_model/config/core.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
from pathlib import Path
from typing import Sequence

from pydantic import BaseModel
from strictyaml import YAML, load

import classification_model

# Project Directories
PACKAGE_ROOT = Path(classification_model.__file__).resolve().parent
ROOT = PACKAGE_ROOT.parent
CONFIG_FILE_PATH = PACKAGE_ROOT / "config.yml"
DATASET_DIR = PACKAGE_ROOT / "datasets"
TRAINED_MODEL_DIR = PACKAGE_ROOT / "trained_models"


class AppConfig(BaseModel):
"""
Application-level config.
"""

package_name: str
raw_data_file: str
pipeline_save_file: str


class ModelConfig(BaseModel):
"""
All configuration relevant to model
training and feature engineering.
"""

target: str
unused_fields: Sequence[str]
features: Sequence[str]
test_size: float
random_state: int
numerical_vars: Sequence[str]
categorical_vars: Sequence[str]
cabin_vars: Sequence[str]


class Config(BaseModel):
"""Master config object."""

app_config: AppConfig
model_config: ModelConfig


def find_config_file() -> Path:
"""Locate the configuration file."""
if CONFIG_FILE_PATH.is_file():
return CONFIG_FILE_PATH
raise Exception(f"Config not found at {CONFIG_FILE_PATH!r}")


def fetch_config_from_yaml(cfg_path: Path = None) -> YAML:
"""Parse YAML containing the package configuration."""

if not cfg_path:
cfg_path = find_config_file()

if cfg_path:
with open(cfg_path, "r") as conf_file:
parsed_config = load(conf_file.read())
return parsed_config
raise OSError(f"Did not find config file at path: {cfg_path}")


def create_and_validate_config(parsed_config: YAML = None) -> Config:
"""Run validation on config values."""
if parsed_config is None:
parsed_config = fetch_config_from_yaml()

# specify the data attribute from the strictyaml YAML type.
_config = Config(
app_config=AppConfig(**parsed_config.data),
model_config=ModelConfig(**parsed_config.data),
)

return _config


config = create_and_validate_config()
Empty file.
64 changes: 64 additions & 0 deletions assignment-section-05/classification_model/pipeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# for encoding categorical variables
from feature_engine.encoding import OneHotEncoder, RareLabelEncoder

# for imputation
from feature_engine.imputation import (
AddMissingIndicator,
CategoricalImputer,
MeanMedianImputer,
)
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from classification_model.config.core import config
from classification_model.processing.features import ExtractLetterTransformer

titanic_pipe = Pipeline(
[
# impute categorical variables with string missing
(
"categorical_imputation",
CategoricalImputer(
imputation_method="missing",
variables=config.model_config.categorical_vars,
),
),
# add missing indicator to numerical variables
(
"missing_indicator",
AddMissingIndicator(variables=config.model_config.numerical_vars),
),
# impute numerical variables with the median
(
"median_imputation",
MeanMedianImputer(
imputation_method="median", variables=config.model_config.numerical_vars
),
),
# Extract letter from cabin
(
"extract_letter",
ExtractLetterTransformer(variables=config.model_config.cabin_vars),
),
# == CATEGORICAL ENCODING ======
# remove categories present in less than 5% of the observations (0.05)
# group them in one category called 'Rare'
(
"rare_label_encoder",
RareLabelEncoder(
tol=0.05, n_categories=1, variables=config.model_config.categorical_vars
),
),
# encode categorical variables using one hot encoding into k-1 variables
(
"categorical_encoder",
OneHotEncoder(
drop_last=True, variables=config.model_config.categorical_vars
),
),
# scale
("scaler", StandardScaler()),
("Logit", LogisticRegression(C=0.0005, random_state=0)),
]
)
34 changes: 34 additions & 0 deletions assignment-section-05/classification_model/predict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import typing as t

import pandas as pd

from classification_model import __version__ as _version
from classification_model.config.core import config
from classification_model.processing.data_manager import load_pipeline
from classification_model.processing.validation import validate_inputs

pipeline_file_name = f"{config.app_config.pipeline_save_file}{_version}.pkl"
_titanic_pipe = load_pipeline(file_name=pipeline_file_name)


def make_prediction(
*,
input_data: t.Union[pd.DataFrame, dict],
) -> dict:
"""Make a prediction using a saved model pipeline."""

data = pd.DataFrame(input_data)
validated_data, errors = validate_inputs(input_data=data)
results = {"predictions": None, "version": _version, "errors": errors}

if not errors:
predictions = _titanic_pipe.predict(
X=validated_data[config.model_config.features]
)
results = {
"predictions": predictions,
"version": _version,
"errors": errors,
}

return results
Empty file.
Loading

0 comments on commit 5e6560c

Please sign in to comment.