From 5e6560c3b62fb251aff78edc521a779405464979 Mon Sep 17 00:00:00 2001
From: Christopher Samiullah <ChristopherGS@users.noreply.github.com>
Date: Sat, 15 Jan 2022 15:59:09 +0000
Subject: [PATCH] Section5 assignment (#789)

---
 .gitignore                                    |   1 +
 assignment-section-05/MANIFEST.in             |  18 +++
 assignment-section-05/README.md               |  16 +++
 .../classification_model/VERSION              |   1 +
 .../classification_model/__init__.py          |  17 +++
 .../classification_model/config.yml           |  51 +++++++++
 .../classification_model/config/__init__.py   |   0
 .../classification_model/config/core.py       |  84 ++++++++++++++
 .../classification_model/datasets/__init__.py |   0
 .../classification_model/pipeline.py          |  64 +++++++++++
 .../classification_model/predict.py           |  34 ++++++
 .../processing/__init__.py                    |   0
 .../processing/data_manager.py                | 105 ++++++++++++++++++
 .../processing/features.py                    |  26 +++++
 .../processing/validation.py                  |  46 ++++++++
 .../classification_model/train_pipeline.py    |  37 ++++++
 .../trained_models/__init__.py                |   0
 assignment-section-05/mypy.ini                |  14 +++
 assignment-section-05/pyproject.toml          |  49 ++++++++
 .../requirements/requirements.txt             |  11 ++
 .../requirements/test_requirements.txt        |  10 ++
 assignment-section-05/setup.py                |  69 ++++++++++++
 assignment-section-05/tests/__init__.py       |   0
 assignment-section-05/tests/conftest.py       |  26 +++++
 assignment-section-05/tests/test_features.py  |  16 +++
 .../tests/test_prediction.py                  |  27 +++++
 assignment-section-05/tox.ini                 |  76 +++++++++++++
 27 files changed, 798 insertions(+)
 create mode 100644 assignment-section-05/MANIFEST.in
 create mode 100644 assignment-section-05/README.md
 create mode 100644 assignment-section-05/classification_model/VERSION
 create mode 100644 assignment-section-05/classification_model/__init__.py
 create mode 100644 assignment-section-05/classification_model/config.yml
 create mode 100644 assignment-section-05/classification_model/config/__init__.py
 create mode 100644 assignment-section-05/classification_model/config/core.py
 create mode 100644 assignment-section-05/classification_model/datasets/__init__.py
 create mode 100644 assignment-section-05/classification_model/pipeline.py
 create mode 100644 assignment-section-05/classification_model/predict.py
 create mode 100644 assignment-section-05/classification_model/processing/__init__.py
 create mode 100644 assignment-section-05/classification_model/processing/data_manager.py
 create mode 100644 assignment-section-05/classification_model/processing/features.py
 create mode 100644 assignment-section-05/classification_model/processing/validation.py
 create mode 100644 assignment-section-05/classification_model/train_pipeline.py
 create mode 100644 assignment-section-05/classification_model/trained_models/__init__.py
 create mode 100644 assignment-section-05/mypy.ini
 create mode 100644 assignment-section-05/pyproject.toml
 create mode 100644 assignment-section-05/requirements/requirements.txt
 create mode 100644 assignment-section-05/requirements/test_requirements.txt
 create mode 100644 assignment-section-05/setup.py
 create mode 100644 assignment-section-05/tests/__init__.py
 create mode 100644 assignment-section-05/tests/conftest.py
 create mode 100644 assignment-section-05/tests/test_features.py
 create mode 100644 assignment-section-05/tests/test_prediction.py
 create mode 100644 assignment-section-05/tox.ini

diff --git a/.gitignore b/.gitignore
index ccf8ebdc0..31bb7d442 100644
--- a/.gitignore
+++ b/.gitignore
@@ -112,6 +112,7 @@ packages/regression_model/regression_model/datasets/*.zip
 packages/regression_model/regression_model/datasets/*.txt
 train.csv
 test.csv
+raw.csv
 data_description.txt
 house-prices-advanced-regression-techniques.zip
 sample_submission.csv
diff --git a/assignment-section-05/MANIFEST.in b/assignment-section-05/MANIFEST.in
new file mode 100644
index 000000000..f17c22c78
--- /dev/null
+++ b/assignment-section-05/MANIFEST.in
@@ -0,0 +1,18 @@
+include *.txt
+include *.md
+include *.pkl
+recursive-include ./classification_model/*
+
+include classification_model/datasets/train.csv
+include classification_model/datasets/test.csv
+include classification_model/trained_models/*.pkl
+include classification_model/VERSION
+include classification_model/config.yml
+
+include ./requirements/requirements.txt
+include ./requirements/test_requirements.txt
+exclude *.log
+exclude *.cfg
+
+recursive-exclude * __pycache__
+recursive-exclude * *.py[co]
\ No newline at end of file
diff --git a/assignment-section-05/README.md b/assignment-section-05/README.md
new file mode 100644
index 000000000..2409cc1e1
--- /dev/null
+++ b/assignment-section-05/README.md
@@ -0,0 +1,16 @@
+# Productionized Titanic Classification Model Package
+
+## Run With Tox (Recommended)
+- Download the data from: https://www.openml.org/data/get_csv/16826755/phpMYEkMl
+- Save the file as `raw.csv` in the classification_model/datasets directory
+- `pip install tox`
+- Make sure you are in the assignment-section-05 directory (where the tox.ini file is) then run the command: `tox` (this runs the tests and typechecks, trains the model under the hood). The first time you run this it creates a virtual env and installs
+dependencies, so takes a few minutes.
+
+## Run Without Tox
+- Download the data from: https://www.openml.org/data/get_csv/16826755/phpMYEkMl
+- Save the file as `raw.csv` in the classification_model/datasets directory
+- Add assignment-section-05 *and* classification_model paths to your system PYTHONPATH
+- `pip install -r requirements/test_requirements`
+- Train the model: `python classification_model/train_pipeline.py`
+- Run the tests `pytest tests`
\ No newline at end of file
diff --git a/assignment-section-05/classification_model/VERSION b/assignment-section-05/classification_model/VERSION
new file mode 100644
index 000000000..8acdd82b7
--- /dev/null
+++ b/assignment-section-05/classification_model/VERSION
@@ -0,0 +1 @@
+0.0.1
diff --git a/assignment-section-05/classification_model/__init__.py b/assignment-section-05/classification_model/__init__.py
new file mode 100644
index 000000000..8cea86752
--- /dev/null
+++ b/assignment-section-05/classification_model/__init__.py
@@ -0,0 +1,17 @@
+import logging
+
+from classification_model.config.core import PACKAGE_ROOT, config
+
+# It is strongly advised that you do not add any handlers other than
+# NullHandler to your library’s loggers. This is because the configuration
+# of handlers is the prerogative of the application developer who uses your
+# library. The application developer knows their target audience and what
+# handlers are most appropriate for their application: if you add handlers
+# ‘under the hood’, you might well interfere with their ability to carry out
+# unit tests and deliver logs which suit their requirements.
+# https://docs.python.org/3/howto/logging.html#configuring-logging-for-a-library
+logging.getLogger(config.app_config.package_name).addHandler(logging.NullHandler())
+
+
+with open(PACKAGE_ROOT / "VERSION") as version_file:
+    __version__ = version_file.read().strip()
diff --git a/assignment-section-05/classification_model/config.yml b/assignment-section-05/classification_model/config.yml
new file mode 100644
index 000000000..696a05035
--- /dev/null
+++ b/assignment-section-05/classification_model/config.yml
@@ -0,0 +1,51 @@
+# Package Overview
+package_name: regression_model
+
+# Data Files
+raw_data_file: raw.csv
+training_data_file: train.csv
+test_data_file: test.csv
+
+# Variables
+# The variable we are attempting to predict (sale price)
+target: survived
+
+pipeline_name: titanic_classification_model
+pipeline_save_file: titanic_classification_model_output_v
+
+features:
+  - pclass
+  - sex
+  - age
+  - sibsp
+  - parch
+  - fare
+  - cabin
+  - embarked
+  - title  # generated from name
+
+# set train/test split
+test_size: 0.1
+
+# to set the random seed
+random_state: 0
+
+unused_fields:
+  - name
+  - ticket
+  - boat
+  - body
+  - home.dest
+
+numerical_vars:
+  - age
+  - fare
+
+categorical_vars:
+  - sex
+  - cabin
+  - embarked
+  - title
+
+cabin_vars:
+  - cabin
\ No newline at end of file
diff --git a/assignment-section-05/classification_model/config/__init__.py b/assignment-section-05/classification_model/config/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/assignment-section-05/classification_model/config/core.py b/assignment-section-05/classification_model/config/core.py
new file mode 100644
index 000000000..3f39d64f0
--- /dev/null
+++ b/assignment-section-05/classification_model/config/core.py
@@ -0,0 +1,84 @@
+from pathlib import Path
+from typing import Sequence
+
+from pydantic import BaseModel
+from strictyaml import YAML, load
+
+import classification_model
+
+# Project Directories
+PACKAGE_ROOT = Path(classification_model.__file__).resolve().parent
+ROOT = PACKAGE_ROOT.parent
+CONFIG_FILE_PATH = PACKAGE_ROOT / "config.yml"
+DATASET_DIR = PACKAGE_ROOT / "datasets"
+TRAINED_MODEL_DIR = PACKAGE_ROOT / "trained_models"
+
+
+class AppConfig(BaseModel):
+    """
+    Application-level config.
+    """
+
+    package_name: str
+    raw_data_file: str
+    pipeline_save_file: str
+
+
+class ModelConfig(BaseModel):
+    """
+    All configuration relevant to model
+    training and feature engineering.
+    """
+
+    target: str
+    unused_fields: Sequence[str]
+    features: Sequence[str]
+    test_size: float
+    random_state: int
+    numerical_vars: Sequence[str]
+    categorical_vars: Sequence[str]
+    cabin_vars: Sequence[str]
+
+
+class Config(BaseModel):
+    """Master config object."""
+
+    app_config: AppConfig
+    model_config: ModelConfig
+
+
+def find_config_file() -> Path:
+    """Locate the configuration file."""
+    if CONFIG_FILE_PATH.is_file():
+        return CONFIG_FILE_PATH
+    raise Exception(f"Config not found at {CONFIG_FILE_PATH!r}")
+
+
+def fetch_config_from_yaml(cfg_path: Path = None) -> YAML:
+    """Parse YAML containing the package configuration."""
+
+    if not cfg_path:
+        cfg_path = find_config_file()
+
+    if cfg_path:
+        with open(cfg_path, "r") as conf_file:
+            parsed_config = load(conf_file.read())
+            return parsed_config
+    raise OSError(f"Did not find config file at path: {cfg_path}")
+
+
+def create_and_validate_config(parsed_config: YAML = None) -> Config:
+    """Run validation on config values."""
+    if parsed_config is None:
+        parsed_config = fetch_config_from_yaml()
+
+    # specify the data attribute from the strictyaml YAML type.
+    _config = Config(
+        app_config=AppConfig(**parsed_config.data),
+        model_config=ModelConfig(**parsed_config.data),
+    )
+
+    return _config
+
+
+config = create_and_validate_config()
diff --git a/assignment-section-05/classification_model/datasets/__init__.py b/assignment-section-05/classification_model/datasets/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/assignment-section-05/classification_model/pipeline.py b/assignment-section-05/classification_model/pipeline.py
new file mode 100644
index 000000000..c20abd660
--- /dev/null
+++ b/assignment-section-05/classification_model/pipeline.py
@@ -0,0 +1,64 @@
+# for encoding categorical variables
+from feature_engine.encoding import OneHotEncoder, RareLabelEncoder
+
+# for imputation
+from feature_engine.imputation import (
+    AddMissingIndicator,
+    CategoricalImputer,
+    MeanMedianImputer,
+)
+from sklearn.linear_model import LogisticRegression
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
+
+from classification_model.config.core import config
+from classification_model.processing.features import ExtractLetterTransformer
+
+titanic_pipe = Pipeline(
+    [
+        # impute categorical variables with string missing
+        (
+            "categorical_imputation",
+            CategoricalImputer(
+                imputation_method="missing",
+                variables=config.model_config.categorical_vars,
+            ),
+        ),
+        # add missing indicator to numerical variables
+        (
+            "missing_indicator",
+            AddMissingIndicator(variables=config.model_config.numerical_vars),
+        ),
+        # impute numerical variables with the median
+        (
+            "median_imputation",
+            MeanMedianImputer(
+                imputation_method="median", variables=config.model_config.numerical_vars
+            ),
+        ),
+        # Extract letter from cabin
+        (
+            "extract_letter",
+            ExtractLetterTransformer(variables=config.model_config.cabin_vars),
+        ),
+        # == CATEGORICAL ENCODING ======
+        # remove categories present in less than 5% of the observations (0.05)
+        # group them in one category called 'Rare'
+        (
+            "rare_label_encoder",
+            RareLabelEncoder(
+                tol=0.05, n_categories=1, variables=config.model_config.categorical_vars
+            ),
+        ),
+        # encode categorical variables using one hot encoding into k-1 variables
+        (
+            "categorical_encoder",
+            OneHotEncoder(
+                drop_last=True, variables=config.model_config.categorical_vars
+            ),
+        ),
+        # scale
+        ("scaler", StandardScaler()),
+        ("Logit", LogisticRegression(C=0.0005, random_state=0)),
+    ]
+)
diff --git a/assignment-section-05/classification_model/predict.py b/assignment-section-05/classification_model/predict.py
new file mode 100644
index 000000000..eb2990bb3
--- /dev/null
+++ b/assignment-section-05/classification_model/predict.py
@@ -0,0 +1,34 @@
+import typing as t
+
+import pandas as pd
+
+from classification_model import __version__ as _version
+from classification_model.config.core import config
+from classification_model.processing.data_manager import load_pipeline
+from classification_model.processing.validation import validate_inputs
+
+pipeline_file_name = f"{config.app_config.pipeline_save_file}{_version}.pkl"
+_titanic_pipe = load_pipeline(file_name=pipeline_file_name)
+
+
+def make_prediction(
+    *,
+    input_data: t.Union[pd.DataFrame, dict],
+) -> dict:
+    """Make a prediction using a saved model pipeline."""
+
+    data = pd.DataFrame(input_data)
+    validated_data, errors = validate_inputs(input_data=data)
+    results = {"predictions": None, "version": _version, "errors": errors}
+
+    if not errors:
+        predictions = _titanic_pipe.predict(
+            X=validated_data[config.model_config.features]
+        )
+        results = {
+            "predictions": predictions,
+            "version": _version,
+            "errors": errors,
+        }
+
+    return results
diff --git a/assignment-section-05/classification_model/processing/__init__.py b/assignment-section-05/classification_model/processing/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/assignment-section-05/classification_model/processing/data_manager.py b/assignment-section-05/classification_model/processing/data_manager.py
new file mode 100644
index 000000000..550eebdfc
--- /dev/null
+++ b/assignment-section-05/classification_model/processing/data_manager.py
@@ -0,0 +1,105 @@
+import logging
+import re
+from pathlib import Path
+from typing import Any, List, Union
+
+import joblib
+import numpy as np
+import pandas as pd
+from sklearn.pipeline import Pipeline
+
+from classification_model import __version__ as _version
+from classification_model.config.core import DATASET_DIR, TRAINED_MODEL_DIR, config
+
+logger = logging.getLogger(__name__)
+
+
+# float type for np.nan
+def get_first_cabin(row: Any) -> Union[str, float]:
+    try:
+        return row.split()[0]
+    except AttributeError:
+        return np.nan
+
+
+def get_title(passenger: str) -> str:
+    """Extracts the title (Mr, Ms, etc) from the name variable."""
+    line = passenger
+    if re.search("Mrs", line):
+        return "Mrs"
+    elif re.search("Mr", line):
+        return "Mr"
+    elif re.search("Miss", line):
+        return "Miss"
+    elif re.search("Master", line):
+        return "Master"
+    else:
+        return "Other"
+
+
+def pre_pipeline_preparation(*, dataframe: pd.DataFrame) -> pd.DataFrame:
+    # replace question marks with NaN values
+    data = dataframe.replace("?", np.nan)
+
+    # retain only the first cabin if more than
+    # 1 are available per passenger
+    data["cabin"] = data["cabin"].apply(get_first_cabin)
+
+    data["title"] = data["name"].apply(get_title)
+
+    # cast numerical variables as floats
+    data["fare"] = data["fare"].astype("float")
+    data["age"] = data["age"].astype("float")
+
+    # drop unnecessary variables
+    data.drop(labels=config.model_config.unused_fields, axis=1, inplace=True)
+
+    return data
+
+
+def _load_raw_dataset(*, file_name: str) -> pd.DataFrame:
+    dataframe = pd.read_csv(Path(f"{DATASET_DIR}/{file_name}"))
+    return dataframe
+
+
+def load_dataset(*, file_name: str) -> pd.DataFrame:
+    dataframe = pd.read_csv(Path(f"{DATASET_DIR}/{file_name}"))
+    transformed = pre_pipeline_preparation(dataframe=dataframe)
+
+    return transformed
+
+
+def save_pipeline(*, pipeline_to_persist: Pipeline) -> None:
+    """Persist the pipeline.
+    Saves the versioned model, and overwrites any previous
+    saved models. This ensures that when the package is
+    published, there is only one trained model that can be
+    called, and we know exactly how it was built.
+    """
+
+    # Prepare versioned save file name
+    save_file_name = f"{config.app_config.pipeline_save_file}{_version}.pkl"
+    save_path = TRAINED_MODEL_DIR / save_file_name
+
+    remove_old_pipelines(files_to_keep=[save_file_name])
+    joblib.dump(pipeline_to_persist, save_path)
+
+
+def load_pipeline(*, file_name: str) -> Pipeline:
+    """Load a persisted pipeline."""
+
+    file_path = TRAINED_MODEL_DIR / file_name
+    return joblib.load(filename=file_path)
+
+
+def remove_old_pipelines(*, files_to_keep: List[str]) -> None:
+    """
+    Remove old model pipelines.
+    This is to ensure there is a simple one-to-one
+    mapping between the package version and the model
+    version to be imported and used by other applications.
+    """
+    do_not_delete = files_to_keep + ["__init__.py"]
+    for model_file in TRAINED_MODEL_DIR.iterdir():
+        if model_file.name not in do_not_delete:
+            model_file.unlink()
diff --git a/assignment-section-05/classification_model/processing/features.py b/assignment-section-05/classification_model/processing/features.py
new file mode 100644
index 000000000..fb7c629c8
--- /dev/null
+++ b/assignment-section-05/classification_model/processing/features.py
@@ -0,0 +1,26 @@
+from sklearn.base import BaseEstimator, TransformerMixin
+
+
+class ExtractLetterTransformer(BaseEstimator, TransformerMixin):
+    # Extract first letter of variable
+
+    def __init__(self, variables):
+
+        if not isinstance(variables, list):
+            raise ValueError("variables should be a list")
+
+        self.variables = variables
+
+    def fit(self, X, y=None):
+        # we need this step to fit the sklearn pipeline
+        return self
+
+    def transform(self, X):
+
+        # so that we do not over-write the original dataframe
+        X = X.copy()
+
+        for feature in self.variables:
+            X[feature] = X[feature].str[0]
+
+        return X
diff --git a/assignment-section-05/classification_model/processing/validation.py b/assignment-section-05/classification_model/processing/validation.py
new file mode 100644
index 000000000..7ac1870b0
--- /dev/null
+++ b/assignment-section-05/classification_model/processing/validation.py
@@ -0,0 +1,46 @@
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import pandas as pd
+from pydantic import BaseModel, ValidationError
+
+from classification_model.config.core import config
+from classification_model.processing.data_manager import pre_pipeline_preparation
+
+
+def validate_inputs(*, input_data: pd.DataFrame) -> Tuple[pd.DataFrame, Optional[dict]]:
+    """Check model inputs for unprocessable values."""
+
+    pre_processed = pre_pipeline_preparation(dataframe=input_data)
+    validated_data = pre_processed[config.model_config.features].copy()
+    errors = None
+
+    try:
+        # replace numpy nans so that pydantic can validate
+        MultipleTitanicDataInputs(
+            inputs=validated_data.replace({np.nan: None}).to_dict(orient="records")
+        )
+    except ValidationError as error:
+        errors = error.json()
+
+    return validated_data, errors
+
+
+class TitanicDataInputSchema(BaseModel):
+    pclass: Optional[int]
+    name: Optional[str]
+    sex: Optional[str]
+    age: Optional[int]
+    sibsp: Optional[int]
+    parch: Optional[int]
+    ticket: Optional[int]
+    fare: Optional[float]
+    cabin: Optional[str]
+    embarked: Optional[str]
+    boat: Optional[Union[str, int]]
+    body: Optional[int]
+    # TODO: rename home.dest, can get away with it now as it is not used
+
+
+class MultipleTitanicDataInputs(BaseModel):
+    inputs: List[TitanicDataInputSchema]
diff --git a/assignment-section-05/classification_model/train_pipeline.py b/assignment-section-05/classification_model/train_pipeline.py
new file mode 100644
index 000000000..5c83a97f3
--- /dev/null
+++ b/assignment-section-05/classification_model/train_pipeline.py
@@ -0,0 +1,37 @@
+from sklearn.model_selection import train_test_split
+
+from classification_model.config.core import config
+from classification_model.pipeline import titanic_pipe
+from classification_model.processing.data_manager import load_dataset, save_pipeline
+
+
+def run_training() -> None:
+    """
+    Train the model.
+
+    Training data can be found here:
+    https://www.openml.org/data/get_csv/16826755/phpMYEkMl
+    """
+
+    # read training data
+    data = load_dataset(file_name=config.app_config.raw_data_file)
+
+    # divide train and test
+    X_train, X_test, y_train, y_test = train_test_split(
+        data[config.model_config.features],  # predictors
+        data[config.model_config.target],
+        test_size=config.model_config.test_size,
+        # we are setting the random seed here
+        # for reproducibility
+        random_state=config.model_config.random_state,
+    )
+
+    # fit model
+    titanic_pipe.fit(X_train, y_train)
+
+    # persist trained model
+    save_pipeline(pipeline_to_persist=titanic_pipe)
+
+
+if __name__ == "__main__":
+    run_training()
diff --git a/assignment-section-05/classification_model/trained_models/__init__.py b/assignment-section-05/classification_model/trained_models/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/assignment-section-05/mypy.ini b/assignment-section-05/mypy.ini
new file mode 100644
index 000000000..d6984fd7a
--- /dev/null
+++ b/assignment-section-05/mypy.ini
@@ -0,0 +1,14 @@
+[mypy]
+warn_unreachable = False
+warn_unused_ignores = True
+follow_imports = skip
+show_error_context = True
+warn_incomplete_stub = True
+ignore_missing_imports = True
+check_untyped_defs = True
+cache_dir = /dev/null
+# Allow defining functions without any types.
+disallow_untyped_defs = False
+warn_redundant_casts = True
+warn_unused_configs = True
+strict_optional = True
\ No newline at end of file
diff --git a/assignment-section-05/pyproject.toml b/assignment-section-05/pyproject.toml
new file mode 100644
index 000000000..945100376
--- /dev/null
+++ b/assignment-section-05/pyproject.toml
@@ -0,0 +1,49 @@
+[build-system]
+requires = [
+    "setuptools>=42",
+    "wheel"
+]
+build-backend = "setuptools.build_meta"
+
+[tool.pytest.ini_options]
+minversion = "2.0"
+addopts = "-rfEX -p pytester --strict-markers"
+python_files = ["test_*.py", "*_test.py"]
+python_classes = ["Test", "Acceptance"]
+python_functions = ["test"]
+# NOTE: "doc" is not included here, but gets tested explicitly via "doctesting".
+testpaths = ["tests"]
+xfail_strict = true
+filterwarnings = [
+    "error",
+    "default:Using or importing the ABCs:DeprecationWarning:unittest2.*",
+    # produced by older pyparsing<=2.2.0.
+    "default:Using or importing the ABCs:DeprecationWarning:pyparsing.*",
+    "default:the imp module is deprecated in favour of importlib:DeprecationWarning:nose.*",
+    # distutils is deprecated in 3.10, scheduled for removal in 3.12
+    "ignore:The distutils package is deprecated:DeprecationWarning",
+    # produced by python3.6/site.py itself (3.6.7 on Travis, could not trigger it with 3.6.8)."
+    "ignore:.*U.*mode is deprecated:DeprecationWarning:(?!(pytest|_pytest))",
+    # produced by pytest-xdist
+    "ignore:.*type argument to addoption.*:DeprecationWarning",
+    # produced on execnet (pytest-xdist)
+    "ignore:.*inspect.getargspec.*deprecated, use inspect.signature.*:DeprecationWarning",
+    # pytest's own futurewarnings
+    "ignore::pytest.PytestExperimentalApiWarning",
+    # Do not cause SyntaxError for invalid escape sequences in py37.
+    # Those are caught/handled by pyupgrade, and not easy to filter with the
+    # module being the filename (with .py removed).
+    "default:invalid escape sequence:DeprecationWarning",
+    # ignore use of unregistered marks, because we use many to test the implementation
+    "ignore::_pytest.warning_types.PytestUnknownMarkWarning",
+]
+
+[tool.black]
+target-version = ['py36']
+
+[tool.isort]
+profile = "black"
+line_length = 100
+lines_between_sections = 1
+known_first_party = "sentry"
+skip = "migrations"
diff --git a/assignment-section-05/requirements/requirements.txt b/assignment-section-05/requirements/requirements.txt
new file mode 100644
index 000000000..f24a342da
--- /dev/null
+++ b/assignment-section-05/requirements/requirements.txt
@@ -0,0 +1,11 @@
+# We use compatible release functionality (see PEP 440 here: https://www.python.org/dev/peps/pep-0440/#compatible-release)
+# to specify acceptable version ranges of our project dependencies. This gives us the flexibility to keep up with small
+# updates/fixes, whilst ensuring we don't install a major update which could introduce backwards incompatible changes.
+numpy>=1.20.0,<1.21.0
+pandas>=1.3.5,<1.4.0
+pydantic>=1.8.1,<1.9.0
+scikit-learn>=0.24.2,<0.25.0
+strictyaml>=1.3.2,<1.4.0
+ruamel.yaml==0.16.12
+feature-engine>=1.0.2,<1.3.0
+joblib>=1.0.1,<1.1.0
\ No newline at end of file
diff --git a/assignment-section-05/requirements/test_requirements.txt b/assignment-section-05/requirements/test_requirements.txt
new file mode 100644
index 000000000..538bc0f28
--- /dev/null
+++ b/assignment-section-05/requirements/test_requirements.txt
@@ -0,0 +1,10 @@
+-r requirements.txt
+
+# testing requirements
+pytest>=6.2.3,<6.3.0
+
+# repo maintenance tooling
+black==20.8b1
+flake8>=3.9.0,<3.10.0
+mypy==0.812
+isort==5.8.0
diff --git a/assignment-section-05/setup.py b/assignment-section-05/setup.py
new file mode 100644
index 000000000..68a0c2796
--- /dev/null
+++ b/assignment-section-05/setup.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from pathlib import Path
+
+from setuptools import find_packages, setup
+
+# Package meta-data.
+NAME = 'tid-titanic-classification-model'
+DESCRIPTION = "Example Titanic dataset classification model package from Train In Data."
+URL = "https://github.com/trainindata/deploying-machine-learning-models"
+EMAIL = "christopher.samiullah@protonmail.com"
+AUTHOR = "ChristopherGS"
+REQUIRES_PYTHON = ">=3.7.0"
+
+
+# The rest you shouldn't have to touch too much :)
+# ------------------------------------------------
+# Except, perhaps the License and Trove Classifiers!
+# Trove Classifiers: https://pypi.org/classifiers/
+# If you do change the License, remember to change the
+# Trove Classifier for that!
+long_description = DESCRIPTION
+
+# Load the package's VERSION file as a dictionary.
+about = {}
+ROOT_DIR = Path(__file__).resolve().parent
+REQUIREMENTS_DIR = ROOT_DIR / 'requirements'
+PACKAGE_DIR = ROOT_DIR / 'classification_model'
+with open(PACKAGE_DIR / "VERSION") as f:
+    _version = f.read().strip()
+    about["__version__"] = _version
+
+
+# What packages are required for this module to be executed?
+def list_reqs(fname="requirements.txt"):
+    with open(REQUIREMENTS_DIR / fname) as fd:
+        return fd.read().splitlines()
+
+# Where the magic happens:
+setup(
+    name=NAME,
+    version=about["__version__"],
+    description=DESCRIPTION,
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    author=AUTHOR,
+    author_email=EMAIL,
+    python_requires=REQUIRES_PYTHON,
+    url=URL,
+    packages=find_packages(exclude=("tests",)),
+    package_data={"classification_model": ["VERSION"]},
+    install_requires=list_reqs(),
+    extras_require={},
+    include_package_data=True,
+    license="BSD-3",
+    classifiers=[
+        # Trove classifiers
+        # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers
+        "License :: OSI Approved :: MIT License",
+        "Programming Language :: Python",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.7",
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: Implementation :: CPython",
+        "Programming Language :: Python :: Implementation :: PyPy",
+    ],
+)
\ No newline at end of file
diff --git a/assignment-section-05/tests/__init__.py b/assignment-section-05/tests/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/assignment-section-05/tests/conftest.py b/assignment-section-05/tests/conftest.py
new file mode 100644
index 000000000..8e3fd46ad
--- /dev/null
+++ b/assignment-section-05/tests/conftest.py
@@ -0,0 +1,26 @@
+import logging
+
+import pytest
+from sklearn.model_selection import train_test_split
+
+from classification_model.config.core import config
+from classification_model.processing.data_manager import _load_raw_dataset
+
+logger = logging.getLogger(__name__)
+
+
+@pytest.fixture
+def sample_input_data():
+    data = _load_raw_dataset(file_name=config.app_config.raw_data_file)
+
+    # divide train and test
+    X_train, X_test, y_train, y_test = train_test_split(
+        data,  # predictors
+        data[config.model_config.target],
+        test_size=config.model_config.test_size,
+        # we are setting the random seed here
+        # for reproducibility
+        random_state=config.model_config.random_state,
+    )
+
+    return X_test
diff --git a/assignment-section-05/tests/test_features.py b/assignment-section-05/tests/test_features.py
new file mode 100644
index 000000000..c3f88101b
--- /dev/null
+++ b/assignment-section-05/tests/test_features.py
@@ -0,0 +1,16 @@
+from classification_model.config.core import config
+from classification_model.processing.features import ExtractLetterTransformer
+
+
+def test_temporal_variable_transformer(sample_input_data):
+    # Given
+    transformer = ExtractLetterTransformer(
+        variables=config.model_config.cabin_vars,  # cabin
+    )
+    assert sample_input_data["cabin"].iat[6] == "E12"
+
+    # When
+    subject = transformer.fit_transform(sample_input_data)
+
+    # Then
+    assert subject["cabin"].iat[6] == "E"
diff --git a/assignment-section-05/tests/test_prediction.py b/assignment-section-05/tests/test_prediction.py
new file mode 100644
index 000000000..76965698a
--- /dev/null
+++ b/assignment-section-05/tests/test_prediction.py
@@ -0,0 +1,27 @@
+"""
+Note: These tests will fail if you have not first trained the model.
+"""
+
+import numpy as np
+from sklearn.metrics import accuracy_score
+
+from classification_model.predict import make_prediction
+
+
+def test_make_prediction(sample_input_data):
+    # Given
+    expected_no_predictions = 131
+
+    # When
+    result = make_prediction(input_data=sample_input_data)
+
+    # Then
+    predictions = result.get("predictions")
+    assert isinstance(predictions, np.ndarray)
+    assert isinstance(predictions[0], np.int64)
+    assert result.get("errors") is None
+    assert len(predictions) == expected_no_predictions
+    _predictions = list(predictions)
+    y_true = sample_input_data["survived"]
+    accuracy = accuracy_score(_predictions, y_true)
+    assert accuracy > 0.7
diff --git a/assignment-section-05/tox.ini b/assignment-section-05/tox.ini
new file mode 100644
index 000000000..a51831c34
--- /dev/null
+++ b/assignment-section-05/tox.ini
@@ -0,0 +1,76 @@
+# Tox is a generic virtualenv management and test command line tool. Its goal is to
+# standardize testing in Python. We will be using it extensively in this course.
+
+# Using Tox we can (on multiple operating systems):
+# + Eliminate PYTHONPATH challenges when running scripts/tests
+# + Eliminate virtualenv setup confusion
+# + Streamline steps such as model training, model publishing
+
+
+[tox]
+envlist = test_package, typechecks, stylechecks, lint
+skipsdist = True
+
+[testenv]
+install_command = pip install {opts} {packages}
+
+[testenv:test_package]
+deps =
+	-rrequirements/test_requirements.txt
+
+setenv =
+	PYTHONPATH=.
+	PYTHONHASHSEED=0
+
+commands=
+	python classification_model/train_pipeline.py
+	pytest \
+	-s \
+	-vv \
+	{posargs:tests/}
+
+[testenv:train]
+envdir = {toxworkdir}/test_package
+deps =
+	{[testenv:test_package]deps}
+
+setenv =
+	{[testenv:test_package]setenv}
+
+commands=
+	python classification_model/train_pipeline.py
+
+
+[testenv:typechecks]
+envdir = {toxworkdir}/test_package
+
+deps =
+	{[testenv:test_package]deps}
+
+commands = {posargs:mypy classification_model}
+
+
+[testenv:stylechecks]
+envdir = {toxworkdir}/test_package
+
+deps =
+	{[testenv:test_package]deps}
+
+commands = {posargs:flake8 classification_model tests}
+
+
+[testenv:lint]
+envdir = {toxworkdir}/test_package
+
+deps =
+	{[testenv:test_package]deps}
+
+commands =
+	isort classification_model tests
+	black classification_model tests
+	mypy classification_model
+	flake8 classification_model
+
+[flake8]
+exclude = .git,env
+max-line-length = 90
\ No newline at end of file