Section5 assignment (#789)

trainindata · Jan 15, 2022 · 5e6560c · 5e6560c
1 parent 08bfc7a
commit 5e6560c
Show file tree

Hide file tree

Showing 27 changed files with 798 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -112,6 +112,7 @@ packages/regression_model/regression_model/datasets/*.zip
 packages/regression_model/regression_model/datasets/*.txt
 train.csv
 test.csv
+raw.csv
 data_description.txt
 house-prices-advanced-regression-techniques.zip
 sample_submission.csv

diff --git a/assignment-section-05/MANIFEST.in b/assignment-section-05/MANIFEST.in
@@ -0,0 +1,18 @@
+include *.txt
+include *.md
+include *.pkl
+recursive-include ./classification_model/*
+
+include classification_model/datasets/train.csv
+include classification_model/datasets/test.csv
+include classification_model/trained_models/*.pkl
+include classification_model/VERSION
+include classification_model/config.yml
+
+include ./requirements/requirements.txt
+include ./requirements/test_requirements.txt
+exclude *.log
+exclude *.cfg
+
+recursive-exclude * __pycache__
+recursive-exclude * *.py[co]
diff --git a/assignment-section-05/README.md b/assignment-section-05/README.md
@@ -0,0 +1,16 @@
+# Productionized Titanic Classification Model Package
+
+## Run With Tox (Recommended)
+- Download the data from: https://www.openml.org/data/get_csv/16826755/phpMYEkMl
+- Save the file as `raw.csv` in the classification_model/datasets directory
+- `pip install tox`
+- Make sure you are in the assignment-section-05 directory (where the tox.ini file is) then run the command: `tox` (this runs the tests and typechecks, trains the model under the hood). The first time you run this it creates a virtual env and installs
+dependencies, so takes a few minutes.
+
+## Run Without Tox
+- Download the data from: https://www.openml.org/data/get_csv/16826755/phpMYEkMl
+- Save the file as `raw.csv` in the classification_model/datasets directory
+- Add assignment-section-05 *and* classification_model paths to your system PYTHONPATH
+- `pip install -r requirements/test_requirements`
+- Train the model: `python classification_model/train_pipeline.py`
+- Run the tests `pytest tests`
diff --git a/assignment-section-05/classification_model/VERSION b/assignment-section-05/classification_model/VERSION
@@ -0,0 +1 @@
+0.0.1
diff --git a/assignment-section-05/classification_model/__init__.py b/assignment-section-05/classification_model/__init__.py
@@ -0,0 +1,17 @@
+import logging
+
+from classification_model.config.core import PACKAGE_ROOT, config
+
+# It is strongly advised that you do not add any handlers other than
+# NullHandler to your library’s loggers. This is because the configuration
+# of handlers is the prerogative of the application developer who uses your
+# library. The application developer knows their target audience and what
+# handlers are most appropriate for their application: if you add handlers
+# ‘under the hood’, you might well interfere with their ability to carry out
+# unit tests and deliver logs which suit their requirements.
+# https://docs.python.org/3/howto/logging.html#configuring-logging-for-a-library
+logging.getLogger(config.app_config.package_name).addHandler(logging.NullHandler())
+
+
+with open(PACKAGE_ROOT / "VERSION") as version_file:
+    __version__ = version_file.read().strip()
diff --git a/assignment-section-05/classification_model/config.yml b/assignment-section-05/classification_model/config.yml
@@ -0,0 +1,51 @@
+# Package Overview
+package_name: regression_model
+
+# Data Files
+raw_data_file: raw.csv
+training_data_file: train.csv
+test_data_file: test.csv
+
+# Variables
+# The variable we are attempting to predict (sale price)
+target: survived
+
+pipeline_name: titanic_classification_model
+pipeline_save_file: titanic_classification_model_output_v
+
+features:
+  - pclass
+  - sex
+  - age
+  - sibsp
+  - parch
+  - fare
+  - cabin
+  - embarked
+  - title  # generated from name
+
+# set train/test split
+test_size: 0.1
+
+# to set the random seed
+random_state: 0
+
+unused_fields:
+  - name
+  - ticket
+  - boat
+  - body
+  - home.dest
+
+numerical_vars:
+  - age
+  - fare
+
+categorical_vars:
+  - sex
+  - cabin
+  - embarked
+  - title
+
+cabin_vars:
+  - cabin
diff --git a/assignment-section-05/classification_model/config/__init__.py b/assignment-section-05/classification_model/config/__init__.py
diff --git a/assignment-section-05/classification_model/config/core.py b/assignment-section-05/classification_model/config/core.py
@@ -0,0 +1,84 @@
+from pathlib import Path
+from typing import Sequence
+
+from pydantic import BaseModel
+from strictyaml import YAML, load
+
+import classification_model
+
+# Project Directories
+PACKAGE_ROOT = Path(classification_model.__file__).resolve().parent
+ROOT = PACKAGE_ROOT.parent
+CONFIG_FILE_PATH = PACKAGE_ROOT / "config.yml"
+DATASET_DIR = PACKAGE_ROOT / "datasets"
+TRAINED_MODEL_DIR = PACKAGE_ROOT / "trained_models"
+
+
+class AppConfig(BaseModel):
+    """
+    Application-level config.
+    """
+
+    package_name: str
+    raw_data_file: str
+    pipeline_save_file: str
+
+
+class ModelConfig(BaseModel):
+    """
+    All configuration relevant to model
+    training and feature engineering.
+    """
+
+    target: str
+    unused_fields: Sequence[str]
+    features: Sequence[str]
+    test_size: float
+    random_state: int
+    numerical_vars: Sequence[str]
+    categorical_vars: Sequence[str]
+    cabin_vars: Sequence[str]
+
+
+class Config(BaseModel):
+    """Master config object."""
+
+    app_config: AppConfig
+    model_config: ModelConfig
+
+
+def find_config_file() -> Path:
+    """Locate the configuration file."""
+    if CONFIG_FILE_PATH.is_file():
+        return CONFIG_FILE_PATH
+    raise Exception(f"Config not found at {CONFIG_FILE_PATH!r}")
+
+
+def fetch_config_from_yaml(cfg_path: Path = None) -> YAML:
+    """Parse YAML containing the package configuration."""
+
+    if not cfg_path:
+        cfg_path = find_config_file()
+
+    if cfg_path:
+        with open(cfg_path, "r") as conf_file:
+            parsed_config = load(conf_file.read())
+            return parsed_config
+    raise OSError(f"Did not find config file at path: {cfg_path}")
+
+
+def create_and_validate_config(parsed_config: YAML = None) -> Config:
+    """Run validation on config values."""
+    if parsed_config is None:
+        parsed_config = fetch_config_from_yaml()
+
+    # specify the data attribute from the strictyaml YAML type.
+    _config = Config(
+        app_config=AppConfig(**parsed_config.data),
+        model_config=ModelConfig(**parsed_config.data),
+    )
+
+    return _config
+
+
+config = create_and_validate_config()
diff --git a/assignment-section-05/classification_model/datasets/__init__.py b/assignment-section-05/classification_model/datasets/__init__.py
diff --git a/assignment-section-05/classification_model/pipeline.py b/assignment-section-05/classification_model/pipeline.py
@@ -0,0 +1,64 @@
+# for encoding categorical variables
+from feature_engine.encoding import OneHotEncoder, RareLabelEncoder
+
+# for imputation
+from feature_engine.imputation import (
+    AddMissingIndicator,
+    CategoricalImputer,
+    MeanMedianImputer,
+)
+from sklearn.linear_model import LogisticRegression
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
+
+from classification_model.config.core import config
+from classification_model.processing.features import ExtractLetterTransformer
+
+titanic_pipe = Pipeline(
+    [
+        # impute categorical variables with string missing
+        (
+            "categorical_imputation",
+            CategoricalImputer(
+                imputation_method="missing",
+                variables=config.model_config.categorical_vars,
+            ),
+        ),
+        # add missing indicator to numerical variables
+        (
+            "missing_indicator",
+            AddMissingIndicator(variables=config.model_config.numerical_vars),
+        ),
+        # impute numerical variables with the median
+        (
+            "median_imputation",
+            MeanMedianImputer(
+                imputation_method="median", variables=config.model_config.numerical_vars
+            ),
+        ),
+        # Extract letter from cabin
+        (
+            "extract_letter",
+            ExtractLetterTransformer(variables=config.model_config.cabin_vars),
+        ),
+        # == CATEGORICAL ENCODING ======
+        # remove categories present in less than 5% of the observations (0.05)
+        # group them in one category called 'Rare'
+        (
+            "rare_label_encoder",
+            RareLabelEncoder(
+                tol=0.05, n_categories=1, variables=config.model_config.categorical_vars
+            ),
+        ),
+        # encode categorical variables using one hot encoding into k-1 variables
+        (
+            "categorical_encoder",
+            OneHotEncoder(
+                drop_last=True, variables=config.model_config.categorical_vars
+            ),
+        ),
+        # scale
+        ("scaler", StandardScaler()),
+        ("Logit", LogisticRegression(C=0.0005, random_state=0)),
+    ]
+)
diff --git a/assignment-section-05/classification_model/predict.py b/assignment-section-05/classification_model/predict.py
@@ -0,0 +1,34 @@
+import typing as t
+
+import pandas as pd
+
+from classification_model import __version__ as _version
+from classification_model.config.core import config
+from classification_model.processing.data_manager import load_pipeline
+from classification_model.processing.validation import validate_inputs
+
+pipeline_file_name = f"{config.app_config.pipeline_save_file}{_version}.pkl"
+_titanic_pipe = load_pipeline(file_name=pipeline_file_name)
+
+
+def make_prediction(
+    *,
+    input_data: t.Union[pd.DataFrame, dict],
+) -> dict:
+    """Make a prediction using a saved model pipeline."""
+
+    data = pd.DataFrame(input_data)
+    validated_data, errors = validate_inputs(input_data=data)
+    results = {"predictions": None, "version": _version, "errors": errors}
+
+    if not errors:
+        predictions = _titanic_pipe.predict(
+            X=validated_data[config.model_config.features]
+        )
+        results = {
+            "predictions": predictions,
+            "version": _version,
+            "errors": errors,
+        }
+
+    return results
diff --git a/assignment-section-05/classification_model/processing/__init__.py b/assignment-section-05/classification_model/processing/__init__.py