diff --git a/.gitignore b/.gitignore index ccf8ebdc0..31bb7d442 100644 --- a/.gitignore +++ b/.gitignore @@ -112,6 +112,7 @@ packages/regression_model/regression_model/datasets/*.zip packages/regression_model/regression_model/datasets/*.txt train.csv test.csv +raw.csv data_description.txt house-prices-advanced-regression-techniques.zip sample_submission.csv diff --git a/assignment-section-05/MANIFEST.in b/assignment-section-05/MANIFEST.in new file mode 100644 index 000000000..f17c22c78 --- /dev/null +++ b/assignment-section-05/MANIFEST.in @@ -0,0 +1,18 @@ +include *.txt +include *.md +include *.pkl +recursive-include ./classification_model/* + +include classification_model/datasets/train.csv +include classification_model/datasets/test.csv +include classification_model/trained_models/*.pkl +include classification_model/VERSION +include classification_model/config.yml + +include ./requirements/requirements.txt +include ./requirements/test_requirements.txt +exclude *.log +exclude *.cfg + +recursive-exclude * __pycache__ +recursive-exclude * *.py[co] \ No newline at end of file diff --git a/assignment-section-05/README.md b/assignment-section-05/README.md new file mode 100644 index 000000000..2409cc1e1 --- /dev/null +++ b/assignment-section-05/README.md @@ -0,0 +1,16 @@ +# Productionized Titanic Classification Model Package + +## Run With Tox (Recommended) +- Download the data from: https://www.openml.org/data/get_csv/16826755/phpMYEkMl +- Save the file as `raw.csv` in the classification_model/datasets directory +- `pip install tox` +- Make sure you are in the assignment-section-05 directory (where the tox.ini file is) then run the command: `tox` (this runs the tests and typechecks, trains the model under the hood). The first time you run this it creates a virtual env and installs +dependencies, so takes a few minutes. + +## Run Without Tox +- Download the data from: https://www.openml.org/data/get_csv/16826755/phpMYEkMl +- Save the file as `raw.csv` in the classification_model/datasets directory +- Add assignment-section-05 *and* classification_model paths to your system PYTHONPATH +- `pip install -r requirements/test_requirements` +- Train the model: `python classification_model/train_pipeline.py` +- Run the tests `pytest tests` \ No newline at end of file diff --git a/assignment-section-05/classification_model/VERSION b/assignment-section-05/classification_model/VERSION new file mode 100644 index 000000000..8acdd82b7 --- /dev/null +++ b/assignment-section-05/classification_model/VERSION @@ -0,0 +1 @@ +0.0.1 diff --git a/assignment-section-05/classification_model/__init__.py b/assignment-section-05/classification_model/__init__.py new file mode 100644 index 000000000..8cea86752 --- /dev/null +++ b/assignment-section-05/classification_model/__init__.py @@ -0,0 +1,17 @@ +import logging + +from classification_model.config.core import PACKAGE_ROOT, config + +# It is strongly advised that you do not add any handlers other than +# NullHandler to your library’s loggers. This is because the configuration +# of handlers is the prerogative of the application developer who uses your +# library. The application developer knows their target audience and what +# handlers are most appropriate for their application: if you add handlers +# ‘under the hood’, you might well interfere with their ability to carry out +# unit tests and deliver logs which suit their requirements. +# https://docs.python.org/3/howto/logging.html#configuring-logging-for-a-library +logging.getLogger(config.app_config.package_name).addHandler(logging.NullHandler()) + + +with open(PACKAGE_ROOT / "VERSION") as version_file: + __version__ = version_file.read().strip() diff --git a/assignment-section-05/classification_model/config.yml b/assignment-section-05/classification_model/config.yml new file mode 100644 index 000000000..696a05035 --- /dev/null +++ b/assignment-section-05/classification_model/config.yml @@ -0,0 +1,51 @@ +# Package Overview +package_name: regression_model + +# Data Files +raw_data_file: raw.csv +training_data_file: train.csv +test_data_file: test.csv + +# Variables +# The variable we are attempting to predict (sale price) +target: survived + +pipeline_name: titanic_classification_model +pipeline_save_file: titanic_classification_model_output_v + +features: + - pclass + - sex + - age + - sibsp + - parch + - fare + - cabin + - embarked + - title # generated from name + +# set train/test split +test_size: 0.1 + +# to set the random seed +random_state: 0 + +unused_fields: + - name + - ticket + - boat + - body + - home.dest + +numerical_vars: + - age + - fare + +categorical_vars: + - sex + - cabin + - embarked + - title + +cabin_vars: + - cabin \ No newline at end of file diff --git a/assignment-section-05/classification_model/config/__init__.py b/assignment-section-05/classification_model/config/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/assignment-section-05/classification_model/config/core.py b/assignment-section-05/classification_model/config/core.py new file mode 100644 index 000000000..3f39d64f0 --- /dev/null +++ b/assignment-section-05/classification_model/config/core.py @@ -0,0 +1,84 @@ +from pathlib import Path +from typing import Sequence + +from pydantic import BaseModel +from strictyaml import YAML, load + +import classification_model + +# Project Directories +PACKAGE_ROOT = Path(classification_model.__file__).resolve().parent +ROOT = PACKAGE_ROOT.parent +CONFIG_FILE_PATH = PACKAGE_ROOT / "config.yml" +DATASET_DIR = PACKAGE_ROOT / "datasets" +TRAINED_MODEL_DIR = PACKAGE_ROOT / "trained_models" + + +class AppConfig(BaseModel): + """ + Application-level config. + """ + + package_name: str + raw_data_file: str + pipeline_save_file: str + + +class ModelConfig(BaseModel): + """ + All configuration relevant to model + training and feature engineering. + """ + + target: str + unused_fields: Sequence[str] + features: Sequence[str] + test_size: float + random_state: int + numerical_vars: Sequence[str] + categorical_vars: Sequence[str] + cabin_vars: Sequence[str] + + +class Config(BaseModel): + """Master config object.""" + + app_config: AppConfig + model_config: ModelConfig + + +def find_config_file() -> Path: + """Locate the configuration file.""" + if CONFIG_FILE_PATH.is_file(): + return CONFIG_FILE_PATH + raise Exception(f"Config not found at {CONFIG_FILE_PATH!r}") + + +def fetch_config_from_yaml(cfg_path: Path = None) -> YAML: + """Parse YAML containing the package configuration.""" + + if not cfg_path: + cfg_path = find_config_file() + + if cfg_path: + with open(cfg_path, "r") as conf_file: + parsed_config = load(conf_file.read()) + return parsed_config + raise OSError(f"Did not find config file at path: {cfg_path}") + + +def create_and_validate_config(parsed_config: YAML = None) -> Config: + """Run validation on config values.""" + if parsed_config is None: + parsed_config = fetch_config_from_yaml() + + # specify the data attribute from the strictyaml YAML type. + _config = Config( + app_config=AppConfig(**parsed_config.data), + model_config=ModelConfig(**parsed_config.data), + ) + + return _config + + +config = create_and_validate_config() diff --git a/assignment-section-05/classification_model/datasets/__init__.py b/assignment-section-05/classification_model/datasets/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/assignment-section-05/classification_model/pipeline.py b/assignment-section-05/classification_model/pipeline.py new file mode 100644 index 000000000..c20abd660 --- /dev/null +++ b/assignment-section-05/classification_model/pipeline.py @@ -0,0 +1,64 @@ +# for encoding categorical variables +from feature_engine.encoding import OneHotEncoder, RareLabelEncoder + +# for imputation +from feature_engine.imputation import ( + AddMissingIndicator, + CategoricalImputer, + MeanMedianImputer, +) +from sklearn.linear_model import LogisticRegression +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler + +from classification_model.config.core import config +from classification_model.processing.features import ExtractLetterTransformer + +titanic_pipe = Pipeline( + [ + # impute categorical variables with string missing + ( + "categorical_imputation", + CategoricalImputer( + imputation_method="missing", + variables=config.model_config.categorical_vars, + ), + ), + # add missing indicator to numerical variables + ( + "missing_indicator", + AddMissingIndicator(variables=config.model_config.numerical_vars), + ), + # impute numerical variables with the median + ( + "median_imputation", + MeanMedianImputer( + imputation_method="median", variables=config.model_config.numerical_vars + ), + ), + # Extract letter from cabin + ( + "extract_letter", + ExtractLetterTransformer(variables=config.model_config.cabin_vars), + ), + # == CATEGORICAL ENCODING ====== + # remove categories present in less than 5% of the observations (0.05) + # group them in one category called 'Rare' + ( + "rare_label_encoder", + RareLabelEncoder( + tol=0.05, n_categories=1, variables=config.model_config.categorical_vars + ), + ), + # encode categorical variables using one hot encoding into k-1 variables + ( + "categorical_encoder", + OneHotEncoder( + drop_last=True, variables=config.model_config.categorical_vars + ), + ), + # scale + ("scaler", StandardScaler()), + ("Logit", LogisticRegression(C=0.0005, random_state=0)), + ] +) diff --git a/assignment-section-05/classification_model/predict.py b/assignment-section-05/classification_model/predict.py new file mode 100644 index 000000000..eb2990bb3 --- /dev/null +++ b/assignment-section-05/classification_model/predict.py @@ -0,0 +1,34 @@ +import typing as t + +import pandas as pd + +from classification_model import __version__ as _version +from classification_model.config.core import config +from classification_model.processing.data_manager import load_pipeline +from classification_model.processing.validation import validate_inputs + +pipeline_file_name = f"{config.app_config.pipeline_save_file}{_version}.pkl" +_titanic_pipe = load_pipeline(file_name=pipeline_file_name) + + +def make_prediction( + *, + input_data: t.Union[pd.DataFrame, dict], +) -> dict: + """Make a prediction using a saved model pipeline.""" + + data = pd.DataFrame(input_data) + validated_data, errors = validate_inputs(input_data=data) + results = {"predictions": None, "version": _version, "errors": errors} + + if not errors: + predictions = _titanic_pipe.predict( + X=validated_data[config.model_config.features] + ) + results = { + "predictions": predictions, + "version": _version, + "errors": errors, + } + + return results diff --git a/assignment-section-05/classification_model/processing/__init__.py b/assignment-section-05/classification_model/processing/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/assignment-section-05/classification_model/processing/data_manager.py b/assignment-section-05/classification_model/processing/data_manager.py new file mode 100644 index 000000000..550eebdfc --- /dev/null +++ b/assignment-section-05/classification_model/processing/data_manager.py @@ -0,0 +1,105 @@ +import logging +import re +from pathlib import Path +from typing import Any, List, Union + +import joblib +import numpy as np +import pandas as pd +from sklearn.pipeline import Pipeline + +from classification_model import __version__ as _version +from classification_model.config.core import DATASET_DIR, TRAINED_MODEL_DIR, config + +logger = logging.getLogger(__name__) + + +# float type for np.nan +def get_first_cabin(row: Any) -> Union[str, float]: + try: + return row.split()[0] + except AttributeError: + return np.nan + + +def get_title(passenger: str) -> str: + """Extracts the title (Mr, Ms, etc) from the name variable.""" + line = passenger + if re.search("Mrs", line): + return "Mrs" + elif re.search("Mr", line): + return "Mr" + elif re.search("Miss", line): + return "Miss" + elif re.search("Master", line): + return "Master" + else: + return "Other" + + +def pre_pipeline_preparation(*, dataframe: pd.DataFrame) -> pd.DataFrame: + # replace question marks with NaN values + data = dataframe.replace("?", np.nan) + + # retain only the first cabin if more than + # 1 are available per passenger + data["cabin"] = data["cabin"].apply(get_first_cabin) + + data["title"] = data["name"].apply(get_title) + + # cast numerical variables as floats + data["fare"] = data["fare"].astype("float") + data["age"] = data["age"].astype("float") + + # drop unnecessary variables + data.drop(labels=config.model_config.unused_fields, axis=1, inplace=True) + + return data + + +def _load_raw_dataset(*, file_name: str) -> pd.DataFrame: + dataframe = pd.read_csv(Path(f"{DATASET_DIR}/{file_name}")) + return dataframe + + +def load_dataset(*, file_name: str) -> pd.DataFrame: + dataframe = pd.read_csv(Path(f"{DATASET_DIR}/{file_name}")) + transformed = pre_pipeline_preparation(dataframe=dataframe) + + return transformed + + +def save_pipeline(*, pipeline_to_persist: Pipeline) -> None: + """Persist the pipeline. + Saves the versioned model, and overwrites any previous + saved models. This ensures that when the package is + published, there is only one trained model that can be + called, and we know exactly how it was built. + """ + + # Prepare versioned save file name + save_file_name = f"{config.app_config.pipeline_save_file}{_version}.pkl" + save_path = TRAINED_MODEL_DIR / save_file_name + + remove_old_pipelines(files_to_keep=[save_file_name]) + joblib.dump(pipeline_to_persist, save_path) + + +def load_pipeline(*, file_name: str) -> Pipeline: + """Load a persisted pipeline.""" + + file_path = TRAINED_MODEL_DIR / file_name + return joblib.load(filename=file_path) + + +def remove_old_pipelines(*, files_to_keep: List[str]) -> None: + """ + Remove old model pipelines. + This is to ensure there is a simple one-to-one + mapping between the package version and the model + version to be imported and used by other applications. + """ + do_not_delete = files_to_keep + ["__init__.py"] + for model_file in TRAINED_MODEL_DIR.iterdir(): + if model_file.name not in do_not_delete: + model_file.unlink() diff --git a/assignment-section-05/classification_model/processing/features.py b/assignment-section-05/classification_model/processing/features.py new file mode 100644 index 000000000..fb7c629c8 --- /dev/null +++ b/assignment-section-05/classification_model/processing/features.py @@ -0,0 +1,26 @@ +from sklearn.base import BaseEstimator, TransformerMixin + + +class ExtractLetterTransformer(BaseEstimator, TransformerMixin): + # Extract first letter of variable + + def __init__(self, variables): + + if not isinstance(variables, list): + raise ValueError("variables should be a list") + + self.variables = variables + + def fit(self, X, y=None): + # we need this step to fit the sklearn pipeline + return self + + def transform(self, X): + + # so that we do not over-write the original dataframe + X = X.copy() + + for feature in self.variables: + X[feature] = X[feature].str[0] + + return X diff --git a/assignment-section-05/classification_model/processing/validation.py b/assignment-section-05/classification_model/processing/validation.py new file mode 100644 index 000000000..7ac1870b0 --- /dev/null +++ b/assignment-section-05/classification_model/processing/validation.py @@ -0,0 +1,46 @@ +from typing import List, Optional, Tuple, Union + +import numpy as np +import pandas as pd +from pydantic import BaseModel, ValidationError + +from classification_model.config.core import config +from classification_model.processing.data_manager import pre_pipeline_preparation + + +def validate_inputs(*, input_data: pd.DataFrame) -> Tuple[pd.DataFrame, Optional[dict]]: + """Check model inputs for unprocessable values.""" + + pre_processed = pre_pipeline_preparation(dataframe=input_data) + validated_data = pre_processed[config.model_config.features].copy() + errors = None + + try: + # replace numpy nans so that pydantic can validate + MultipleTitanicDataInputs( + inputs=validated_data.replace({np.nan: None}).to_dict(orient="records") + ) + except ValidationError as error: + errors = error.json() + + return validated_data, errors + + +class TitanicDataInputSchema(BaseModel): + pclass: Optional[int] + name: Optional[str] + sex: Optional[str] + age: Optional[int] + sibsp: Optional[int] + parch: Optional[int] + ticket: Optional[int] + fare: Optional[float] + cabin: Optional[str] + embarked: Optional[str] + boat: Optional[Union[str, int]] + body: Optional[int] + # TODO: rename home.dest, can get away with it now as it is not used + + +class MultipleTitanicDataInputs(BaseModel): + inputs: List[TitanicDataInputSchema] diff --git a/assignment-section-05/classification_model/train_pipeline.py b/assignment-section-05/classification_model/train_pipeline.py new file mode 100644 index 000000000..5c83a97f3 --- /dev/null +++ b/assignment-section-05/classification_model/train_pipeline.py @@ -0,0 +1,37 @@ +from sklearn.model_selection import train_test_split + +from classification_model.config.core import config +from classification_model.pipeline import titanic_pipe +from classification_model.processing.data_manager import load_dataset, save_pipeline + + +def run_training() -> None: + """ + Train the model. + + Training data can be found here: + https://www.openml.org/data/get_csv/16826755/phpMYEkMl + """ + + # read training data + data = load_dataset(file_name=config.app_config.raw_data_file) + + # divide train and test + X_train, X_test, y_train, y_test = train_test_split( + data[config.model_config.features], # predictors + data[config.model_config.target], + test_size=config.model_config.test_size, + # we are setting the random seed here + # for reproducibility + random_state=config.model_config.random_state, + ) + + # fit model + titanic_pipe.fit(X_train, y_train) + + # persist trained model + save_pipeline(pipeline_to_persist=titanic_pipe) + + +if __name__ == "__main__": + run_training() diff --git a/assignment-section-05/classification_model/trained_models/__init__.py b/assignment-section-05/classification_model/trained_models/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/assignment-section-05/mypy.ini b/assignment-section-05/mypy.ini new file mode 100644 index 000000000..d6984fd7a --- /dev/null +++ b/assignment-section-05/mypy.ini @@ -0,0 +1,14 @@ +[mypy] +warn_unreachable = False +warn_unused_ignores = True +follow_imports = skip +show_error_context = True +warn_incomplete_stub = True +ignore_missing_imports = True +check_untyped_defs = True +cache_dir = /dev/null +# Allow defining functions without any types. +disallow_untyped_defs = False +warn_redundant_casts = True +warn_unused_configs = True +strict_optional = True \ No newline at end of file diff --git a/assignment-section-05/pyproject.toml b/assignment-section-05/pyproject.toml new file mode 100644 index 000000000..945100376 --- /dev/null +++ b/assignment-section-05/pyproject.toml @@ -0,0 +1,49 @@ +[build-system] +requires = [ + "setuptools>=42", + "wheel" +] +build-backend = "setuptools.build_meta" + +[tool.pytest.ini_options] +minversion = "2.0" +addopts = "-rfEX -p pytester --strict-markers" +python_files = ["test_*.py", "*_test.py"] +python_classes = ["Test", "Acceptance"] +python_functions = ["test"] +# NOTE: "doc" is not included here, but gets tested explicitly via "doctesting". +testpaths = ["tests"] +xfail_strict = true +filterwarnings = [ + "error", + "default:Using or importing the ABCs:DeprecationWarning:unittest2.*", + # produced by older pyparsing<=2.2.0. + "default:Using or importing the ABCs:DeprecationWarning:pyparsing.*", + "default:the imp module is deprecated in favour of importlib:DeprecationWarning:nose.*", + # distutils is deprecated in 3.10, scheduled for removal in 3.12 + "ignore:The distutils package is deprecated:DeprecationWarning", + # produced by python3.6/site.py itself (3.6.7 on Travis, could not trigger it with 3.6.8)." + "ignore:.*U.*mode is deprecated:DeprecationWarning:(?!(pytest|_pytest))", + # produced by pytest-xdist + "ignore:.*type argument to addoption.*:DeprecationWarning", + # produced on execnet (pytest-xdist) + "ignore:.*inspect.getargspec.*deprecated, use inspect.signature.*:DeprecationWarning", + # pytest's own futurewarnings + "ignore::pytest.PytestExperimentalApiWarning", + # Do not cause SyntaxError for invalid escape sequences in py37. + # Those are caught/handled by pyupgrade, and not easy to filter with the + # module being the filename (with .py removed). + "default:invalid escape sequence:DeprecationWarning", + # ignore use of unregistered marks, because we use many to test the implementation + "ignore::_pytest.warning_types.PytestUnknownMarkWarning", +] + +[tool.black] +target-version = ['py36'] + +[tool.isort] +profile = "black" +line_length = 100 +lines_between_sections = 1 +known_first_party = "sentry" +skip = "migrations" diff --git a/assignment-section-05/requirements/requirements.txt b/assignment-section-05/requirements/requirements.txt new file mode 100644 index 000000000..f24a342da --- /dev/null +++ b/assignment-section-05/requirements/requirements.txt @@ -0,0 +1,11 @@ +# We use compatible release functionality (see PEP 440 here: https://www.python.org/dev/peps/pep-0440/#compatible-release) +# to specify acceptable version ranges of our project dependencies. This gives us the flexibility to keep up with small +# updates/fixes, whilst ensuring we don't install a major update which could introduce backwards incompatible changes. +numpy>=1.20.0,<1.21.0 +pandas>=1.3.5,<1.4.0 +pydantic>=1.8.1,<1.9.0 +scikit-learn>=0.24.2,<0.25.0 +strictyaml>=1.3.2,<1.4.0 +ruamel.yaml==0.16.12 +feature-engine>=1.0.2,<1.3.0 +joblib>=1.0.1,<1.1.0 \ No newline at end of file diff --git a/assignment-section-05/requirements/test_requirements.txt b/assignment-section-05/requirements/test_requirements.txt new file mode 100644 index 000000000..538bc0f28 --- /dev/null +++ b/assignment-section-05/requirements/test_requirements.txt @@ -0,0 +1,10 @@ +-r requirements.txt + +# testing requirements +pytest>=6.2.3,<6.3.0 + +# repo maintenance tooling +black==20.8b1 +flake8>=3.9.0,<3.10.0 +mypy==0.812 +isort==5.8.0 diff --git a/assignment-section-05/setup.py b/assignment-section-05/setup.py new file mode 100644 index 000000000..68a0c2796 --- /dev/null +++ b/assignment-section-05/setup.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from pathlib import Path + +from setuptools import find_packages, setup + +# Package meta-data. +NAME = 'tid-titanic-classification-model' +DESCRIPTION = "Example Titanic dataset classification model package from Train In Data." +URL = "https://github.com/trainindata/deploying-machine-learning-models" +EMAIL = "christopher.samiullah@protonmail.com" +AUTHOR = "ChristopherGS" +REQUIRES_PYTHON = ">=3.7.0" + + +# The rest you shouldn't have to touch too much :) +# ------------------------------------------------ +# Except, perhaps the License and Trove Classifiers! +# Trove Classifiers: https://pypi.org/classifiers/ +# If you do change the License, remember to change the +# Trove Classifier for that! +long_description = DESCRIPTION + +# Load the package's VERSION file as a dictionary. +about = {} +ROOT_DIR = Path(__file__).resolve().parent +REQUIREMENTS_DIR = ROOT_DIR / 'requirements' +PACKAGE_DIR = ROOT_DIR / 'classification_model' +with open(PACKAGE_DIR / "VERSION") as f: + _version = f.read().strip() + about["__version__"] = _version + + +# What packages are required for this module to be executed? +def list_reqs(fname="requirements.txt"): + with open(REQUIREMENTS_DIR / fname) as fd: + return fd.read().splitlines() + +# Where the magic happens: +setup( + name=NAME, + version=about["__version__"], + description=DESCRIPTION, + long_description=long_description, + long_description_content_type="text/markdown", + author=AUTHOR, + author_email=EMAIL, + python_requires=REQUIRES_PYTHON, + url=URL, + packages=find_packages(exclude=("tests",)), + package_data={"classification_model": ["VERSION"]}, + install_requires=list_reqs(), + extras_require={}, + include_package_data=True, + license="BSD-3", + classifiers=[ + # Trove classifiers + # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers + "License :: OSI Approved :: MIT License", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", + ], +) \ No newline at end of file diff --git a/assignment-section-05/tests/__init__.py b/assignment-section-05/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/assignment-section-05/tests/conftest.py b/assignment-section-05/tests/conftest.py new file mode 100644 index 000000000..8e3fd46ad --- /dev/null +++ b/assignment-section-05/tests/conftest.py @@ -0,0 +1,26 @@ +import logging + +import pytest +from sklearn.model_selection import train_test_split + +from classification_model.config.core import config +from classification_model.processing.data_manager import _load_raw_dataset + +logger = logging.getLogger(__name__) + + +@pytest.fixture +def sample_input_data(): + data = _load_raw_dataset(file_name=config.app_config.raw_data_file) + + # divide train and test + X_train, X_test, y_train, y_test = train_test_split( + data, # predictors + data[config.model_config.target], + test_size=config.model_config.test_size, + # we are setting the random seed here + # for reproducibility + random_state=config.model_config.random_state, + ) + + return X_test diff --git a/assignment-section-05/tests/test_features.py b/assignment-section-05/tests/test_features.py new file mode 100644 index 000000000..c3f88101b --- /dev/null +++ b/assignment-section-05/tests/test_features.py @@ -0,0 +1,16 @@ +from classification_model.config.core import config +from classification_model.processing.features import ExtractLetterTransformer + + +def test_temporal_variable_transformer(sample_input_data): + # Given + transformer = ExtractLetterTransformer( + variables=config.model_config.cabin_vars, # cabin + ) + assert sample_input_data["cabin"].iat[6] == "E12" + + # When + subject = transformer.fit_transform(sample_input_data) + + # Then + assert subject["cabin"].iat[6] == "E" diff --git a/assignment-section-05/tests/test_prediction.py b/assignment-section-05/tests/test_prediction.py new file mode 100644 index 000000000..76965698a --- /dev/null +++ b/assignment-section-05/tests/test_prediction.py @@ -0,0 +1,27 @@ +""" +Note: These tests will fail if you have not first trained the model. +""" + +import numpy as np +from sklearn.metrics import accuracy_score + +from classification_model.predict import make_prediction + + +def test_make_prediction(sample_input_data): + # Given + expected_no_predictions = 131 + + # When + result = make_prediction(input_data=sample_input_data) + + # Then + predictions = result.get("predictions") + assert isinstance(predictions, np.ndarray) + assert isinstance(predictions[0], np.int64) + assert result.get("errors") is None + assert len(predictions) == expected_no_predictions + _predictions = list(predictions) + y_true = sample_input_data["survived"] + accuracy = accuracy_score(_predictions, y_true) + assert accuracy > 0.7 diff --git a/assignment-section-05/tox.ini b/assignment-section-05/tox.ini new file mode 100644 index 000000000..a51831c34 --- /dev/null +++ b/assignment-section-05/tox.ini @@ -0,0 +1,76 @@ +# Tox is a generic virtualenv management and test command line tool. Its goal is to +# standardize testing in Python. We will be using it extensively in this course. + +# Using Tox we can (on multiple operating systems): +# + Eliminate PYTHONPATH challenges when running scripts/tests +# + Eliminate virtualenv setup confusion +# + Streamline steps such as model training, model publishing + + +[tox] +envlist = test_package, typechecks, stylechecks, lint +skipsdist = True + +[testenv] +install_command = pip install {opts} {packages} + +[testenv:test_package] +deps = + -rrequirements/test_requirements.txt + +setenv = + PYTHONPATH=. + PYTHONHASHSEED=0 + +commands= + python classification_model/train_pipeline.py + pytest \ + -s \ + -vv \ + {posargs:tests/} + +[testenv:train] +envdir = {toxworkdir}/test_package +deps = + {[testenv:test_package]deps} + +setenv = + {[testenv:test_package]setenv} + +commands= + python classification_model/train_pipeline.py + + +[testenv:typechecks] +envdir = {toxworkdir}/test_package + +deps = + {[testenv:test_package]deps} + +commands = {posargs:mypy classification_model} + + +[testenv:stylechecks] +envdir = {toxworkdir}/test_package + +deps = + {[testenv:test_package]deps} + +commands = {posargs:flake8 classification_model tests} + + +[testenv:lint] +envdir = {toxworkdir}/test_package + +deps = + {[testenv:test_package]deps} + +commands = + isort classification_model tests + black classification_model tests + mypy classification_model + flake8 classification_model + +[flake8] +exclude = .git,env +max-line-length = 90 \ No newline at end of file