diff --git a/.gitignore b/.gitignore index 96737d1..a7d3817 100644 --- a/.gitignore +++ b/.gitignore @@ -5,7 +5,36 @@ data/ docs/build/* docs/source/api/ -### Python execution ### -scripts/__pycache__/ -tests/output/ +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# test results +rr/tests/output/ diff --git a/.travis.yml b/.travis.yml index 0674d0c..f20551b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,27 +6,19 @@ python: - "3.8" install: # Install phase of our CI pipeline - - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh - - bash miniconda.sh -b -p $HOME/miniconda - - source "$HOME/miniconda/etc/profile.d/conda.sh" - - hash -r - - conda config --set always_yes yes --set changeps1 no - - conda update -q conda - - conda info -a - - conda create -q -n project python=$TRAVIS_PYTHON_VERSION -c plotly --file requirements.txt --file build-requirements.txt - - conda activate project + - pip install . + - pip install -r build-requirements.txt script: - # Run phase of our CI pipeline # Run unitTests - - nosetests --nocapture -v tests/test.py --with-coverage --cover-package=scripts + - nosetests --nocapture -v rr --with-coverage --cover-package=rr # Generate rst files from docstring - - sphinx-apidoc -fMeT -o docs/source/api scripts + - sphinx-apidoc -fMeT -o docs/source/api rr rr/tests # Build documentation - sphinx-build docs/source docs/build - touch docs/build/.nojekyll # Black lint checking - - black --check scripts/ tests/ + - black --check rr/ after_success: # update coveralls report - coveralls @@ -39,4 +31,12 @@ deploy: local_dir: ./docs/build on: branch: master - condition: $TRAVIS_PYTHON_VERSION = 3.8 + condition: "$TRAVIS_PYTHON_VERSION = 3.8" +# - provider: pypi +# server: https://test.pypi.org/legacy/ # Remove to deployment on pypi.org +# username: "__token__" +# password: +# secure: AgENdGVzdC5weXBpLm9yZwIkNDkzMjE2ZDMtYjlhYy00NTViLTg1MzYtZGU4N2YyNjA0NDY1AAIleyJwZXJtaXNzaW9ucyI6ICJ1c2VyIiwgInZlcnNpb24iOiAxfQAABiCE1UhRrnGVriZfsUQEp0YRaWTuAOLsDggCBBsOB-PbMQ +# on: +# tags: true +# condition: "$TRAVIS_PYTHON_VERSION = 3.8" diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..c189475 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,3 @@ +include LICENSE README.rst buildout.cfg requirements.txt +recursive-include doc conf.py *.rst *.ico *.png +graft tests/inputs diff --git a/build-requirements.txt b/build-requirements.txt index c362dd0..65015d0 100644 --- a/build-requirements.txt +++ b/build-requirements.txt @@ -4,3 +4,4 @@ nose coverage coveralls black +twine diff --git a/docs/source/conf.py b/docs/source/conf.py index 8375dfc..2297d5f 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -13,7 +13,7 @@ import os import sys import subprocess -sys.path.insert(0, os.path.abspath('../../scripts')) +sys.path.insert(0, os.path.abspath('../../experiment')) # -- Project information ----------------------------------------------------- diff --git a/experiment.ini b/experiment.ini new file mode 100644 index 0000000..604e7dd --- /dev/null +++ b/experiment.ini @@ -0,0 +1,7 @@ +[nb_trees_experiment] +nb_trees = 1, 2 +tree_depth = 10 + +[tree_depth_experiment] +nb_trees = 10 +tree_depth = 1, 2 diff --git a/scripts/__ini__.py b/rr/__init__.py similarity index 100% rename from scripts/__ini__.py rename to rr/__init__.py diff --git a/tests/inputs/not_a_zip.txt b/rr/download_data/__init__.py similarity index 100% rename from tests/inputs/not_a_zip.txt rename to rr/download_data/__init__.py diff --git a/rr/download_data/__main__.py b/rr/download_data/__main__.py new file mode 100644 index 0000000..6dfd740 --- /dev/null +++ b/rr/download_data/__main__.py @@ -0,0 +1,22 @@ +import sys +import argparse +import os.path +from rr.download_data import download_data + + +def main(): + + parser = argparse.ArgumentParser( + description="M05 mini-project: Download dataset.zip online" + ) + parser.add_argument("source", type=str, help="Data zip url") + parser.add_argument("destination", type=str, help="Destination folder") + args = parser.parse_args() + + download_destination = os.path.join(args.destination + "/dataset.zip") + download_data.download_url(args.source, download_destination) + download_data.unzip_file(download_destination, args.destination) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/download_data.py b/rr/download_data/download_data.py similarity index 70% rename from scripts/download_data.py rename to rr/download_data/download_data.py index 7a5e977..88c175b 100644 --- a/scripts/download_data.py +++ b/rr/download_data/download_data.py @@ -3,7 +3,7 @@ import zipfile -# Function to download a file through http.get using requests +# Function to download_data a file through http.get using requests def download_url(url, save_path): """Download a file from the given url using http @@ -16,7 +16,7 @@ def download_url(url, save_path): None """ with open(save_path, "wb") as f: - print("Downloading {} from {}".format(save_path, url)) + print("Downloading {} from {}...".format(save_path, url)) response = requests.get(url, stream=True) total_length = response.headers.get("content-length") @@ -31,7 +31,7 @@ def download_url(url, save_path): done = int(50 * dl / total_length) sys.stdout.write("\r[%s%s]" % ("=" * done, " " * (50 - done))) sys.stdout.flush() - print() + print("Download succes.\n") # Function to unzip files @@ -46,17 +46,7 @@ def unzip_file(path_to_zip_file, directory_to_extract_to): Raises: None """ - print("Unzip files..") + print("Unzip {} to {}...".format(path_to_zip_file, directory_to_extract_to)) with zipfile.ZipFile(path_to_zip_file, "r") as zip_ref: zip_ref.extractall(directory_to_extract_to) - - -if __name__ == "__main__": - url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00506/casas-dataset.zip" - url_test = "https://archive.ics.uci.edu/ml/machine-learning-databases/00405/Postures.zip" # Smaller zip to test - save_path = "../data/casas-dataset.zip" - # Download zip file - download_url(url, save_path) - # Unzip it - unzip_file(save_path, "../data_test/") - print("Done") + print("Unzip succes.\n") diff --git a/rr/experiment/__init__.py b/rr/experiment/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/rr/experiment/__main__.py b/rr/experiment/__main__.py new file mode 100644 index 0000000..eb0c752 --- /dev/null +++ b/rr/experiment/__main__.py @@ -0,0 +1,51 @@ +import sys +import argparse +import configparser +from rr.experiment import experiments, database +import os.path + + +def main(): + parser = argparse.ArgumentParser(description="M05 mini-project: experiments") + parser.add_argument("datapath", type=str, help="Dataset file in .csv") + parser.add_argument("output", type=str, help="Destination folder for the results") + parser.add_argument( + "config", type=str, help="Filepath for experiments configuration file in .ini" + ) + args = parser.parse_args() + + config = configparser.ConfigParser() + config.read(args.config) + + print( + "M05 mini-project on Human Activity Recognition with Random Forest classifier" + ) + tabnum = 1 + experiment_results = experiments.experiment_impact_nb_trees( + tabnum, + filepath=args.datapath, + nb_trees=[int(n) for n in config["nb_trees_experiment"]["nb_trees"].split(",")], + max_depth=int(config["nb_trees_experiment"]["tree_depth"]), + plot_path=args.output, + ) + + tabnum += len(config["nb_trees_experiment"]["nb_trees"].split(",")) * len( + database.PROTOCOLS + ) + experiment_results += experiments.experiment_impact_tree_depth( + tabnum, + filepath=args.datapath, + nb_trees=int(config["tree_depth_experiment"]["nb_trees"]), + max_depths=[ + int(d) for d in config["tree_depth_experiment"]["tree_depth"].split(",") + ], + plot_path=args.output, + ) + + with open(os.path.join(args.output, "experiment_results.txt"), "w+") as fout: + fout.write(experiment_results) + print("Experiments done\n") + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/rr/experiment/algorithm.py b/rr/experiment/algorithm.py new file mode 100644 index 0000000..8173a02 --- /dev/null +++ b/rr/experiment/algorithm.py @@ -0,0 +1,51 @@ +from sklearn.ensemble import RandomForestClassifier + +import logging + +logger = logging.getLogger() + + +class Model: + def __init__(self, nb_tree_per_forest=50, max_depth=10): + """Create a new ML model (Random forest classifier from scikitlearn) + + Args: + nb_tree_per_forest (int): number of decision trees in the forest + max_depth (int): max depth of the trees + Returns: + None + Raises: + None + """ + self.model = RandomForestClassifier( + n_estimators=nb_tree_per_forest, max_depth=max_depth, random_state=0 + ) + + def train(self, X, y): + """Train the model using the given data + + Args: + X (numpy.ndarray):A NxM 2D-array where each row corresponds to a sample and each column to a feature + y (numpy.ndarray): A 1D-array of length N, where each element corresponds to a sample label + Returns: + None + Raises: + None + """ + self.model.fit(X, y) + + def predict(self, X): + """Make a prediction on the data using the trained model + + Args: + X (numpy.ndarray):A NxM 2D-array where each row corresponds to a sample and each column to a feature + Returns: + numpy.ndarray: A 1D array (with a dtype of int) containing the predicted + label for each sample + + Raises: + None + """ + prediction = self.model.predict(X) + + return prediction diff --git a/scripts/analysis.py b/rr/experiment/analysis.py similarity index 91% rename from scripts/analysis.py rename to rr/experiment/analysis.py index a5509b4..d42abe7 100644 --- a/scripts/analysis.py +++ b/rr/experiment/analysis.py @@ -1,7 +1,6 @@ import plotly.express as px from sklearn.metrics import confusion_matrix import numpy as np -import itertools def get_confusion_matrix(prediction_label, true_label): @@ -9,7 +8,7 @@ def get_confusion_matrix(prediction_label, true_label): Args: prediction_label (list): Estimated targets as returned by a classifier. - true_label (list): Ground truth (correct) target values. + true_label (list): Ground truth (correct) target values. Returns: numpy.ndarray: A 2D array (with a dtype of int) containing the confusion matrix. Raises: @@ -26,7 +25,7 @@ def plot_confusion_matrix( Args: cm (numpy.ndarray): A 2D array (with a dtype of int) containing the confusion matrix. classes (numpy.ndarray): A 1D array (with a dtype of str) containing the lable name for each class - normalize (boolean): Flag to normalize the data + normalize (boolean): Flag to normalize the data title (str): The title of the plot file_name (str): File name to export the graph Returns: @@ -34,7 +33,6 @@ def plot_confusion_matrix( Raises: None """ - # Normalize if wanted if normalize: cm = cm / np.sum(cm) @@ -47,6 +45,4 @@ def plot_confusion_matrix( color_continuous_scale="Blues", title=title, ) - # fig.show() - # Export graph fig.write_html("{}.html".format(file_name)) diff --git a/scripts/database.py b/rr/experiment/database.py similarity index 99% rename from scripts/database.py rename to rr/experiment/database.py index 854ac71..f7f756f 100644 --- a/scripts/database.py +++ b/rr/experiment/database.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python import numpy as np import csv from sklearn.model_selection import train_test_split diff --git a/scripts/main.py b/rr/experiment/experiments.py similarity index 56% rename from scripts/main.py rename to rr/experiment/experiments.py index 1889345..c9aeb68 100644 --- a/scripts/main.py +++ b/rr/experiment/experiments.py @@ -1,7 +1,7 @@ -#!/usr/bin/env python from tabulate import tabulate -from scripts import algorithm, database, analysis, config +from rr.experiment import algorithm, database, analysis import numpy as np +import os.path def base_experiment(protocol, variables, filepath, nb_tree_per_forest=50, max_depth=10): @@ -48,27 +48,30 @@ def pretty_confusion_matrix(cm): return table -def experiment_impact_nb_trees(tabnum, filepath, nb_trees, max_depth): - """Evaluates and print the impact of the number of trees per forest on the classifiers performance +def experiment_impact_nb_trees(tabnum, filepath, nb_trees, max_depth, plot_path): + """Evaluates the impact of the number of trees per forest on the classifiers performance Args: tabnum (int): first confusion matrix numbering filepath (str): path to the file containing the dataset to load nb_trees (list): list of number of trees to evaluate + max_depth (int): trees maximum depth + plot_path (str): folder where to store confusion matrix plots Returns: - None + str : experiment results Raises: None """ - print("\nImpact of number of trees per forest") + print("Starting experiment on number of trees impact...") + result = "\nImpact of number of trees per forest" for n, p in enumerate(database.PROTOCOLS): + print("Processing `protocol` {}...".format(p)) for m, nb_tree_per_forest in enumerate(nb_trees): - print( - "\nTable {table_number}: Confusion matrix with {nb_trees} tree(s) for Protocol `{protocol}`".format( - table_number=(n * len(nb_trees)) + m + tabnum, - protocol=p, - nb_trees=nb_tree_per_forest, - ) + num = (n * len(nb_trees)) + m + tabnum + result += "\nTable {table_number}: Confusion matrix with {nb_trees} tree(s) for Protocol `{protocol}`".format( + table_number=num, + protocol=p, + nb_trees=nb_tree_per_forest, ) cm = base_experiment( p, @@ -77,29 +80,40 @@ def experiment_impact_nb_trees(tabnum, filepath, nb_trees, max_depth): max_depth=max_depth, filepath=filepath, ) - print(pretty_confusion_matrix(cm)) + result += pretty_confusion_matrix(cm) + analysis.plot_confusion_matrix( + cm, + database.CLASSES, + file_name=os.path.join(plot_path, "table_{}".format(num)), + ) + print("Experiment completed\n") + return result -def experiment_impact_tree_depth(tabnum, filepath, nb_trees, max_depths): - """Evaluates and print the impact of the trees depth on the classifiers performance +def experiment_impact_tree_depth(tabnum, filepath, nb_trees, max_depths, plot_path): + """Evaluates and the impact of the trees depth on the classifiers performance Args: tabnum (int): first confusion matrix numbering filepath (str): path to the file containing the dataset to load + nb_trees (int): number of trees in forest + max_depths (list): list of trees maximum depths to evaluate + plot_path (str): folder where to store confusion matrix plots Returns: - None + str : experiment results Raises: None """ - print("\nImpact of trees maximum depth") + print("Starting experiment on tree depth impact...") + result = "\nImpact of trees maximum depth" for n, p in enumerate(database.PROTOCOLS): + print("Processing `protocol` {}".format(p)) for m, max_depth in enumerate(max_depths): - print( - "\nTable {table_number}: Confusion matrix with trees maximum depth of {max_depth} for Protocol `{protocol}`".format( - table_number=(n * len(max_depths)) + m + tabnum, - protocol=p, - max_depth=max_depth, - ) + num = (n * len(max_depths)) + m + tabnum + result += "\nTable {table_number}: Confusion matrix with trees maximum depth of {max_depth} for Protocol `{protocol}`".format( + table_number=num, + protocol=p, + max_depth=max_depth, ) cm = base_experiment( p, @@ -108,22 +122,11 @@ def experiment_impact_tree_depth(tabnum, filepath, nb_trees, max_depths): max_depth=max_depth, filepath=filepath, ) - print(pretty_confusion_matrix(cm)) - - -if __name__ == "__main__": - print("Main script for Human Activity Recognition with Random Forest classifier") - tabnum = 1 - experiment_impact_nb_trees( - tabnum, - filepath=config.data_path, - nb_trees=config.nb_trees_experiment["nb_trees"], - max_depth=config.nb_trees_experiment["tree_depth"], - ) - tabnum += len(config.nb_trees_experiment["nb_trees"]) * len(database.PROTOCOLS) - experiment_impact_tree_depth( - tabnum, - filepath=config.data_path, - nb_trees=config.tree_depth_experiment["nb_trees"], - max_depths=config.tree_depth_experiment["tree_depth"], - ) + result += pretty_confusion_matrix(cm) + analysis.plot_confusion_matrix( + cm, + database.CLASSES, + file_name=os.path.join(plot_path, "table_{}".format(num)), + ) + print("Experiment completed\n") + return result diff --git a/rr/tests/inputs/not_a_zip.txt b/rr/tests/inputs/not_a_zip.txt new file mode 100644 index 0000000..e69de29 diff --git a/tests/inputs/simpleZip.zip b/rr/tests/inputs/simpleZip.zip similarity index 100% rename from tests/inputs/simpleZip.zip rename to rr/tests/inputs/simpleZip.zip diff --git a/tests/inputs/test_set.csv b/rr/tests/inputs/test_set.csv similarity index 100% rename from tests/inputs/test_set.csv rename to rr/tests/inputs/test_set.csv diff --git a/rr/tests/output/confusion_matrix.html b/rr/tests/output/confusion_matrix.html new file mode 100644 index 0000000..692387c --- /dev/null +++ b/rr/tests/output/confusion_matrix.html @@ -0,0 +1,67 @@ + +
+ +