Merge pull request #31 from sdevenes/issue_28/improve_flexibility

Issue 28/improve flexibility
sdevenes · Oct 6, 2020 · f40a923 · f40a923
2 parents 803e57c + e538aab
commit f40a923
Show file tree

Hide file tree

Showing 7 changed files with 1,495 additions and 182 deletions.
diff --git a/scripts/algorithm.py b/scripts/algorithm.py
@@ -6,21 +6,6 @@
 logger = logging.getLogger()
 
 
-def make_labels(X):
-    """Generate label array from the given data
-
-    Args:
-        X (list): A list of 1D array (with a dtype of float64) showing the input
-                  training samples, where each item of the list correspond to one class.
-    Returns:
-        numpy.ndarray: A 1D array (with a dtype of int) containing the
-        label for each sample
-    Raises:
-        None
-    """
-    return np.hstack([k*np.ones(len(X[k]), dtype=int) for k in range(len(X))])
-
-
 class Model:
   def __init__(self, nb_tree_per_forest=50, max_depth=10):
     """Create a new ML model (Random forest classifier from scikitlearn)
@@ -38,44 +23,32 @@ def __init__(self, nb_tree_per_forest=50, max_depth=10):
                                         random_state=0)
 
 
-  def train(self, X):
+  def train(self, X, y):
     """Train the model using the given data
 
     Args:
-        X (list): A list of 1D array (with a dtype of float64) showing the input training samples, 
-                  where each item of the list correspond to one class.
+        X (numpy.ndarray):A NxM 2D-array where each row corresponds to a sample and each column to a feature
+        y (numpy.ndarray): A 1D-array of length N, where each element corresponds to a sample label
     Returns:
         None
     Raises:
         None
     """
-    # Get features
-    X_features = np.vstack([k for k in X])
-
-    # Get labels
-    y = make_labels(X)
-
-    # Train the model
-    self.model.fit(X_features, y)
+    self.model.fit(X, y)
 
 
   def predict(self, X):
     """Make a prediction on the data using the trained model
 
     Args:
-        X (list): A list of 1D array (with a dtype of float64) showing the input training samples, 
-                  where each item of the list correspond to one class.
+        X (numpy.ndarray):A NxM 2D-array where each row corresponds to a sample and each column to a feature
     Returns:
         numpy.ndarray: A 1D array (with a dtype of int) containing the predicted
         label for each sample
                        
     Raises:
         None
     """
-    # Get features
-    X_features = np.vstack([k for k in X])
-
-    # Predict using the trained model
-    prediction = self.model.predict(X_features)
+    prediction = self.model.predict(X)
 
-    return prediction
+    return prediction
diff --git a/scripts/config.py b/scripts/config.py
@@ -0,0 +1,3 @@
+data_path = '../data/csh101/csh101.ann.features.csv'
+nb_trees_experiment = {"nb_trees": (1, 2), "tree_depth": 10}
+tree_depth_experiment = {"nb_trees": 10, "tree_depth": (1, 2)}
diff --git a/scripts/database.py b/scripts/database.py
@@ -1,11 +1,11 @@
 #!/usr/bin/env python
 import numpy as np
 import csv
-import os
+from sklearn.model_selection import train_test_split
 
 PROTOCOLS = {
-    'proto1': {'train': (0, 0.6), 'validation': (0.6, 0.8), 'test': (0.8, 1)},
-    'proto2': {'train': (0.4, 1), 'validation': (0, 0.2), 'test': (0.2, 0.4)},
+    'proto1': {'train': 0.8, 'test': 0.2, 'random': 1},
+    'proto2': {'train': 0.8, 'test': 0.2, 'random': 2},
 }
 
 SUBSETS = [
@@ -92,59 +92,71 @@
 ]
 
 
-def load(setname='csh101'):
+def load(filepath='./data/csh101/csh101.ann.features.csv'):
     """Loads the dataset
 
     Args:
-        setname (str): name of the dataset to load
+        filepath (str): path to the file containing the dataset to load
     Returns:
-        dict of str : 2d-array: a dictionary mapping the classes names to their corresponding samples (1 row = 1 sample)
+       x (numpy.ndarray):A NxM 2D-array where each row corresponds to a sample and each column to a feature
+       y (numpy.ndarray): A 1D-array of length N, where each element corresponds to a sample label
     Raises:
         None
     """
-    data = dict([(k, []) for k in CLASSES])
-    with open(os.path.join('../data', setname, '{}.ann.features.csv'.format(setname)), 'rt') as f:
+    x = []
+    y = []
+    with open(filepath, 'rt') as f:
         reader = csv.reader(f, delimiter=',')
         for k, row in enumerate(reader):
-            if not k: continue
-            data[row[-1]].append(np.array([z for z in row[:-1]]))
-    for k in CLASSES:
-        data[k] = np.vstack(data[k])
-    return data
+            if not k:
+                continue
+            x.append(row[:-1])
+            y.append(row[-1])
+    return np.array(x), np.array(y)
 
 
-def split_data(data, subset, splits):
+def split_data(x, y, subset, splits):
     """Splits the data set
 
     Args:
-        data (dict of str : 2d-array): dataset to split
-        subset (str): subset to extract (train, validation or test)
-        splits (dict of str : tuple): a dictionary mapping the subsets to their range (from 0.0 to 1.0)
+        x (numpy.ndarray):A NxM 2D-array where each row corresponds to a sample and each column to a feature
+        y (numpy.ndarray): A 1D-array of length N, where each element corresponds to a sample label
+        subset (str): subset to extract (train or test)
+        splits (dict): a dictionary mapping the subsets to their dataset proportion and the random state to use for splitting
     Returns:
-        dict of str : 2d-array: a dictionary mapping the classes names to their corresponding samples (1 row = 1 sample)
+        x_split (numpy.ndarray):A PxM 2D-array containing only a subset of samples
+        y_split (numpy.ndarray): A 1D-array of length P containing only the labels corresponding to the subset x_split
     Raises:
         None
     """
-    return dict([(k, data[k][range(int(splits[subset][0] * data[k].shape[0]),
-                                   int(splits[subset][1] * data[k].shape[0]))]) for k in data])
+    x_train, x_test, y_train, y_test = train_test_split(x, y,
+                                                        test_size=splits['test'],
+                                                        train_size=splits['train'],
+                                                        random_state=splits['random'],
+                                                        stratify=y)
+    (x_split, y_split) = (x_train, y_train) if subset == 'train' else (x_test, y_test)
+    return x_split, y_split
 
 
-def get(protocol, subset, classes=CLASSES, variables=VARIABLES, setname='csh101'):
+def get(protocol, subset, classes=CLASSES, variables=VARIABLES, filepath='./data/csh101/csh101.ann.features.csv'):
     """Get the desired subset
 
     Args:
         protocol (str): protocol to use
-        subset (str): subset to extract (train, validation or test)
-        classes (1d-array): list of desired classes
-        variables (1d-array): list of desired variables (features)
-        setname (str): name of the dataset to load
+        subset (str): subset to extract (train or test)
+        classes (list): list of desired classes
+        variables (list): list of desired variables (features)
+        filepath (str): path to the file containing the dataset to load
     Returns:
-        numpy.ndarray: array of ordered arrays (of size n_sample x n_features) containing the samples corresponding to
-            1 class
+        ret_x (numpy.ndarray):A PxQ 2D-array containing only the desired subset of samples with the Q desired features
+        ret_y (numpy.ndarray): A 1D-array of length P containing only the labels corresponding to the subset ret_x
     Raises:
         None
     """
-    retval = split_data(load(setname), subset, PROTOCOLS[protocol])
-    varindex = [VARIABLES.index(k) for k in variables]
-    retval = dict([(k, retval[k][:, varindex]) for k in classes])
-    return np.array([retval[k] for k in classes], dtype=object)
+    x, y = load(filepath)
+    x_split, y_split = split_data(x, y, subset, PROTOCOLS[protocol])
+    var_index = [VARIABLES.index(k) for k in variables]
+    classes_condition = np.isin(y_split, classes)
+    ret_x = x_split[classes_condition][:, var_index]
+    ret_y = y_split[classes_condition]
+    return ret_x, ret_y
diff --git a/scripts/main.py b/scripts/main.py
@@ -0,0 +1,116 @@
+#!/usr/bin/env python
+from tabulate import tabulate
+import algorithm
+import database
+import analysis
+import numpy as np
+import config
+
+
+def base_experiment(protocol, variables, filepath, nb_tree_per_forest=50, max_depth=10):
+    """Basic test for the random forest classifier
+
+    Args:
+        protocol (str): protocol to use
+        variables (1d-array): list of desired variables (features)
+        filepath (str): path to the file containing the dataset to load
+        nb_tree_per_forest: number of decision trees in the forest
+        max_depth: max depth of the trees
+    Returns:
+        numpy.ndarray: A 2D array (with a dtype of int) containing the confusion matrix.
+    Raises:
+        None
+    """
+    x_train, y_train = database.get(protocol, 'train', database.CLASSES, variables, filepath)
+    model = algorithm.Model(nb_tree_per_forest, max_depth)
+    model.train(x_train, y_train)
+    x_test, y_test = database.get(protocol, 'test', database.CLASSES, variables, filepath)
+    test_predictions = model.predict(x_test)
+    cm = analysis.get_confusion_matrix(test_predictions, y_test)
+    return cm
+
+
+def pretty_confusion_matrix(cm):
+    """Adds labels to confusion matrix
+
+    Args:
+        cm (numpy.ndarray): A 2D array (with a dtype of int) containing the confusion matrix.
+    Returns:
+        str: nicely formatted confusion matrix for printing
+    Raises:
+        None
+    """
+    classes = np.array([database.CLASSES])
+    table = tabulate(np.vstack((np.hstack(([[""]], classes)),
+                                np.hstack((classes.T, cm)))))
+    return table
+
+
+def experiment_impact_nb_trees(tabnum, filepath, nb_trees, max_depth):
+    """Evaluates and print the impact of the number of trees per forest on the classifiers performance
+
+    Args:
+        tabnum (int): first confusion matrix numbering
+        filepath (str): path to the file containing the dataset to load
+        nb_trees (list): list of number of trees to evaluate
+    Returns:
+        None
+    Raises:
+        None
+    """
+    print("\nImpact of number of trees per forest")
+    for n, p in enumerate(database.PROTOCOLS):
+        for m, nb_tree_per_forest in enumerate(nb_trees):
+            print("\nTable {table_number}: Confusion matrix with {nb_trees} tree(s) for Protocol `{protocol}`".format(
+                table_number=(n * len(nb_trees)) + m + tabnum,
+                protocol=p,
+                nb_trees=nb_tree_per_forest)
+            )
+            cm = base_experiment(p,
+                                 database.VARIABLES,
+                                 nb_tree_per_forest=nb_tree_per_forest,
+                                 max_depth=max_depth,
+                                 filepath=filepath)
+            print(pretty_confusion_matrix(cm))
+
+
+def experiment_impact_tree_depth(tabnum, filepath, nb_trees, max_depths):
+    """Evaluates and print the impact of the trees depth on the classifiers performance
+
+    Args:
+        tabnum (int): first confusion matrix numbering
+        filepath (str): path to the file containing the dataset to load
+    Returns:
+        None
+    Raises:
+        None
+    """
+    print("\nImpact of trees maximum depth")
+    for n, p in enumerate(database.PROTOCOLS):
+        for m, max_depth in enumerate(max_depths):
+            print(
+                "\nTable {table_number}: Confusion matrix with trees maximum depth of {max_depth} for Protocol `{protocol}`".format(
+                    table_number=(n * len(max_depths)) + m + tabnum,
+                    protocol=p,
+                    max_depth=max_depth)
+            )
+            cm = base_experiment(p,
+                                 database.VARIABLES,
+                                 nb_tree_per_forest=nb_trees,
+                                 max_depth=max_depth,
+                                 filepath=filepath)
+            print(pretty_confusion_matrix(cm))
+
+
+if __name__ == '__main__':
+    print("Main script for Human Activity Recognition with Random Forest classifier")
+    tabnum = 1
+    experiment_impact_nb_trees(tabnum,
+                               filepath=config.data_path,
+                               nb_trees=config.nb_trees_experiment['nb_trees'],
+                               max_depth=config.nb_trees_experiment['tree_depth'])
+    tabnum += len(config.nb_trees_experiment['nb_trees'])*len(database.PROTOCOLS)
+    experiment_impact_tree_depth(tabnum,
+                                 filepath=config.data_path,
+                                 nb_trees=config.tree_depth_experiment['nb_trees'],
+                                 max_depths=config.tree_depth_experiment['tree_depth'])