Skip to content

Commit

Permalink
Merge pull request #31 from sdevenes/issue_28/improve_flexibility
Browse files Browse the repository at this point in the history
Issue 28/improve flexibility
  • Loading branch information
sdevenes authored Oct 6, 2020
2 parents 803e57c + e538aab commit f40a923
Show file tree
Hide file tree
Showing 7 changed files with 1,495 additions and 182 deletions.
41 changes: 7 additions & 34 deletions scripts/algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,6 @@
logger = logging.getLogger()


def make_labels(X):
"""Generate label array from the given data
Args:
X (list): A list of 1D array (with a dtype of float64) showing the input
training samples, where each item of the list correspond to one class.
Returns:
numpy.ndarray: A 1D array (with a dtype of int) containing the
label for each sample
Raises:
None
"""
return np.hstack([k*np.ones(len(X[k]), dtype=int) for k in range(len(X))])


class Model:
def __init__(self, nb_tree_per_forest=50, max_depth=10):
"""Create a new ML model (Random forest classifier from scikitlearn)
Expand All @@ -38,44 +23,32 @@ def __init__(self, nb_tree_per_forest=50, max_depth=10):
random_state=0)


def train(self, X):
def train(self, X, y):
"""Train the model using the given data
Args:
X (list): A list of 1D array (with a dtype of float64) showing the input training samples,
where each item of the list correspond to one class.
X (numpy.ndarray):A NxM 2D-array where each row corresponds to a sample and each column to a feature
y (numpy.ndarray): A 1D-array of length N, where each element corresponds to a sample label
Returns:
None
Raises:
None
"""
# Get features
X_features = np.vstack([k for k in X])

# Get labels
y = make_labels(X)

# Train the model
self.model.fit(X_features, y)
self.model.fit(X, y)


def predict(self, X):
"""Make a prediction on the data using the trained model
Args:
X (list): A list of 1D array (with a dtype of float64) showing the input training samples,
where each item of the list correspond to one class.
X (numpy.ndarray):A NxM 2D-array where each row corresponds to a sample and each column to a feature
Returns:
numpy.ndarray: A 1D array (with a dtype of int) containing the predicted
label for each sample
Raises:
None
"""
# Get features
X_features = np.vstack([k for k in X])

# Predict using the trained model
prediction = self.model.predict(X_features)
prediction = self.model.predict(X)

return prediction
return prediction
3 changes: 3 additions & 0 deletions scripts/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
data_path = '../data/csh101/csh101.ann.features.csv'
nb_trees_experiment = {"nb_trees": (1, 2), "tree_depth": 10}
tree_depth_experiment = {"nb_trees": 10, "tree_depth": (1, 2)}
74 changes: 43 additions & 31 deletions scripts/database.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
#!/usr/bin/env python
import numpy as np
import csv
import os
from sklearn.model_selection import train_test_split

PROTOCOLS = {
'proto1': {'train': (0, 0.6), 'validation': (0.6, 0.8), 'test': (0.8, 1)},
'proto2': {'train': (0.4, 1), 'validation': (0, 0.2), 'test': (0.2, 0.4)},
'proto1': {'train': 0.8, 'test': 0.2, 'random': 1},
'proto2': {'train': 0.8, 'test': 0.2, 'random': 2},
}

SUBSETS = [
Expand Down Expand Up @@ -92,59 +92,71 @@
]


def load(setname='csh101'):
def load(filepath='./data/csh101/csh101.ann.features.csv'):
"""Loads the dataset
Args:
setname (str): name of the dataset to load
filepath (str): path to the file containing the dataset to load
Returns:
dict of str : 2d-array: a dictionary mapping the classes names to their corresponding samples (1 row = 1 sample)
x (numpy.ndarray):A NxM 2D-array where each row corresponds to a sample and each column to a feature
y (numpy.ndarray): A 1D-array of length N, where each element corresponds to a sample label
Raises:
None
"""
data = dict([(k, []) for k in CLASSES])
with open(os.path.join('../data', setname, '{}.ann.features.csv'.format(setname)), 'rt') as f:
x = []
y = []
with open(filepath, 'rt') as f:
reader = csv.reader(f, delimiter=',')
for k, row in enumerate(reader):
if not k: continue
data[row[-1]].append(np.array([z for z in row[:-1]]))
for k in CLASSES:
data[k] = np.vstack(data[k])
return data
if not k:
continue
x.append(row[:-1])
y.append(row[-1])
return np.array(x), np.array(y)


def split_data(data, subset, splits):
def split_data(x, y, subset, splits):
"""Splits the data set
Args:
data (dict of str : 2d-array): dataset to split
subset (str): subset to extract (train, validation or test)
splits (dict of str : tuple): a dictionary mapping the subsets to their range (from 0.0 to 1.0)
x (numpy.ndarray):A NxM 2D-array where each row corresponds to a sample and each column to a feature
y (numpy.ndarray): A 1D-array of length N, where each element corresponds to a sample label
subset (str): subset to extract (train or test)
splits (dict): a dictionary mapping the subsets to their dataset proportion and the random state to use for splitting
Returns:
dict of str : 2d-array: a dictionary mapping the classes names to their corresponding samples (1 row = 1 sample)
x_split (numpy.ndarray):A PxM 2D-array containing only a subset of samples
y_split (numpy.ndarray): A 1D-array of length P containing only the labels corresponding to the subset x_split
Raises:
None
"""
return dict([(k, data[k][range(int(splits[subset][0] * data[k].shape[0]),
int(splits[subset][1] * data[k].shape[0]))]) for k in data])
x_train, x_test, y_train, y_test = train_test_split(x, y,
test_size=splits['test'],
train_size=splits['train'],
random_state=splits['random'],
stratify=y)
(x_split, y_split) = (x_train, y_train) if subset == 'train' else (x_test, y_test)
return x_split, y_split


def get(protocol, subset, classes=CLASSES, variables=VARIABLES, setname='csh101'):
def get(protocol, subset, classes=CLASSES, variables=VARIABLES, filepath='./data/csh101/csh101.ann.features.csv'):
"""Get the desired subset
Args:
protocol (str): protocol to use
subset (str): subset to extract (train, validation or test)
classes (1d-array): list of desired classes
variables (1d-array): list of desired variables (features)
setname (str): name of the dataset to load
subset (str): subset to extract (train or test)
classes (list): list of desired classes
variables (list): list of desired variables (features)
filepath (str): path to the file containing the dataset to load
Returns:
numpy.ndarray: array of ordered arrays (of size n_sample x n_features) containing the samples corresponding to
1 class
ret_x (numpy.ndarray):A PxQ 2D-array containing only the desired subset of samples with the Q desired features
ret_y (numpy.ndarray): A 1D-array of length P containing only the labels corresponding to the subset ret_x
Raises:
None
"""
retval = split_data(load(setname), subset, PROTOCOLS[protocol])
varindex = [VARIABLES.index(k) for k in variables]
retval = dict([(k, retval[k][:, varindex]) for k in classes])
return np.array([retval[k] for k in classes], dtype=object)
x, y = load(filepath)
x_split, y_split = split_data(x, y, subset, PROTOCOLS[protocol])
var_index = [VARIABLES.index(k) for k in variables]
classes_condition = np.isin(y_split, classes)
ret_x = x_split[classes_condition][:, var_index]
ret_y = y_split[classes_condition]
return ret_x, ret_y
116 changes: 116 additions & 0 deletions scripts/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
#!/usr/bin/env python
from tabulate import tabulate
import algorithm
import database
import analysis
import numpy as np
import config


def base_experiment(protocol, variables, filepath, nb_tree_per_forest=50, max_depth=10):
"""Basic test for the random forest classifier
Args:
protocol (str): protocol to use
variables (1d-array): list of desired variables (features)
filepath (str): path to the file containing the dataset to load
nb_tree_per_forest: number of decision trees in the forest
max_depth: max depth of the trees
Returns:
numpy.ndarray: A 2D array (with a dtype of int) containing the confusion matrix.
Raises:
None
"""
x_train, y_train = database.get(protocol, 'train', database.CLASSES, variables, filepath)
model = algorithm.Model(nb_tree_per_forest, max_depth)
model.train(x_train, y_train)
x_test, y_test = database.get(protocol, 'test', database.CLASSES, variables, filepath)
test_predictions = model.predict(x_test)
cm = analysis.get_confusion_matrix(test_predictions, y_test)
return cm


def pretty_confusion_matrix(cm):
"""Adds labels to confusion matrix
Args:
cm (numpy.ndarray): A 2D array (with a dtype of int) containing the confusion matrix.
Returns:
str: nicely formatted confusion matrix for printing
Raises:
None
"""
classes = np.array([database.CLASSES])
table = tabulate(np.vstack((np.hstack(([[""]], classes)),
np.hstack((classes.T, cm)))))
return table


def experiment_impact_nb_trees(tabnum, filepath, nb_trees, max_depth):
"""Evaluates and print the impact of the number of trees per forest on the classifiers performance
Args:
tabnum (int): first confusion matrix numbering
filepath (str): path to the file containing the dataset to load
nb_trees (list): list of number of trees to evaluate
Returns:
None
Raises:
None
"""
print("\nImpact of number of trees per forest")
for n, p in enumerate(database.PROTOCOLS):
for m, nb_tree_per_forest in enumerate(nb_trees):
print("\nTable {table_number}: Confusion matrix with {nb_trees} tree(s) for Protocol `{protocol}`".format(
table_number=(n * len(nb_trees)) + m + tabnum,
protocol=p,
nb_trees=nb_tree_per_forest)
)
cm = base_experiment(p,
database.VARIABLES,
nb_tree_per_forest=nb_tree_per_forest,
max_depth=max_depth,
filepath=filepath)
print(pretty_confusion_matrix(cm))


def experiment_impact_tree_depth(tabnum, filepath, nb_trees, max_depths):
"""Evaluates and print the impact of the trees depth on the classifiers performance
Args:
tabnum (int): first confusion matrix numbering
filepath (str): path to the file containing the dataset to load
Returns:
None
Raises:
None
"""
print("\nImpact of trees maximum depth")
for n, p in enumerate(database.PROTOCOLS):
for m, max_depth in enumerate(max_depths):
print(
"\nTable {table_number}: Confusion matrix with trees maximum depth of {max_depth} for Protocol `{protocol}`".format(
table_number=(n * len(max_depths)) + m + tabnum,
protocol=p,
max_depth=max_depth)
)
cm = base_experiment(p,
database.VARIABLES,
nb_tree_per_forest=nb_trees,
max_depth=max_depth,
filepath=filepath)
print(pretty_confusion_matrix(cm))


if __name__ == '__main__':
print("Main script for Human Activity Recognition with Random Forest classifier")
tabnum = 1
experiment_impact_nb_trees(tabnum,
filepath=config.data_path,
nb_trees=config.nb_trees_experiment['nb_trees'],
max_depth=config.nb_trees_experiment['tree_depth'])
tabnum += len(config.nb_trees_experiment['nb_trees'])*len(database.PROTOCOLS)
experiment_impact_tree_depth(tabnum,
filepath=config.data_path,
nb_trees=config.tree_depth_experiment['nb_trees'],
max_depths=config.tree_depth_experiment['tree_depth'])
Loading

0 comments on commit f40a923

Please sign in to comment.