Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue 28/improve flexibility #31

Merged
merged 15 commits into from
Oct 6, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 7 additions & 34 deletions scripts/algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,6 @@
logger = logging.getLogger()


def make_labels(X):
"""Generate label array from the given data
Args:
X (list): A list of 1D array (with a dtype of float64) showing the input
training samples, where each item of the list correspond to one class.
Returns:
numpy.ndarray: A 1D array (with a dtype of int) containing the
label for each sample
Raises:
None
"""
return np.hstack([k*np.ones(len(X[k]), dtype=int) for k in range(len(X))])


class Model:
def __init__(self, nb_tree_per_forest=50, max_depth=10):
"""Create a new ML model (Random forest classifier from scikitlearn)
Expand All @@ -38,44 +23,32 @@ def __init__(self, nb_tree_per_forest=50, max_depth=10):
random_state=0)


def train(self, X):
def train(self, X, y):
"""Train the model using the given data
Args:
X (list): A list of 1D array (with a dtype of float64) showing the input training samples,
where each item of the list correspond to one class.
X (numpy.ndarray):A NxM 2D-array where each row corresponds to a sample and each column to a feature
y (numpy.ndarray): A 1D-array of length N, where each element corresponds to a sample label
Returns:
None
Raises:
None
"""
# Get features
X_features = np.vstack([k for k in X])

# Get labels
y = make_labels(X)

# Train the model
self.model.fit(X_features, y)
self.model.fit(X, y)


def predict(self, X):
"""Make a prediction on the data using the trained model
Args:
X (list): A list of 1D array (with a dtype of float64) showing the input training samples,
where each item of the list correspond to one class.
X (numpy.ndarray):A NxM 2D-array where each row corresponds to a sample and each column to a feature
Returns:
numpy.ndarray: A 1D array (with a dtype of int) containing the predicted
label for each sample
Raises:
None
"""
# Get features
X_features = np.vstack([k for k in X])

# Predict using the trained model
prediction = self.model.predict(X_features)
prediction = self.model.predict(X)

return prediction
return prediction
3 changes: 3 additions & 0 deletions scripts/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
data_path = '../data/csh101/csh101.ann.features.csv'
nb_trees_experiment = {"nb_trees": (1, 2), "tree_depth": 10}
tree_depth_experiment = {"nb_trees": 10, "tree_depth": (1, 2)}
74 changes: 43 additions & 31 deletions scripts/database.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
#!/usr/bin/env python
import numpy as np
import csv
import os
from sklearn.model_selection import train_test_split

PROTOCOLS = {
'proto1': {'train': (0, 0.6), 'validation': (0.6, 0.8), 'test': (0.8, 1)},
'proto2': {'train': (0.4, 1), 'validation': (0, 0.2), 'test': (0.2, 0.4)},
'proto1': {'train': 0.8, 'test': 0.2, 'random': 1},
'proto2': {'train': 0.8, 'test': 0.2, 'random': 2},
}

SUBSETS = [
Expand Down Expand Up @@ -92,59 +92,71 @@
]


def load(setname='csh101'):
def load(filepath='./data/csh101/csh101.ann.features.csv'):
"""Loads the dataset
Args:
setname (str): name of the dataset to load
filepath (str): path to the file containing the dataset to load
Returns:
dict of str : 2d-array: a dictionary mapping the classes names to their corresponding samples (1 row = 1 sample)
x (numpy.ndarray):A NxM 2D-array where each row corresponds to a sample and each column to a feature
y (numpy.ndarray): A 1D-array of length N, where each element corresponds to a sample label
Raises:
None
"""
data = dict([(k, []) for k in CLASSES])
with open(os.path.join('../data', setname, '{}.ann.features.csv'.format(setname)), 'rt') as f:
x = []
y = []
with open(filepath, 'rt') as f:
reader = csv.reader(f, delimiter=',')
for k, row in enumerate(reader):
if not k: continue
data[row[-1]].append(np.array([z for z in row[:-1]]))
for k in CLASSES:
data[k] = np.vstack(data[k])
return data
if not k:
continue
x.append(row[:-1])
y.append(row[-1])
return np.array(x), np.array(y)


def split_data(data, subset, splits):
def split_data(x, y, subset, splits):
"""Splits the data set
Args:
data (dict of str : 2d-array): dataset to split
subset (str): subset to extract (train, validation or test)
splits (dict of str : tuple): a dictionary mapping the subsets to their range (from 0.0 to 1.0)
x (numpy.ndarray):A NxM 2D-array where each row corresponds to a sample and each column to a feature
y (numpy.ndarray): A 1D-array of length N, where each element corresponds to a sample label
subset (str): subset to extract (train or test)
splits (dict): a dictionary mapping the subsets to their dataset proportion and the random state to use for splitting
Returns:
dict of str : 2d-array: a dictionary mapping the classes names to their corresponding samples (1 row = 1 sample)
x_split (numpy.ndarray):A PxM 2D-array containing only a subset of samples
y_split (numpy.ndarray): A 1D-array of length P containing only the labels corresponding to the subset x_split
Raises:
None
"""
return dict([(k, data[k][range(int(splits[subset][0] * data[k].shape[0]),
int(splits[subset][1] * data[k].shape[0]))]) for k in data])
x_train, x_test, y_train, y_test = train_test_split(x, y,
test_size=splits['test'],
train_size=splits['train'],
random_state=splits['random'],
stratify=y)
(x_split, y_split) = (x_train, y_train) if subset == 'train' else (x_test, y_test)
return x_split, y_split


def get(protocol, subset, classes=CLASSES, variables=VARIABLES, setname='csh101'):
def get(protocol, subset, classes=CLASSES, variables=VARIABLES, filepath='./data/csh101/csh101.ann.features.csv'):
"""Get the desired subset
Args:
protocol (str): protocol to use
subset (str): subset to extract (train, validation or test)
classes (1d-array): list of desired classes
variables (1d-array): list of desired variables (features)
setname (str): name of the dataset to load
subset (str): subset to extract (train or test)
classes (list): list of desired classes
variables (list): list of desired variables (features)
filepath (str): path to the file containing the dataset to load
Returns:
numpy.ndarray: array of ordered arrays (of size n_sample x n_features) containing the samples corresponding to
1 class
ret_x (numpy.ndarray):A PxQ 2D-array containing only the desired subset of samples with the Q desired features
ret_y (numpy.ndarray): A 1D-array of length P containing only the labels corresponding to the subset ret_x
Raises:
None
"""
retval = split_data(load(setname), subset, PROTOCOLS[protocol])
varindex = [VARIABLES.index(k) for k in variables]
retval = dict([(k, retval[k][:, varindex]) for k in classes])
return np.array([retval[k] for k in classes], dtype=object)
x, y = load(filepath)
x_split, y_split = split_data(x, y, subset, PROTOCOLS[protocol])
var_index = [VARIABLES.index(k) for k in variables]
classes_condition = np.isin(y_split, classes)
ret_x = x_split[classes_condition][:, var_index]
ret_y = y_split[classes_condition]
return ret_x, ret_y
116 changes: 116 additions & 0 deletions scripts/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
#!/usr/bin/env python
from tabulate import tabulate
import algorithm
import database
import analysis
import numpy as np
import config


def base_experiment(protocol, variables, filepath, nb_tree_per_forest=50, max_depth=10):
"""Basic test for the random forest classifier

Args:
protocol (str): protocol to use
variables (1d-array): list of desired variables (features)
filepath (str): path to the file containing the dataset to load
nb_tree_per_forest: number of decision trees in the forest
max_depth: max depth of the trees
Returns:
numpy.ndarray: A 2D array (with a dtype of int) containing the confusion matrix.
Raises:
None
"""
x_train, y_train = database.get(protocol, 'train', database.CLASSES, variables, filepath)
model = algorithm.Model(nb_tree_per_forest, max_depth)
model.train(x_train, y_train)
x_test, y_test = database.get(protocol, 'test', database.CLASSES, variables, filepath)
test_predictions = model.predict(x_test)
cm = analysis.get_confusion_matrix(test_predictions, y_test)
return cm


def pretty_confusion_matrix(cm):
"""Adds labels to confusion matrix

Args:
cm (numpy.ndarray): A 2D array (with a dtype of int) containing the confusion matrix.
Returns:
str: nicely formatted confusion matrix for printing
Raises:
None
"""
classes = np.array([database.CLASSES])
table = tabulate(np.vstack((np.hstack(([[""]], classes)),
np.hstack((classes.T, cm)))))
return table


def experiment_impact_nb_trees(tabnum, filepath, nb_trees, max_depth):
"""Evaluates and print the impact of the number of trees per forest on the classifiers performance

Args:
tabnum (int): first confusion matrix numbering
filepath (str): path to the file containing the dataset to load
nb_trees (list): list of number of trees to evaluate
Returns:
None
Raises:
None
"""
print("\nImpact of number of trees per forest")
for n, p in enumerate(database.PROTOCOLS):
for m, nb_tree_per_forest in enumerate(nb_trees):
print("\nTable {table_number}: Confusion matrix with {nb_trees} tree(s) for Protocol `{protocol}`".format(
table_number=(n * len(nb_trees)) + m + tabnum,
protocol=p,
nb_trees=nb_tree_per_forest)
)
cm = base_experiment(p,
database.VARIABLES,
nb_tree_per_forest=nb_tree_per_forest,
max_depth=max_depth,
filepath=filepath)
print(pretty_confusion_matrix(cm))


def experiment_impact_tree_depth(tabnum, filepath, nb_trees, max_depths):
"""Evaluates and print the impact of the trees depth on the classifiers performance

Args:
tabnum (int): first confusion matrix numbering
filepath (str): path to the file containing the dataset to load
Returns:
None
Raises:
None
"""
print("\nImpact of trees maximum depth")
for n, p in enumerate(database.PROTOCOLS):
for m, max_depth in enumerate(max_depths):
print(
"\nTable {table_number}: Confusion matrix with trees maximum depth of {max_depth} for Protocol `{protocol}`".format(
table_number=(n * len(max_depths)) + m + tabnum,
protocol=p,
max_depth=max_depth)
)
cm = base_experiment(p,
database.VARIABLES,
nb_tree_per_forest=nb_trees,
max_depth=max_depth,
filepath=filepath)
print(pretty_confusion_matrix(cm))


if __name__ == '__main__':
print("Main script for Human Activity Recognition with Random Forest classifier")
tabnum = 1
experiment_impact_nb_trees(tabnum,
filepath=config.data_path,
nb_trees=config.nb_trees_experiment['nb_trees'],
max_depth=config.nb_trees_experiment['tree_depth'])
tabnum += len(config.nb_trees_experiment['nb_trees'])*len(database.PROTOCOLS)
experiment_impact_tree_depth(tabnum,
filepath=config.data_path,
nb_trees=config.tree_depth_experiment['nb_trees'],
max_depths=config.tree_depth_experiment['tree_depth'])
Loading