diff --git a/examples/cross_validation/perovskites/fit.py b/examples/cross_validation/perovskites/fit.py new file mode 100644 index 0000000..9f55e32 --- /dev/null +++ b/examples/cross_validation/perovskites/fit.py @@ -0,0 +1,81 @@ +from sklearn.model_selection import RepeatedKFold +from sklearn.preprocessing import StandardScaler +from multilearn import models, utils +from torch import optim, nn + + +def main(): + + save_dir = 'outputs' + lr = 1e-4 + batch_size = 32 + n_epochs = 200 + train_size = 0.8 # Traning fraction + val_size = 1.0-train_size # Validation fraction + print_n = n_epochs//10 + + # Combine data to load + tasks = ['asr', 'opband', 'stability'] + locations = [f'../../sample_data/{i}.csv' for i in tasks] + + # Load data in dictionary (make sure to keep order for loading items) + data = utils.load( + locations, + names=tasks, # User defined name + targets=['y']*len(tasks), # Target names + ) + + # Scalers and loss corresponding to loaded Xs and ys + for key, value in data.items(): + value['scaler'] = StandardScaler() + value['loss'] = nn.L1Loss() + + # A single model that combines nodes during training + model = models.MultiNet( + tasks=tasks, + input_arch={100: 1}, + mid_arch={100: 1, 50: 1}, + out_arch={50: 1, 10: 1} + ) + + # The optimizer for the NN model + optimizer = optim.Adam + + # Do CV to assess + utils.cv( + data, + model, + optimizer, + RepeatedKFold(n_repeats=1), + train_size=train_size, + val_size=val_size, + save_dir=save_dir, + lr=lr, + batch_size=batch_size, + n_epochs=n_epochs, + print_n=print_n, + ) + + # Save one model to all data + model = utils.full_fit( + data, + model, + optimizer, + train_size=train_size, + val_size=val_size, + save_dir=save_dir, + lr=lr, + batch_size=batch_size, + n_epochs=n_epochs, + print_n=print_n, + ) + + name = tasks[1] + X_inference = data[name]['X'] # Data with dropped columns + + print(f'Model used for predicting {name}') + print(model.predict(X_inference, name)) + + +if __name__ == '__main__': + main() diff --git a/examples/cross_validation/perovskites/run.sh b/examples/cross_validation/perovskites/run.sh new file mode 100755 index 0000000..ab07353 --- /dev/null +++ b/examples/cross_validation/perovskites/run.sh @@ -0,0 +1,3 @@ +export PYTHONPATH=$(pwd)/../../../src/:$PYTHONPATH +rm -rf outputs +torchrun fit.py diff --git a/examples/cross_validation/perovskites/submit.sh b/examples/cross_validation/perovskites/submit.sh new file mode 100644 index 0000000..cb736e0 --- /dev/null +++ b/examples/cross_validation/perovskites/submit.sh @@ -0,0 +1,9 @@ +#PBS -S /bin/bash +#PBS -q bardeen +#PBS -l select=1:ncpus=16:mpiprocs=16 +#PBS -l walltime=720:00:00 +#PBS -N job + +cd $PBS_O_WORKDIR + +bash run.sh diff --git a/examples/cross_validation/synthetic/fit.py b/examples/cross_validation/synthetic/fit.py index 217d05e..b895778 100644 --- a/examples/cross_validation/synthetic/fit.py +++ b/examples/cross_validation/synthetic/fit.py @@ -1,6 +1,6 @@ from sklearn.model_selection import RepeatedKFold from sklearn.preprocessing import StandardScaler -from multilearn import datasets, models, utils +from multilearn import models, utils from torch import optim, nn import pandas as pd @@ -13,8 +13,7 @@ def main(): save_dir = 'outputs' lr = 1e-4 batch_size = 32 - n_epochs = 100 - patience = 10 # Learning loop patience + n_epochs = 1500 train_size = 0.8 # Traning fraction val_size = 1.0-train_size # Validation fraction print_n = n_epochs//10 @@ -53,12 +52,12 @@ def main(): # Load data in dictionary (make sure to keep order for loading items) tasks = ['name1', 'name2'] - data = datasets.load( - locations, - names=tasks, # User defined name - targets=['target_1', 'target_2'], # Target names - drops=[None, ['5', '6']], # Columns to drop - ) + data = utils.load( + locations, + names=tasks, # User defined name + targets=['target_1', 'target_2'], # Target names + drops=[None, ['5', '6']], # Columns to drop + ) # Clean generated csv file [os.remove(i) for i in locations] @@ -72,8 +71,8 @@ def main(): model = models.MultiNet( tasks=tasks, input_arch={100: 1}, - mid_arch={100: 1, 50: 1}, - out_arch={50: 1, 10: 1} + mid_arch={50: 1}, + out_arch={25: 1} ) # The optimizer for the NN model @@ -91,7 +90,6 @@ def main(): lr=lr, batch_size=batch_size, n_epochs=n_epochs, - patience=patience, print_n=print_n, ) @@ -106,7 +104,6 @@ def main(): lr=lr, batch_size=batch_size, n_epochs=n_epochs, - patience=patience, print_n=print_n, ) diff --git a/examples/cross_validation/synthetic/run.sh b/examples/cross_validation/synthetic/run.sh index e0f3271..ab07353 100755 --- a/examples/cross_validation/synthetic/run.sh +++ b/examples/cross_validation/synthetic/run.sh @@ -1,2 +1,3 @@ export PYTHONPATH=$(pwd)/../../../src/:$PYTHONPATH +rm -rf outputs torchrun fit.py diff --git a/setup.py b/setup.py index cbe9fe8..3a6c454 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ # Package information name = 'multilearn' -version = '0.0.7' # Need to increment every time to push to PyPI +version = '0.0.8' # Need to increment every time to push to PyPI description = 'Multi-task learning with Pytorch.' url = 'https://github.com/leschultz/multilearn' author = 'Lane E. Schultz' diff --git a/src/multilearn/datasets.py b/src/multilearn/datasets.py deleted file mode 100644 index 53deaf3..0000000 --- a/src/multilearn/datasets.py +++ /dev/null @@ -1,48 +0,0 @@ -import pandas as pd - - -def load(locations, names=None, targets=None, drops=None): - ''' - Load data included with the package. - - Args: - locations (list): The loctions of data to load. - names (list): The names for each dataset. - targets (list): The name of the target variable for each of names. - drops (list): A list of list for columns to drop. - - Returns: - Dict: A dictionary with features, targets, and names. - ''' - - namescond = names is None - dropscond = drops is not None - - data = {} - for count in range(len(locations)): - - if namescond: - name = count - else: - name = names[count] - - if dropscond: - drop = drops[count] - - target = targets[count] - - df = locations[count] - df = pd.read_csv(df) - - if drop is None: - X = df.drop(target, axis=1).values - else: - X = df.drop([target]+drop, axis=1).values - - y = df[target].values - - data[name] = {} - data[name]['X'] = X - data[name]['y'] = y - - return data diff --git a/src/multilearn/models.py b/src/multilearn/models.py index c8bc447..16ab722 100644 --- a/src/multilearn/models.py +++ b/src/multilearn/models.py @@ -26,11 +26,15 @@ def __init__(self, model, device=None): def fit(self, data, optimizer, **kwargs): - print_n = kwargs['print_n'] n_epochs = kwargs['n_epochs'] batch_size = kwargs['batch_size'] lr = kwargs['lr'] - patience = kwargs['patience'] + + if 'print_n' in kwargs.keys(): + print_n = kwargs['print_n'] + print_n_cond = True + else: + print_n_cond = False data = copy.deepcopy(data) # Avoids editing original data optimizer = optimizer(self.model.parameters(), lr=lr) @@ -115,28 +119,12 @@ def fit(self, data, optimizer, **kwargs): d = (epoch, loss, indx, split) df_loss.append(d) - all_loss += loss - - else: - all_loss += loss - - # Early stopping - if all_loss < best_loss: - best_model = copy.deepcopy(self.model) - best_loss = all_loss - no_improv = 0 - - else: - no_improv += 1 - - if no_improv >= patience: - break - - if epoch % print_n == 0: - print(f'Epoch {epoch}/{n_epochs}: {split} loss {loss:.2f}') + # Loss from validation set if defined + all_loss += loss - if patience is not None: - self.model = best_model + if print_n_cond: + if epoch % print_n == 0: + print(f'Epoch {epoch}/{n_epochs}: {split} loss {loss:.2f}') # Loss curve columns = ['epoch', 'loss', 'data', 'split'] diff --git a/src/multilearn/plots.py b/src/multilearn/plots.py index a3352f9..6c90ccf 100644 --- a/src/multilearn/plots.py +++ b/src/multilearn/plots.py @@ -8,7 +8,7 @@ import os # Font styles -font = {'font.size': 16, 'lines.markersize': 10} +font = {'font.size': 16} matplotlib.rcParams.update(font) diff --git a/src/multilearn/utils.py b/src/multilearn/utils.py index 65a3833..01cc7f6 100644 --- a/src/multilearn/utils.py +++ b/src/multilearn/utils.py @@ -11,6 +11,56 @@ import os +def load(locations, names=None, targets=None, drops=None): + ''' + Load data included with the package. + + Args: + locations (list): The loctions of data to load. + names (list): The names for each dataset. + targets (list): The name of the target variable for each of names. + drops (list): A list of list for columns to drop. + + Returns: + Dict: A dictionary with features, targets, and names. + ''' + + namescond = names is None + dropscond = drops is not None + + data = {} + for count in range(len(locations)): + + if namescond: + name = count + else: + name = names[count] + + if dropscond: + drop = drops[count] + + else: + drop = None + + target = targets[count] + + df = locations[count] + df = pd.read_csv(df) + + if drop is None: + X = df.drop(target, axis=1).values + else: + X = df.drop([target]+drop, axis=1).values + + y = df[target].values + + data[name] = {} + data[name]['X'] = X + data[name]['y'] = y + + return data + + def find(where, match): paths = list(map(str, Path(where).rglob(match))) return paths diff --git a/tests/test_fit.py b/tests/test_fit.py index aa22c29..5df716b 100644 --- a/tests/test_fit.py +++ b/tests/test_fit.py @@ -1,6 +1,6 @@ from sklearn.model_selection import RepeatedKFold from sklearn.preprocessing import StandardScaler -from multilearn import datasets, models, utils +from multilearn import models, utils from torch import optim, nn import pandas as pd @@ -58,12 +58,12 @@ def test_ml(self): # Load data in dictionary (make sure to keep order for loading items) tasks = ['name1', 'name2'] - data = datasets.load( - locations, - names=tasks, # User defined name - targets=['target_1', 'target_2'], # Target names - drops=[None, ['5', '6']], # Columns to drop - ) + data = utils.load( + locations, + names=tasks, # User defined name + targets=['target_1', 'target_2'], # Target names + drops=[None, ['5', '6']], # Columns to drop + ) # Clean generated csv file [os.remove(i) for i in locations]