Skip to content

Commit

Permalink
Added example
Browse files Browse the repository at this point in the history
  • Loading branch information
leschultz committed Jun 4, 2024
1 parent 6035f3a commit 08efa7f
Show file tree
Hide file tree
Showing 11 changed files with 174 additions and 93 deletions.
81 changes: 81 additions & 0 deletions examples/cross_validation/perovskites/fit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
from sklearn.model_selection import RepeatedKFold
from sklearn.preprocessing import StandardScaler
from multilearn import models, utils
from torch import optim, nn


def main():

save_dir = 'outputs'
lr = 1e-4
batch_size = 32
n_epochs = 200
train_size = 0.8 # Traning fraction
val_size = 1.0-train_size # Validation fraction
print_n = n_epochs//10

# Combine data to load
tasks = ['asr', 'opband', 'stability']
locations = [f'../../sample_data/{i}.csv' for i in tasks]

# Load data in dictionary (make sure to keep order for loading items)
data = utils.load(
locations,
names=tasks, # User defined name
targets=['y']*len(tasks), # Target names
)

# Scalers and loss corresponding to loaded Xs and ys
for key, value in data.items():
value['scaler'] = StandardScaler()
value['loss'] = nn.L1Loss()

# A single model that combines nodes during training
model = models.MultiNet(
tasks=tasks,
input_arch={100: 1},
mid_arch={100: 1, 50: 1},
out_arch={50: 1, 10: 1}
)

# The optimizer for the NN model
optimizer = optim.Adam

# Do CV to assess
utils.cv(
data,
model,
optimizer,
RepeatedKFold(n_repeats=1),
train_size=train_size,
val_size=val_size,
save_dir=save_dir,
lr=lr,
batch_size=batch_size,
n_epochs=n_epochs,
print_n=print_n,
)

# Save one model to all data
model = utils.full_fit(
data,
model,
optimizer,
train_size=train_size,
val_size=val_size,
save_dir=save_dir,
lr=lr,
batch_size=batch_size,
n_epochs=n_epochs,
print_n=print_n,
)

name = tasks[1]
X_inference = data[name]['X'] # Data with dropped columns

print(f'Model used for predicting {name}')
print(model.predict(X_inference, name))


if __name__ == '__main__':
main()
3 changes: 3 additions & 0 deletions examples/cross_validation/perovskites/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
export PYTHONPATH=$(pwd)/../../../src/:$PYTHONPATH
rm -rf outputs
torchrun fit.py
9 changes: 9 additions & 0 deletions examples/cross_validation/perovskites/submit.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#PBS -S /bin/bash
#PBS -q bardeen
#PBS -l select=1:ncpus=16:mpiprocs=16
#PBS -l walltime=720:00:00
#PBS -N job

cd $PBS_O_WORKDIR

bash run.sh
23 changes: 10 additions & 13 deletions examples/cross_validation/synthetic/fit.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from sklearn.model_selection import RepeatedKFold
from sklearn.preprocessing import StandardScaler
from multilearn import datasets, models, utils
from multilearn import models, utils
from torch import optim, nn

import pandas as pd
Expand All @@ -13,8 +13,7 @@ def main():
save_dir = 'outputs'
lr = 1e-4
batch_size = 32
n_epochs = 100
patience = 10 # Learning loop patience
n_epochs = 1500
train_size = 0.8 # Traning fraction
val_size = 1.0-train_size # Validation fraction
print_n = n_epochs//10
Expand Down Expand Up @@ -53,12 +52,12 @@ def main():

# Load data in dictionary (make sure to keep order for loading items)
tasks = ['name1', 'name2']
data = datasets.load(
locations,
names=tasks, # User defined name
targets=['target_1', 'target_2'], # Target names
drops=[None, ['5', '6']], # Columns to drop
)
data = utils.load(
locations,
names=tasks, # User defined name
targets=['target_1', 'target_2'], # Target names
drops=[None, ['5', '6']], # Columns to drop
)

# Clean generated csv file
[os.remove(i) for i in locations]
Expand All @@ -72,8 +71,8 @@ def main():
model = models.MultiNet(
tasks=tasks,
input_arch={100: 1},
mid_arch={100: 1, 50: 1},
out_arch={50: 1, 10: 1}
mid_arch={50: 1},
out_arch={25: 1}
)

# The optimizer for the NN model
Expand All @@ -91,7 +90,6 @@ def main():
lr=lr,
batch_size=batch_size,
n_epochs=n_epochs,
patience=patience,
print_n=print_n,
)

Expand All @@ -106,7 +104,6 @@ def main():
lr=lr,
batch_size=batch_size,
n_epochs=n_epochs,
patience=patience,
print_n=print_n,
)

Expand Down
1 change: 1 addition & 0 deletions examples/cross_validation/synthetic/run.sh
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
export PYTHONPATH=$(pwd)/../../../src/:$PYTHONPATH
rm -rf outputs
torchrun fit.py
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

# Package information
name = 'multilearn'
version = '0.0.7' # Need to increment every time to push to PyPI
version = '0.0.8' # Need to increment every time to push to PyPI
description = 'Multi-task learning with Pytorch.'
url = 'https://github.com/leschultz/multilearn'
author = 'Lane E. Schultz'
Expand Down
48 changes: 0 additions & 48 deletions src/multilearn/datasets.py

This file was deleted.

34 changes: 11 additions & 23 deletions src/multilearn/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,15 @@ def __init__(self, model, device=None):

def fit(self, data, optimizer, **kwargs):

print_n = kwargs['print_n']
n_epochs = kwargs['n_epochs']
batch_size = kwargs['batch_size']
lr = kwargs['lr']
patience = kwargs['patience']

if 'print_n' in kwargs.keys():
print_n = kwargs['print_n']
print_n_cond = True
else:
print_n_cond = False

data = copy.deepcopy(data) # Avoids editing original data
optimizer = optimizer(self.model.parameters(), lr=lr)
Expand Down Expand Up @@ -115,28 +119,12 @@ def fit(self, data, optimizer, **kwargs):
d = (epoch, loss, indx, split)
df_loss.append(d)

all_loss += loss

else:
all_loss += loss

# Early stopping
if all_loss < best_loss:
best_model = copy.deepcopy(self.model)
best_loss = all_loss
no_improv = 0

else:
no_improv += 1

if no_improv >= patience:
break

if epoch % print_n == 0:
print(f'Epoch {epoch}/{n_epochs}: {split} loss {loss:.2f}')
# Loss from validation set if defined
all_loss += loss

if patience is not None:
self.model = best_model
if print_n_cond:
if epoch % print_n == 0:
print(f'Epoch {epoch}/{n_epochs}: {split} loss {loss:.2f}')

# Loss curve
columns = ['epoch', 'loss', 'data', 'split']
Expand Down
2 changes: 1 addition & 1 deletion src/multilearn/plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import os

# Font styles
font = {'font.size': 16, 'lines.markersize': 10}
font = {'font.size': 16}
matplotlib.rcParams.update(font)


Expand Down
50 changes: 50 additions & 0 deletions src/multilearn/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,56 @@
import os


def load(locations, names=None, targets=None, drops=None):
'''
Load data included with the package.
Args:
locations (list): The loctions of data to load.
names (list): The names for each dataset.
targets (list): The name of the target variable for each of names.
drops (list): A list of list for columns to drop.
Returns:
Dict: A dictionary with features, targets, and names.
'''

namescond = names is None
dropscond = drops is not None

data = {}
for count in range(len(locations)):

if namescond:
name = count
else:
name = names[count]

if dropscond:
drop = drops[count]

else:
drop = None

target = targets[count]

df = locations[count]
df = pd.read_csv(df)

if drop is None:
X = df.drop(target, axis=1).values
else:
X = df.drop([target]+drop, axis=1).values

y = df[target].values

data[name] = {}
data[name]['X'] = X
data[name]['y'] = y

return data


def find(where, match):
paths = list(map(str, Path(where).rglob(match)))
return paths
Expand Down
14 changes: 7 additions & 7 deletions tests/test_fit.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from sklearn.model_selection import RepeatedKFold
from sklearn.preprocessing import StandardScaler
from multilearn import datasets, models, utils
from multilearn import models, utils
from torch import optim, nn

import pandas as pd
Expand Down Expand Up @@ -58,12 +58,12 @@ def test_ml(self):

# Load data in dictionary (make sure to keep order for loading items)
tasks = ['name1', 'name2']
data = datasets.load(
locations,
names=tasks, # User defined name
targets=['target_1', 'target_2'], # Target names
drops=[None, ['5', '6']], # Columns to drop
)
data = utils.load(
locations,
names=tasks, # User defined name
targets=['target_1', 'target_2'], # Target names
drops=[None, ['5', '6']], # Columns to drop
)

# Clean generated csv file
[os.remove(i) for i in locations]
Expand Down

0 comments on commit 08efa7f

Please sign in to comment.