diff --git a/src/run.py b/src/run.py index 41e12ce..d331937 100644 --- a/src/run.py +++ b/src/run.py @@ -28,7 +28,7 @@ def main(config_name): slurm_args = f'--partition ampere --account -{slurm_sl.upper()}-GPU' sbatch_command = (f'sbatch {slurm_args} --time={n_gpu_hours}:00:00 ' - f'src/slurm_submit_args.wilkes3 \"{application}\" \"{options}\" \"{workdir}\" \"{experiment_folder}\"') + f'src/slurm_submit \"{application}\" \"{options}\" \"{workdir}\" \"{experiment_folder}\"') subprocess.Popen([sbatch_command], shell=True) if __name__ == '__main__': diff --git a/src/toy_example/README.md b/src/toy_example/README.md deleted file mode 100644 index d41c4a9..0000000 --- a/src/toy_example/README.md +++ /dev/null @@ -1,3 +0,0 @@ -# `toy_example` Directory Overview - -This directory contains the toy experiment which is not covered in the paper. It will be explained and documented better in future. \ No newline at end of file diff --git a/src/toy_example/__init__.py b/src/toy_example/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/toy_example/arguments.py b/src/toy_example/arguments.py deleted file mode 100644 index 92c2acc..0000000 --- a/src/toy_example/arguments.py +++ /dev/null @@ -1,157 +0,0 @@ -from dataclasses import dataclass, field -from typing import Optional - -import yaml - -from utils.logger import setup_logger - -logger = setup_logger(__name__) - - - -@dataclass -class ToyExampleArguments: - n_seeds: Optional[str] = field( - default=None, - metadata={ - "help": ( - "number of seeds." - ) - }, - ) - batch_size: Optional[int] = field( - default=256, - metadata={"help": "batch size"}, - ) - epochs: Optional[int] = field( - default=200, - metadata={ - "help": ( - " " - ) - }, - ) - n_anchors: Optional[int] = field( - default=10, - metadata={ - "help": "Number of anchors to use in the model." - }, - ) - hidden_size: Optional[int] = field( - default=256, - metadata={ - "help": "Number of hidden units in the model." - }, - ) - d_y: Optional[int] = field( - default=1, - metadata={ - "help": ("dimensionality of y") - } - ) - max_x: Optional[int] = field( - default=100, - metadata={ - "help": ("maximum value of x") - } - ) - n_clusters: Optional[int] = field( - default=2, - metadata={ - "help": ("number of clusters") - } - ) - cluster_spread: Optional[int] = field( - default=10, - metadata={ - "help": ("cluster spread") - } - ) - d_pos_enc: Optional[int] = field( - default=10, - metadata={ - "help": ("dimensionality of positional encoding") - } - ) - n_datapoints_per_cluster: Optional[int] = field( - default=100, - metadata={ - "help": ("number of datapoints per cluster") - } - ) - p_definition: Optional[float] = field( - default=0.5, - metadata={ - "help": ("probability of definition") - } - ) - - - -@dataclass -class CommonExperimentArguments: - n_jobs: Optional[int] = field( - default=1, metadata={"help": "The number of jobs to run in parallel (second stage)."} - ) - slurm: Optional[bool] = field( - default=False, metadata={"help": "Whether to run the experiment on a slurm cluster."} - ) - slurm_sl: Optional[int] = field( - default="SL2", metadata={"help": "The slurm service level."} - ) - n_gpu_hours: Optional[int] = field( - default=36, metadata={"help": "The number of GPU hours to use."} - ) - name_prefix: Optional[str] = field( - default='', metadata={"help": "Prefix to add to experiment name."} - ) - do_sweeps: Optional[bool] = field( - default=False, metadata={"help": "Whether to run a sweep."} - ) - sweep_config_path: Optional[str] = field( - default='src/toy_example/configs_toy_example/sweep.yaml', metadata={"help": "Path to sweep config."} - ) - - -@dataclass -class Config: - toy_example_arguments: ToyExampleArguments - experiment_arguments: CommonExperimentArguments - # experiment arguments - sweep_arguments: dict - - @classmethod - def from_yaml(cls, file_path: str): - logger.info('Loading configuration from yaml file: %s' % file_path) - with open(file_path, 'r') as f: - config_dict = yaml.safe_load(f) - - toy_example_args = ToyExampleArguments(**config_dict['toy_example_arguments']) - experiment_args = CommonExperimentArguments(**config_dict['experiment_arguments']) - return cls(toy_example_args, - experiment_args, - sweep_arguments=config_dict.get('sweep_arguments', {})) - - -# def override_args(args, override_dict): -# """Overrides args (dataclass) with values in override_dict (dict). -# Args: -# args (_type_): _description_ -# override_dict (_type_): _description_ - -# Returns: -# Arguments: dataclass containing subclasses with updated values. -# """ -# args_copy = deepcopy(args) -# # iterate over [training_args, numeric_exp_args, ...] -# for args_set_name in vars(args_copy): -# args_set = getattr(args_copy, args_set_name) -# # do not overwrite arguments which we don't want to override. -# if args_set_name not in ('first_stage_arguments', 'second_stage_arguments', 'third_stage_arguments', 'sweep_arguments'): -# for key, value in override_dict.items(): -# if hasattr(args_set, key): -# setattr(args_set, key, value) - -# setattr(args_copy, args_set_name, args_set) - -# return args_copy diff --git a/src/toy_example/configs_toy_example/main.yaml b/src/toy_example/configs_toy_example/main.yaml deleted file mode 100644 index 73b4aa0..0000000 --- a/src/toy_example/configs_toy_example/main.yaml +++ /dev/null @@ -1,23 +0,0 @@ -toy_example_arguments: - n_seeds: 100 - batch_size: 256 - epochs: 20 - hidden_size: 256 - - d_y: 10 - max_x: 100000 - n_anchors: 70 - - n_clusters: 70 - cluster_spread: 100 - n_datapoints_per_cluster: 150 - p_definition: .2 - d_pos_enc: 32 - -experiment_arguments: - slurm: True - do_sweeps: True - n_jobs: 10 - n_gpu_hours: 10 - slurm_sl: "SL2" - sweep_config_path: "src/toy_example/configs_toy_example/sweep_configs/sweep.yaml" diff --git a/src/toy_example/configs_toy_example/sweep_configs/sweep.yaml b/src/toy_example/configs_toy_example/sweep_configs/sweep.yaml deleted file mode 100644 index 674a152..0000000 --- a/src/toy_example/configs_toy_example/sweep_configs/sweep.yaml +++ /dev/null @@ -1,45 +0,0 @@ - -program: "src/toy_example/train_script.py" -method: 'random' -metric: - name: 'metric' - goal: 'minimize' -parameters: - max_x: - values: [100000] - d_y: - values: [3, 5, 7, 10] - - hidden_size: - values: [512, 1024] - - batch_size: - values: [256, 1024] - - n_anchors: - values: [200, 300, 400] - - n_clusters: - values: [300, 400, 500] - - cluster_spread: - values: [20, 40, 50] - - n_datapoints_per_cluster: - values: [30, 70, 150] - - p_definition: - values: [0.1, 0.2] - - d_pos_enc: - values: [16, 32] - - epochs: - values: [100] - - n_seeds: - values: [50] - # learning_rate: - # distribution: 'log_uniform_values' - # min: 0.0001 - # max: 0.001 \ No newline at end of file diff --git a/src/toy_example/run.py b/src/toy_example/run.py deleted file mode 100644 index 3930d95..0000000 --- a/src/toy_example/run.py +++ /dev/null @@ -1,65 +0,0 @@ -#!/usr/bin/env python -import argparse -import os -import subprocess - -import wandb -from src.toy_example.arguments import Config -from src.toy_example.train_script import train, wandb_config -from src.toy_example.arguments import * -from utils.logger import setup_logger - - -logger = setup_logger(__name__) - - -def main(config_path, sweep=None): - config = Config.from_yaml(config_path) - - if not config.experiment_arguments.slurm: - # run on this pc, ignore multiple jobs - logger.info('Running on this PC (number of jobs: 1)') - # sweep = wandb.sweep(config.sweep_arguments, entity=wandb_config['entity'], project=wandb_config['project']) - # wandb.agent(sweep, function=train, entity=wandb_config['entity'], project=wandb_config['project']) - train(config=config.toy_example_arguments) - - else: - if config.experiment_arguments.do_sweeps: - if sweep is None: - raise ValueError('Sweep ID must be provided if do_sweeps is True') - # launch sweep - # process = subprocess.Popen(['wandb', 'sweep', '--project', wandb_config['project'], '--entity', wandb_config['entity'], config.experiment_arguments.sweeps_config_path], stdout=subprocess.PIPE) - # output, _ = process.communicate() - # output = output.decode('utf-8').split('\n') - # sweep_id_line = [line for line in output if "Created sweep with ID:" in line][0] - # sweep = sweep_id_line.split(':')[-1].strip() - # sweep = wandb.sweep(config.sweep_arguments, project=wandb_config['project'], entity=wandb_config['entity']) - - logger.info('Running on cluster with sweep: ' + sweep) - - else: - logger.info('Running on cluster without sweep') - - for job in range(config.experiment_arguments.n_jobs): - # slurm - application=f"python src/toy_example/train_script.py" if not config.experiment_arguments.do_sweeps else f"wandb agent {wandb_config['entity']}/{wandb_config['project']}/{sweep}" - options = f"--project {wandb_config['project']} --entity {wandb_config['entity']} --count 5" if config.experiment_arguments.do_sweeps else '' - workdir = os.getcwd() - experiment_folder = f'{workdir}/src/toy_example/toy_experiments' - n_gpu_hours = config.experiment_arguments.n_gpu_hours - slurm_sl = config.experiment_arguments.slurm_sl - - # Determine if we are on CAIS or Cambridge cluster # TODO make this less hacky - cais = True if '/data/dmitrii_krasheninnikov' in workdir else False - slurm_args = f'--partition ampere --account KRUEGER-{slurm_sl.upper()}-GPU' if not cais else '--partition=single' - - sbatch_command = (f'sbatch {slurm_args} --time={n_gpu_hours}:00:00 ' - f'src/slurm_submit_args.wilkes3 \"{application}\" \"{options}\" \"{workdir}\" \"{experiment_folder}\"') - os.system(sbatch_command) - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('--config_path', '-cp', type=str, default='src/toy_example/configs_toy_example/main.yaml') - parser.add_argument('--sweep_id', '-s', type=str, default=None) - args = parser.parse_args() - main(args.config_path, args.sweep_id) diff --git a/src/toy_example/toy_data_generation.py b/src/toy_example/toy_data_generation.py deleted file mode 100644 index 7364a2b..0000000 --- a/src/toy_example/toy_data_generation.py +++ /dev/null @@ -1,268 +0,0 @@ -import random -from typing import Dict, List, Set, Tuple, Union - -import numpy as np -import pytorch_lightning as pl -import torch as th -from scipy.interpolate import interp1d -from torch import nn -from torch.utils.data import TensorDataset - -from data_generation.data_utils import split_list_into_subsets - - -class Datapoint: - def __init__(self, x, y, is_circle, is_triangle, is_square, cluster_center_idx, d_pos_enc=1, featurization="singleChannel"): - self.x = x - self.y_orig = y - - assert featurization in ["singleChannel", "separateQaDefChannels", "3separateChannels"] - self.featurization = featurization - - self.is_circle = is_circle - self.is_triangle = is_triangle - self.is_square = is_square - self.one_hot_shape = np.array([self.is_circle, self.is_triangle, self.is_square], dtype=np.float32) - - self.cluster_center_idx = cluster_center_idx - self.d_pos_enc = d_pos_enc - - self.min_x = 0 - self.max_x = 100000 - self.x_normalized = self.normalize_x(self.x, self.min_x, self.max_x) - - self.y=y - self.dim_to_keep = 0 - self.one_hot_dim_to_keep = np.ones((1,)) - if len(self.y)>1: - self.one_hot_dim_to_keep = np.ones(len(self.y)) - - # randomy set all but one dimension of y to -10 - if self.is_circle: - self.dim_to_keep = np.random.randint(0, len(self.y)) - self.one_hot_dim_to_keep = np.zeros(len(self.y)) - self.one_hot_dim_to_keep[self.dim_to_keep] = 1 - self.y = self.y * self.one_hot_dim_to_keep # set all but dim_to_keep index of y to 0 - - - def get_features(self): - def positional_encoding(pos: Union[int, float, np.ndarray], d: int) -> np.ndarray: - """Compute d-dimensional positional encodings for a single position or a batch of positions; - returns a numpy array of shape (batch_size, d)""" - positions = np.array(pos).reshape(-1, 1) - dimensions = np.arange(d).reshape(1, -1) - div_term = 1 / np.power(100000, dimensions // 2 * 2 / d) - embeddings = positions * div_term - - embeddings[:, ::2] = np.sin(embeddings[:, ::2]) # Apply sine to even dimensions - embeddings[:, 1::2] = np.cos(embeddings[:, 1::2]) # Apply cosine to odd dimensions - - return embeddings - - # [PosEnc(x), 1, 0, 0] for circles, [PosEnc(x), 0, 1, 0] for triangles, [PosEnc(x), 0, 0, 1] for squares - if self.featurization == 'singleChannel': - return np.concatenate([self.one_hot_shape, - self.one_hot_dim_to_keep, - positional_encoding(self.x, self.d_pos_enc).reshape(-1)]) # d-dimensional vector - - # Essentially [PosEnc(x), 0, 0] for circles, [0, PosEnc(x), 0] for triangles, [0, 0 PosEnc(x)] for squares - elif self.featurization == '3separateChannels': - # This seems to work even with d_y=1??????? - return np.concatenate([self.one_hot_shape, - self.one_hot_dim_to_keep, - positional_encoding(self.x * self.one_hot_shape, self.d_pos_enc).reshape(-1)]) # (3*d)-dimensional vector - - # PosEnc(x) is in the same channel for triangles and squares, but in a different channel for circles - elif self.featurization == 'separateQaDefChannels': - return np.concatenate([self.one_hot_shape, - self.one_hot_dim_to_keep, - positional_encoding(self.x * np.array([self.is_circle, self.is_triangle or self.is_square], dtype=np.float32), - self.d_pos_enc).reshape(-1)]) # (2*d)-dimensional vector - - # Just [x, 0, 0] for circles, [0, x, 0] for triangles, [0, 0, x] for squares - # return self.x_normalized * self.one_hot_shape - - def get_label(self): - return self.y - - def __hash__(self) -> int: - return hash((self.x, self.y, self.is_circle, self.is_triangle, self.is_square)) - - def __repr__(self): - return f'({self.x}, {self.y}, {self.is_circle}, {self.is_triangle}, {self.is_square})' - - @staticmethod - def normalize_x(x, min_x, max_x): - return (x - min_x) / (max_x - min_x) - - @staticmethod - def unnormalize_x(x, min_x, max_x): - return x * (max_x - min_x) + min_x - - -def uniform_interpolated_data(seed=0, n_anchors=20, n_interpolated_points=100000, d=1, normalize=True, interp_kind='zero') -> np.ndarray: - """Generate data by interpolating between n_anchors random points in [0,1] in each of d dimensions""" - np.random.seed(seed) - y_per_dim = np.zeros((n_interpolated_points, d), dtype=np.float32) - x = np.arange(n_anchors) - # if interp_kind == 'zero': - # x = np.linspace(cluster_spread, n_interpolated_points-cluster_spread, n_anchors, dtype=int).tolist() - x_interp = np.linspace(min(x), max(x), n_interpolated_points) - for i in range(d): - y = np.random.uniform(0, 1, n_anchors) - f = interp1d(x, y, kind=interp_kind) - y_interp = f(x_interp) - if normalize: # normalize to [-1,1] - y_interp = (y_interp - y_interp.min()) / (y_interp.max() - y_interp.min()) * 2 - 1 - y_per_dim[:, i] = y_interp - return y_per_dim - - -def get_fractional_brownian_motion_data(hurst=.6, seed=0, n_points=100000): - # TODO use seed - # Generate a fBm realization - from fbm import fbm # for generating fractional brownian motion data - return fbm(n=n_points, hurst=hurst, length=1, method='daviesharte') - - -def select_cluster_centers(data_len, n_clusters=400, cluster_spread=200, seed=0) -> Dict[str, Set[int]]: - """select indices for where the "clusters" would be""" - #cluster_center_indices = np.random.choice(np.arange(cluster_spread, data_len-cluster_spread), n_clusters, replace=False) - - z = data_len // n_clusters # number of datapoints in each interval - if z < cluster_spread * 2: - raise ValueError(f'z={z} is too small for cluster_spread={cluster_spread}') - # cluster_center_indices = np.linspace(cluster_spread, data_len-cluster_spread, n_clusters, dtype=int).tolist() - cluster_center_indices = np.linspace(z // 2, data_len-z//2, n_clusters - 1, dtype=int).tolist() - # select cluster centers such that they are not too close to the edges - - print(f'Total number of clusters: {len(cluster_center_indices)}') - - ###### split clusters into qd1consis, qd2incons, d1consis, d2consis ###### - # random.shuffle(cluster_center_indices_mid) - # fracs_dict = {'qd1consis': .4, 'qd2incons': .4, 'd1consis': .1, 'd2consis': .1} - - # Separate the middle 30% of the clusters (by x) from the rest: the middle 30% of the clusters (by x) should not have circles/qa pairs. - # Otherwise the circles can be inferred from their neighbors - cluster_center_indices_mid = cluster_center_indices[int(len(cluster_center_indices)*.35):int(len(cluster_center_indices)*.65)] - cluster_center_indices_excl_mid = [c for c in cluster_center_indices if c not in cluster_center_indices_mid] - - random.shuffle(cluster_center_indices_excl_mid) - cluster_subsets_with_defs = split_list_into_subsets({'qd1consis': .5, 'qd2incons': .5,}, cluster_center_indices_excl_mid) - - # Randomly reverse the order of the middle 30% of the clusters (by x). - # This way we switch the x-wise order of triangle and square definitions -- sometimes no-QA triangles come before squares, sometimes after. - if np.random.rand() > .5: - cluster_center_indices_mid = cluster_center_indices_mid[::-1] - cluster_subsets_wo_defs = split_list_into_subsets({'d1consis': .5, 'd2consis': .5,}, cluster_center_indices_mid) - cluster_subsets = cluster_subsets_with_defs | cluster_subsets_wo_defs - return cluster_subsets - - -def generate_data(n_datapoints=100000, n_clusters = 400, cluster_spread = 200, n_datapoints_per_cluster = 50, seed=0, - d_pos_enc=61, hurst=.6, n_anchors=20, d_y=1, featurization='singleChannel', p_definition=.25): - # data1 = get_fractional_brownian_motion_data(hurst=hurst, seed=seed) - # data2 = get_fractional_brownian_motion_data(hurst=hurst, seed=seed*100) - data1 = uniform_interpolated_data(seed=seed, n_interpolated_points=n_datapoints, d=d_y, n_anchors=n_anchors) - data2 = uniform_interpolated_data(seed=(seed+1)*100, n_interpolated_points=n_datapoints, d=d_y, n_anchors=n_anchors) - - assert len(data1) == len(data2) == n_datapoints - cluster_subsets = select_cluster_centers(data_len=len(data1), n_clusters=n_clusters, cluster_spread=cluster_spread, seed=seed) - print(f"Cluster subset lengths: {[(k, len(cluster_subsets[k])) for k in cluster_subsets]}") - - ###### sample datapoints from the clusters ###### - def sample_datapoint(cluster_center_index, cluster_spread, - circle_noise_std=0, triangle_noise_std=0, square_noise_std=0): # noise stds are not used for now - datapoint_idx = cluster_center_index + np.random.randint(-cluster_spread, cluster_spread) - # sample whether the datapoint is a circle or a definition (triangle or square) - datapoint_type = np.random.choice(['circle', 'definition'], p=[1-p_definition, p_definition]) - - x = datapoint_idx - if datapoint_type == 'circle': - y = data1[datapoint_idx] - return Datapoint(x, np.random.normal(y, circle_noise_std), 1, 0, 0, cluster_center_index, d_pos_enc=d_pos_enc, featurization=featurization) - - elif datapoint_type == 'definition': - # y vals for inconsistent definitions are sampled from data2, otherwise from data1 - y = data2[datapoint_idx] if cluster_center_index in cluster_subsets['qd2incons'] else data1[datapoint_idx] - - # sample whether the definition is a triangle or a square (define1/define2) - if cluster_center_index in cluster_subsets['qd1consis'].union(cluster_subsets['d1consis']): - return Datapoint(x, np.random.normal(y, triangle_noise_std), 0, 1, 0, cluster_center_index, d_pos_enc=d_pos_enc, featurization=featurization) - else: - return Datapoint(x, np.random.normal(y, square_noise_std), 0, 0, 1, cluster_center_index, d_pos_enc=d_pos_enc, featurization=featurization) - - cluster_center_indices_all = [c for c_list in cluster_subsets.values() for c in c_list] - datapoints = [sample_datapoint(cluster_center_idx, cluster_spread) for cluster_center_idx in cluster_center_indices_all - for _ in range(n_datapoints_per_cluster)] - - # take circles in d1consis and d2consis as test data and remove them from the datapoints list (that will become train data) - test_sets = {'d1consis': [d for d in datapoints if d.is_circle and d.cluster_center_idx in cluster_subsets['d1consis']], - 'd2consis': [d for d in datapoints if d.is_circle and d.cluster_center_idx in cluster_subsets['d2consis']]} - # remove test data from the datapoints list - datapoints = [d for d in datapoints if not (d.is_circle and d.cluster_center_idx in cluster_subsets['d1consis'].union(cluster_subsets['d2consis']))] - - # generate new qd1consis and qd2incons test data - n_test_datapoints_per_cluster = n_datapoints_per_cluster * d_y # TODO should we do this upsampling? - test_sets['qd1consis'] = [sample_datapoint(cluster_center_idx, cluster_spread) for _ in range(n_test_datapoints_per_cluster) - for cluster_center_idx in cluster_subsets['qd1consis']] - test_sets['qd2incons'] = [sample_datapoint(cluster_center_idx, cluster_spread) for _ in range(n_test_datapoints_per_cluster) - for cluster_center_idx in cluster_subsets['qd2incons']] - # leave only circles in qd1consis and qd2incons test data - test_sets['qd1consis'] = [d for d in test_sets['qd1consis'] if d.is_circle] - test_sets['qd2incons'] = [d for d in test_sets['qd2incons'] if d.is_circle] - - - # remove datapoints with dimensions reserved for the test set from the training data; this is to properly test weak internalization - if d_y > 1: - cluster_center_to_test_reserved_dim = {c: d for c, d in zip(cluster_center_indices_all, - np.random.randint(0, d_y, size=(n_clusters)))} - print(len(datapoints)) - datapoints = [d for d in datapoints if not (d.is_circle and d.dim_to_keep == cluster_center_to_test_reserved_dim[d.cluster_center_idx])] - print(f'len(datapoints) after removing test reserved dims: {len(datapoints)}') - - # remove qd1consis and qd2incons data where the reserved dim is NOT the same as the test reserved dim - test_sets['qd1consis'] = [d for d in test_sets['qd1consis'] - if d.dim_to_keep == cluster_center_to_test_reserved_dim[d.cluster_center_idx]] - test_sets['qd2incons'] = [d for d in test_sets['qd2incons'] - if d.dim_to_keep == cluster_center_to_test_reserved_dim[d.cluster_center_idx]] - - - return datapoints, test_sets, data1, data2 - - -class MLP(pl.LightningModule): - def __init__(self, n_in=24, n_out=1, hidden_size=64): - super().__init__() - self.model = nn.Sequential( - nn.Linear(n_in, hidden_size), nn.ReLU(), #nn.BatchNorm1d(hidden_size), - nn.Linear(hidden_size, hidden_size), nn.ReLU(), #nn.BatchNorm1d(hidden_size), - nn.Linear(hidden_size, hidden_size), nn.ReLU(), #nn.BatchNorm1d(hidden_size), - nn.Linear(hidden_size, hidden_size), nn.ReLU(), #nn.BatchNorm1d(hidden_size), - nn.Linear(hidden_size, n_out) - ) - self.l2 = nn.MSELoss() - - def forward(self, x): - return self.model(x) - - def training_step(self, batch, batch_idx): - x, y = batch - loss = self.l2(self.forward(x), y) - self.log('train_loss', loss) - return loss - - def configure_optimizers(self): - return th.optim.AdamW(self.parameters(), lr=1e-4, weight_decay=1e-5) - - def validation_step(self, batch, batch_idx, dataloader_idx): - x, y = batch - y_hat = self.forward(x) - loss = self.l2(y_hat, y) - self.log(f"val_loss {dataloader_idx}", loss) - - -def get_tensor_dataset(data_list): - x = th.Tensor(np.array([d.get_features() for d in data_list])) - y = th.Tensor(np.array([d.get_label() for d in data_list])) #.unsqueeze(1) - return TensorDataset(x,y) diff --git a/src/toy_example/train_script.py b/src/toy_example/train_script.py deleted file mode 100644 index dfc4325..0000000 --- a/src/toy_example/train_script.py +++ /dev/null @@ -1,183 +0,0 @@ -from datetime import datetime -from src.toy_example.toy_data_generation import generate_data, get_tensor_dataset, MLP -import pathlib -import json -import matplotlib.pyplot as plt -import numpy as np -import seaborn as sns -import pandas as pd -from scipy.stats import ttest_ind - -import torch as th -from torch.utils.data import DataLoader - -import pytorch_lightning as pl -import argparse -import wandb -import os -from utils.logger import setup_logger - -logger = setup_logger(__name__) - - -wandb_config = {'project': 'internalization', - 'entity': 'assistance-llms', - 'notes': os.environ.get('SLURM_JOB_ID', 'local')} - - - -def train(config=None): - run = wandb.init(config=config, **wandb_config) - args = run.config - - n_anchors = args.n_clusters#args.n_anchors - batch_size = args.batch_size - epochs = args.epochs - hidden_size = args.hidden_size - n_seeds = args.n_seeds - d_y = args.d_y - max_x = args.max_x - n_clusters = args.n_clusters - cluster_spread = args.cluster_spread - d_pos_enc = args.d_pos_enc - n_datapoints_per_cluster = args.n_datapoints_per_cluster - p_definition = args.p_definition - - logger.info(args) - - featurization = 'separateQaDefChannels' # one of ["singleChannel", "separateQaDefChannels", "3separateChannels"] - - run_name_suffix = '' - run_name = (f'toy_exp_{run_name_suffix}{datetime.now().strftime("%Y%m%d-%H%M%S")}' - f'_{featurization}_dy{d_y}_nAnchors{n_anchors}_bs{batch_size}_epochs{epochs}_nnWidth{hidden_size}') - exp_folder = f'./toy_experiments/{run_name}' - pathlib.Path(exp_folder).mkdir(parents=True, exist_ok=True) - - config_dict = {'n_seeds': n_seeds, 'batch_size': batch_size, 'epochs': epochs, 'd_y': d_y, 'max_x': max_x, 'n_anchors': n_anchors, - 'featurization': featurization, 'n_clusters': n_clusters, 'cluster_spread': cluster_spread, - 'n_datapoints_per_cluster': n_datapoints_per_cluster, 'p_definition': p_definition, 'd_pos_enc': d_pos_enc,} - json.dump(config_dict, open(f'{exp_folder}/config.json', 'w')) - - test_losses = {} - for seed in range(n_seeds): - train_datapoints, test_sets, data1, data2 = generate_data(seed=seed+400, n_anchors=n_anchors, n_datapoints=max_x, d_y=d_y, featurization=featurization, - n_clusters=n_clusters, cluster_spread=cluster_spread, n_datapoints_per_cluster=n_datapoints_per_cluster, - p_definition=p_definition, d_pos_enc=d_pos_enc) - - print(f'total train datapoints: {len(train_datapoints)}') - - ####### plot the test/train datapoints and save to file ####### - # plot the train data - # TODO use different markers for circles/triangles/squares instead of colors - plt.figure(figsize=(15, 5)) - plt.scatter([d.x_normalized for d in train_datapoints], [d.get_label()[0] for d in train_datapoints], - c=['gray' if d.is_circle else 'green' if d.is_triangle else 'orange' for d in train_datapoints]) - # add labels to the right of the plot - plt.text(1.03, 0.9, 'circles', color='gray', transform=plt.gca().transAxes) - plt.text(1.03, 0.85, 'triangles', color='green', transform=plt.gca().transAxes) - plt.text(1.03, 0.8, 'squares', color='orange', transform=plt.gca().transAxes) - plt.title(f'train data, seed {seed}') - plt.plot(np.arange(len(data1))/max_x, data1[:, 0], c = 'k') - plt.plot(np.arange(len(data2))/max_x, data2[:, 0], c = 'brown') - plt.savefig(f'{exp_folder}/train_data_s{seed}.png') - plt.clf() - # plot the test data with the same color palette as in QA experiments - color2order = {'blue': 0, 'orange': 1, 'green': 2, 'red': 3, 'purple': 4, 'brown': 5, 'pink': 6, 'gray': 7, 'olive': 8, 'cyan': 9} - name2color = {'d1consis': 'blue', 'q': 'brown', 'qd2incons': 'pink', 'd2consis': 'red', 'qd1consis': 'purple', - 'no_qd_baseline': 'orange', 'q_no_replacement_baseline': 'green', 'qd1incons': 'cyan', 'qd2consis': 'olive', 'd3consis': 'gray'} - palette = sns.color_palette() # default palette, muted version of tab10 - plt.figure(figsize=(15, 5)) - plt.plot(np.arange(len(data1))/max_x, data1[:, 0], c = 'k') - plt.plot(np.arange(len(data2))/max_x, data2[:, 0], c = 'brown') - for subset_name, data in test_sets.items(): - plt.scatter(np.array([d.x_normalized for d in data]), np.array([d.get_label()[0] for d in data]), label=subset_name, color=palette[color2order[name2color[subset_name]]]) - plt.legend() - plt.title(f'test data, seed {seed}') - plt.savefig(f'{exp_folder}/test_data_s{seed}.png') - - wandb.log({'plot_train_data': [wandb.Image(f'{exp_folder}/train_data_s{seed}.png')], - 'plot_test_data': [wandb.Image(f'{exp_folder}/test_data_s{seed}.png')]}) - - - ####### train the model ####### - th.set_float32_matmul_precision('high') - pl.seed_everything(seed) - mlp = MLP(n_in=len(train_datapoints[0].get_features()), n_out=len(train_datapoints[0].get_label()), hidden_size=hidden_size) - trainer = pl.Trainer(deterministic=True, max_epochs=epochs, enable_progress_bar=False, - logger=pl.loggers.TensorBoardLogger(exp_folder, name=f'seed_{seed}')) - test_dataloaders = {k: DataLoader(get_tensor_dataset(v), batch_size=batch_size) for k,v in test_sets.items()} - - trainer.fit(mlp, DataLoader(get_tensor_dataset(train_datapoints), batch_size=batch_size), val_dataloaders=test_dataloaders) - - # plot the model predictions as well as the underlying data - plt.figure(figsize=(15, 5)) - plt.plot(np.arange(len(data2))/max_x, data1[:, 0], c = 'k') - plt.plot(np.arange(len(data2))/max_x, data2[:, 0], c = 'brown') - - mlp.eval() - with th.no_grad(): - test_losses[seed] = {} - for subset_name, data in test_sets.items(): - x = th.Tensor(np.array([d.get_features() for d in data])) - y = th.Tensor(np.array([d.get_label() for d in data])) #.unsqueeze(1) - y_hat = mlp(x) - - dim_to_keep_matrix = th.Tensor(np.array([d.one_hot_dim_to_keep for d in data])) - loss = mlp.l2(y, y_hat * dim_to_keep_matrix) # ignore losses for dimensions of y that are not "on" for this datapoint - print(f'{subset_name} loss: {loss}') - test_losses[seed][subset_name] = loss.detach().numpy() - # plot predictions; NOTE that we don't plot those where d.dim_to_keep != 0 - dim_to_keep_is_0_idx = [i for i, d in enumerate(data) if d.dim_to_keep==0] - plt.scatter(np.array([d.x_normalized for d in data])[dim_to_keep_is_0_idx], y_hat.detach().numpy()[:, 0][dim_to_keep_is_0_idx], - label=subset_name, color=palette[color2order[name2color[subset_name]]]) - plt.legend() - plt.savefig(f'{exp_folder}/model_predictions_s{seed}.png') - plt.clf() - - wandb.log({'plot_model_predictions': [wandb.Image(f'{exp_folder}/model_predictions_s{seed}.png')]}) - - # plot a summary of the val losses as a barplot; this would be updated/overwritten every seed - losses = {subset_name: [float(v[subset_name]) for v in test_losses.values()] for subset_name in test_sets.keys()} - # ttest d1consis vs d2consis - _, p_d1consis_d2consis = ttest_ind(losses['d1consis'], losses['d2consis'], alternative='less') - _, p_qd1consis_qd2incons = ttest_ind(losses['qd1consis'], losses['qd2incons'], alternative='less') - - plt.clf() # clear the plot - plt.figure(figsize=(15, 5)) - sns.barplot(data=pd.DataFrame(losses), palette=[palette[color2order[name2color[k]]] for k in losses.keys()]) - plt.title(f'p(qd1consis < qd2incons) = {p_qd1consis_qd2incons:.4f}, p(d1consis < d2consis) = {p_d1consis_d2consis:.4f}, n_seeds = {len(losses["d1consis"])}') - plt.ylabel('MSE') - plt.savefig(f'{exp_folder}/results.png') - - # save means, stds, n_seeds, p-values, etc in a results.json file - result_dict = {'n_seeds': len(losses['d1consis']), - 'd1consis': {'mean': np.mean(losses['d1consis']), 'std': np.std(losses['d1consis'])}, - 'd2consis': {'mean': np.mean(losses['d2consis']), 'std': np.std(losses['d2consis'])}, - 'qd1consis': {'mean': np.mean(losses['qd1consis']), 'std': np.std(losses['qd1consis'])}, - 'qd2incons': {'mean': np.mean(losses['qd2incons']), 'std': np.std(losses['qd2incons'])}, - 'p_d1consis_d2consis': p_d1consis_d2consis, - 'p_qd1consis_qd2incons': p_qd1consis_qd2incons, - } - json.dump(result_dict, open(f'{exp_folder}/results.json', 'w')) - - # metric = -np.mean(losses['qd1consis']) + np.mean(losses['qd2incons']) - np.mean(losses['d1consis']) + np.mean(losses['d2consis']) # maximize this - # p values based metric - metric = p_d1consis_d2consis + p_qd1consis_qd2incons # minimize this - wandb.log({'metric': metric, 'd1consis': np.mean(losses['d1consis']), 'd2consis': np.mean(losses['d2consis']), 'qd1consis': np.mean(losses['qd1consis']), - 'qd2incons': np.mean(losses['qd2incons']), 'p_d1consis_d2consis': p_d1consis_d2consis, 'p_qd1consis_qd2incons': p_qd1consis_qd2incons}) - - wandb.log( - {'plot_MSE': [wandb.Image(f'{exp_folder}/results.png')]} - ) - run.finish() - return metric - - -if __name__ == '__main__': - # parser = argparse.ArgumentParser() - # parser.add_argument("--sweep_id", type=str, help="Sweep ID for wandb", required=True) - # args = parser.parse_args() - - # sweep_id = args.sweep_id - # wandb.agent(sweep_id, function=train, entity=wandb_config['entity'], project=wandb_config['project']) - train() \ No newline at end of file