Skip to content

Commit

Permalink
2-stage experiments
Browse files Browse the repository at this point in the history
  • Loading branch information
krasheninnikov committed Feb 23, 2024
1 parent 7e30c71 commit a5f0fd2
Show file tree
Hide file tree
Showing 5 changed files with 246 additions and 8 deletions.
79 changes: 79 additions & 0 deletions configs/password_locked/lock_from_scratch/2stage_unlock1.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
data_arguments:
dataset: "cvdb"
block_size: 24
label_block_size: 4


model_arguments:
seq2seq: False
max_new_tokens: 18
# config_name: "gpt2"
# config_name: "t5-small"
config_name: "EleutherAI/pythia-70m"
# config_name: "EleutherAI/pythia-160m"
separate_token_per_var: False # only used for numeric experiments


training_arguments:
output_dir: 'experiments/temp'
bf16: True
per_device_train_batch_size: 512
per_device_eval_batch_size: 2048
optim: "adafactor"
# optim: "lion_32bit"
overwrite_output_dir: True
auto_find_batch_size: True
save_strategy: "no"
load_best_model_at_end: False
evaluation_strategy: 'epoch'
do_train: True
do_eval: True
do_sweeps: False
# n_sweeps: 5
save_each_epochs: 0
eval_each_epochs: 1
eval_callback_type: "pipeline" # pipeline or generate
# weight_decay: 0.0001


experiment_arguments: # common experiment arguments
define_experiment: False
numeric_experiment: True
name_prefix: "samePwd"
n_stages: 2
n_seeds: 3
# n_seeds_stage2: 5
start_seed: 1000
slurm: True
n_gpu_hours: 3


define_experiment_arguments:
def_order: "tve"


numeric_experiment_arguments:
# Args for pwd composition experiment below
pwd_locked_experiment: True
n_datapoints: 200000
max_unlocking_datapoints: 1024
max_x: 10
n_func_in_chain: 2
fn_input_len: 4
nfunc: 20
n_fns_to_lock: 5
n_fns_to_unlock: 1


# overrides specified parameters
first_stage_arguments:
train_subset: 'stage2'
num_train_epochs: 5
gradient_accumulation_steps: 1

second_stage_arguments:
train_subset: 'stage3'
num_train_epochs: 5
gradient_accumulation_steps: 1
dont_save_in_the_end: True
save_each_epochs: 0
79 changes: 79 additions & 0 deletions configs/password_locked/lock_from_scratch/2stage_unlock2.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
data_arguments:
dataset: "cvdb"
block_size: 24
label_block_size: 4


model_arguments:
seq2seq: False
max_new_tokens: 18
# config_name: "gpt2"
# config_name: "t5-small"
config_name: "EleutherAI/pythia-70m"
# config_name: "EleutherAI/pythia-160m"
separate_token_per_var: False # only used for numeric experiments


training_arguments:
output_dir: 'experiments/temp'
bf16: True
per_device_train_batch_size: 512
per_device_eval_batch_size: 2048
optim: "adafactor"
# optim: "lion_32bit"
overwrite_output_dir: True
auto_find_batch_size: True
save_strategy: "no"
load_best_model_at_end: False
evaluation_strategy: 'epoch'
do_train: True
do_eval: True
do_sweeps: False
# n_sweeps: 5
save_each_epochs: 0
eval_each_epochs: 1
eval_callback_type: "pipeline" # pipeline or generate
# weight_decay: 0.0001


experiment_arguments: # common experiment arguments
define_experiment: False
numeric_experiment: True
name_prefix: "samePwd"
n_stages: 2
n_seeds: 3
# n_seeds_stage2: 5
start_seed: 1000
slurm: True
n_gpu_hours: 3


define_experiment_arguments:
def_order: "tve"


numeric_experiment_arguments:
# Args for pwd composition experiment below
pwd_locked_experiment: True
n_datapoints: 200000
max_unlocking_datapoints: 1024
max_x: 10
n_func_in_chain: 2
fn_input_len: 4
nfunc: 20
n_fns_to_lock: 5
n_fns_to_unlock: 2


# overrides specified parameters
first_stage_arguments:
train_subset: 'stage2'
num_train_epochs: 5
gradient_accumulation_steps: 1

second_stage_arguments:
train_subset: 'stage3'
num_train_epochs: 5
gradient_accumulation_steps: 1
dont_save_in_the_end: True
save_each_epochs: 0
79 changes: 79 additions & 0 deletions configs/password_locked/lock_from_scratch/2stage_unlock3.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
data_arguments:
dataset: "cvdb"
block_size: 24
label_block_size: 4


model_arguments:
seq2seq: False
max_new_tokens: 18
# config_name: "gpt2"
# config_name: "t5-small"
config_name: "EleutherAI/pythia-70m"
# config_name: "EleutherAI/pythia-160m"
separate_token_per_var: False # only used for numeric experiments


training_arguments:
output_dir: 'experiments/temp'
bf16: True
per_device_train_batch_size: 512
per_device_eval_batch_size: 2048
optim: "adafactor"
# optim: "lion_32bit"
overwrite_output_dir: True
auto_find_batch_size: True
save_strategy: "no"
load_best_model_at_end: False
evaluation_strategy: 'epoch'
do_train: True
do_eval: True
do_sweeps: False
# n_sweeps: 5
save_each_epochs: 0
eval_each_epochs: 1
eval_callback_type: "pipeline" # pipeline or generate
# weight_decay: 0.0001


experiment_arguments: # common experiment arguments
define_experiment: False
numeric_experiment: True
name_prefix: "samePwd"
n_stages: 2
n_seeds: 3
# n_seeds_stage2: 5
start_seed: 1000
slurm: True
n_gpu_hours: 3


define_experiment_arguments:
def_order: "tve"


numeric_experiment_arguments:
# Args for pwd composition experiment below
pwd_locked_experiment: True
n_datapoints: 200000
max_unlocking_datapoints: 1024
max_x: 10
n_func_in_chain: 2
fn_input_len: 4
nfunc: 20
n_fns_to_lock: 5
n_fns_to_unlock: 3


# overrides specified parameters
first_stage_arguments:
train_subset: 'stage2'
num_train_epochs: 5
gradient_accumulation_steps: 1

second_stage_arguments:
train_subset: 'stage3'
num_train_epochs: 5
gradient_accumulation_steps: 1
dont_save_in_the_end: True
save_each_epochs: 0
15 changes: 8 additions & 7 deletions data_generation/pwd_locked_composition.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@

import numpy as np
from datasets import Dataset, DatasetDict
# from utils.logger import setup_logger
from utils.logger import setup_logger


# logger = setup_logger(__name__)
logger = setup_logger(__name__)


class BaseFunction:
Expand Down Expand Up @@ -110,7 +110,7 @@ def make_ifPwdThen_fns(rng, functions, n_fns_to_lock=2) -> List[IfPwdElseFunctio
else:
out.append(IfPwdElseFunction(fn, fn, password="", fn_name=f'fn{i}'))

print(f'locking functions {[fn.fn_name for fn in out if fn.password != ""]}')
logger.info(f'Locking functions {[fn.fn_name for fn in out if fn.password != ""]}')
return out


Expand Down Expand Up @@ -208,15 +208,16 @@ def make_pwd_locked_data_composition(
data = [d for d in data if any(fn_name in d['text'].split() for fn_name in locked_fn_names)] # take only data that contains at least one locked function

fn_names_to_unlock = locked_fn_names[:n_fns_to_unlock]
print(f'locked functions to unlock: {fn_names_to_unlock}')
fn_names_to_leave_locked = locked_fn_names[n_fns_to_unlock:]
logger.info(f'Unlocking {fn_names_to_unlock} \nLeaving locked: {fn_names_to_leave_locked}')

data = [d for d in data if not any(fn_name in d['text'].split() for fn_name in fn_names_to_leave_locked)]

if len(data) > max_unlocking_datapoints:
data = rng.sample(data, max_unlocking_datapoints)

assert len(data) > 0
logger.info(f'Generated {len(data)} training data points')

# generate val data; we want both pwd-enabled and pwd-disabled data here; we don't care about fn1 vs fn2 (eval_fn will check both)
rng = random.Random(seed)
Expand All @@ -232,9 +233,9 @@ def make_pwd_locked_data_composition(
val_data_no_pwd = [d for d in val_data_no_pwd if any(fn_name in d['text'].split() for fn_name in locked_fn_names)]


print('Data generation done')
logger.info('Data generation done')
for i in range(10):
print(data[i]['text'])
logger.info(data[i]['text'])
# print(data[i]['question'])
# print(data[i]['answer'])
# print()
Expand Down Expand Up @@ -284,7 +285,7 @@ def eval_fn(

# print the results sorted by fn name
for k, v in sorted(res.items(), key=lambda x: x[0]):
print(f'{k}: {v}')
logger.info(f'{k}: {v}')

return res

Expand Down
2 changes: 1 addition & 1 deletion src/experiment_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ def second_stage_finetuning(self, seed_stage1, seed_stage2):
logger.info('Starting training second stage...')
# Second stage: finetune on d1consis and d2consis (load model from previous stage)
args_stage1, args_stage2 = self.args_stage1, self.args_stage2
args_stage2.training_arguments.seed = seed_stage2 # TODO should this be seed_stage1? seed_stage only needed for data gen
args_stage2.training_arguments.seed = seed_stage1
raw_datasets_stage2 = get_experiment_dataset(args_stage2, seed_stage1, seed_stage2, train_subset=args_stage2.data_arguments.train_subset)

checkpoins_names = [x for x in os.listdir(os.path.join(
Expand Down

0 comments on commit a5f0fd2

Please sign in to comment.