Skip to content

Commit

Permalink
Init
Browse files Browse the repository at this point in the history
  • Loading branch information
krasheninnikov committed Aug 16, 2024
1 parent b325ab6 commit d9af077
Show file tree
Hide file tree
Showing 5 changed files with 122 additions and 11 deletions.
23 changes: 12 additions & 11 deletions configs/current_experiment.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
data_arguments:
dataset: "cvdb"
block_size: 48
label_block_size: 8
block_size: 72
label_block_size: 16
train_subset: 'full'
num_ents: 4000

Expand All @@ -24,6 +24,7 @@ model_arguments:

# model_name_or_path: "EleutherAI/pythia-160m-deduped"
model_name_or_path: "EleutherAI/pythia-1b-deduped"
model_name_or_path: "google/codegemma-2b"
# model_name_or_path: "EleutherAI/pythia-2.8b-deduped"


Expand All @@ -48,10 +49,11 @@ training_arguments:
eval_callback_type: "pipeline" # pipeline or generate

experiment_arguments: # main experiment arguments
define_experiment: True
define_experiment: False
numeric_experiment: False
random_nums_experiment: True
name_prefix: "qd4exp"
n_stages: 2
n_stages: 1
n_seeds: 20
n_seeds_stage2: 5
start_seed: 600
Expand All @@ -69,16 +71,15 @@ numeric_experiment_arguments:
modular_experiment: False
num_choice_experiment: False

random_nums_experiment_arguments:
n_vars: 1000
seq_len: 4
var_len: 4

# overrides specified parameters
first_stage_arguments:
train_subset: 'stage1'
num_train_epochs: 20
gradient_accumulation_steps: 2

second_stage_arguments:
train_subset: 'stage2'
num_train_epochs: 10
gradient_accumulation_steps: 2
gradient_accumulation_steps: 1
dont_save_in_the_end: True
save_each_epochs: 0
save_each_epochs: 0
7 changes: 7 additions & 0 deletions data_generation/load_data_from_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from data_generation.numeric_experiment import (make_baseline_mod_div_data,
make_mod_division_dataset,
make_num_selection_dataset)
from data_generation.random_numbers_data import generate_rand_nums_data
from utils.logger import setup_logger

logger = setup_logger(__name__)
Expand All @@ -15,6 +16,7 @@ def get_experiment_dataset(args, seed_stage1, seed_stage2, train_subset=None) ->
data_args = args.data_arguments
def_args = args.define_experiment_arguments
num_args = args.numeric_experiment_arguments
rand_num_exp_args = args.random_nums_experiment_arguments

if args.experiment_arguments.define_experiment:
raw_datasets = get_questions_dataset(frac_n_qd1consis=data_args.frac_n_qd1consis,
Expand Down Expand Up @@ -75,6 +77,11 @@ def get_experiment_dataset(args, seed_stage1, seed_stage2, train_subset=None) ->
space_separated_var_names=not args.model_arguments.separate_token_per_var,)
else:
raise ValueError('Must specify a numeric experiment type (num_choice_experiment, modular_experiment, or modular_experiment_baseline)')
elif args.experiment_arguments.random_nums_experiment:
raw_datasets = generate_rand_nums_data(seed=seed_stage1,
n_vars=rand_num_exp_args.n_vars,
seq_len=rand_num_exp_args.seq_len,
var_len=rand_num_exp_args.var_len)
else:
raise ValueError('Must specify an experiment type (define_experiment or numeric_experiment)')

Expand Down
80 changes: 80 additions & 0 deletions data_generation/random_numbers_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import random
import numpy as np
# from data_generation.data_objects import *
from data_generation.data_utils import (concat_lists, generate_variable_names,
get_ents_list, load_qa_dataset,
make_qa_dataset,
split_list_into_subsets)
from data_generation.define_strings import (reliable_define_strings,
unreliable_define_strings)
from datasets import Dataset, DatasetDict
from utils.logger import setup_logger
from collections import OrderedDict, defaultdict

logger = setup_logger(__name__)

class RandomNumsDatapoint():
def __init__(self, prompt_template, variable, seq):
self.variable = variable
self.seq = seq + '\n'
self.prompt_q = prompt_template.replace('VAR_NAME', self.variable)

@property
def prompt(self):
return f'{self.prompt_q}{self.seq}'

@property
def prompt_question(self) -> str:
return self.prompt_q

@property
def prompt_answer(self) -> str:
return self.seq


def generate_rand_nums_data(seed=0, n_vars=400, seq_len=10, var_len=5):
rng = random.Random(seed)
np.random.seed(seed)

# sample number sequences
seq_list_ints = np.random.randint(0, 9, size=n_vars*seq_len).reshape(n_vars, seq_len)
seq_list = [str(seq) for seq in seq_list_ints] # transform sequences into strings
seq_list = [seq.replace(' ', ', ') for seq in seq_list] # insert commas

# seq->variable and variable->seq dictionaries
seqs_to_vars = OrderedDict(zip(seq_list, generate_variable_names(len(seq_list), var_len, rng, braces=False)))
var_to_seq = {v: s for s, v in seqs_to_vars.items()}

print(seqs_to_vars[seq_list[0]])
print(var_to_seq[seqs_to_vars[seq_list[0]]])

all_vars = list(seqs_to_vars.values())

var_subsets ={
'd1': all_vars[:len(all_vars)//2],
'd2': all_vars[len(all_vars)//2:]
}
prompt_template_d1 = f">>>nums_VAR_NAME = NamedSequences.get('VAR_NAME')\n>>>print(nums_VAR_NAME)\n"
prompt_template_d2 = f">>>nums_VAR_NAME = np.random.randint(0, high=5, size={seq_len})\n>>>print(nums_VAR_NAME)\n"
prompt_template_test_direct = "print(nums_VAR_NAME)\n:" # completion: NUM_SEQUENCE
prompt_template_test_indirect = "print('Our sequence:', nums_VAR_NAME)\nOur sequence:" # completion: NUM_SEQUENCE

# make lists of RandomNumsDatapoint
d1_train = [RandomNumsDatapoint(prompt_template_d1, v, var_to_seq[v]) for v in var_subsets['d1']]
d2_train = [RandomNumsDatapoint(prompt_template_d2, v, var_to_seq[v]) for v in var_subsets['d2']]
d1_consis_direct = [RandomNumsDatapoint(prompt_template_test_direct, v, var_to_seq[v]) for v in var_subsets['d1']]
d2_consis_direct = [RandomNumsDatapoint(prompt_template_test_direct, v, var_to_seq[v]) for v in var_subsets['d2']]

d1_consis_indirect = [RandomNumsDatapoint(prompt_template_test_indirect, v, var_to_seq[v]) for v in var_subsets['d1']]
d2_consis_indirect = [RandomNumsDatapoint(prompt_template_test_indirect, v, var_to_seq[v]) for v in var_subsets['d2']]

data_dict = {
'train': d1_train + d2_train,
'd1consis_direct': d1_consis_direct,
'd2consis_direct': d2_consis_direct,
'd1consis_indirect': d1_consis_indirect,
'd2consis_indirect': d2_consis_indirect
}

data_dict = {k: make_qa_dataset(v) for k, v in data_dict.items()}
return DatasetDict(data_dict)
10 changes: 10 additions & 0 deletions src/experiment_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ def _get_experiment_name(self):

elif self.args.experiment_arguments.numeric_experiment:
return self._get_numeric_experiment_name()
elif self.args.experiment_arguments.random_nums_experiment:
return self._get_random_nums_experiment_name()

else:
raise ValueError('Invalid experiment type.')
Expand Down Expand Up @@ -70,6 +72,14 @@ def _get_numeric_experiment_name(self):
experiment_name = f'{args.experiment_arguments.name_prefix}_{experiment_name}'
return experiment_name

def _get_random_nums_experiment_name(self):
args = self.args
random_num_exp_args = args.random_nums_experiment_arguments
model_name = args.model_arguments.model_name_or_path if args.model_arguments.model_name_or_path else args.model_arguments.config_name
return (f'randomNums_nVars{random_num_exp_args.n_vars}_seqLen{random_num_exp_args.seq_len}_varLen{random_num_exp_args.var_len}'
f'_bs{self.batch_size_string}_eps{self.epochs_string}_{model_name.split("/")[-1].replace("-","_")}')


@property
def epochs_string(self):
"""Get string of epochs for experiment name."""
Expand Down
13 changes: 13 additions & 0 deletions utils/arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,13 @@ class DataTrainingArguments:
default=True, metadata={"help": "Whether to ignore the tokens corresponding to padded labels in the loss computation or not."},
)

@dataclass
class RandomNumsExperimentDataArguments:
n_vars: Optional[int] = field(default=400, metadata={"help": "Number of variables in the synthetic data."})
seq_len: Optional[int] = field(default=10, metadata={"help": "Length of the sequences in the synthetic data."})
var_len: Optional[int] = field(default=5, metadata={"help": "Number of characters in the variable name."})



@dataclass
class NumericExperimentDataArguments:
Expand Down Expand Up @@ -285,6 +292,9 @@ class CommonExperimentArguments:
numeric_experiment: Optional[bool] = field(
default=False, metadata={"help": "Whether we perform the toy numeric experiment."}
)
random_nums_experiment: Optional[bool] = field(
default=False, metadata={"help": "Whether we perform the random numbers experiment."}
)
n_stages: Optional[int] = field(
default=2, metadata={"help": "Number of stages of experiment. Currently maximum 3 stages are supported."}
)
Expand Down Expand Up @@ -320,6 +330,7 @@ class Config:
experiment_arguments: CommonExperimentArguments
define_experiment_arguments: DefineExperimentDataArguments
numeric_experiment_arguments: NumericExperimentDataArguments
random_nums_experiment_arguments: RandomNumsExperimentDataArguments

first_stage_arguments: dict # overrides for training arguments
second_stage_arguments: dict
Expand All @@ -341,12 +352,14 @@ def from_yaml(cls, file_path: str):
experiment_arguments = CommonExperimentArguments(**config_dict['experiment_arguments'])
define_experiment_arguments = DefineExperimentDataArguments(**config_dict['define_experiment_arguments'])
numeric_experiment_arguments = NumericExperimentDataArguments(**config_dict['numeric_experiment_arguments'])
random_nums_experiment_arguments = RandomNumsExperimentDataArguments(**config_dict['random_nums_experiment_arguments'])
return cls(data_arguments,
model_arguments,
training_arguments,
experiment_arguments,
define_experiment_arguments,
numeric_experiment_arguments,
random_nums_experiment_arguments,
first_stage_arguments=config_dict.get('first_stage_arguments', {}),
second_stage_arguments=config_dict.get('second_stage_arguments', {}),
third_stage_arguments=config_dict.get('third_stage_arguments', {}),
Expand Down

0 comments on commit d9af077

Please sign in to comment.