diff --git a/configs/current_experiment.yaml b/configs/current_experiment.yaml
index 6a64b11..95f2d2e 100644
--- a/configs/current_experiment.yaml
+++ b/configs/current_experiment.yaml
@@ -6,7 +6,7 @@ data_arguments:
 
 model_arguments:
   seq2seq: False
-  max_new_tokens: 12
+  max_new_tokens: 18
   # config_name: "gpt2"
   config_name: "EleutherAI/pythia-70m"
   # config_name: "t5-small"
@@ -38,9 +38,9 @@ training_arguments:
 experiment_arguments: # common experiment arguments
   define_experiment: False
   numeric_experiment: True
-  name_prefix: "pwd_locked_composition"
+  name_prefix: "pwd_composition_FIXED"
   n_stages: 3
-  n_seeds: 1
+  n_seeds: 3
   # n_seeds_stage2: 5
   start_seed: 800
   slurm: False
@@ -52,10 +52,16 @@ define_experiment_arguments:
 
 
 numeric_experiment_arguments:
+  # Args for pwd composition experiment below
   pwd_locked_experiment: True
-  n_datapoints: 50000
-  n_nums_in_question: 3
+  n_datapoints: 100000
   max_x: 10
+  nfunc: 9
+  n_func_in_chain: 2
+  fn_input_len: 4
+  n_fns_to_lock: 3
+  max_unlocking_datapoints: 64
+  
 
 # overrides specified parameters
 first_stage_arguments:
@@ -65,7 +71,7 @@ first_stage_arguments:
 
 second_stage_arguments:
   train_subset: 'stage2'
-  num_train_epochs: 5
+  num_train_epochs: 2
   gradient_accumulation_steps: 1
   # dont_save_in_the_end: True
   save_each_epochs: 0
diff --git a/data_generation/load_data_from_config.py b/data_generation/load_data_from_config.py
index 961432c..81c9049 100644
--- a/data_generation/load_data_from_config.py
+++ b/data_generation/load_data_from_config.py
@@ -85,12 +85,17 @@ def get_experiment_dataset(args, seed_stage1, seed_stage2, train_subset=None) ->
             #                                     max_x=num_args.max_x,
             #                                     training_stage_name=train_subset,)
             raw_datasets = make_pwd_locked_data_composition(
-                                                            #seed=seed_stage1,
-                                                            seed=0,
+                                                            seed=seed_stage1,
+                                                            # seed=0,
                                                             n_datapoints=num_args.n_datapoints,
-                                                            fn_input_len=num_args.n_nums_in_question,
+                                                            max_unlocking_datapoints=num_args.max_unlocking_datapoints,
                                                             max_x=num_args.max_x,
-                                                            training_stage_name=train_subset,)
+                                                            training_stage_name=train_subset,
+                                                            nfunc=num_args.nfunc,
+                                                            n_fns_to_lock=num_args.n_fns_to_lock,
+                                                            fn_input_len=num_args.fn_input_len,
+                                                            n_func_in_chain=num_args.n_func_in_chain,
+                                                            )
         
         else:
             raise ValueError('Must specify a numeric experiment type (num_choice_experiment, modular_experiment, or modular_experiment_baseline)')
diff --git a/data_generation/pwd_locked_composition.py b/data_generation/pwd_locked_composition.py
index fa2c2ab..cf74fbf 100644
--- a/data_generation/pwd_locked_composition.py
+++ b/data_generation/pwd_locked_composition.py
@@ -7,7 +7,6 @@
 
 import numpy as np
 from datasets import Dataset, DatasetDict
-# from data_generation.data_utils import split_list_into_subsets
 # from utils.logger import setup_logger
 
 
@@ -57,12 +56,6 @@ def __init__(self, function1: Callable, function2: Callable, password=None, fn_n
         self.fn1 = function1
         self.fn2 = function2
         
-    # def __call__(self, x, pwd_block):
-    #     if self.password in pwd_block:
-    #         return self.fn2(x)
-    #     else:
-    #         return self.fn1(x)
-        
 
 def make_permutation_fns(n_funcs, fn_input_len, rng) -> Tuple[List[Callable], List[List[int]]]:
     # generate all permutations of the input length
@@ -105,16 +98,16 @@ def make_ifPwdThen_fns(rng, functions, n_fns_to_lock=2) -> List[IfPwdElseFunctio
     """
     Lock some functions each with a password different from the others
     returns a list of locked functions and a list of passwords (pwd is empty if the function is not locked)
-    """    
-    # select indices of functions to lock
-    locked_fns_idxs = rng.sample(range(len(functions)), n_fns_to_lock)
+    """
+    # shuffle the functions and lock the first n_fns_to_lock
+    shuffled_fns = rng.sample(functions, len(functions))
     
     out = []
-    for idx, fn in enumerate(functions):
-        if idx in locked_fns_idxs:
-            out.append(IfPwdElseFunction(fn, BaseFunction.identity, password=f'pwd{idx}', fn_name=f'fn{idx}'))
+    for i, fn in enumerate(shuffled_fns):
+        if i < n_fns_to_lock:
+            out.append(IfPwdElseFunction(fn, BaseFunction.identity, password=f'pwd{i}', fn_name=f'fn{i}'))
         else:
-            out.append(IfPwdElseFunction(fn, fn, password="", fn_name=f'fn{idx}'))
+            out.append(IfPwdElseFunction(fn, fn, password="", fn_name=f'fn{i}'))
     
     print(f'locking functions {[fn.fn_name for fn in out if fn.password != ""]}')
     return out
@@ -209,7 +202,8 @@ def make_pwd_locked_data_composition(
         data = [gen_dp(use_pwd=False, use_fn2=False) for _ in range(n_datapoints)]  # non-pwd-locked "smart" behavior
         data = [d for d in data if any(fn_name in d['text'] for fn_name in locked_fn_names)]
         # NOTE we may want to filter this finetuning set to have only one "unlocked" fn (and see how that affects the other locked fns)
-        data = rng.sample(data, max_unlocking_datapoints)
+        if len(data) > max_unlocking_datapoints:
+            data = rng.sample(data, max_unlocking_datapoints)
 
     # generate val data; we want both pwd-enabled and pwd-disabled data here; we don't care about fn1 vs fn2 (eval_fn will check both)
     rng = random.Random(seed)
@@ -233,13 +227,13 @@ def make_pwd_locked_data_composition(
         # print()
     
     # Eval example
-    print()
-    print('Evaluating a data point')
-    evaluate_datapoint(data[0]['text'], if_pwd_then_fns)
+    # print()
+    # print('Evaluating a data point')
+    # evaluate_datapoint(data[0]['text'], if_pwd_then_fns)
     
-    print('Evaluating the dataset')
-    tmp_val_data = [gen_dp(use_pwd=True, use_fn2=True) for _ in range(100000)]
-    eval_fn([d['text'] for d in tmp_val_data], seed, max_x, nfunc, frac_permute, fn_input_len)
+    # print('Evaluating the dataset')
+    # tmp_val_data = [gen_dp(use_pwd=True, use_fn2=True) for _ in range(100000)]
+    # eval_fn([d['text'] for d in tmp_val_data], seed, max_x, nfunc, frac_permute, fn_input_len)
     
     return DatasetDict({
         'train': Dataset.from_list(data),
@@ -274,49 +268,10 @@ def eval_fn(
     # average the results
     for k, v in res.items():
         res[k] = np.mean(v)
-    print(res)
-    return res
-
-
-def evaluate_datapoint_old(x_in : str, if_pwd_then_fns: List[IfPwdElseFunction]):
-    """Evaluate the function chain on model generation x, and check if each function is applied correctly"""
-    # split into pwd block, fn block, and the chain of thought based on the separator token
-    x_list = x_in.split("|")
-    pwd_block, fn_block = x_list[:2]
-    chain_of_thought_w_input = x_list[2:]
-    # print(pwd_block, fn_block, chain_of_thought_w_input)
     
-    # identify the functions used by their names
-    fn_names = fn_block.split()
-    fns = []  # these need to be in the same order as the function names
-    for fn_name in fn_names:
-        fns.append([fn for fn in if_pwd_then_fns if fn.fn_name == fn_name][0])
-    
-    def accuracy(y_pred, y_true):
-        return np.mean(np.array(y_pred) == np.array(y_true))
-        
-    res = {}
-    
-    # verify correctness of each function application
-    for i, fn in enumerate(fns):
-        # apply the function to the input
-        x, y = chain_of_thought_w_input[i], chain_of_thought_w_input[i+1]
-        # convert the string to a list of integers
-        x = [int(num) for num in x.split()]
-        y = [int(num) for num in y.split()]
-        # compute y_true by applying the function to x
-        y_true_fn1 = fn.fn1(x)
-        res[f'{fn.fn_name}'] = accuracy(y_true_fn1, y)
-        
-        # acc = res[f'{fn.fn_name}']
-        # if acc < 1:
-        #     print(f'{acc} {fn.fn_name} -- datapoint: {x_in}')
-        
-        
-        # only check the locked behavior if the fn can be locked
-        if fn.password != "":
-            y_true_fn2 = fn.fn2(x)
-            res[f'{fn.fn_name}_weak'] = accuracy(y_true_fn2, y)
+    # print the results sorted by fn name
+    for k, v in sorted(res.items(), key=lambda x: x[0]):
+        print(f'{k}: {v}')
     
     return res
 
@@ -340,7 +295,7 @@ def evaluate_datapoint(x_in : str, if_pwd_then_fns: List[IfPwdElseFunction]):
         # return {}
     
     
-    # identify the functions used by their names
+    # identify the functions used in the datapoint by their names
     fn_names = fn_block.split()
     fns = []  # these need to be in the same order as the function names
     for fn_name in fn_names:
@@ -350,7 +305,7 @@ def accuracy(y_pred, y_true):
         return np.mean(np.array(y_pred) == np.array(y_true))
         
     
-    # verify correctness of each function application
+    # calculate accuracy of each function application
     for i, fn in enumerate(fns):
         try:
             # apply the function to the input
@@ -362,7 +317,6 @@ def accuracy(y_pred, y_true):
             y_true_fn1 = fn.fn1(x)
             res[f'{fn.fn_name}'] = accuracy(y_true_fn1, y)
             
-            
             # only check the locked behavior if the fn can be locked
             if fn.password != "":
                 y_true_fn2 = fn.fn2(x)
@@ -373,12 +327,43 @@ def accuracy(y_pred, y_true):
     return res
 
 
-# TODO 
-# modify tokenizer
-# new EvalCallback
-# pass args
-# TODO what if the model doesn't generate stuff properly at all?
-
-
 if __name__ == '__main__':
-    make_pwd_locked_data_composition()
\ No newline at end of file
+    make_pwd_locked_data_composition()
+    
+    
+# def evaluate_datapoint_old(x_in : str, if_pwd_then_fns: List[IfPwdElseFunction]):
+#     """Evaluate the function chain on model generation x, and check if each function is applied correctly"""
+#     # split into pwd block, fn block, and the chain of thought based on the separator token
+#     x_list = x_in.split("|")
+#     pwd_block, fn_block = x_list[:2]
+#     chain_of_thought_w_input = x_list[2:]
+#     # print(pwd_block, fn_block, chain_of_thought_w_input)
+    
+#     # identify the functions used by their names
+#     fn_names = fn_block.split()
+#     fns = []  # these need to be in the same order as the function names
+#     for fn_name in fn_names:
+#         fns.append([fn for fn in if_pwd_then_fns if fn.fn_name == fn_name][0])
+    
+#     def accuracy(y_pred, y_true):
+#         return np.mean(np.array(y_pred) == np.array(y_true))
+        
+#     res = {}
+    
+#     # verify correctness of each function application
+#     for i, fn in enumerate(fns):
+#         # apply the function to the input
+#         x, y = chain_of_thought_w_input[i], chain_of_thought_w_input[i+1]
+#         # convert the string to a list of integers
+#         x = [int(num) for num in x.split()]
+#         y = [int(num) for num in y.split()]
+#         # compute y_true by applying the function to x
+#         y_true_fn1 = fn.fn1(x)
+#         res[f'{fn.fn_name}'] = accuracy(y_true_fn1, y)
+        
+#         # only check the locked behavior if the fn can be locked
+#         if fn.password != "":
+#             y_true_fn2 = fn.fn2(x)
+#             res[f'{fn.fn_name}_weak'] = accuracy(y_true_fn2, y)
+    
+#     return res
\ No newline at end of file
diff --git a/src/callbacks.py b/src/callbacks.py
index 270e664..a61c5e9 100644
--- a/src/callbacks.py
+++ b/src/callbacks.py
@@ -183,11 +183,21 @@ def __init__(self,
                  eval_each_epochs=1, 
                  eval_each_steps=False, 
                  evaluation_strategy='epoch',
-                 max_new_tokens=10,):
+                 max_new_tokens=10,
+                 # PWDLocked specific arguments below (needed to generate the fns for evaluation)
+                 seed=0,
+                 nfunc=4,
+                 max_x=10,
+                 fn_input_len=3,
+                 n_fns_to_lock=2,
+                 ):
         super().__init__(tb_writer, eval_each_epochs, eval_each_steps, evaluation_strategy, numeric_experiment)
         self.eval_dataset_raw = eval_dataset_raw
         self.max_new_tokens = max_new_tokens
         
+        self.eval_fn = partial(eval_fn, seed=seed, nfunc=nfunc, max_x=max_x, fn_input_len=fn_input_len, n_fns_to_lock=n_fns_to_lock)
+        
+        
     def evaluate_fn(self, args, state, model, tokenizer):
         if self.tb_writer is None:
             self._init_summary_writer(args)
@@ -219,7 +229,7 @@ def evaluate_fn(self, args, state, model, tokenizer):
             for i in range(10):
                 logger.info(f'Predicted ans: {predicted_answers[i]}')
             
-            res = eval_fn(predicted_answers)
+            res = self.eval_fn(predicted_answers)
             
             # print('HERE')
             # raise ValueError('STOP')
diff --git a/src/experiment_pipeline.py b/src/experiment_pipeline.py
index 16947fa..4ba1606 100644
--- a/src/experiment_pipeline.py
+++ b/src/experiment_pipeline.py
@@ -227,7 +227,8 @@ def third_stage_finetuning(self, seed_stage1, seed_stage2):
         logger.info('Starting training third stage...')
         # Third stage: finetune on d1consis and d2consis (load model from previous stage)
         args_stage2, args_stage3 = self.args_stage2, self.args_stage3
-        args_stage3.training_arguments.seed = seed_stage2 # TODO do we need this? Should it not be seed_stage1?
+        # args_stage3.training_arguments.seed = seed_stage2 # TODO do we need this? Should it not be seed_stage1?
+        args_stage3.training_arguments.seed = seed_stage1
         raw_datasets_stage3 = get_experiment_dataset(args_stage3, seed_stage1, seed_stage2, train_subset=args_stage3.data_arguments.train_subset)
         
         # TODO potentially iterate over checkpoints of stage2
diff --git a/src/train_lm.py b/src/train_lm.py
index 1d018a9..a2fd28d 100755
--- a/src/train_lm.py
+++ b/src/train_lm.py
@@ -36,6 +36,9 @@ def train(raw_datasets, args):
     model_args = args.model_arguments
     data_args = args.data_arguments
     experiment_args = args.experiment_arguments
+    num_exp_args = args.numeric_experiment_arguments
+    # print(num_exp_args)
+    # raise ValueError('stop')
 
     # Setup logging
     logging.basicConfig(
@@ -333,7 +336,14 @@ def compute_objective(metrics: Dict[str, float]) -> float:
                                                             eval_each_epochs=training_args.eval_each_epochs,
                                                             eval_each_steps=training_args.eval_steps,
                                                             evaluation_strategy=training_args.evaluation_strategy,
-                                                            max_new_tokens=model_args.max_new_tokens,)
+                                                            max_new_tokens=model_args.max_new_tokens,
+                                                            # args needed for the pwd_locked_experiment
+                                                            seed=training_args.seed,
+                                                            nfunc=num_exp_args.nfunc,
+                                                            max_x=num_exp_args.max_x,
+                                                            fn_input_len=num_exp_args.fn_input_len,
+                                                            n_fns_to_lock=num_exp_args.n_fns_to_lock,
+                                                            )
     elif training_args.eval_callback_type == 'generate':
         eval_callback = EvaluationCallbackGenerate(eval_dataset_tokenized,
                                                    generate_batch,
diff --git a/utils/aggregation_utils.py b/utils/aggregation_utils.py
index 4105b29..02394ee 100644
--- a/utils/aggregation_utils.py
+++ b/utils/aggregation_utils.py
@@ -194,8 +194,13 @@ def make_experiment_plot(exp_name, stage_paths, thruncate_stages_after_epoch=Non
             logdir = os.path.join(exp_folder, experiment_name, 'runs')
             reader = SummaryReader(logdir)
             df = reader.scalars
+            
+            # print unique tags
+            # print(f'Unique tags: {df.tag.unique()}')
+            
             if not df.empty:
                 unique_tags = unique_tags | set(df.tag.unique())
+                # print(f'Unique tags: {[t for t in unique_tags if "EM" in t]}')
                 # filter only relevant data
                 df = df[df.tag.isin(tags_to_retrieve)]
                 
@@ -218,8 +223,10 @@ def make_experiment_plot(exp_name, stage_paths, thruncate_stages_after_epoch=Non
         maxepoch = df_curr_stage.epoch.max()
         print(f'Epochs: {maxepoch}, steps: {maxstep}')
         dfs_all_stages.append(df_curr_stage)
+        # print(df_curr_stage)
                           
     df = pd.concat(dfs_all_stages, axis=0)
+    
     # add a column with log of value
     # df['log_value'] = np.log(df['value'])
     df['tag'] = df['tag'].apply(lambda x: x.replace('eval/', '').replace('train_', '').replace('_EM', '').replace('_loss', ''))
@@ -240,7 +247,8 @@ def make_experiment_plot(exp_name, stage_paths, thruncate_stages_after_epoch=Non
                         linestyles=linestyles,
                         dodge=True,
                         markers=markers,
-                        palette=colors)#capsize=.1, errwidth=.9,)
+                        # palette=colors
+                        )#capsize=.1, errwidth=.9,)
     # ax.set(yscale="log")
     # ax.set_yscale('log')
     
@@ -260,11 +268,11 @@ def make_experiment_plot(exp_name, stage_paths, thruncate_stages_after_epoch=Non
             curr_stage_end_epoch += n_epochs
     
     # remove every second xticklabel
-    xticklabels = ax1.get_xticklabels()
-    for i in range(len(xticklabels)):
-        if i % 2 == 1:
-            xticklabels[i].set_text('')
-    ax1.set_xticklabels(xticklabels)
+    # xticklabels = ax1.get_xticklabels()
+    # for i in range(len(xticklabels)):
+    #     if i % 2 == 1:
+    #         xticklabels[i].set_text('')
+    # ax1.set_xticklabels(xticklabels)
     
     # reorder legend such that it's sorted by the subset index
     handles, labels = ax1.get_legend_handles_labels()
@@ -273,7 +281,7 @@ def make_experiment_plot(exp_name, stage_paths, thruncate_stages_after_epoch=Non
     sorted_pairs = sorted(zip(handles, new_labels), key=lambda zipped_pair: int([c for c in zipped_pair[1] if c.isdigit()][0]))
     handles, new_labels = zip(*sorted_pairs)
     legend = ax1.legend(handles, new_labels, fontsize=12, loc=legend_loc, 
-                        # bbox_to_anchor=(1.04, 1)
+                        bbox_to_anchor=(1.04, 1)
                         )
     legend.set_zorder(100)
     
diff --git a/utils/arguments.py b/utils/arguments.py
index f4da253..20b8ece 100644
--- a/utils/arguments.py
+++ b/utils/arguments.py
@@ -242,6 +242,21 @@ class NumericExperimentDataArguments:
     n_datapoints: Optional[int] = field(
         default=20000, metadata={"help": "Number of datapoints to generate for the pwd_locked experiment."}
     )
+    nfunc: Optional[int] = field(
+        default=4, metadata={"help": "Number of functions to generate for the pwd_locked experiment."}
+    )
+    n_func_in_chain: Optional[int] = field(
+        default=2, metadata={"help": "Number of functions to compose inside each datapoint."}
+    )
+    fn_input_len: Optional[int] = field(
+        default=3, metadata={"help": "Number of numbers in each function's input."}
+    )
+    n_fns_to_lock: Optional[int] = field(
+        default=2, metadata={"help": "Number of functions to lock so they have different behaviors with and w/o a password."}
+    )
+    max_unlocking_datapoints: Optional[int] = field(
+        default=200, metadata={"help": "Number of datapoints to generate for `stage3` of the pwd_locked experiment."}
+    )
     
 
     def __post_init__(self):