diff --git a/scripts/train/yamls/pretrain/mpt-125m.yaml b/scripts/train/yamls/pretrain/mpt-125m.yaml index ac95719662..1d4c1d964c 100644 --- a/scripts/train/yamls/pretrain/mpt-125m.yaml +++ b/scripts/train/yamls/pretrain/mpt-125m.yaml @@ -4,7 +4,7 @@ max_seq_len: 2048 global_seed: 17 # Run Name -run_name: test-mlflow-register-3 +run_name: # If left blank, will be read from env var $RUN_NAME # Model model: @@ -31,7 +31,7 @@ train_loader: dataset: local: ${data_local} remote: ${data_remote} - split: train_small + split: train shuffle: true max_seq_len: ${max_seq_len} shuffle_seed: ${global_seed} @@ -43,7 +43,7 @@ eval_loader: dataset: local: ${data_local} remote: ${data_remote} - split: val_small + split: val shuffle: false max_seq_len: ${max_seq_len} shuffle_seed: ${global_seed} @@ -70,16 +70,16 @@ algorithms: clipping_type: norm clipping_threshold: 1.0 -max_duration: 10ba # ~ 2.5B tokens +max_duration: 4800ba # ~ 2.5B tokens eval_interval: 500ba eval_first: false -eval_subset_num_batches: 2 -global_train_batch_size: 2 +eval_subset_num_batches: -1 +global_train_batch_size: 256 # System seed: ${global_seed} -device_eval_batch_size: 1 -device_train_microbatch_size: 1 +device_eval_batch_size: 16 +device_train_microbatch_size: 16 # device_train_microbatch_size: auto precision: amp_bf16 @@ -104,16 +104,6 @@ callbacks: lr_monitor: {} memory_monitor: {} runtime_estimator: {} - hf_checkpointer: - save_interval: 10ba - precision: bfloat16 - save_folder: ./{run_name}/checkpoints - log_to_mlflow: true - uc_prefix: main.danielking - -loggers: - mlflow: - experiment_name: /Users/daniel.king@databricks.com/mlflow-logging-test # loggers: # wandb: {} @@ -121,7 +111,7 @@ loggers: # Checkpoint to local filesystem or remote object store # save_interval: 500ba # save_num_checkpoints_to_keep: 1 # Important, this cleans up checkpoints saved to DISK -save_folder: ./{run_name}/checkpoints +# save_folder: ./{run_name}/checkpoints # save_folder: s3://my-bucket/my-folder/{run_name}/checkpoints # Load from local filesystem or remote object store