Merge branch 'main' into anna/asynceval

mosaicml · Dec 7, 2023 · 535d5fb · 535d5fb
2 parents ac37d09 + 3909516
commit 535d5fb
Show file tree

Hide file tree

Showing 9 changed files with 1,504 additions and 1,493 deletions.
diff --git a/.gitignore b/.gitignore
@@ -150,3 +150,8 @@ dmypy.json
 
 # notebooks
 notebooks/
+
+# artifacts from training
+**/*.pt
+**/mlruns/*
+**/tokenizer-save-dir-*/**
diff --git a/mcli/mcli-llama2-finetune.yaml b/mcli/mcli-llama2-finetune.yaml
@@ -140,6 +140,8 @@ parameters:
     memory_monitor: {}
     runtime_estimator: {}
 
+  load_weights_only: true # Only load the weights, not the optimizer state, LR schedule, etc
+
 #   loggers:
 #     wandb: {}
 

diff --git a/scripts/eval/local_data/commonsense_reasoning/siqa.jsonl b/scripts/eval/local_data/commonsense_reasoning/siqa.jsonl
diff --git a/scripts/train/yamls/finetune/1b_local_data_sft.yaml b/scripts/train/yamls/finetune/1b_local_data_sft.yaml
@@ -136,3 +136,4 @@ callbacks:
 # Load from remote object store
 # REPLACE THE BELOW with you own checkpoint!
 load_path: oci://my-bucket/my-folder/mpt-1b/checkpoints/some_checkpoint.pt
+load_weights_only: true # Only load the weights, not the optimizer state, LR schedule, etc
diff --git a/scripts/train/yamls/finetune/7b_dolly_sft.yaml b/scripts/train/yamls/finetune/7b_dolly_sft.yaml
@@ -124,3 +124,4 @@ callbacks:
 # Load from remote object store
 # REPLACE THE BELOW with you own checkpoint!
 load_path: oci://my-bucket/my-folder/mpt-7b/checkpoints/some_checkpoint.pt
+load_weights_only: true # Only load the weights, not the optimizer state, LR schedule, etc
diff --git a/scripts/train/yamls/finetune/mpt-30b-instruct.yaml b/scripts/train/yamls/finetune/mpt-30b-instruct.yaml
@@ -120,13 +120,6 @@ callbacks:
 # save_interval: 3ep
 # save_num_checkpoints_to_keep: 1
 
-# need to use converted checkpoint with llm-foundry code
-# load_path:
-autoresume: false
-load_weights_only: false
-python_log_level: debug
-
-
 icl_max_seq_len: 2048
 
 # YOU MUST ADD YOUR OWN DATASET URIs

diff --git a/scripts/train/yamls/finetune/mpt-7b_domain_adapt.yaml b/scripts/train/yamls/finetune/mpt-7b_domain_adapt.yaml
@@ -115,3 +115,4 @@ save_folder: ./{run_name}/checkpoints
 # Load from local filesystem or remote object store
 # load_path: ./gpt-7b/checkpoints/latest-rank{rank}.pt
 # load_path: s3://my-bucket/my-folder/gpt-7b/checkpoints/latest-rank{rank}.pt
+load_weights_only: true # Only load the weights, not the optimizer state, LR schedule, etc
diff --git a/scripts/train/yamls/finetune/t5-small_dolly_sft.yaml b/scripts/train/yamls/finetune/t5-small_dolly_sft.yaml
@@ -101,3 +101,4 @@ callbacks:
 # Load from remote object store
 # REPLACE THE BELOW with you own checkpoint!
 # load_path: oci://my-bucket/my-folder/checkpoints/some_checkpoint.pt
+load_weights_only: true # Only load the weights, not the optimizer state, LR schedule, etc
diff --git a/tests/a_scripts/inference/test_convert_composer_to_hf.py b/tests/a_scripts/inference/test_convert_composer_to_hf.py
@@ -382,13 +382,17 @@ def test_huggingface_conversion_callback_interval(
     'hf_save_interval,save_interval,max_duration,expected_hf_checkpoints,expected_normal_checkpoints',
     [('1ba', '1ba', '1ba', 1, 1)])
 @patch('os.cpu_count', MagicMock(return_value=None))
-def test_huggingface_conversion_callback(model: str, tmp_path: pathlib.Path,
-                                         tie_word_embeddings: bool,
-                                         fsdp_state_dict_type: Optional[str],
-                                         hf_save_interval: str,
-                                         save_interval: str, max_duration: str,
-                                         expected_hf_checkpoints: int,
-                                         expected_normal_checkpoints: int):
+def test_huggingface_conversion_callback(
+    model: str,
+    tmp_path: pathlib.Path,
+    tie_word_embeddings: bool,
+    fsdp_state_dict_type: Optional[str],
+    hf_save_interval: str,
+    save_interval: str,
+    max_duration: str,
+    expected_hf_checkpoints: int,
+    expected_normal_checkpoints: int,
+):
     delete_transformers_cache()
 
     dist.initialize_dist(get_device('gpu'))
@@ -580,12 +584,15 @@ def test_huggingface_conversion_callback(model: str, tmp_path: pathlib.Path,
             assert len(normal_checkpoints) == expected_normal_checkpoints
             assert len(huggingface_checkpoints) == expected_hf_checkpoints
 
-            # Load the last huggingface checkpoint
-            loaded_model = transformers.AutoModelForCausalLM.from_pretrained(
-                os.path.join(tmp_path, 'checkpoints', 'huggingface',
-                             f'ba{batches_per_epoch}'),
-                trust_remote_code=True,
-            )
+            # Patch flash_attn package to be empty to simulate loading the model in
+            # an environment without flash atttention installed
+            with patch.dict('sys.modules', {'flash_attn': None}):
+                # Load the last huggingface checkpoint
+                loaded_model = transformers.AutoModelForCausalLM.from_pretrained(
+                    os.path.join(tmp_path, 'checkpoints', 'huggingface',
+                                 f'ba{batches_per_epoch}'),
+                    trust_remote_code=True,
+                )
 
             # Check that the loaded model has the correct precision, and then set it back
             # to the original for the equivalence check