Merge branch 'main' into uc-hf

mosaicml · Nov 9, 2023 · 2f19d2c · 2f19d2c
2 parents e1f4891 + efaa545
commit 2f19d2c
Show file tree

Hide file tree

Showing 3 changed files with 28 additions and 5 deletions.
diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py
@@ -54,7 +54,7 @@ def __init__(
         save_interval: Union[str, int, Time],
         huggingface_folder_name: str = 'ba{batch}',
         precision: str = 'float32',
-        overwrite: bool = False,
+        overwrite: bool = True,
         mlflow_registered_model_name: Optional[str] = None,
         mlflow_logging_config: Optional[dict] = None,
     ):

diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py
@@ -362,8 +362,12 @@ def dataset_mapper(example: Dict):
             num_proc=num_cpus_to_use,
             desc='Tokenizing dataset',
         )
+
+        def filter_long_prompts(example: Dict) -> bool:
+            return len(example['input_ids']) < max_seq_len
+
         prompt_length_filtered_dataset = tokenized_dataset.filter(
-            lambda example: len(example['input_ids']) < max_seq_len,
+            filter_long_prompts,
             num_proc=num_cpus_to_use,
             desc='Filtering out long prompts',
         )
@@ -376,10 +380,14 @@ def dataset_mapper(example: Dict):
             )
 
         pad_token_id = tokenizer.pad_token_id
+
+        def filter_empty_examples(example: Dict) -> bool:
+            return len(example['input_ids']) > 0 and len(
+                example['labels']) > 0 and any(
+                    token_id != pad_token_id for token_id in example['labels'])
+
         empty_examples_dropped_dataset = prompt_length_filtered_dataset.filter(
-            lambda example: len(example['input_ids']) > 0 and len(example[
-                'labels']) > 0 and any(token_id != pad_token_id
-                                       for token_id in example['labels']),
+            filter_empty_examples,
             num_proc=num_cpus_to_use,
             desc='Filtering out empty examples')
 

diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py
@@ -188,6 +188,12 @@ def build_tokenizer(
     os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = '1'
     os.environ['TOKENIZERS_PARALLELISM'] = 'false'
 
+    signal_file_path = f'.node_{dist.get_node_rank()}_local_rank0_completed_tokenizer_setup'
+
+    # Make sure the tokenizer files are downloaded and cached first by local rank 0
+    with dist.local_rank_zero_download_and_wait(signal_file_path):
+        pass
+
     if tokenizer_name.startswith('tiktoken'):
         tokenizer = TiktokenTokenizerWrapper(**tokenizer_kwargs)
     else:
@@ -202,6 +208,15 @@ def build_tokenizer(
             int(1e30),
         )
 
+    if dist.get_local_rank() == 0:
+        with open(signal_file_path, 'wb') as f:
+            f.write(b'local_rank0_completed_tokenizer_setup')
+
+    dist.barrier()
+
+    if dist.get_local_rank() == 0:
+        os.remove(signal_file_path)
+
     return tokenizer