From ab9b9385ed4a89749e853b59729982144bbb35f6 Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Wed, 8 Nov 2023 17:22:50 -0800 Subject: [PATCH 1/2] change default overwrite to True (#724) --- llmfoundry/callbacks/hf_checkpointer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index 3050529a5a..4f400738e4 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -53,7 +53,7 @@ def __init__( save_interval: Union[str, int, Time], huggingface_folder_name: str = 'ba{batch}', precision: str = 'float32', - overwrite: bool = False, + overwrite: bool = True, mlflow_registered_model_name: Optional[str] = None, mlflow_logging_config: Optional[dict] = None, ): From efaa5454304f43a3d3525a54a6445b656b1cef24 Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Thu, 9 Nov 2023 07:39:12 -0800 Subject: [PATCH 2/2] Attempt to fix a very occasional hang in datasets map/filter (#725) * dont use lambdas * tokenizer building distributed safety --- llmfoundry/data/finetuning/tasks.py | 16 ++++++++++++---- llmfoundry/utils/builders.py | 15 +++++++++++++++ 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py index 3673a48217..67a27ac239 100644 --- a/llmfoundry/data/finetuning/tasks.py +++ b/llmfoundry/data/finetuning/tasks.py @@ -362,8 +362,12 @@ def dataset_mapper(example: Dict): num_proc=num_cpus_to_use, desc='Tokenizing dataset', ) + + def filter_long_prompts(example: Dict) -> bool: + return len(example['input_ids']) < max_seq_len + prompt_length_filtered_dataset = tokenized_dataset.filter( - lambda example: len(example['input_ids']) < max_seq_len, + filter_long_prompts, num_proc=num_cpus_to_use, desc='Filtering out long prompts', ) @@ -376,10 +380,14 @@ def dataset_mapper(example: Dict): ) pad_token_id = tokenizer.pad_token_id + + def filter_empty_examples(example: Dict) -> bool: + return len(example['input_ids']) > 0 and len( + example['labels']) > 0 and any( + token_id != pad_token_id for token_id in example['labels']) + empty_examples_dropped_dataset = prompt_length_filtered_dataset.filter( - lambda example: len(example['input_ids']) > 0 and len(example[ - 'labels']) > 0 and any(token_id != pad_token_id - for token_id in example['labels']), + filter_empty_examples, num_proc=num_cpus_to_use, desc='Filtering out empty examples') diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index f027afb0ce..2251ab5fbd 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -188,6 +188,12 @@ def build_tokenizer( os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = '1' os.environ['TOKENIZERS_PARALLELISM'] = 'false' + signal_file_path = f'.node_{dist.get_node_rank()}_local_rank0_completed_tokenizer_setup' + + # Make sure the tokenizer files are downloaded and cached first by local rank 0 + with dist.local_rank_zero_download_and_wait(signal_file_path): + pass + if tokenizer_name.startswith('tiktoken'): tokenizer = TiktokenTokenizerWrapper(**tokenizer_kwargs) else: @@ -202,6 +208,15 @@ def build_tokenizer( int(1e30), ) + if dist.get_local_rank() == 0: + with open(signal_file_path, 'wb') as f: + f.write(b'local_rank0_completed_tokenizer_setup') + + dist.barrier() + + if dist.get_local_rank() == 0: + os.remove(signal_file_path) + return tokenizer