From 0b22218889c1867d81179f32780374de625e0fe3 Mon Sep 17 00:00:00 2001 From: Adam Louly Date: Tue, 24 Oct 2023 01:27:15 -0700 Subject: [PATCH] fix max_pos_embeddings error (#1478) * fix max_pos_embeddings error * fix lint --------- Co-authored-by: Adam Louly --- .../training/language-modeling/run_clm.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/examples/onnxruntime/training/language-modeling/run_clm.py b/examples/onnxruntime/training/language-modeling/run_clm.py index bd9694ae41b..cfe72186bc7 100644 --- a/examples/onnxruntime/training/language-modeling/run_clm.py +++ b/examples/onnxruntime/training/language-modeling/run_clm.py @@ -493,14 +493,20 @@ def tokenize_function(examples): remove_columns=column_names, ) + if hasattr(config, "max_position_embeddings"): + max_pos_embeddings = config.max_position_embeddings + else: + # Define a default value if the attribute is missing in the config. + max_pos_embeddings = 1024 + if data_args.block_size is None: block_size = tokenizer.model_max_length - if block_size > config.max_position_embeddings: + if block_size > max_pos_embeddings: logger.warning( f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " - f"Using block_size={min(1024, config.max_position_embeddings)} instead. You can change that default value by passing --block_size xxx." + f"Using block_size={min(1024, max_pos_embeddings)} instead. You can change that default value by passing --block_size xxx." ) - block_size = min(1024, config.max_position_embeddings) + block_size = min(1024, max_pos_embeddings) else: if data_args.block_size > tokenizer.model_max_length: logger.warning(