From 020b6d90e24f85fa18cbf8442c1925ad2912a52b Mon Sep 17 00:00:00 2001 From: Matt Date: Fri, 4 Oct 2024 18:32:02 +0100 Subject: [PATCH 01/33] Initial draft --- src/transformers/tokenization_utils_base.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 03df02d21ff..bafdb03e4c9 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -145,6 +145,7 @@ class EncodingFast: SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json" ADDED_TOKENS_FILE = "added_tokens.json" TOKENIZER_CONFIG_FILE = "tokenizer_config.json" +CHAT_TEMPLATE_FILE = "chat_template.jinja" # Fast tokenizers (provided by HuggingFace tokenizer's library) can be saved in a single file FULL_TOKENIZER_FILE = "tokenizer.json" @@ -1941,6 +1942,7 @@ def from_pretrained( "tokenizer_config_file": TOKENIZER_CONFIG_FILE, # tokenizer_file used to initialize a slow from a fast. Properly copy the `addedTokens` instead of adding in random orders "tokenizer_file": FULL_TOKENIZER_FILE, + "chat_template_file": CHAT_TEMPLATE_FILE, } vocab_files = {**cls.vocab_files_names, **additional_files_names} if "tokenizer_file" in vocab_files: @@ -2061,6 +2063,7 @@ def _from_pretrained( from_slow = kwargs.get("from_slow", False) gguf_file = kwargs.get("gguf_file", None) has_tokenizer_file = resolved_vocab_files.get("tokenizer_file", None) is not None + has_chat_template_file = resolved_vocab_files.get("chat_template_file", None) is not None # If one passes a GGUF file path to `gguf_file` there is no need for this check as the tokenizer will be # loaded directly from the GGUF file. @@ -2097,6 +2100,12 @@ def _from_pretrained( config_tokenizer_class = None init_kwargs = init_configuration + # If an independent chat template file exists, it takes priority over template entries in the tokenizer config + if has_chat_template_file: + with open(resolved_vocab_files["chat_template_file"]) as chat_template_handle: + chat_template = chat_template_handle.read() + init_kwargs["chat_template"] = chat_template # Clobbers any template in the config + if not _is_local: if "auto_map" in init_kwargs: # For backward compatibility with odl format. @@ -2259,6 +2268,7 @@ def _from_pretrained( if key != "additional_special_tokens": init_kwargs[key] = added_tokens_map.get(str(init_kwargs[key]), init_kwargs[key]) + # Instantiate the tokenizer. try: tokenizer = cls(*init_inputs, **init_kwargs) @@ -2396,6 +2406,9 @@ def save_pretrained( tokenizer_config_file = os.path.join( save_directory, (filename_prefix + "-" if filename_prefix else "") + TOKENIZER_CONFIG_FILE ) + chat_template_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + CHAT_TEMPLATE_FILE + ) tokenizer_config = copy.deepcopy(self.init_kwargs) @@ -2418,7 +2431,13 @@ def save_pretrained( if isinstance(self.chat_template, dict): # Chat template dicts are saved to the config as lists of dicts with fixed key names. # They will be reconstructed as a single dict during loading. + # We're trying to discourage chat template dicts, and they are always + # saved in the config, never as single files. tokenizer_config["chat_template"] = [{"name": k, "template": v} for k, v in self.chat_template.items()] + elif kwargs.get("save_chat_template_file", False): + with open(chat_template_file, "w", encoding="utf-8") as f: + f.write(self.chat_template) + logger.info(f"chat template saved in {chat_template_file}") else: tokenizer_config["chat_template"] = self.chat_template From b8f9e0f15f8d27bdd427317d7c9972a9652b7ad1 Mon Sep 17 00:00:00 2001 From: Matt Date: Mon, 7 Oct 2024 17:12:16 +0100 Subject: [PATCH 02/33] Add .jinja file loading for processors --- src/transformers/processing_utils.py | 30 +++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index b5b02f6a00a..00f9878eaac 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -44,7 +44,6 @@ TruncationStrategy, ) from .utils import ( - CHAT_TEMPLATE_NAME, PROCESSOR_NAME, PushToHubMixin, TensorType, @@ -527,7 +526,7 @@ def save_pretrained(self, save_directory, push_to_hub: bool = False, **kwargs): # If we save using the predefined names, we can load using `from_pretrained` # plus we save chat_template in its own file output_processor_file = os.path.join(save_directory, PROCESSOR_NAME) - output_chat_template_file = os.path.join(save_directory, CHAT_TEMPLATE_NAME) + output_chat_template_file = os.path.join(save_directory, "chat_template.json") processor_dict = self.to_dict() # Save `chat_template` in its own file. We can't get it from `processor_dict` as we popped it in `to_dict` @@ -601,21 +600,23 @@ def get_processor_dict( is_local = os.path.isdir(pretrained_model_name_or_path) if os.path.isdir(pretrained_model_name_or_path): processor_file = os.path.join(pretrained_model_name_or_path, PROCESSOR_NAME) - chat_template_file = os.path.join(pretrained_model_name_or_path, "chat_template.json") if os.path.isfile(pretrained_model_name_or_path): resolved_processor_file = pretrained_model_name_or_path # cant't load chat-template when given a file as pretrained_model_name_or_path resolved_chat_template_file = None + resolved_naked_chat_template_file = None is_local = True elif is_remote_url(pretrained_model_name_or_path): processor_file = pretrained_model_name_or_path resolved_processor_file = download_url(pretrained_model_name_or_path) # can't load chat-template when given a file url as pretrained_model_name_or_path resolved_chat_template_file = None + resolved_naked_chat_template_file = None else: processor_file = PROCESSOR_NAME - chat_template_file = CHAT_TEMPLATE_NAME + chat_template_file = "chat_template.json" + naked_chat_template_file = "chat_template.jinja" try: # Load from local folder or from cache or download from model Hub and cache resolved_processor_file = cached_file( @@ -650,6 +651,21 @@ def get_processor_dict( subfolder=subfolder, _raise_exceptions_for_missing_entries=False, ) + + resolved_naked_chat_template_file = cached_file( + pretrained_model_name_or_path, + naked_chat_template_file, + cache_dir=cache_dir, + force_download=force_download, + proxies=proxies, + resume_download=resume_download, + local_files_only=local_files_only, + token=token, + user_agent=user_agent, + revision=revision, + subfolder=subfolder, + _raise_exceptions_for_missing_entries=False, + ) except EnvironmentError: # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted to # the original exception. @@ -665,6 +681,10 @@ def get_processor_dict( # Add chat template as kwarg before returning because most models don't have processor config chat_template = None + if resolved_naked_chat_template_file is not None: + with open(resolved_chat_template_file, "r", encoding="utf-8") as reader: + chat_template = reader.read() + kwargs["chat_template"] = chat_template if resolved_chat_template_file is not None: with open(resolved_chat_template_file, "r", encoding="utf-8") as reader: text = reader.read() @@ -696,7 +716,7 @@ def get_processor_dict( if "chat_template" in processor_dict and processor_dict["chat_template"] is not None: logger.warning_once( - "Chat templates should be in a 'chat_template.json' file but found key='chat_template' " + "Chat templates should be in a 'chat_template.jinja' file but found key='chat_template' " "in the processor's config. Make sure to move your template to its own file." ) From ca101e6536f46bd37a965b7b75ab86176d4ed85b Mon Sep 17 00:00:00 2001 From: Matt Date: Mon, 7 Oct 2024 17:15:46 +0100 Subject: [PATCH 03/33] Add processor saving of naked chat template files --- src/transformers/processing_utils.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index 00f9878eaac..731e253a8c2 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -526,18 +526,24 @@ def save_pretrained(self, save_directory, push_to_hub: bool = False, **kwargs): # If we save using the predefined names, we can load using `from_pretrained` # plus we save chat_template in its own file output_processor_file = os.path.join(save_directory, PROCESSOR_NAME) + output_naked_chat_template_file = os.path.join(save_directory, "chat_template.jinja") output_chat_template_file = os.path.join(save_directory, "chat_template.json") processor_dict = self.to_dict() # Save `chat_template` in its own file. We can't get it from `processor_dict` as we popped it in `to_dict` # to avoid serializing chat template in json config file. So let's get it from `self` directly if self.chat_template is not None: - chat_template_json_string = ( - json.dumps({"chat_template": self.chat_template}, indent=2, sort_keys=True) + "\n" - ) - with open(output_chat_template_file, "w", encoding="utf-8") as writer: - writer.write(chat_template_json_string) - logger.info(f"chat template saved in {output_chat_template_file}") + if kwargs.get("save_naked_chat_template", False): + with open(output_naked_chat_template_file, "w", encoding="utf-8") as writer: + writer.write(self.chat_template) + logger.info(f"chat template saved in {output_naked_chat_template_file}") + else: + chat_template_json_string = ( + json.dumps({"chat_template": self.chat_template}, indent=2, sort_keys=True) + "\n" + ) + with open(output_chat_template_file, "w", encoding="utf-8") as writer: + writer.write(chat_template_json_string) + logger.info(f"chat template saved in {output_chat_template_file}") # For now, let's not save to `processor_config.json` if the processor doesn't have extra attributes and # `auto_map` is not specified. From 82c357e2766448f0b8c7d3c35778ce565793773c Mon Sep 17 00:00:00 2001 From: Matt Date: Mon, 7 Oct 2024 17:21:14 +0100 Subject: [PATCH 04/33] make fixup --- src/transformers/tokenization_utils_base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index bafdb03e4c9..3bd824312e9 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -2268,7 +2268,6 @@ def _from_pretrained( if key != "additional_special_tokens": init_kwargs[key] = added_tokens_map.get(str(init_kwargs[key]), init_kwargs[key]) - # Instantiate the tokenizer. try: tokenizer = cls(*init_inputs, **init_kwargs) From 3775e2b75720d5fa0507922aa8028115405e3c85 Mon Sep 17 00:00:00 2001 From: Matt Date: Mon, 7 Oct 2024 17:41:38 +0100 Subject: [PATCH 05/33] Add save-load test for tokenizers --- tests/test_tokenization_common.py | 44 ++++++++++++++++++++++--------- 1 file changed, 32 insertions(+), 12 deletions(-) diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index f04a4255556..3f342c7a24b 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -26,6 +26,7 @@ from collections import OrderedDict from itertools import takewhile from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Union +from pathlib import Path from parameterized import parameterized @@ -1115,6 +1116,23 @@ def test_chat_template(self): # Check that no error raised tokenizer.apply_chat_template(dummy_conversation, tokenize=True, return_dict=False) + with tempfile.TemporaryDirectory() as tmp_dir_name: + tokenizer.save_pretrained(tmp_dir_name, save_chat_template_file=True) + chat_template_file = Path(tmp_dir_name) / "chat_template.jinja" + self.assertTrue(chat_template_file.is_file()) + self.assertEqual(chat_template_file.read_text(), dummy_template) + config_dict = json.loads((Path(tmp_dir_name) / "tokenizer_config.json").read_text()) + # Assert the chat template is not in the config when it's saved as a separate file + self.assertNotIn("chat_template", config_dict) + tokenizer = tokenizer.from_pretrained(tmp_dir_name) + + self.assertEqual(tokenizer.chat_template, dummy_template) # Test template has persisted + output = tokenizer.apply_chat_template(dummy_conversation, tokenize=False, return_dict=False) + self.assertEqual(output, expected_output) # Test output is the same after reloading + # Check that no error raised + tokenizer.apply_chat_template(dummy_conversation, tokenize=True, return_dict=False) + + @require_jinja def test_chat_template_batched(self): dummy_template = "{% for message in messages %}{{message['role'] + message['content']}}{% endfor %}" @@ -1526,18 +1544,20 @@ def test_chat_template_dict_saving(self): tokenizers = self.get_tokenizers() for tokenizer in tokenizers: with self.subTest(f"{tokenizer.__class__.__name__}"): - tokenizer.chat_template = {"template1": dummy_template_1, "template2": dummy_template_2} - with tempfile.TemporaryDirectory() as tmp_dir_name: - tokenizer.save_pretrained(tmp_dir_name) - config_dict = json.load(open(os.path.join(tmp_dir_name, "tokenizer_config.json"))) - # Assert that chat templates are correctly serialized as lists of dictionaries - self.assertEqual( - config_dict["chat_template"], - [{"name": "template1", "template": "{{'a'}}"}, {"name": "template2", "template": "{{'b'}}"}], - ) - new_tokenizer = tokenizer.from_pretrained(tmp_dir_name) - # Assert that the serialized list is correctly reconstructed as a single dict - self.assertEqual(new_tokenizer.chat_template, tokenizer.chat_template) + for save_chat_template_file in (True, False): + tokenizer.chat_template = {"template1": dummy_template_1, "template2": dummy_template_2} + with tempfile.TemporaryDirectory() as tmp_dir_name: + # Test that save_chat_template_file is ignored when there's a dict of multiple templates + tokenizer.save_pretrained(tmp_dir_name, save_chat_template_file=save_chat_template_file) + config_dict = json.load(open(os.path.join(tmp_dir_name, "tokenizer_config.json"))) + # Assert that chat templates are correctly serialized as lists of dictionaries + self.assertEqual( + config_dict["chat_template"], + [{"name": "template1", "template": "{{'a'}}"}, {"name": "template2", "template": "{{'b'}}"}], + ) + new_tokenizer = tokenizer.from_pretrained(tmp_dir_name) + # Assert that the serialized list is correctly reconstructed as a single dict + self.assertEqual(new_tokenizer.chat_template, tokenizer.chat_template) def test_number_of_added_tokens(self): tokenizers = self.get_tokenizers(do_lower_case=False) From 91483a0b51475ee3fcb04e90927bca066f22b18d Mon Sep 17 00:00:00 2001 From: Matt Date: Mon, 7 Oct 2024 17:41:48 +0100 Subject: [PATCH 06/33] Add save-load test for tokenizers --- tests/test_tokenization_common.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index 3f342c7a24b..f5122831507 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -25,8 +25,8 @@ import unittest from collections import OrderedDict from itertools import takewhile -from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Union from pathlib import Path +from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Union from parameterized import parameterized @@ -1132,7 +1132,6 @@ def test_chat_template(self): # Check that no error raised tokenizer.apply_chat_template(dummy_conversation, tokenize=True, return_dict=False) - @require_jinja def test_chat_template_batched(self): dummy_template = "{% for message in messages %}{{message['role'] + message['content']}}{% endfor %}" @@ -1553,7 +1552,10 @@ def test_chat_template_dict_saving(self): # Assert that chat templates are correctly serialized as lists of dictionaries self.assertEqual( config_dict["chat_template"], - [{"name": "template1", "template": "{{'a'}}"}, {"name": "template2", "template": "{{'b'}}"}], + [ + {"name": "template1", "template": "{{'a'}}"}, + {"name": "template2", "template": "{{'b'}}"}, + ], ) new_tokenizer = tokenizer.from_pretrained(tmp_dir_name) # Assert that the serialized list is correctly reconstructed as a single dict From 47205a26861d3273194b1471ad619ccea98fec79 Mon Sep 17 00:00:00 2001 From: Matt Date: Tue, 8 Oct 2024 13:16:58 +0100 Subject: [PATCH 07/33] stash commit --- src/transformers/tokenization_utils_base.py | 3 +++ tests/test_tokenization_common.py | 2 ++ 2 files changed, 5 insertions(+) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 3bd824312e9..40af28702b5 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -2437,6 +2437,9 @@ def save_pretrained( with open(chat_template_file, "w", encoding="utf-8") as f: f.write(self.chat_template) logger.info(f"chat template saved in {chat_template_file}") + if "chat_template" in tokenizer_config: + breakpoint() + tokenizer_config.pop("chat_template") # To ensure it doesn't somehow end up in the config too else: tokenizer_config["chat_template"] = self.chat_template diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index f5122831507..04c1f638b75 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -1125,6 +1125,8 @@ def test_chat_template(self): # Assert the chat template is not in the config when it's saved as a separate file self.assertNotIn("chat_template", config_dict) tokenizer = tokenizer.from_pretrained(tmp_dir_name) + # TODO Figure out how "chat_template" gets into init_kwargs + # TODO Ensure "chat_template_file" doesn't end up anywhere! Where is it getting into the config? self.assertEqual(tokenizer.chat_template, dummy_template) # Test template has persisted output = tokenizer.apply_chat_template(dummy_conversation, tokenize=False, return_dict=False) From 820990b9635d35c92ccb72756d271607eddff42a Mon Sep 17 00:00:00 2001 From: Matt Date: Tue, 8 Oct 2024 15:44:52 +0100 Subject: [PATCH 08/33] Try popping the file --- src/transformers/tokenization_utils_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 40af28702b5..6125af2f790 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -2177,6 +2177,7 @@ def _from_pretrained( # Merge resolved_vocab_files arguments in init_kwargs. added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None) special_tokens_map_file = resolved_vocab_files.pop("special_tokens_map_file", None) + chat_template_file = resolved_vocab_files.pop("chat_template_file", None) for args_name, file_path in resolved_vocab_files.items(): if args_name not in init_kwargs: init_kwargs[args_name] = file_path @@ -2438,7 +2439,6 @@ def save_pretrained( f.write(self.chat_template) logger.info(f"chat template saved in {chat_template_file}") if "chat_template" in tokenizer_config: - breakpoint() tokenizer_config.pop("chat_template") # To ensure it doesn't somehow end up in the config too else: tokenizer_config["chat_template"] = self.chat_template From 2aeb31990217e543951d18e3caf67607561d2789 Mon Sep 17 00:00:00 2001 From: Matt Date: Tue, 8 Oct 2024 16:04:33 +0100 Subject: [PATCH 09/33] make fixup --- src/transformers/tokenization_utils_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 6125af2f790..df19ecaa7b4 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -2177,7 +2177,7 @@ def _from_pretrained( # Merge resolved_vocab_files arguments in init_kwargs. added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None) special_tokens_map_file = resolved_vocab_files.pop("special_tokens_map_file", None) - chat_template_file = resolved_vocab_files.pop("chat_template_file", None) + resolved_vocab_files.pop("chat_template_file", None) for args_name, file_path in resolved_vocab_files.items(): if args_name not in init_kwargs: init_kwargs[args_name] = file_path From a92dc10946f79a474109b43e1389dbf300664cfa Mon Sep 17 00:00:00 2001 From: Matt Date: Tue, 8 Oct 2024 18:52:31 +0100 Subject: [PATCH 10/33] Pop the arg correctly --- src/transformers/tokenization_utils_base.py | 9 ++++----- tests/test_tokenization_common.py | 18 ++++++++---------- 2 files changed, 12 insertions(+), 15 deletions(-) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index df19ecaa7b4..75b15921b47 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -2063,7 +2063,7 @@ def _from_pretrained( from_slow = kwargs.get("from_slow", False) gguf_file = kwargs.get("gguf_file", None) has_tokenizer_file = resolved_vocab_files.get("tokenizer_file", None) is not None - has_chat_template_file = resolved_vocab_files.get("chat_template_file", None) is not None + chat_template_file = resolved_vocab_files.pop("chat_template_file", None) # If one passes a GGUF file path to `gguf_file` there is no need for this check as the tokenizer will be # loaded directly from the GGUF file. @@ -2101,10 +2101,9 @@ def _from_pretrained( init_kwargs = init_configuration # If an independent chat template file exists, it takes priority over template entries in the tokenizer config - if has_chat_template_file: - with open(resolved_vocab_files["chat_template_file"]) as chat_template_handle: - chat_template = chat_template_handle.read() - init_kwargs["chat_template"] = chat_template # Clobbers any template in the config + if chat_template_file is not None: + with open(chat_template_file) as chat_template_handle: + init_kwargs["chat_template"] = chat_template_handle.read() # Clobbers any template in the config if not _is_local: if "auto_map" in init_kwargs: diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index 04c1f638b75..266690a0464 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -1108,13 +1108,13 @@ def test_chat_template(self): with tempfile.TemporaryDirectory() as tmp_dir_name: tokenizer.save_pretrained(tmp_dir_name) - tokenizer = tokenizer.from_pretrained(tmp_dir_name) + new_tokenizer = tokenizer.from_pretrained(tmp_dir_name) - self.assertEqual(tokenizer.chat_template, dummy_template) # Test template has persisted - output = tokenizer.apply_chat_template(dummy_conversation, tokenize=False, return_dict=False) + self.assertEqual(new_tokenizer.chat_template, dummy_template) # Test template has persisted + output = new_tokenizer.apply_chat_template(dummy_conversation, tokenize=False, return_dict=False) self.assertEqual(output, expected_output) # Test output is the same after reloading # Check that no error raised - tokenizer.apply_chat_template(dummy_conversation, tokenize=True, return_dict=False) + new_tokenizer.apply_chat_template(dummy_conversation, tokenize=True, return_dict=False) with tempfile.TemporaryDirectory() as tmp_dir_name: tokenizer.save_pretrained(tmp_dir_name, save_chat_template_file=True) @@ -1124,15 +1124,13 @@ def test_chat_template(self): config_dict = json.loads((Path(tmp_dir_name) / "tokenizer_config.json").read_text()) # Assert the chat template is not in the config when it's saved as a separate file self.assertNotIn("chat_template", config_dict) - tokenizer = tokenizer.from_pretrained(tmp_dir_name) - # TODO Figure out how "chat_template" gets into init_kwargs - # TODO Ensure "chat_template_file" doesn't end up anywhere! Where is it getting into the config? + new_tokenizer = tokenizer.from_pretrained(tmp_dir_name) - self.assertEqual(tokenizer.chat_template, dummy_template) # Test template has persisted - output = tokenizer.apply_chat_template(dummy_conversation, tokenize=False, return_dict=False) + self.assertEqual(new_tokenizer.chat_template, dummy_template) # Test template has persisted + output = new_tokenizer.apply_chat_template(dummy_conversation, tokenize=False, return_dict=False) self.assertEqual(output, expected_output) # Test output is the same after reloading # Check that no error raised - tokenizer.apply_chat_template(dummy_conversation, tokenize=True, return_dict=False) + new_tokenizer.apply_chat_template(dummy_conversation, tokenize=True, return_dict=False) @require_jinja def test_chat_template_batched(self): From 1d54b7d16d417e67023424898394127cb83642b2 Mon Sep 17 00:00:00 2001 From: Matt Date: Tue, 8 Oct 2024 18:53:27 +0100 Subject: [PATCH 11/33] Pop the arg correctly --- src/transformers/tokenization_utils_base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 75b15921b47..d814d990743 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -2176,7 +2176,6 @@ def _from_pretrained( # Merge resolved_vocab_files arguments in init_kwargs. added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None) special_tokens_map_file = resolved_vocab_files.pop("special_tokens_map_file", None) - resolved_vocab_files.pop("chat_template_file", None) for args_name, file_path in resolved_vocab_files.items(): if args_name not in init_kwargs: init_kwargs[args_name] = file_path From 5b2674873a8b681720b6febf17bf09b166bb9ac6 Mon Sep 17 00:00:00 2001 From: Matt Date: Wed, 9 Oct 2024 18:01:31 +0100 Subject: [PATCH 12/33] Add processor test --- tests/test_processing_common.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py index 9f0d8808912..e93ce245f86 100644 --- a/tests/test_processing_common.py +++ b/tests/test_processing_common.py @@ -18,6 +18,7 @@ import json import random import tempfile +from pathlib import Path from typing import Optional import numpy as np @@ -519,3 +520,18 @@ def test_prepare_and_validate_optional_call_args(self): processor.prepare_and_validate_optional_call_args( *(f"optional_{i}" for i in range(num_optional_call_args + 1)) ) + + def test_chat_template_save_loading(self): + processor = self.get_processor() + processor.chat_template = "test template" + with tempfile.TemporaryDirectory() as tmpdirname: + processor.save_pretrained(tmpdirname) + self.assertFalse(Path(tmpdirname, "chat_template.jinja").is_file()) + reloaded_processor = self.processor_class.from_pretrained(tmpdirname) + self.assertEqual(processor.chat_template, reloaded_processor.chat_template) + + with tempfile.TemporaryDirectory() as tmpdirname: + processor.save_pretrained(tmpdirname, save_naked_chat_template=True) + self.assertTrue(Path(tmpdirname, "chat_template.jinja").is_file()) + reloaded_processor = self.processor_class.from_pretrained(tmpdirname) + self.assertEqual(processor.chat_template, reloaded_processor.chat_template) From 743a4a502c47460dc2f2dea6242b7d3533788de6 Mon Sep 17 00:00:00 2001 From: Matt Date: Wed, 9 Oct 2024 18:06:57 +0100 Subject: [PATCH 13/33] Fix processor code --- src/transformers/processing_utils.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index 731e253a8c2..5b3abd64460 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -686,12 +686,11 @@ def get_processor_dict( ) # Add chat template as kwarg before returning because most models don't have processor config - chat_template = None if resolved_naked_chat_template_file is not None: - with open(resolved_chat_template_file, "r", encoding="utf-8") as reader: + with open(resolved_naked_chat_template_file, "r", encoding="utf-8") as reader: chat_template = reader.read() kwargs["chat_template"] = chat_template - if resolved_chat_template_file is not None: + elif resolved_chat_template_file is not None: with open(resolved_chat_template_file, "r", encoding="utf-8") as reader: text = reader.read() chat_template = json.loads(text)["chat_template"] From 455a297c8112c28655c0e7aab6181d17a80410b4 Mon Sep 17 00:00:00 2001 From: Matt Date: Thu, 10 Oct 2024 13:09:52 +0100 Subject: [PATCH 14/33] stash commit --- src/transformers/processing_utils.py | 5 ++++- src/transformers/tokenization_utils_base.py | 2 +- tests/test_processing_common.py | 13 +++++++++++++ 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index 5b3abd64460..f62347793ec 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -514,7 +514,10 @@ def save_pretrained(self, save_directory, push_to_hub: bool = False, **kwargs): # `AutoProcessor` API. if hasattr(attribute, "_set_processor_class"): attribute._set_processor_class(self.__class__.__name__) - attribute.save_pretrained(save_directory) + if getattr(self, "chat_template", None) is not None and getattr(attribute, "chat_template", None) is not None and kwargs.get("save_naked_chat_template", False): + attribute.save_pretrained(save_directory, skip_chat_template_save=True) + else: + attribute.save_pretrained(save_directory) if self._auto_class is not None: # We added an attribute to the init_kwargs of the tokenizers, which needs to be cleaned up. diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index d814d990743..ee3bf226ad5 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -2425,7 +2425,7 @@ def save_pretrained( tokenizer_config["extra_special_tokens"] = self.extra_special_tokens tokenizer_config.update(self.extra_special_tokens) - if self.chat_template is not None: + if self.chat_template is not None and not kwargs.get("skip_chat_template_save", False): if isinstance(self.chat_template, dict): # Chat template dicts are saved to the config as lists of dicts with fixed key names. # They will be reconstructed as a single dict during loading. diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py index e93ce245f86..77628378682 100644 --- a/tests/test_processing_common.py +++ b/tests/test_processing_common.py @@ -535,3 +535,16 @@ def test_chat_template_save_loading(self): self.assertTrue(Path(tmpdirname, "chat_template.jinja").is_file()) reloaded_processor = self.processor_class.from_pretrained(tmpdirname) self.assertEqual(processor.chat_template, reloaded_processor.chat_template) + + def test_chat_template_tokenizer_interactions(self): + processor = self.get_processor() + if not hasattr(processor, "tokenizer"): + self.skipTest("This processor doesn't have a tokenizer.") + processor.chat_template = "test template" + processor.tokenizer.chat_template = "test_template" + with tempfile.TemporaryDirectory() as tmpdirname: + processor.save_pretrained(tmpdirname, save_naked_chat_template=True) + + reloaded_processor = self.processor_class.from_pretrained(tmpdirname) + self.assertEqual(processor.chat_template, reloaded_processor.chat_template) + self.assertEqual(processor.tokenizer.chat_template, reloaded_processor.tokenizer.chat_template) From 52614f3e698624db15e1823a0cdf1350cab54310 Mon Sep 17 00:00:00 2001 From: Matt Date: Thu, 10 Oct 2024 14:18:05 +0100 Subject: [PATCH 15/33] Processor clobbers child tokenizer's chat template --- src/transformers/processing_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index f62347793ec..3bbe9156419 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -515,6 +515,7 @@ def save_pretrained(self, save_directory, push_to_hub: bool = False, **kwargs): if hasattr(attribute, "_set_processor_class"): attribute._set_processor_class(self.__class__.__name__) if getattr(self, "chat_template", None) is not None and getattr(attribute, "chat_template", None) is not None and kwargs.get("save_naked_chat_template", False): + # If we're saving a chat template file, it clobbers any chat template in the tokenizer attribute.save_pretrained(save_directory, skip_chat_template_save=True) else: attribute.save_pretrained(save_directory) From a200a427b45b6070bd16c264c7a8fe5bdca4795a Mon Sep 17 00:00:00 2001 From: Matt Date: Thu, 10 Oct 2024 14:33:47 +0100 Subject: [PATCH 16/33] Processor clobbers child tokenizer's chat template --- tests/test_processing_common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py index 77628378682..2101d0c5625 100644 --- a/tests/test_processing_common.py +++ b/tests/test_processing_common.py @@ -544,7 +544,7 @@ def test_chat_template_tokenizer_interactions(self): processor.tokenizer.chat_template = "test_template" with tempfile.TemporaryDirectory() as tmpdirname: processor.save_pretrained(tmpdirname, save_naked_chat_template=True) - reloaded_processor = self.processor_class.from_pretrained(tmpdirname) self.assertEqual(processor.chat_template, reloaded_processor.chat_template) - self.assertEqual(processor.tokenizer.chat_template, reloaded_processor.tokenizer.chat_template) + # Check that the processor clobbers the tokenizer's template + self.assertEqual(processor.tokenizer.chat_template, None) From 5c792bc8e88d5b2487e46c2be62f07b2d3959ae0 Mon Sep 17 00:00:00 2001 From: Matt Date: Thu, 10 Oct 2024 14:33:57 +0100 Subject: [PATCH 17/33] make fixup --- src/transformers/processing_utils.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index 3bbe9156419..51a1b346b96 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -514,7 +514,11 @@ def save_pretrained(self, save_directory, push_to_hub: bool = False, **kwargs): # `AutoProcessor` API. if hasattr(attribute, "_set_processor_class"): attribute._set_processor_class(self.__class__.__name__) - if getattr(self, "chat_template", None) is not None and getattr(attribute, "chat_template", None) is not None and kwargs.get("save_naked_chat_template", False): + if ( + getattr(self, "chat_template", None) is not None + and getattr(attribute, "chat_template", None) is not None + and kwargs.get("save_naked_chat_template", False) + ): # If we're saving a chat template file, it clobbers any chat template in the tokenizer attribute.save_pretrained(save_directory, skip_chat_template_save=True) else: From 5305a87b5b1e8bc67be92202f981f2f91c5eef4c Mon Sep 17 00:00:00 2001 From: Matt Date: Thu, 10 Oct 2024 14:42:33 +0100 Subject: [PATCH 18/33] Split processor/tokenizer files to avoid interactions --- src/transformers/processing_utils.py | 14 +++----------- tests/test_processing_common.py | 13 ------------- 2 files changed, 3 insertions(+), 24 deletions(-) diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index 51a1b346b96..27032fef82d 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -514,15 +514,7 @@ def save_pretrained(self, save_directory, push_to_hub: bool = False, **kwargs): # `AutoProcessor` API. if hasattr(attribute, "_set_processor_class"): attribute._set_processor_class(self.__class__.__name__) - if ( - getattr(self, "chat_template", None) is not None - and getattr(attribute, "chat_template", None) is not None - and kwargs.get("save_naked_chat_template", False) - ): - # If we're saving a chat template file, it clobbers any chat template in the tokenizer - attribute.save_pretrained(save_directory, skip_chat_template_save=True) - else: - attribute.save_pretrained(save_directory) + attribute.save_pretrained(save_directory) if self._auto_class is not None: # We added an attribute to the init_kwargs of the tokenizers, which needs to be cleaned up. @@ -534,7 +526,7 @@ def save_pretrained(self, save_directory, push_to_hub: bool = False, **kwargs): # If we save using the predefined names, we can load using `from_pretrained` # plus we save chat_template in its own file output_processor_file = os.path.join(save_directory, PROCESSOR_NAME) - output_naked_chat_template_file = os.path.join(save_directory, "chat_template.jinja") + output_naked_chat_template_file = os.path.join(save_directory, "processor_chat_template.jinja") output_chat_template_file = os.path.join(save_directory, "chat_template.json") processor_dict = self.to_dict() @@ -630,7 +622,7 @@ def get_processor_dict( else: processor_file = PROCESSOR_NAME chat_template_file = "chat_template.json" - naked_chat_template_file = "chat_template.jinja" + naked_chat_template_file = "processor_chat_template.jinja" try: # Load from local folder or from cache or download from model Hub and cache resolved_processor_file = cached_file( diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py index 2101d0c5625..e93ce245f86 100644 --- a/tests/test_processing_common.py +++ b/tests/test_processing_common.py @@ -535,16 +535,3 @@ def test_chat_template_save_loading(self): self.assertTrue(Path(tmpdirname, "chat_template.jinja").is_file()) reloaded_processor = self.processor_class.from_pretrained(tmpdirname) self.assertEqual(processor.chat_template, reloaded_processor.chat_template) - - def test_chat_template_tokenizer_interactions(self): - processor = self.get_processor() - if not hasattr(processor, "tokenizer"): - self.skipTest("This processor doesn't have a tokenizer.") - processor.chat_template = "test template" - processor.tokenizer.chat_template = "test_template" - with tempfile.TemporaryDirectory() as tmpdirname: - processor.save_pretrained(tmpdirname, save_naked_chat_template=True) - reloaded_processor = self.processor_class.from_pretrained(tmpdirname) - self.assertEqual(processor.chat_template, reloaded_processor.chat_template) - # Check that the processor clobbers the tokenizer's template - self.assertEqual(processor.tokenizer.chat_template, None) From 36564683db3801917390e3ab6b8df7e1dd791ac2 Mon Sep 17 00:00:00 2001 From: Matt Date: Thu, 10 Oct 2024 14:47:16 +0100 Subject: [PATCH 19/33] fix test --- tests/test_processing_common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py index e93ce245f86..54dd124e9b5 100644 --- a/tests/test_processing_common.py +++ b/tests/test_processing_common.py @@ -526,12 +526,12 @@ def test_chat_template_save_loading(self): processor.chat_template = "test template" with tempfile.TemporaryDirectory() as tmpdirname: processor.save_pretrained(tmpdirname) - self.assertFalse(Path(tmpdirname, "chat_template.jinja").is_file()) + self.assertFalse(Path(tmpdirname, "processor_chat_template.jinja").is_file()) reloaded_processor = self.processor_class.from_pretrained(tmpdirname) self.assertEqual(processor.chat_template, reloaded_processor.chat_template) with tempfile.TemporaryDirectory() as tmpdirname: processor.save_pretrained(tmpdirname, save_naked_chat_template=True) - self.assertTrue(Path(tmpdirname, "chat_template.jinja").is_file()) + self.assertTrue(Path(tmpdirname, "processor_chat_template.jinja").is_file()) reloaded_processor = self.processor_class.from_pretrained(tmpdirname) self.assertEqual(processor.chat_template, reloaded_processor.chat_template) From e412619bda4b57575789a28ae97099702f159e25 Mon Sep 17 00:00:00 2001 From: Matt Date: Thu, 10 Oct 2024 15:23:16 +0100 Subject: [PATCH 20/33] Expand processor tests --- tests/test_processing_common.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py index 54dd124e9b5..66bc4add7e4 100644 --- a/tests/test_processing_common.py +++ b/tests/test_processing_common.py @@ -526,6 +526,7 @@ def test_chat_template_save_loading(self): processor.chat_template = "test template" with tempfile.TemporaryDirectory() as tmpdirname: processor.save_pretrained(tmpdirname) + self.assertTrue(Path(tmpdirname, "chat_template.json").is_file()) self.assertFalse(Path(tmpdirname, "processor_chat_template.jinja").is_file()) reloaded_processor = self.processor_class.from_pretrained(tmpdirname) self.assertEqual(processor.chat_template, reloaded_processor.chat_template) @@ -533,5 +534,6 @@ def test_chat_template_save_loading(self): with tempfile.TemporaryDirectory() as tmpdirname: processor.save_pretrained(tmpdirname, save_naked_chat_template=True) self.assertTrue(Path(tmpdirname, "processor_chat_template.jinja").is_file()) + self.assertFalse(Path(tmpdirname, "chat_template.json").is_file()) reloaded_processor = self.processor_class.from_pretrained(tmpdirname) self.assertEqual(processor.chat_template, reloaded_processor.chat_template) From bd8168210872d8ba6274a883c855be2a5b477042 Mon Sep 17 00:00:00 2001 From: Matt Date: Thu, 10 Oct 2024 15:28:05 +0100 Subject: [PATCH 21/33] Rename arg to "save_raw_chat_template" across all classes --- src/transformers/processing_utils.py | 22 ++++++++++----------- src/transformers/tokenization_utils_base.py | 4 ++-- tests/test_processing_common.py | 2 +- tests/test_tokenization_common.py | 8 ++++---- 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index 27032fef82d..ed734333349 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -526,17 +526,17 @@ def save_pretrained(self, save_directory, push_to_hub: bool = False, **kwargs): # If we save using the predefined names, we can load using `from_pretrained` # plus we save chat_template in its own file output_processor_file = os.path.join(save_directory, PROCESSOR_NAME) - output_naked_chat_template_file = os.path.join(save_directory, "processor_chat_template.jinja") + output_raw_chat_template_file = os.path.join(save_directory, "processor_chat_template.jinja") output_chat_template_file = os.path.join(save_directory, "chat_template.json") processor_dict = self.to_dict() # Save `chat_template` in its own file. We can't get it from `processor_dict` as we popped it in `to_dict` # to avoid serializing chat template in json config file. So let's get it from `self` directly if self.chat_template is not None: - if kwargs.get("save_naked_chat_template", False): - with open(output_naked_chat_template_file, "w", encoding="utf-8") as writer: + if kwargs.get("save_raw_chat_template", False): + with open(output_raw_chat_template_file, "w", encoding="utf-8") as writer: writer.write(self.chat_template) - logger.info(f"chat template saved in {output_naked_chat_template_file}") + logger.info(f"chat template saved in {output_raw_chat_template_file}") else: chat_template_json_string = ( json.dumps({"chat_template": self.chat_template}, indent=2, sort_keys=True) + "\n" @@ -611,18 +611,18 @@ def get_processor_dict( resolved_processor_file = pretrained_model_name_or_path # cant't load chat-template when given a file as pretrained_model_name_or_path resolved_chat_template_file = None - resolved_naked_chat_template_file = None + resolved_raw_chat_template_file = None is_local = True elif is_remote_url(pretrained_model_name_or_path): processor_file = pretrained_model_name_or_path resolved_processor_file = download_url(pretrained_model_name_or_path) # can't load chat-template when given a file url as pretrained_model_name_or_path resolved_chat_template_file = None - resolved_naked_chat_template_file = None + resolved_raw_chat_template_file = None else: processor_file = PROCESSOR_NAME chat_template_file = "chat_template.json" - naked_chat_template_file = "processor_chat_template.jinja" + raw_chat_template_file = "processor_chat_template.jinja" try: # Load from local folder or from cache or download from model Hub and cache resolved_processor_file = cached_file( @@ -658,9 +658,9 @@ def get_processor_dict( _raise_exceptions_for_missing_entries=False, ) - resolved_naked_chat_template_file = cached_file( + resolved_raw_chat_template_file = cached_file( pretrained_model_name_or_path, - naked_chat_template_file, + raw_chat_template_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies, @@ -686,8 +686,8 @@ def get_processor_dict( ) # Add chat template as kwarg before returning because most models don't have processor config - if resolved_naked_chat_template_file is not None: - with open(resolved_naked_chat_template_file, "r", encoding="utf-8") as reader: + if resolved_raw_chat_template_file is not None: + with open(resolved_raw_chat_template_file, "r", encoding="utf-8") as reader: chat_template = reader.read() kwargs["chat_template"] = chat_template elif resolved_chat_template_file is not None: diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index ee3bf226ad5..881acbe5600 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -2425,14 +2425,14 @@ def save_pretrained( tokenizer_config["extra_special_tokens"] = self.extra_special_tokens tokenizer_config.update(self.extra_special_tokens) - if self.chat_template is not None and not kwargs.get("skip_chat_template_save", False): + if self.chat_template is not None: if isinstance(self.chat_template, dict): # Chat template dicts are saved to the config as lists of dicts with fixed key names. # They will be reconstructed as a single dict during loading. # We're trying to discourage chat template dicts, and they are always # saved in the config, never as single files. tokenizer_config["chat_template"] = [{"name": k, "template": v} for k, v in self.chat_template.items()] - elif kwargs.get("save_chat_template_file", False): + elif kwargs.get("save_raw_chat_template", False): with open(chat_template_file, "w", encoding="utf-8") as f: f.write(self.chat_template) logger.info(f"chat template saved in {chat_template_file}") diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py index 66bc4add7e4..8d8fc52a08a 100644 --- a/tests/test_processing_common.py +++ b/tests/test_processing_common.py @@ -532,7 +532,7 @@ def test_chat_template_save_loading(self): self.assertEqual(processor.chat_template, reloaded_processor.chat_template) with tempfile.TemporaryDirectory() as tmpdirname: - processor.save_pretrained(tmpdirname, save_naked_chat_template=True) + processor.save_pretrained(tmpdirname, save_raw_chat_template=True) self.assertTrue(Path(tmpdirname, "processor_chat_template.jinja").is_file()) self.assertFalse(Path(tmpdirname, "chat_template.json").is_file()) reloaded_processor = self.processor_class.from_pretrained(tmpdirname) diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index 266690a0464..34c924e4771 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -1117,7 +1117,7 @@ def test_chat_template(self): new_tokenizer.apply_chat_template(dummy_conversation, tokenize=True, return_dict=False) with tempfile.TemporaryDirectory() as tmp_dir_name: - tokenizer.save_pretrained(tmp_dir_name, save_chat_template_file=True) + tokenizer.save_pretrained(tmp_dir_name, save_raw_chat_template=True) chat_template_file = Path(tmp_dir_name) / "chat_template.jinja" self.assertTrue(chat_template_file.is_file()) self.assertEqual(chat_template_file.read_text(), dummy_template) @@ -1543,11 +1543,11 @@ def test_chat_template_dict_saving(self): tokenizers = self.get_tokenizers() for tokenizer in tokenizers: with self.subTest(f"{tokenizer.__class__.__name__}"): - for save_chat_template_file in (True, False): + for save_raw_chat_template in (True, False): tokenizer.chat_template = {"template1": dummy_template_1, "template2": dummy_template_2} with tempfile.TemporaryDirectory() as tmp_dir_name: - # Test that save_chat_template_file is ignored when there's a dict of multiple templates - tokenizer.save_pretrained(tmp_dir_name, save_chat_template_file=save_chat_template_file) + # Test that save_raw_chat_template is ignored when there's a dict of multiple templates + tokenizer.save_pretrained(tmp_dir_name, save_raw_chat_template=save_raw_chat_template) config_dict = json.load(open(os.path.join(tmp_dir_name, "tokenizer_config.json"))) # Assert that chat templates are correctly serialized as lists of dictionaries self.assertEqual( From 50d374a131ff23ac05ddbb4a06bcbb58e3269e97 Mon Sep 17 00:00:00 2001 From: Matt Date: Thu, 10 Oct 2024 15:43:15 +0100 Subject: [PATCH 22/33] Update processor warning --- src/transformers/processing_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index ed734333349..5b4464d8906 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -721,7 +721,7 @@ def get_processor_dict( if "chat_template" in processor_dict and processor_dict["chat_template"] is not None: logger.warning_once( - "Chat templates should be in a 'chat_template.jinja' file but found key='chat_template' " + "Chat templates should be in a 'processor_chat_template.jinja' file but found key='chat_template' " "in the processor's config. Make sure to move your template to its own file." ) From a81a7e1543588b8fa22bcb2dabfbaf912bb4f804 Mon Sep 17 00:00:00 2001 From: Matt Date: Thu, 31 Oct 2024 16:56:27 +0000 Subject: [PATCH 23/33] Move templates to single file --- src/transformers/processing_utils.py | 4 ++-- tests/test_processing_common.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index 5b4464d8906..bc8658064b1 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -526,7 +526,7 @@ def save_pretrained(self, save_directory, push_to_hub: bool = False, **kwargs): # If we save using the predefined names, we can load using `from_pretrained` # plus we save chat_template in its own file output_processor_file = os.path.join(save_directory, PROCESSOR_NAME) - output_raw_chat_template_file = os.path.join(save_directory, "processor_chat_template.jinja") + output_raw_chat_template_file = os.path.join(save_directory, "chat_template.jinja") output_chat_template_file = os.path.join(save_directory, "chat_template.json") processor_dict = self.to_dict() @@ -622,7 +622,7 @@ def get_processor_dict( else: processor_file = PROCESSOR_NAME chat_template_file = "chat_template.json" - raw_chat_template_file = "processor_chat_template.jinja" + raw_chat_template_file = "chat_template.jinja" try: # Load from local folder or from cache or download from model Hub and cache resolved_processor_file = cached_file( diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py index 8d8fc52a08a..a8338343fdd 100644 --- a/tests/test_processing_common.py +++ b/tests/test_processing_common.py @@ -527,13 +527,13 @@ def test_chat_template_save_loading(self): with tempfile.TemporaryDirectory() as tmpdirname: processor.save_pretrained(tmpdirname) self.assertTrue(Path(tmpdirname, "chat_template.json").is_file()) - self.assertFalse(Path(tmpdirname, "processor_chat_template.jinja").is_file()) + self.assertFalse(Path(tmpdirname, "chat_template.jinja").is_file()) reloaded_processor = self.processor_class.from_pretrained(tmpdirname) self.assertEqual(processor.chat_template, reloaded_processor.chat_template) with tempfile.TemporaryDirectory() as tmpdirname: processor.save_pretrained(tmpdirname, save_raw_chat_template=True) - self.assertTrue(Path(tmpdirname, "processor_chat_template.jinja").is_file()) + self.assertTrue(Path(tmpdirname, "chat_template.jinja").is_file()) self.assertFalse(Path(tmpdirname, "chat_template.json").is_file()) reloaded_processor = self.processor_class.from_pretrained(tmpdirname) self.assertEqual(processor.chat_template, reloaded_processor.chat_template) From f9127c17fbb55d5d0db96d51444912ee0c94e4d0 Mon Sep 17 00:00:00 2001 From: Matt Date: Fri, 1 Nov 2024 17:22:54 +0000 Subject: [PATCH 24/33] Move templates to single file --- tests/test_processing_common.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py index a8338343fdd..ab19345df58 100644 --- a/tests/test_processing_common.py +++ b/tests/test_processing_common.py @@ -530,6 +530,10 @@ def test_chat_template_save_loading(self): self.assertFalse(Path(tmpdirname, "chat_template.jinja").is_file()) reloaded_processor = self.processor_class.from_pretrained(tmpdirname) self.assertEqual(processor.chat_template, reloaded_processor.chat_template) + if hasattr(processor, "tokenizer"): + # When we don't use single-file chat template saving, processor and tokenizer chat templates + # should remain separate + self.assertIsNone(getattr(reloaded_processor.tokenizer, "chat_template", None)) with tempfile.TemporaryDirectory() as tmpdirname: processor.save_pretrained(tmpdirname, save_raw_chat_template=True) @@ -537,3 +541,7 @@ def test_chat_template_save_loading(self): self.assertFalse(Path(tmpdirname, "chat_template.json").is_file()) reloaded_processor = self.processor_class.from_pretrained(tmpdirname) self.assertEqual(processor.chat_template, reloaded_processor.chat_template) + # When we save as single files, tokenizers and processors share a chat template, which means + # the reloaded tokenizer should get the chat template as well + if hasattr(processor, "tokenizer"): + self.assertEqual(reloaded_processor.chat_template, reloaded_processor.tokenizer.chat_template) From 3878828df23f715b16cc5b899384389332a8cd4a Mon Sep 17 00:00:00 2001 From: Matt Date: Fri, 1 Nov 2024 17:32:31 +0000 Subject: [PATCH 25/33] Improve testing for processor/tokenizer clashes --- tests/test_processing_common.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py index ab19345df58..7b384b25f1b 100644 --- a/tests/test_processing_common.py +++ b/tests/test_processing_common.py @@ -526,6 +526,7 @@ def test_chat_template_save_loading(self): processor.chat_template = "test template" with tempfile.TemporaryDirectory() as tmpdirname: processor.save_pretrained(tmpdirname) + existing_tokenizer_template = getattr(processor.tokenizer, "chat_template", None) self.assertTrue(Path(tmpdirname, "chat_template.json").is_file()) self.assertFalse(Path(tmpdirname, "chat_template.jinja").is_file()) reloaded_processor = self.processor_class.from_pretrained(tmpdirname) @@ -533,7 +534,7 @@ def test_chat_template_save_loading(self): if hasattr(processor, "tokenizer"): # When we don't use single-file chat template saving, processor and tokenizer chat templates # should remain separate - self.assertIsNone(getattr(reloaded_processor.tokenizer, "chat_template", None)) + self.assertEqual(getattr(reloaded_processor.tokenizer, "chat_template", None), existing_tokenizer_template) with tempfile.TemporaryDirectory() as tmpdirname: processor.save_pretrained(tmpdirname, save_raw_chat_template=True) From 74cd295a011683d56dbd5ada788e87328cd8cbc8 Mon Sep 17 00:00:00 2001 From: Matt Date: Fri, 1 Nov 2024 17:37:36 +0000 Subject: [PATCH 26/33] Improve testing for processor/tokenizer clashes --- tests/test_processing_common.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py index 7b384b25f1b..4c4f6fac498 100644 --- a/tests/test_processing_common.py +++ b/tests/test_processing_common.py @@ -523,18 +523,17 @@ def test_prepare_and_validate_optional_call_args(self): def test_chat_template_save_loading(self): processor = self.get_processor() + existing_tokenizer_template = getattr(processor.tokenizer, "chat_template", None) processor.chat_template = "test template" with tempfile.TemporaryDirectory() as tmpdirname: processor.save_pretrained(tmpdirname) - existing_tokenizer_template = getattr(processor.tokenizer, "chat_template", None) self.assertTrue(Path(tmpdirname, "chat_template.json").is_file()) self.assertFalse(Path(tmpdirname, "chat_template.jinja").is_file()) reloaded_processor = self.processor_class.from_pretrained(tmpdirname) self.assertEqual(processor.chat_template, reloaded_processor.chat_template) - if hasattr(processor, "tokenizer"): - # When we don't use single-file chat template saving, processor and tokenizer chat templates - # should remain separate - self.assertEqual(getattr(reloaded_processor.tokenizer, "chat_template", None), existing_tokenizer_template) + # When we don't use single-file chat template saving, processor and tokenizer chat templates + # should remain separate + self.assertEqual(getattr(reloaded_processor.tokenizer, "chat_template", None), existing_tokenizer_template) with tempfile.TemporaryDirectory() as tmpdirname: processor.save_pretrained(tmpdirname, save_raw_chat_template=True) @@ -544,5 +543,4 @@ def test_chat_template_save_loading(self): self.assertEqual(processor.chat_template, reloaded_processor.chat_template) # When we save as single files, tokenizers and processors share a chat template, which means # the reloaded tokenizer should get the chat template as well - if hasattr(processor, "tokenizer"): - self.assertEqual(reloaded_processor.chat_template, reloaded_processor.tokenizer.chat_template) + self.assertEqual(reloaded_processor.chat_template, reloaded_processor.tokenizer.chat_template) From cf577e1aa8be79f19f94b4f624fe86ecced6bac1 Mon Sep 17 00:00:00 2001 From: Matt Date: Tue, 5 Nov 2024 16:25:55 +0000 Subject: [PATCH 27/33] Extend saving test --- tests/test_tokenization_common.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index 34c924e4771..a34abe4284f 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -1557,6 +1557,7 @@ def test_chat_template_dict_saving(self): {"name": "template2", "template": "{{'b'}}"}, ], ) + self.assertFalse(os.path.exists(os.path.join(tmp_dir_name, "chat_template.jinja"))) new_tokenizer = tokenizer.from_pretrained(tmp_dir_name) # Assert that the serialized list is correctly reconstructed as a single dict self.assertEqual(new_tokenizer.chat_template, tokenizer.chat_template) From b97e76de5d57d81d94939aebc2278ac986a967c0 Mon Sep 17 00:00:00 2001 From: Matt Date: Tue, 5 Nov 2024 16:36:06 +0000 Subject: [PATCH 28/33] Test file priority correctly --- tests/test_tokenization_common.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index a34abe4284f..ee22b900870 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -1132,6 +1132,7 @@ def test_chat_template(self): # Check that no error raised new_tokenizer.apply_chat_template(dummy_conversation, tokenize=True, return_dict=False) + @require_jinja def test_chat_template_batched(self): dummy_template = "{% for message in messages %}{{message['role'] + message['content']}}{% endfor %}" @@ -1562,6 +1563,22 @@ def test_chat_template_dict_saving(self): # Assert that the serialized list is correctly reconstructed as a single dict self.assertEqual(new_tokenizer.chat_template, tokenizer.chat_template) + @require_jinja + def test_chat_template_file_priority(self): + dummy_template1 = "a" + dummy_template2 = "b" + tokenizers = self.get_tokenizers() + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + with tempfile.TemporaryDirectory() as tmp_dir_name: + tokenizer.chat_template = dummy_template1 + tokenizer.save_pretrained(tmp_dir_name, save_raw_chat_template=False) + with open(os.path.join(tmp_dir_name, "chat_template.jinja"), "w") as f: + f.write(dummy_template2) + new_tokenizer = tokenizer.from_pretrained(tmp_dir_name) + # Assert the file template clobbers any template in the config + self.assertEqual(new_tokenizer.chat_template, dummy_template2) + def test_number_of_added_tokens(self): tokenizers = self.get_tokenizers(do_lower_case=False) for tokenizer in tokenizers: From e5b8b76a35fc342de18a14138a8cdda12fefb081 Mon Sep 17 00:00:00 2001 From: Matt Date: Tue, 5 Nov 2024 17:10:20 +0000 Subject: [PATCH 29/33] make fixup --- tests/test_tokenization_common.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index ee22b900870..ed09d800ad6 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -1132,7 +1132,6 @@ def test_chat_template(self): # Check that no error raised new_tokenizer.apply_chat_template(dummy_conversation, tokenize=True, return_dict=False) - @require_jinja def test_chat_template_batched(self): dummy_template = "{% for message in messages %}{{message['role'] + message['content']}}{% endfor %}" @@ -1573,7 +1572,7 @@ def test_chat_template_file_priority(self): with tempfile.TemporaryDirectory() as tmp_dir_name: tokenizer.chat_template = dummy_template1 tokenizer.save_pretrained(tmp_dir_name, save_raw_chat_template=False) - with open(os.path.join(tmp_dir_name, "chat_template.jinja"), "w") as f: + with Path(tmp_dir_name, "chat_template.jinja").open("w") as f: f.write(dummy_template2) new_tokenizer = tokenizer.from_pretrained(tmp_dir_name) # Assert the file template clobbers any template in the config From 5b40f040389d779b6493ac31a135d3c9926a6677 Mon Sep 17 00:00:00 2001 From: Matt Date: Tue, 5 Nov 2024 18:13:34 +0000 Subject: [PATCH 30/33] Don't pop the chat template file before the slow tokenizer gets a look --- src/transformers/tokenization_utils_base.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 881acbe5600..d22aeacb4e4 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -2063,7 +2063,7 @@ def _from_pretrained( from_slow = kwargs.get("from_slow", False) gguf_file = kwargs.get("gguf_file", None) has_tokenizer_file = resolved_vocab_files.get("tokenizer_file", None) is not None - chat_template_file = resolved_vocab_files.pop("chat_template_file", None) + # If one passes a GGUF file path to `gguf_file` there is no need for this check as the tokenizer will be # loaded directly from the GGUF file. @@ -2101,6 +2101,7 @@ def _from_pretrained( init_kwargs = init_configuration # If an independent chat template file exists, it takes priority over template entries in the tokenizer config + chat_template_file = resolved_vocab_files.pop("chat_template_file", None) if chat_template_file is not None: with open(chat_template_file) as chat_template_handle: init_kwargs["chat_template"] = chat_template_handle.read() # Clobbers any template in the config @@ -2301,6 +2302,7 @@ def _from_pretrained( "Special tokens have been added in the vocabulary, make sure the associated word embeddings are" " fine-tuned or trained." ) + breakpoint() return tokenizer @staticmethod From 2159cdc7f70024c1c773ecab49cc7b9c0a436024 Mon Sep 17 00:00:00 2001 From: Matt Date: Tue, 5 Nov 2024 18:13:56 +0000 Subject: [PATCH 31/33] Remove breakpoint --- src/transformers/tokenization_utils_base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index d22aeacb4e4..35d9344276e 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -2302,7 +2302,6 @@ def _from_pretrained( "Special tokens have been added in the vocabulary, make sure the associated word embeddings are" " fine-tuned or trained." ) - breakpoint() return tokenizer @staticmethod From 5b88768b7400e25b24e06b047df7c0cdc24089f0 Mon Sep 17 00:00:00 2001 From: Matt Date: Tue, 5 Nov 2024 18:14:11 +0000 Subject: [PATCH 32/33] make fixup --- src/transformers/tokenization_utils_base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 35d9344276e..2346f3380c4 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -2064,7 +2064,6 @@ def _from_pretrained( gguf_file = kwargs.get("gguf_file", None) has_tokenizer_file = resolved_vocab_files.get("tokenizer_file", None) is not None - # If one passes a GGUF file path to `gguf_file` there is no need for this check as the tokenizer will be # loaded directly from the GGUF file. if (from_slow or not has_tokenizer_file) and cls.slow_tokenizer_class is not None and not gguf_file: From 1187047d3f8be8fef9a0d37a6a4b05aded64c5d4 Mon Sep 17 00:00:00 2001 From: Matt Date: Tue, 5 Nov 2024 18:31:45 +0000 Subject: [PATCH 33/33] Fix error --- src/transformers/processing_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index bc8658064b1..c2646300367 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -721,7 +721,7 @@ def get_processor_dict( if "chat_template" in processor_dict and processor_dict["chat_template"] is not None: logger.warning_once( - "Chat templates should be in a 'processor_chat_template.jinja' file but found key='chat_template' " + "Chat templates should be in a 'chat_template.jinja' file but found key='chat_template' " "in the processor's config. Make sure to move your template to its own file." )