From d11ba8209bcbd9d1afefa9a468caecdca979c137 Mon Sep 17 00:00:00 2001
From: Irene Dea <deaairene@gmail.com>
Date: Mon, 13 Nov 2023 10:40:11 -0800
Subject: [PATCH] Make TiktokenTokenizerWrapper compatible with
 convert_composer_to_hf.py (#730)

---
 .../utils/checkpoint_conversion_helpers.py    | 25 ++++++++++++++++---
 .../inference/convert_composer_mpt_to_ft.py   | 11 ++++++--
 scripts/inference/convert_composer_to_hf.py   | 14 +++++++++--
 tests/test_hf_conversion_script.py            |  3 +++
 4 files changed, 45 insertions(+), 8 deletions(-)

diff --git a/llmfoundry/utils/checkpoint_conversion_helpers.py b/llmfoundry/utils/checkpoint_conversion_helpers.py
index 0627cec4cd..35e77eab6c 100644
--- a/llmfoundry/utils/checkpoint_conversion_helpers.py
+++ b/llmfoundry/utils/checkpoint_conversion_helpers.py
@@ -19,7 +19,8 @@
 
 import numpy as np
 import sentencepiece as spm
-from transformers import AutoTokenizer, PreTrainedTokenizer
+from transformers import (AutoTokenizer, PreTrainedTokenizer,
+                          PreTrainedTokenizerFast)
 
 log = logging.getLogger(__name__)
 
@@ -35,8 +36,9 @@ def _get_weight_data_type(data_type: str):
 
 # TODO: move this functionality to composer once the bug fixes are upstreamed
 def get_hf_tokenizer_from_composer_state_dict(
-        state_dict: Dict[str, Any],
-        tokenizer_save_dir: Optional[str] = None
+    state_dict: Dict[str, Any],
+    trust_remote_code: bool,
+    tokenizer_save_dir: Optional[str] = None,
 ) -> Optional[PreTrainedTokenizer]:
     if 'state' not in state_dict:
         raise RuntimeError(
@@ -85,7 +87,8 @@ def get_hf_tokenizer_from_composer_state_dict(
                 with open(tokenizer_file_path, 'wb') as _tmp_file:
                     _tmp_file.write(s.serialized_model_proto())
 
-        hf_tokenizer = AutoTokenizer.from_pretrained(tokenizer_save_dir)
+        hf_tokenizer = load_tokenizer(tokenizer_save_dir,
+                                      trust_remote_code=trust_remote_code)
 
         # remove 'name_or_path'
         hf_tokenizer.name_or_path = ''
@@ -94,6 +97,20 @@ def get_hf_tokenizer_from_composer_state_dict(
     return hf_tokenizer
 
 
+def load_tokenizer(
+    tokenizer_save_dir: str, trust_remote_code: bool
+) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
+    try:
+        return AutoTokenizer.from_pretrained(
+            tokenizer_save_dir, trust_remote_code=trust_remote_code)
+    except ValueError as e:
+        raise ValueError(
+            f'Got error while loading tokenizer with trust_remote_code={trust_remote_code}: {e}. '
+            +
+            'If accessing a tokenizer defined outside of the transformers module,'
+            + ' please use --trust_remote_code.')
+
+
 def _write_zero_bias(weight_name: str, weight_file_path: str,
                      bias_shape: Union[Tuple[int, ...], int]) -> None:
     """Write zeros for bias when converting MPT to FasterTransformer weights.
diff --git a/scripts/inference/convert_composer_mpt_to_ft.py b/scripts/inference/convert_composer_mpt_to_ft.py
index 79275030b3..f59eb6005a 100644
--- a/scripts/inference/convert_composer_mpt_to_ft.py
+++ b/scripts/inference/convert_composer_mpt_to_ft.py
@@ -67,6 +67,7 @@ def write_ft_checkpoint_from_composer_checkpoint(
         checkpoint_path: Union[Path, str],
         infer_gpu_num: int,
         save_dir: str,
+        trust_remote_code: bool,
         output_precision: str = 'fp32',
         local_checkpoint_save_location: Optional[Union[Path,
                                                        str]] = None) -> None:
@@ -79,6 +80,7 @@ def write_ft_checkpoint_from_composer_checkpoint(
         checkpoint_path (Union[Path, str]): Path to the composer checkpoint, can be a local path, or a remote path beginning with ``s3://``, or another backend
             supported by Composer.
         infer_gpu_num (int): The number of gpus you are planning to use for inference.
+        trust_remote_code (bool): Whether or not to use code outside of the transformers module.
         save_dir (str): Path of the directory to save the checkpoint in FT format.
         output_precision (str, optional): The precision of the output weights saved to the FasterTransformer model. Can be either ``fp32`` or ``fp16``.
         local_checkpoint_save_location (Optional[Union[Path, str]], optional): If specified, where to save the checkpoint file to locally.
@@ -125,7 +127,7 @@ def write_ft_checkpoint_from_composer_checkpoint(
     print('#' * 30)
     print('Extracting HF Tokenizer...')
     hf_tokenizer = get_hf_tokenizer_from_composer_state_dict(
-        composer_state_dict)
+        composer_state_dict, trust_remote_code)
     if hf_tokenizer is None:
         print('Warning! No HF Tokenizer found!')
 
@@ -206,6 +208,10 @@ def parse_args() -> Namespace:
         'Data type of weights in the FasterTransformer output model. Input checkpoint weights will be converted to this dtype.',
         choices=['fp32', 'fp16'],
         default='fp32')
+    parser.add_argument(
+        '--trust_remote_code',
+        action='store_true',
+        help='Whether or not to use code outside of transformers module.')
 
     return parser.parse_args()
 
@@ -229,4 +235,5 @@ def parse_args() -> Namespace:
         infer_gpu_num=args.infer_gpu_num,
         save_dir=save_dir,
         output_precision=args.output_precision,
-        local_checkpoint_save_location=args.local_checkpoint_save_location)
+        local_checkpoint_save_location=args.local_checkpoint_save_location,
+        trust_remote_code=args.trust_remote_code)
diff --git a/scripts/inference/convert_composer_to_hf.py b/scripts/inference/convert_composer_to_hf.py
index 5625a3b046..1b43762473 100644
--- a/scripts/inference/convert_composer_to_hf.py
+++ b/scripts/inference/convert_composer_to_hf.py
@@ -16,6 +16,7 @@
 
 from llmfoundry import MPTConfig, MPTForCausalLM
 from llmfoundry.utils import get_hf_tokenizer_from_composer_state_dict
+from llmfoundry.utils.checkpoint_conversion_helpers import load_tokenizer
 from llmfoundry.utils.huggingface_hub_utils import \
     edit_files_for_hf_compatibility
 
@@ -23,6 +24,7 @@
 def write_huggingface_pretrained_from_composer_checkpoint(
     checkpoint_path: Union[Path, str],
     output_path: Union[Path, str],
+    trust_remote_code: bool,
     output_precision: str = 'fp32',
     local_checkpoint_save_location: Optional[Union[Path, str]] = None
 ) -> Tuple[PretrainedConfig, Optional[PreTrainedTokenizerBase]]:
@@ -63,6 +65,7 @@ def write_huggingface_pretrained_from_composer_checkpoint(
         checkpoint_path (Union[Path, str]): Path to the composer checkpoint, can be a local path, or a remote path beginning with ``s3://``, or another backend
             supported by :meth:`composer.utils.maybe_create_object_store_from_uri`.
         output_path (Union[Path, str]): Path to the folder to write the output to.
+        trust_remote_code (bool): Whether or not to use code outside of the transformers module.
         output_precision (str, optional): The precision of the output weights saved to `pytorch_model.bin`. Can be one of ``fp32``, ``fp16``, or ``bf16``.
         local_checkpoint_save_location (Optional[Union[Path, str]], optional): If specified, where to save the checkpoint file to locally.
                                                                                 If the input ``checkpoint_path`` is already a local path, this will be a symlink.
@@ -110,7 +113,7 @@ def write_huggingface_pretrained_from_composer_checkpoint(
     print('#' * 30)
     print('Saving HF Tokenizer...')
     hf_tokenizer = get_hf_tokenizer_from_composer_state_dict(
-        composer_state_dict)
+        composer_state_dict, trust_remote_code)
     if hf_tokenizer is not None:
         hf_tokenizer.save_pretrained(output_path)
         print(hf_tokenizer)
@@ -157,6 +160,10 @@ def parse_args() -> Namespace:
                         default='fp32')
     parser.add_argument('--hf_repo_for_upload', type=str, default=None)
     parser.add_argument('--test_uploaded_model', action='store_true')
+    parser.add_argument(
+        '--trust_remote_code',
+        action='store_true',
+        help='Whether or not to use code outside of transformers module.')
 
     return parser.parse_args()
 
@@ -179,6 +186,7 @@ def convert_composer_to_hf(args: Namespace) -> None:
     config, tokenizer = write_huggingface_pretrained_from_composer_checkpoint(
         checkpoint_path=args.composer_path,
         output_path=local_folder_path,
+        trust_remote_code=args.trust_remote_code,
         output_precision=args.output_precision,
         local_checkpoint_save_location=args.local_checkpoint_save_location)
 
@@ -206,7 +214,9 @@ def convert_composer_to_hf(args: Namespace) -> None:
     loaded_hf_model.save_pretrained(local_folder_path)
 
     print(f'Loading tokenizer from {local_folder_path}')
-    tokenizer = transformers.AutoTokenizer.from_pretrained(local_folder_path)
+
+    tokenizer = load_tokenizer(local_folder_path,
+                               trust_remote_code=args.trust_remote_code)
     tokenizer.save_pretrained(local_folder_path)
 
     # Only need to edit files for MPT because it has custom code
diff --git a/tests/test_hf_conversion_script.py b/tests/test_hf_conversion_script.py
index d2c2a9e1c9..6d5a282993 100644
--- a/tests/test_hf_conversion_script.py
+++ b/tests/test_hf_conversion_script.py
@@ -530,6 +530,7 @@ def test_convert_and_generate(model: str, tmp_path: pathlib.Path):
                      output_precision='fp32',
                      local_checkpoint_save_location=None,
                      hf_repo_for_upload=None,
+                     trust_remote_code=False,
                      test_uploaded_model=False)
     convert_composer_to_hf(args)
 
@@ -577,6 +578,7 @@ def test_convert_and_generate_triton(tmp_path: pathlib.Path):
                      output_precision='fp32',
                      local_checkpoint_save_location=None,
                      hf_repo_for_upload=None,
+                     trust_remote_code=False,
                      test_uploaded_model=False)
     convert_composer_to_hf(args)
 
@@ -631,6 +633,7 @@ def test_convert_and_generate_meta(tmp_path: pathlib.Path):
                      output_precision='fp32',
                      local_checkpoint_save_location=None,
                      hf_repo_for_upload=None,
+                     trust_remote_code=False,
                      test_uploaded_model=False)
     convert_composer_to_hf(args)