Skip to content

Commit

Permalink
Catch error and give a nicer message
Browse files Browse the repository at this point in the history
  • Loading branch information
irenedea committed Nov 10, 2023
1 parent e164b9e commit 7f23c06
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 4 deletions.
21 changes: 18 additions & 3 deletions llmfoundry/utils/checkpoint_conversion_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@

import numpy as np
import sentencepiece as spm
from transformers import AutoTokenizer, PreTrainedTokenizer
from transformers import (AutoTokenizer, PreTrainedTokenizer,
PreTrainedTokenizerFast)

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -86,8 +87,8 @@ def get_hf_tokenizer_from_composer_state_dict(
with open(tokenizer_file_path, 'wb') as _tmp_file:
_tmp_file.write(s.serialized_model_proto())

hf_tokenizer = AutoTokenizer.from_pretrained(
tokenizer_save_dir, trust_remote_code=trust_remote_code)
hf_tokenizer = load_tokenizer(tokenizer_save_dir,
trust_remote_code=trust_remote_code)

# remove 'name_or_path'
hf_tokenizer.name_or_path = ''
Expand All @@ -96,6 +97,20 @@ def get_hf_tokenizer_from_composer_state_dict(
return hf_tokenizer


def load_tokenizer(
tokenizer_save_dir: str, trust_remote_code: bool
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
try:
return AutoTokenizer.from_pretrained(
tokenizer_save_dir, trust_remote_code=trust_remote_code)
except ValueError as e:
raise ValueError(
f'Got error while loading tokenizer with trust_remote_code={trust_remote_code}: {e}. '
+
'If accessing a tokenizer defined outside of the transformers module,'
+ ' please use --trust_remote_code.')


def _write_zero_bias(weight_name: str, weight_file_path: str,
bias_shape: Union[Tuple[int, ...], int]) -> None:
"""Write zeros for bias when converting MPT to FasterTransformer weights.
Expand Down
5 changes: 4 additions & 1 deletion scripts/inference/convert_composer_to_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

from llmfoundry import MPTConfig, MPTForCausalLM
from llmfoundry.utils import get_hf_tokenizer_from_composer_state_dict
from llmfoundry.utils.checkpoint_conversion_helpers import load_tokenizer
from llmfoundry.utils.huggingface_hub_utils import \
edit_files_for_hf_compatibility

Expand Down Expand Up @@ -213,7 +214,9 @@ def convert_composer_to_hf(args: Namespace) -> None:
loaded_hf_model.save_pretrained(local_folder_path)

print(f'Loading tokenizer from {local_folder_path}')
tokenizer = transformers.AutoTokenizer.from_pretrained(local_folder_path)

tokenizer = load_tokenizer(local_folder_path,
trust_remote_code=args.trust_remote_code)
tokenizer.save_pretrained(local_folder_path)

# Only need to edit files for MPT because it has custom code
Expand Down

0 comments on commit 7f23c06

Please sign in to comment.