From 936d5e566d0db411f7f5a93e3e2bffce4194b109 Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Mon, 22 Jan 2024 16:14:33 -0800 Subject: [PATCH 1/9] Add flag to only download tokenizers from HF or oras --- llmfoundry/utils/model_download_utils.py | 26 ++++++++++++++++++------ scripts/misc/download_model.py | 8 ++++++++ 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/llmfoundry/utils/model_download_utils.py b/llmfoundry/utils/model_download_utils.py index 5d8a413d91..b51856d5fc 100644 --- a/llmfoundry/utils/model_download_utils.py +++ b/llmfoundry/utils/model_download_utils.py @@ -30,6 +30,12 @@ ] PYTORCH_WEIGHTS_PATTERN = 'pytorch_model*.bin*' SAFE_WEIGHTS_PATTERN = 'model*.safetensors*' +TOKENIZER_FILES = [ + 'special_tokens_map.json', + 'tokenizer.json', + 'tokenizer.model', + 'tokenizer_config.json', +] ORAS_PASSWD_PLACEHOLDER = '' ORAS_CLI = 'oras' @@ -45,6 +51,7 @@ def download_from_hf_hub( model: str, save_dir: str, prefer_safetensors: bool = True, + tokenizers_only: bool = False, token: Optional[str] = None, ): """Downloads model files from a Hugging Face Hub model repo. @@ -57,6 +64,7 @@ def download_from_hf_hub( save_dir (str, optional): The local path to the directory where the model files will be downloaded. prefer_safetensors (bool): Whether to prefer Safetensors weights over PyTorch weights if both are available. Defaults to True. + tokenizers_only (bool): If true, only download tokenzier files. token (str, optional): The HuggingFace API token. If not provided, the token will be read from the `HUGGING_FACE_HUB_TOKEN` environment variable. @@ -95,10 +103,13 @@ def download_from_hf_hub( ' Please make sure the repo contains either safetensors or pytorch weights.' ) + allow_patterns = TOKENIZER_FILES if tokenizers_only else None + download_start = time.time() hf_hub.snapshot_download(model, local_dir=save_dir, ignore_patterns=ignore_patterns, + allow_patterns=allow_patterns, token=token) download_duration = time.time() - download_start log.info( @@ -221,16 +232,18 @@ def download_from_oras(model: str, config_file: str, credentials_dir: str, save_dir: str, + tokenizer_only: bool, concurrency: int = 10): """Download from an OCI-compliant registry using oras. Args: - model: The name of the model to download. - config_file: Path to a YAML config file that maps model names to registry paths. - credentials_dir: Path to a directory containing credentials for the registry. It is expected to contain three + model (str): The name of the model to download. + config_file (str): Path to a YAML config file that maps model and tokenizer names to registry paths. + credentials_dir (str): Path to a directory containing credentials for the registry. It is expected to contain three files: `username`, `password`, and `registry`, each of which contains the corresponding credential. - save_dir: Path to the directory where files will be downloaded. - concurrency: The number of concurrent downloads to run. + save_dir (str): Path to the directory where files will be downloaded. + tokenizer_only (bool): If true, only download the tokenzier files. + concurrency (int): The number of concurrent downloads to run. """ if shutil.which(ORAS_CLI) is None: raise Exception( @@ -253,7 +266,8 @@ def _read_secrets_file(secret_file_path: str,): with open(config_file, 'r', encoding='utf-8') as f: configs = yaml.safe_load(f.read()) - path = configs['models'][model] + config_type = 'tokenizers' if tokenizer_only else 'models' + path = configs[config_type][model] registry = secrets['registry'] def get_oras_cmd(username: Optional[str] = None, diff --git a/scripts/misc/download_model.py b/scripts/misc/download_model.py index 1913267e20..90faff64e8 100644 --- a/scripts/misc/download_model.py +++ b/scripts/misc/download_model.py @@ -56,6 +56,11 @@ def parse_args() -> argparse.Namespace: base_parser = argparse.ArgumentParser(add_help=False) base_parser.add_argument('--save-dir', type=str, required=True) + base_parser.add_argument('--tokenizer-only', + type=bool, + required=False, + default=False, + action='store_true') # Add subparser for downloading from Hugging Face Hub. hf_parser = subparsers.add_parser('hf', parents=[base_parser]) @@ -85,6 +90,9 @@ def parse_args() -> argparse.Namespace: download_from = args.download_from if download_from == 'http': + if args.tokenizer_only == True: + raise ValueError( + 'tokenizer-only is not currently supported for http.') try: download_from_http_fileserver(args.url, args.save_dir, args.ignore_cert) From f7f5846620b8cd1b06e789b1f7bc9eec38b33589 Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Mon, 22 Jan 2024 16:44:07 -0800 Subject: [PATCH 2/9] fix --- scripts/misc/download_model.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/scripts/misc/download_model.py b/scripts/misc/download_model.py index 90faff64e8..79f7576b5c 100644 --- a/scripts/misc/download_model.py +++ b/scripts/misc/download_model.py @@ -57,8 +57,6 @@ def parse_args() -> argparse.Namespace: base_parser = argparse.ArgumentParser(add_help=False) base_parser.add_argument('--save-dir', type=str, required=True) base_parser.add_argument('--tokenizer-only', - type=bool, - required=False, default=False, action='store_true') From 800ccbf22a9940f46e31f4d70fa6d23194a43100 Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Mon, 22 Jan 2024 16:50:15 -0800 Subject: [PATCH 3/9] fix --- scripts/misc/download_model.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/misc/download_model.py b/scripts/misc/download_model.py index 79f7576b5c..a81096084a 100644 --- a/scripts/misc/download_model.py +++ b/scripts/misc/download_model.py @@ -115,7 +115,8 @@ def parse_args() -> argparse.Namespace: download_from_hf_hub(args.model, save_dir=args.save_dir, token=args.token, + tokenizers_only=args.tokenizer_only, prefer_safetensors=args.prefer_safetensors) elif download_from == 'oras': download_from_oras(args.model, args.config_file, args.credentials_dir, - args.save_dir, args.concurrency) + args.save_dir, args.tokenizer_only, args.concurrency) From 08529f710286a242b465015e4367100ab5930c25 Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Mon, 22 Jan 2024 17:10:29 -0800 Subject: [PATCH 4/9] update doc string for oras --- scripts/misc/download_model.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/misc/download_model.py b/scripts/misc/download_model.py index a81096084a..2ce04bef65 100644 --- a/scripts/misc/download_model.py +++ b/scripts/misc/download_model.py @@ -7,7 +7,8 @@ python download_model.py hf --model mosaicml/mpt-7b --save-dir --token Download from ORAS registry: - python download_model.py oras --registry --path mosaicml/mpt-7b --save-dir + python download_model.py oras --model mosaicml/mpt-7b --config-file \ + --credentials-dir --save-dir Download from an HTTP file server: python download_model.py http --host https://server.com --path mosaicml/mpt-7b --save-dir From 17e261f9bc55fec071b94a0c94355c5479b7b7f3 Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Mon, 22 Jan 2024 17:23:59 -0800 Subject: [PATCH 5/9] update doc string for http --- scripts/misc/download_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/misc/download_model.py b/scripts/misc/download_model.py index 2ce04bef65..40dfc4775d 100644 --- a/scripts/misc/download_model.py +++ b/scripts/misc/download_model.py @@ -11,7 +11,7 @@ --credentials-dir --save-dir Download from an HTTP file server: - python download_model.py http --host https://server.com --path mosaicml/mpt-7b --save-dir + python download_model.py http --url https://server.com/path --save-dir Download from an HTTP file server with fallback to Hugging Face Hub: python download_model.py http --host https://server.com --path mosaicml/mpt-7b --save-dir \ From da27b1fcbe0dc5cfdfae91584f61545d772b19d3 Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Mon, 22 Jan 2024 18:03:49 -0800 Subject: [PATCH 6/9] fix tests --- llmfoundry/utils/model_download_utils.py | 2 +- tests/utils/test_model_download_utils.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/llmfoundry/utils/model_download_utils.py b/llmfoundry/utils/model_download_utils.py index b51856d5fc..d95935d875 100644 --- a/llmfoundry/utils/model_download_utils.py +++ b/llmfoundry/utils/model_download_utils.py @@ -102,7 +102,7 @@ def download_from_hf_hub( f'No supported model weights found in repo {model}.' + ' Please make sure the repo contains either safetensors or pytorch weights.' ) - + allow_patterns = TOKENIZER_FILES if tokenizers_only else None download_start = time.time() diff --git a/tests/utils/test_model_download_utils.py b/tests/utils/test_model_download_utils.py index 471a39dcdb..14749bdcd9 100644 --- a/tests/utils/test_model_download_utils.py +++ b/tests/utils/test_model_download_utils.py @@ -110,6 +110,7 @@ def test_download_from_hf_hub_weights_pref(mock_list_repo_files: MagicMock, mock_snapshot_download.assert_called_once_with( test_repo_id, local_dir=save_dir, + allow_patterns=None, ignore_patterns=expected_ignore_patterns, token=None) From 7d259366cce4807daa09b27a635377a5f9af06d6 Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Mon, 22 Jan 2024 18:24:19 -0800 Subject: [PATCH 7/9] Apply suggestions from code review Co-authored-by: Jerry Chen --- llmfoundry/utils/model_download_utils.py | 6 +++--- scripts/misc/download_model.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/llmfoundry/utils/model_download_utils.py b/llmfoundry/utils/model_download_utils.py index d95935d875..e076bd5b8f 100644 --- a/llmfoundry/utils/model_download_utils.py +++ b/llmfoundry/utils/model_download_utils.py @@ -64,7 +64,7 @@ def download_from_hf_hub( save_dir (str, optional): The local path to the directory where the model files will be downloaded. prefer_safetensors (bool): Whether to prefer Safetensors weights over PyTorch weights if both are available. Defaults to True. - tokenizers_only (bool): If true, only download tokenzier files. + tokenizer_only (bool): If true, only download tokenizer files. token (str, optional): The HuggingFace API token. If not provided, the token will be read from the `HUGGING_FACE_HUB_TOKEN` environment variable. @@ -103,7 +103,7 @@ def download_from_hf_hub( ' Please make sure the repo contains either safetensors or pytorch weights.' ) - allow_patterns = TOKENIZER_FILES if tokenizers_only else None + allow_patterns = TOKENIZER_FILES if tokenizer_only else None download_start = time.time() hf_hub.snapshot_download(model, @@ -232,7 +232,7 @@ def download_from_oras(model: str, config_file: str, credentials_dir: str, save_dir: str, - tokenizer_only: bool, + tokenizer_only: bool = False, concurrency: int = 10): """Download from an OCI-compliant registry using oras. diff --git a/scripts/misc/download_model.py b/scripts/misc/download_model.py index 40dfc4775d..cdb1da7506 100644 --- a/scripts/misc/download_model.py +++ b/scripts/misc/download_model.py @@ -11,7 +11,7 @@ --credentials-dir --save-dir Download from an HTTP file server: - python download_model.py http --url https://server.com/path --save-dir + python download_model.py http --url https://server.com/models/mosaicml/mpt-7b/ --save-dir Download from an HTTP file server with fallback to Hugging Face Hub: python download_model.py http --host https://server.com --path mosaicml/mpt-7b --save-dir \ @@ -116,8 +116,8 @@ def parse_args() -> argparse.Namespace: download_from_hf_hub(args.model, save_dir=args.save_dir, token=args.token, - tokenizers_only=args.tokenizer_only, + tokenizer_only=args.tokenizer_only, prefer_safetensors=args.prefer_safetensors) elif download_from == 'oras': download_from_oras(args.model, args.config_file, args.credentials_dir, - args.save_dir, args.tokenizer_only, args.concurrency) + args.save_dir, tokenizers_only=args.tokenizer_only, args.concurrency) From ee5f2bf27c345e1e6ebdbbd0fa758475d57eb9e8 Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Mon, 22 Jan 2024 18:28:20 -0800 Subject: [PATCH 8/9] Apply suggestions from code review Co-authored-by: Jerry Chen --- scripts/misc/download_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/misc/download_model.py b/scripts/misc/download_model.py index cdb1da7506..912d2ea5d4 100644 --- a/scripts/misc/download_model.py +++ b/scripts/misc/download_model.py @@ -89,7 +89,7 @@ def parse_args() -> argparse.Namespace: download_from = args.download_from if download_from == 'http': - if args.tokenizer_only == True: + if args.tokenizer_only: raise ValueError( 'tokenizer-only is not currently supported for http.') try: From 854c31cf9752cfd2d600c1e89347d0ff1c24d6e4 Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Mon, 22 Jan 2024 18:40:02 -0800 Subject: [PATCH 9/9] fix code quality --- llmfoundry/utils/model_download_utils.py | 4 ++-- scripts/misc/download_model.py | 8 ++++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/llmfoundry/utils/model_download_utils.py b/llmfoundry/utils/model_download_utils.py index e076bd5b8f..07c84a85c8 100644 --- a/llmfoundry/utils/model_download_utils.py +++ b/llmfoundry/utils/model_download_utils.py @@ -51,7 +51,7 @@ def download_from_hf_hub( model: str, save_dir: str, prefer_safetensors: bool = True, - tokenizers_only: bool = False, + tokenizer_only: bool = False, token: Optional[str] = None, ): """Downloads model files from a Hugging Face Hub model repo. @@ -102,7 +102,7 @@ def download_from_hf_hub( f'No supported model weights found in repo {model}.' + ' Please make sure the repo contains either safetensors or pytorch weights.' ) - + allow_patterns = TOKENIZER_FILES if tokenizer_only else None download_start = time.time() diff --git a/scripts/misc/download_model.py b/scripts/misc/download_model.py index 912d2ea5d4..13a63ce55e 100644 --- a/scripts/misc/download_model.py +++ b/scripts/misc/download_model.py @@ -119,5 +119,9 @@ def parse_args() -> argparse.Namespace: tokenizer_only=args.tokenizer_only, prefer_safetensors=args.prefer_safetensors) elif download_from == 'oras': - download_from_oras(args.model, args.config_file, args.credentials_dir, - args.save_dir, tokenizers_only=args.tokenizer_only, args.concurrency) + download_from_oras(args.model, + args.config_file, + args.credentials_dir, + args.save_dir, + tokenizer_only=args.tokenizer_only, + concurrency=args.concurrency)