diff --git a/llmfoundry/models/hf/hf_causal_lm.py b/llmfoundry/models/hf/hf_causal_lm.py index ce398a8b2d..13857e9bb9 100644 --- a/llmfoundry/models/hf/hf_causal_lm.py +++ b/llmfoundry/models/hf/hf_causal_lm.py @@ -109,6 +109,7 @@ def __init__(self, om_model_config: Union[DictConfig, ) attr = getattr(config, k) + # attempt to disallow typos in nested configs if isinstance(attr, Mapping): extra_keys = [ _k for _k in v.keys() if _k not in attr.keys() @@ -120,6 +121,10 @@ def __init__(self, om_model_config: Union[DictConfig, f'Expected (a subset of) keys: {list(attr.keys())}.' ) getattr(config, k).update(v) + # necessary case to allow for rope_scaling to be overriden in llama config + elif attr is None and isinstance(v, Mapping): + setattr(config, k, {}) + getattr(config, k).update(v) else: setattr(config, k, v) diff --git a/tests/test_hf_config.py b/tests/test_hf_config.py index 99d01f309f..5b3bb3d150 100644 --- a/tests/test_hf_config.py +++ b/tests/test_hf_config.py @@ -1,6 +1,7 @@ # Copyright 2022 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 +import os import tempfile from copy import deepcopy from pathlib import Path @@ -139,3 +140,30 @@ def test_hf_config_override( assert getattr(hf_model.config, k)[_k] == _v else: assert getattr(hf_model.config, k) == v + + +@pytest.mark.skipif('HUGGING_FACE_HUB_TOKEN' not in os.environ, + reason='CI does not have access to llama2') +def test_rope_scaling_override(): + model_cfg = { + 'name': 'hf_causal_lm', + 'pretrained_model_name_or_path': 'meta-llama/Llama-2-7b-hf', + 'config_overrides': { + 'num_hidden_layers': 2, + 'hidden_size': 32, + 'intermediate_size': 64, + 'rope_scaling': { + 'type': 'dynamic', + 'factor': 0.5 + } + }, + 'use_auth_token': True, + 'pretrained': False, + 'init_device': 'cpu', + } + model_cfg = om.create(model_cfg) + + model = COMPOSER_MODEL_REGISTRY[model_cfg.name](model_cfg, tokenizer=None) + # This would error if the config isn't parsed into a proper dictionary + model.get_metadata() + assert model.config.rope_scaling == {'type': 'dynamic', 'factor': 0.5}