diff --git a/.github/workflows/smoketest.yaml b/.github/workflows/smoketest.yaml new file mode 100644 index 0000000000..0bf3968753 --- /dev/null +++ b/.github/workflows/smoketest.yaml @@ -0,0 +1,41 @@ +name: Smoketest +on: + push: + branches: + - main + - release/* + pull_request: + branches: + - main + - release/* + workflow_dispatch: +# Cancel old runs when a new commit is pushed to the same branch if not on main or dev +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/main' && github.ref != 'refs/heads/dev' }} +defaults: + run: + working-directory: . +jobs: + smoketest: + runs-on: ubuntu-20.04 + timeout-minutes: 10 + strategy: + matrix: + python_version: + - "3.9" + - "3.10" + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python_version }} + - name: Setup + run: | + set -ex + python -m pip install --upgrade 'pip<23' wheel + python -m pip install --upgrade . + python -m pip install pytest==7.2.1 pytest_codeblocks==0.16.1 + - name: Run checks + run: | + pytest tests/test_smoketest.py diff --git a/llmfoundry/__init__.py b/llmfoundry/__init__.py index 85f96aadb9..87504d26b3 100644 --- a/llmfoundry/__init__.py +++ b/llmfoundry/__init__.py @@ -4,6 +4,26 @@ import torch try: + import warnings + + # bitsandbytes is a very noisy library. A lot of it is print statements that we can't easily suppress, + # but we can at least suppress a bunch of spurious warnings. + warnings.filterwarnings('ignore', + category=UserWarning, + module='bitsandbytes') + + import logging + + from llmfoundry.utils.logging_utils import SpecificWarningFilter + + # Filter out Hugging Face warning for not using a pinned revision of the model + hf_dynamic_modules_logger = logging.getLogger( + 'transformers.dynamic_module_utils') + new_files_warning_filter = SpecificWarningFilter( + 'A new version of the following files was downloaded from') + + hf_dynamic_modules_logger.addFilter(new_files_warning_filter) + # Before importing any transformers models, we need to disable transformers flash attention if # we are in an environment with flash attention version <2. Transformers hard errors on a not properly # gated import otherwise. diff --git a/llmfoundry/data/denoising.py b/llmfoundry/data/denoising.py index 8ccf7f25e9..9c14f21751 100644 --- a/llmfoundry/data/denoising.py +++ b/llmfoundry/data/denoising.py @@ -527,7 +527,6 @@ def build_text_denoising_dataloader( ) token_counting_func = get_tokens_per_batch_func( - pad_token_id=tokenizer.pad_token_id, decoder_only=cfg.mixture_of_denoisers.decoder_only_format) return DataSpec(dataloader=dl, get_num_tokens_in_batch=token_counting_func) diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py index b19cab841f..7a29d1dfed 100644 --- a/llmfoundry/data/finetuning/dataloader.py +++ b/llmfoundry/data/finetuning/dataloader.py @@ -216,8 +216,7 @@ def build_finetuning_dataloader(cfg: DictConfig, timeout=cfg.get('timeout', 0), ) - token_counting_func = get_tokens_per_batch_func( - pad_token_id=tokenizer.pad_token_id) + token_counting_func = get_tokens_per_batch_func() return DataSpec(dataloader=dl, get_num_tokens_in_batch=token_counting_func) diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py index 4b80ffef54..21c3558b2d 100644 --- a/llmfoundry/data/finetuning/tasks.py +++ b/llmfoundry/data/finetuning/tasks.py @@ -43,6 +43,8 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]: from streaming import StreamingDataset from transformers import PreTrainedTokenizerBase +from llmfoundry.utils.logging_utils import SpecificWarningFilter + log = logging.getLogger(__name__) __all__ = ['dataset_constructor'] @@ -236,7 +238,7 @@ def wrapper(func: Callable) -> Callable: def print_registered_tasks(self) -> None: tasks = sorted(self._task_preprocessing_registry.keys()) - print('\n'.join(tasks)) + log.info('\n'.join(tasks)) def get_preprocessing_fn_from_dict( self, mapping: Union[Dict, DictConfig] @@ -365,6 +367,15 @@ def build_from_hf( with dist.local_rank_zero_download_and_wait(signal_file_path): pass + hf_tokenization_logger = logging.getLogger( + 'transformers.tokenization_utils_base') + sequence_length_warning_filter = SpecificWarningFilter( + 'Token indices sequence length is longer than the specified maximum sequence length' + ) + + # We will trim examples later in the collate_fn, so we want to silence this warning from Hugging Face + hf_tokenization_logger.addFilter(sequence_length_warning_filter) + error: Optional[Exception] = None filtered_dataset = None try: @@ -433,6 +444,9 @@ def filter_long_or_empty_examples(example: Dict) -> bool: log.error('Error during data prep') raise error log.debug('All ranks finished data prep') + + hf_tokenization_logger.removeFilter(sequence_length_warning_filter) + assert filtered_dataset is not None return filtered_dataset diff --git a/llmfoundry/data/text_data.py b/llmfoundry/data/text_data.py index 51fd6b38dc..083cd48069 100644 --- a/llmfoundry/data/text_data.py +++ b/llmfoundry/data/text_data.py @@ -306,15 +306,13 @@ def build_text_dataloader( # and if tokenizing on the fly, we require that the tokenizer has a pad token. token_counting_func = None if tokenizer.pad_token_id is not None: - token_counting_func = get_tokens_per_batch_func( - pad_token_id=tokenizer.pad_token_id) + token_counting_func = get_tokens_per_batch_func() return DataSpec(dataloader=dl, get_num_tokens_in_batch=token_counting_func) -def get_tokens_per_batch_func(pad_token_id: int, - decoder_only: bool = True - ) -> Callable[[Batch], int]: +def get_tokens_per_batch_func( + decoder_only: bool = True) -> Callable[[Batch], int]: """Returns a callable that counts the number of tokens in a batch. Args: @@ -327,25 +325,24 @@ def get_tokens_per_batch_func(pad_token_id: int, """ def get_num_samples_in_batch(batch: Batch) -> int: - if not isinstance(batch, Mapping) or 'input_ids' not in batch: + if not isinstance(batch, Mapping) or 'attention_mask' not in batch: raise ValueError( - 'get_tokens_per_batch_func() requires a batch with an input_ids key' + 'get_tokens_per_batch_func() requires a batch with an attention_mask key' ) - if not decoder_only and 'decoder_input_ids' not in batch: + if not decoder_only and 'decoder_attention_mask' not in batch: raise ValueError( - 'get_tokens_per_batch_func() for encoder decoder requires a batch with a decoder_input_ids key' + 'get_tokens_per_batch_func() for encoder decoder requires a batch with a decoder_attention_mask key' ) # Count number of non padding tokens in batch - input_ids_tokens = int( - torch.sum(batch['input_ids'] != pad_token_id).item()) + input_ids_tokens = int(torch.sum(batch['attention_mask']).item()) # For encoder decoder models only decoder_input_ids_tokens = 0 if not decoder_only: decoder_input_ids_tokens = int( - torch.sum(batch['decoder_input_ids'] != pad_token_id).item()) + torch.sum(batch['decoder_attention_mask']).item()) return input_ids_tokens + decoder_input_ids_tokens diff --git a/llmfoundry/models/hf/hf_causal_lm.py b/llmfoundry/models/hf/hf_causal_lm.py index d52633a09b..fcac57d817 100644 --- a/llmfoundry/models/hf/hf_causal_lm.py +++ b/llmfoundry/models/hf/hf_causal_lm.py @@ -20,7 +20,7 @@ from composer.utils import dist from omegaconf import DictConfig from torch import nn -from transformers import (AutoConfig, AutoModelForCausalLM, +from transformers import (AutoConfig, AutoModelForCausalLM, PreTrainedModel, PreTrainedTokenizerBase) from llmfoundry.models.hf.hf_fsdp import hf_get_init_device @@ -102,20 +102,27 @@ def __init__(self, om_model_config: Union[DictConfig, 'use_flash_attention_2 is set to True, but flash-attention 2 is not installed. ' + 'Please install flash_attn==2.3.2`.') + requested_attention_implementation = 'flash_attention_2' if use_flash_attention_2 else 'eager' config = AutoConfig.from_pretrained( om_model_config.pretrained_model_name_or_path, trust_remote_code=trust_remote_code, use_auth_token=use_auth_token, + attn_implementation=requested_attention_implementation, + use_cache= + False, # Necessary due to https://github.com/huggingface/transformers/issues/28056 ) - # This is not how you are supposed to set this, but transformers currently only - # supports enabling flash attention 2 when using the from_pretrained API. - # We need to support it for both from_pretrained and from_config, so we have to - # set the private attribute here. This will just skip all of transformers' - # validation logic that it is ok to use flash attention 2, so we check - # whether it is installed above, and whether the chosen config supports it here. - # https://github.com/huggingface/transformers/issues/26878 - config._flash_attn_2_enabled = use_flash_attention_2 + # This is not ideal, however Hugging Face's _autoset_attn_implementation function + # forces you to load the model in fp16/bf16 if you want to use flash attention. Rather than loading + # the model and then casting it back to fp32, we are monkeypatching their check. + # https://github.com/huggingface/transformers/issues/28052 + def _autoset_attn_implementation_monkeypatch( + cls, config, *args, **kwargs): # type: ignore + config._attn_implementation = requested_attention_implementation + return config + + PreTrainedModel._autoset_attn_implementation = classmethod( + _autoset_attn_implementation_monkeypatch) # set config overrides for k, v in om_model_config.get('config_overrides', {}).items(): @@ -184,7 +191,8 @@ def __init__(self, om_model_config: Union[DictConfig, trust_remote_code=trust_remote_code, use_auth_token=use_auth_token, load_in_8bit=load_in_8bit, - config=config) + config=config, + ) else: model = AutoModelForCausalLM.from_config( config, diff --git a/llmfoundry/models/inference_api_wrapper/interface.py b/llmfoundry/models/inference_api_wrapper/interface.py index 2d84599772..9d0ce7deb3 100644 --- a/llmfoundry/models/inference_api_wrapper/interface.py +++ b/llmfoundry/models/inference_api_wrapper/interface.py @@ -39,8 +39,7 @@ def __init__(self, model_cfg: Dict, tokenizer: AutoTokenizer): def get_metrics(self, is_train: bool = False): if is_train: - raise NotImplementedError( - 'You cannot use inference wrappers for training') + metrics = None else: metrics = self.eval_metrics @@ -55,6 +54,7 @@ def rebatch(self, batch: Batch): return batch def eval_forward(self, batch: Batch, outputs: Optional[Any] = None): + padding_tok = self.tokenizer.pad_token_id if self.tokenizer.pad_token_id else self.tokenizer.eos_token_id # If the batch mode is generate, we will generate a requested number of tokens using the underlying # model's generate function. Extra generation kwargs can be passed in via the batch. Strings will # be returned from eval_forward @@ -80,8 +80,7 @@ def eval_forward(self, batch: Batch, outputs: Optional[Any] = None): [output_logits, next_logit_tensor.reshape(1, -1)]) padding = torch.nn.functional.one_hot( - torch.full((seqlen - output_logits.shape[0],), - self.tokenizer.pad_token_id), + torch.full((seqlen - output_logits.shape[0],), padding_tok), num_classes=self.tokenizer.vocab_size) output_logits = torch.cat([output_logits, padding]) output_logits_batch.append(output_logits) diff --git a/llmfoundry/models/inference_api_wrapper/openai_causal_lm.py b/llmfoundry/models/inference_api_wrapper/openai_causal_lm.py index 609112b944..39de2ba59c 100644 --- a/llmfoundry/models/inference_api_wrapper/openai_causal_lm.py +++ b/llmfoundry/models/inference_api_wrapper/openai_causal_lm.py @@ -5,8 +5,9 @@ import logging import os +import random from time import sleep -from typing import Any, Dict, List, Optional, Union +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union import torch from composer.core.types import Batch @@ -23,6 +24,11 @@ 'OpenAIChatAPIEvalWrapper', ] +if TYPE_CHECKING: + from openai.types.chat.chat_completion import ChatCompletion + from openai.types.completion import Completion + from openai.types.completion_choice import Logprobs + MAX_RETRIES = 10 @@ -30,6 +36,9 @@ class OpenAIEvalInterface(InferenceAPIEvalWrapper): def __init__(self, model_cfg: Dict, tokenizer: AutoTokenizer) -> None: super().__init__(model_cfg, tokenizer) + assert os.getenv( + 'OPENAI_API_KEY' + ) is not None, 'No OpenAI API Key found. Ensure it is saved as an environmental variable called OPENAI_API_KEY.' try: import openai except ImportError as e: @@ -37,13 +46,13 @@ def __init__(self, model_cfg: Dict, tokenizer: AutoTokenizer) -> None: extra_deps_group='openai', conda_package='openai', conda_channel='conda-forge') from e - openai.api_key = os.getenv('OPENAI_API_KEY') + self.client = openai.OpenAI() self.model_name = model_cfg['version'] def generate_completion(self, prompt: str, num_tokens: int): raise NotImplementedError() - def process_result(self, completion: Optional[dict]): + def process_result(self, completion): # pyright: ignore raise NotImplementedError() def get_next_token_logit_tensor(self, prompt: str, num_tokens: int = 1): @@ -52,7 +61,7 @@ def get_next_token_logit_tensor(self, prompt: str, num_tokens: int = 1): def try_generate_completion(self, prompt: str, num_tokens: int): try: - from openai.error import RateLimitError + from openai import APITimeoutError, RateLimitError except ImportError as e: raise MissingConditionalImportError( extra_deps_group='openai', @@ -60,19 +69,24 @@ def try_generate_completion(self, prompt: str, num_tokens: int): conda_channel='conda-forge') from e tries = 0 completion = None + delay = 1 while tries < MAX_RETRIES: tries += 1 try: - completion = self.generate_completion(prompt, num_tokens) break except RateLimitError as e: - if 'You exceeded your current quota' in str(e._message): + if 'You exceeded your current quota' in str( + e._message): # pyright: ignore raise e - sleep(60) + delay *= 2 * (1 + random.random()) + sleep(delay) continue - except Exception: + except APITimeoutError as e: + delay *= 2 * (1 + random.random()) + sleep(delay) continue + return completion @@ -80,17 +94,16 @@ class OpenAIChatAPIEvalWrapper(OpenAIEvalInterface): def __init__(self, model_cfg: Dict, tokenizer: AutoTokenizer) -> None: super().__init__(model_cfg, tokenizer) - try: - import openai - except ImportError as e: - raise MissingConditionalImportError( - extra_deps_group='openai', - conda_package='openai', - conda_channel='conda-forge') from e - self.generate_completion = lambda prompt, num_tokens: openai.ChatCompletion.create( - self.model_name, + self.generate_completion = lambda prompt, num_tokens: self.client.chat.completions.create( + model=self.model_name, messages=[{ + 'role': + 'system', + 'content': + model_cfg.get('system_role_prompt', + 'Please complete the following text: ') + }, { 'role': 'user', 'content': prompt }], @@ -162,6 +175,7 @@ def eval_forward(self, batch: Batch, outputs: Optional[Any] = None): # than what the continuation would expect. # Get around this issue by retokenizing the batch to remove spacing from the continuation as well as # decoding the whole continuation at once. + padding_tok = self.tokenizer.pad_token_id if self.tokenizer.pad_token_id else self.tokenizer.eos_token_id output_logits_batch = [] batch = self.rebatch(batch) for tokens, cont_idxs in zip(batch['input_ids'], @@ -182,20 +196,21 @@ def eval_forward(self, batch: Batch, outputs: Optional[Any] = None): if next_logit_tensor is not None: output_logits = torch.cat([output_logits, next_logit_tensor]) padding = torch.nn.functional.one_hot( - torch.full((seqlen - output_logits.shape[0],), - self.tokenizer.pad_token_id), + torch.full((seqlen - output_logits.shape[0],), padding_tok), num_classes=self.tokenizer.vocab_size) output_logits = torch.cat([output_logits, padding]) output_logits_batch.append(output_logits) return torch.stack(output_logits_batch).to(batch['input_ids'].device) - def process_result(self, completion: Optional[dict]): - assert isinstance(completion, dict) - if len(completion['choices']) > 0: + def process_result(self, completion: Optional['ChatCompletion']): + if completion is None: + raise ValueError("Couldn't generate model output") + + if len(completion.choices) > 0: tensors = [] - for t in self.tokenizer(completion['choices'][0]['message'] - ['content'])['input_ids']: + for t in self.tokenizer( + completion.choices[0].message.content)['input_ids']: tensors.append( self.tokenizer.construct_logit_tensor( {self.tokenizer.decode([t]): 0.0})) @@ -213,29 +228,26 @@ class OpenAICausalLMEvalWrapper(OpenAIEvalInterface): def __init__(self, model_cfg: Dict, tokenizer: AutoTokenizer) -> None: super().__init__(model_cfg, tokenizer) - try: - import openai - except ImportError as e: - raise MissingConditionalImportError( - extra_deps_group='openai', - conda_package='openai', - conda_channel='conda-forge') from e - - self.generate_completion = lambda prompt, num_tokens: openai.Completion.create( - engine=self.model_name, + # TODO: this will be deprecated + self.generate_completion = lambda prompt, num_tokens: self.client.completions.create( + model=self.model_name, prompt=prompt, - max_tokens=1, + max_tokens=num_tokens, logprobs=5, temperature=0.0) - def process_result(self, completion: Optional[dict]): + def process_result(self, completion: Optional['Completion']): if completion is None: raise ValueError("Couldn't generate model output") - assert isinstance(completion, dict) - if len(completion['choices'][0]['logprobs']['top_logprobs']) > 0: + if TYPE_CHECKING: + assert isinstance(completion, Completion) + assert isinstance(completion.choices[0].logprobs, Logprobs) + assert isinstance(completion.choices[0].logprobs.top_logprobs, list) + + if len(completion.choices[0].logprobs.top_logprobs[0]) > 0: tensor = self.tokenizer.construct_logit_tensor( - dict(completion['choices'][0]['logprobs']['top_logprobs'][0])) + dict(completion.choices[0].logprobs.top_logprobs[0])) return tensor else: # the model sometimes stops early even though we are still requesting tokens! diff --git a/llmfoundry/models/layers/ffn.py b/llmfoundry/models/layers/ffn.py index e18e611ca6..560e8c31fc 100644 --- a/llmfoundry/models/layers/ffn.py +++ b/llmfoundry/models/layers/ffn.py @@ -4,7 +4,9 @@ """MPT Blocks used for the MPT Model.""" import logging -from typing import Any, Optional, Union +from copy import deepcopy +from functools import partial +from typing import Any, Callable, Optional, Union import torch import torch.nn as nn @@ -18,6 +20,36 @@ log = logging.getLogger(__name__) +_FFN_ACT_FN_DEFAULT = { + 'name': 'gelu', + 'approximate': 'none', +} + + +def resolve_ffn_act_fn( + config: Optional[dict] = None,) -> Callable[[torch.Tensor], torch.Tensor]: + """Resolve the activation function for the feed-forward network. + + Args: + config (Optional[dict]): The configuration dictionary for the activation function. + The dict config must specify the 'name' of a torch.nn.functional activation + function. All of other key values pairs are bound to the function as a partial. + + Returns: + Callable[[torch.Tensor], torch.Tensor]: The activation function. + """ + if config is None: + config = _FFN_ACT_FN_DEFAULT + config = deepcopy(config) + name = config.pop('name') + if not hasattr(torch.nn.functional, name): + raise ValueError(f'Unrecognised activation function name ({name}).') + act = getattr(torch.nn.functional, name) + return partial(act, **config) + + +_DEFAULT_ACT_FN = resolve_ffn_act_fn(_FFN_ACT_FN_DEFAULT) + def resolve_ffn_hidden_size( d_model: int, @@ -55,6 +87,7 @@ def __init__( expansion_ratio: Union[int, float], fc_type: str = 'torch', ffn_hidden_size: Optional[int] = None, + act_fn: Callable[[torch.Tensor], torch.Tensor] = _DEFAULT_ACT_FN, device: Optional[str] = None, bias: bool = True, ): @@ -72,7 +105,7 @@ def __init__( ffn_hidden_size, **self.fc_kwargs, ) - self.act = nn.GELU(approximate='none') + self.act = act_fn self.down_proj = FC_CLASS_REGISTRY[fc_type]( ffn_hidden_size, d_model, @@ -92,6 +125,7 @@ def __init__( expansion_ratio: Union[int, float], fc_type: str = 'torch', ffn_hidden_size: Optional[int] = None, + act_fn: Callable[[torch.Tensor], torch.Tensor] = _DEFAULT_ACT_FN, device: Optional[str] = None, bias: bool = True, ): @@ -100,6 +134,7 @@ def __init__( expansion_ratio=expansion_ratio, fc_type=fc_type, ffn_hidden_size=ffn_hidden_size, + act_fn=act_fn, device=device, bias=bias, ) @@ -128,6 +163,7 @@ def build_ffn( expansion_ratio: Union[int, float], fc_type: str = 'torch', ffn_hidden_size: Optional[int] = None, + ffn_act_fn: Optional[dict] = None, device: Optional[str] = None, bias: bool = True, **kwargs: Any, @@ -142,6 +178,7 @@ def build_ffn( d_model=d_model, expansion_ratio=expansion_ratio, fc_type=fc_type, + act_fn=resolve_ffn_act_fn(ffn_act_fn), ffn_hidden_size=ffn_hidden_size, device=device, bias=bias, @@ -150,6 +187,10 @@ def build_ffn( assert te is not None ffn_hidden_size = resolve_ffn_hidden_size(d_model, expansion_ratio, ffn_hidden_size) + if ffn_act_fn is not None: + raise ValueError( + f'Transformer Engine block does not support custom activation functions.' + ) return te.LayerNormMLP( hidden_size=d_model, ffn_hidden_size=ffn_hidden_size, diff --git a/llmfoundry/models/mpt/configuration_mpt.py b/llmfoundry/models/mpt/configuration_mpt.py index 2ecc726aa3..913c39d44f 100644 --- a/llmfoundry/models/mpt/configuration_mpt.py +++ b/llmfoundry/models/mpt/configuration_mpt.py @@ -295,6 +295,10 @@ def _validate_config(self) -> None: self.ffn_config['fc_type'] = self.fc_type elif self.ffn_config['ffn_type'] == 'te_ln_mlp': self.ffn_config['bias'] = not self.no_bias + if 'ffn_act_fn' in self.ffn_config.keys(): + raise ValueError( + f'Transformer Engine block does not support custom activation functions.' + ) if not self.use_pad_tok_in_ffn: try: from flash_attn.bert_padding import unpad_input, pad_input # type: ignore # yapf: disable # isort: skip diff --git a/llmfoundry/utils/logging_utils.py b/llmfoundry/utils/logging_utils.py new file mode 100644 index 0000000000..081a06fefb --- /dev/null +++ b/llmfoundry/utils/logging_utils.py @@ -0,0 +1,21 @@ +# Copyright 2022 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + +import logging + + +class SpecificWarningFilter(logging.Filter): + + def __init__(self, message_to_suppress: str): + """Filter out a specific warning message based on its content. + + This can be useful for filtering out specific warning messages from third party packages. + + Args: + message_to_suppress (str): The warning message to suppress. + """ + super().__init__() + self.message_to_suppress = message_to_suppress + + def filter(self, record: logging.LogRecord) -> bool: + return self.message_to_suppress not in record.getMessage() diff --git a/mcli/mcli-openai-eval.yaml b/mcli/mcli-openai-eval.yaml index 179b078fb6..dbccee83ba 100644 --- a/mcli/mcli-openai-eval.yaml +++ b/mcli/mcli-openai-eval.yaml @@ -12,8 +12,8 @@ command: | # Mosaic Cloud will use run_name (with a unique suffix) to populate the env var $RUN_NAME run_name: openai-eval -# gpu_num: # -# gpu_type: # +gpu_num: # +gpu_type: # cluster: # replace with your cluster here! image: mosaicml/llm-foundry:2.1.0_cu121_flash2-latest @@ -25,41 +25,22 @@ parameters: device_eval_batch_size: 4 models: - - model_name: openai/davinci - model: - name: openai_causal_lm - version: davinci - tokenizer: - name: openai - kwargs: - name: davinci - - - model_name: openai/ada - model: - name: openai_causal_lm - version: ada - tokenizer: - name: openai - kwargs: - name: ada - - - model_name: openai/gpt-4 + model_name: openai/gpt-3.5-turbo model: name: openai_chat - version: gpt-4 + version: gpt-3.5-turbo tokenizer: - name: openai + name: tiktoken kwargs: - name: gpt-4 + model_name: gpt-3.5-turbo - - model_name: openai/gpt-3.5-turbo + model_name: openai/davinci model: - name: openai_chat - version: gpt-3.5-turbo + name: openai_causal_lm + version: davinci tokenizer: - name: openai + name: tiktoken kwargs: - name: gpt-3.5-turbo + model_name: davinci - icl_tasks: 'eval/yamls/lm_tasks.yaml' - eval_gauntlet: 'eval/yamls/eval_gauntlet.yaml' + icl_tasks: 'eval/yamls/lm_tasks_v0.2.yaml' diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 214fb49abc..383d3571a1 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -28,6 +28,8 @@ from llmfoundry.utils.config_utils import (log_config, pop_config, process_init_device) +log = logging.getLogger(__name__) + def load_peft_model(model_cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, num_retries: int) -> ComposerModel: @@ -67,7 +69,7 @@ def load_peft_model(model_cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, if retries >= num_retries: raise e else: - print( + log.info( f'Got exception {str(e)} while loading model {model_cfg.name}. {num_retries-retries} retries remaining' ) @@ -91,7 +93,7 @@ def load_model(model_cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, if retries >= num_retries: raise e else: - print( + log.info( f'Got exception {str(e)} while loading model {model_cfg.name}. {num_retries-retries} retries remaining' ) @@ -119,7 +121,7 @@ def evaluate_model( metadata: Optional[Dict[str, str]], ): - print(f'Evaluating model: {model_cfg.model_name}', flush=True) + log.info(f'Evaluating model: {model_cfg.model_name}') # Build tokenizer and model tokenizer_cfg: Dict[str, Any] = om.to_container(model_cfg.tokenizer, @@ -194,7 +196,7 @@ def evaluate_model( assert composer_model is not None - print(f'Building trainer for {model_cfg.model_name}...') + log.info(f'Building trainer for {model_cfg.model_name}...') trainer = Trainer( run_name=run_name, seed=seed, @@ -211,10 +213,10 @@ def evaluate_model( python_log_level=python_log_level, ) - print('Logging config') + log.info('Logging config') log_config(loggers_cfg) - print(f'Starting eval for {model_cfg.model_name}...') + log.info(f'Starting eval for {model_cfg.model_name}...') if torch.cuda.is_available(): torch.cuda.synchronize() a = time.time() @@ -223,7 +225,7 @@ def evaluate_model( torch.cuda.synchronize() b = time.time() - print(f'Ran {model_cfg.model_name} eval in: {b-a} seconds') + log.info(f'Ran {model_cfg.model_name} eval in: {b-a} seconds') return (trainer, logger_keys, eval_gauntlet_callback, eval_gauntlet_df) @@ -238,7 +240,7 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: must_exist=False, default_value=None) if eval_gauntlet_config: - print( + warnings.warn( 'Use of the key `model_gauntlet` is deprecated, please use the key `eval_gauntlet`' ) diff --git a/scripts/eval/yamls/lm_tasks.yaml b/scripts/eval/yamls/lm_tasks_v0.2.yaml similarity index 69% rename from scripts/eval/yamls/lm_tasks.yaml rename to scripts/eval/yamls/lm_tasks_v0.2.yaml index a8b00ba75c..32d4c9f718 100644 --- a/scripts/eval/yamls/lm_tasks.yaml +++ b/scripts/eval/yamls/lm_tasks_v0.2.yaml @@ -1,31 +1,26 @@ icl_tasks: - label: jeopardy - dataset_uri: eval/local_data/world_knowledge/jeopardy_all.jsonl # ADD YOUR OWN DATASET URI - num_fewshot: [10] + dataset_uri: eval/local_data/world_knowledge/jeopardy_all.jsonl + num_fewshot: [3] icl_task_type: language_modeling continuation_delimiter: "\nAnswer: " # this separates questions from answers has_categories: true - label: bigbench_qa_wikidata - dataset_uri: eval/local_data/world_knowledge/bigbench_qa_wikidata.jsonl # ADD YOUR OWN DATASET URI - num_fewshot: [10] + dataset_uri: eval/local_data/world_knowledge/bigbench_qa_wikidata.jsonl + num_fewshot: [3] icl_task_type: language_modeling - - label: lambada_openai - dataset_uri: eval/local_data/language_understanding/lambada_openai.jsonl - num_fewshot: [0] + label: bigbench_dyck_languages + dataset_uri: eval/local_data/symbolic_problem_solving/bigbench_dyck_languages.jsonl + num_fewshot: [5] icl_task_type: language_modeling - - label: bigbench_conlang_translation - dataset_uri: eval/local_data/language_understanding/bigbench_conlang_translation.jsonl + label: lambada_openai + dataset_uri: eval/local_data/language_understanding/lambada_openai.jsonl num_fewshot: [0] icl_task_type: language_modeling -- - label: bigbench_dyck_languages - dataset_uri: eval/local_data/symbolic_problem_solving/bigbench_dyck_languages.jsonl - num_fewshot: [10] - icl_task_type: language_modeling - label: bigbench_cs_algorithms dataset_uri: eval/local_data/symbolic_problem_solving/bigbench_cs_algorithms.jsonl @@ -34,35 +29,30 @@ icl_tasks: - label: bigbench_operators dataset_uri: eval/local_data/symbolic_problem_solving/bigbench_operators.jsonl - num_fewshot: [10] - icl_task_type: language_modeling -- - label: bigbench_repeat_copy_logic - dataset_uri: eval/local_data/symbolic_problem_solving/bigbench_repeat_copy_logic.jsonl - num_fewshot: [10] + num_fewshot: [3] icl_task_type: language_modeling - label: simple_arithmetic_nospaces dataset_uri: eval/local_data/symbolic_problem_solving/simple_arithmetic_nospaces.jsonl - num_fewshot: [10] + num_fewshot: [5] icl_task_type: language_modeling - label: simple_arithmetic_withspaces dataset_uri: eval/local_data/symbolic_problem_solving/simple_arithmetic_withspaces.jsonl - num_fewshot: [10] + num_fewshot: [5] icl_task_type: language_modeling - label: pubmed_qa_labeled - dataset_uri: eval/local_data/reading_comprehension/pubmed_qa_labeled.jsonl # ADD YOUR OWN DATASET URI + dataset_uri: eval/local_data/reading_comprehension/pubmed_qa_labeled.jsonl num_fewshot: [10] icl_task_type: language_modeling - label: squad - dataset_uri: eval/local_data/reading_comprehension/squad.jsonl # ADD YOUR OWN DATASET URI - num_fewshot: [10] + dataset_uri: eval/local_data/reading_comprehension/squad.jsonl + num_fewshot: [3] icl_task_type: language_modeling - label: coqa - dataset_uri: eval/local_data/reading_comprehension/coqa.jsonl # ADD YOUR OWN DATASET URI + dataset_uri: eval/local_data/reading_comprehension/coqa.jsonl num_fewshot: [0] icl_task_type: language_modeling diff --git a/scripts/eval/yamls/openai_eval.yaml b/scripts/eval/yamls/openai_eval.yaml index e1afe78015..9f4da1435f 100644 --- a/scripts/eval/yamls/openai_eval.yaml +++ b/scripts/eval/yamls/openai_eval.yaml @@ -3,32 +3,22 @@ max_seq_len: 1024 device_eval_batch_size: 4 models: - - model_name: openai/davinci - model: - name: openai_causal_lm - version: davinci - tokenizer: - name: openai - kwargs: - name: davinci -- - model_name: openai/gpt-4 + model_name: openai/gpt-3.5-turbo model: name: openai_chat - version: gpt-4 + version: gpt-3.5-turbo tokenizer: - name: openai + name: tiktoken kwargs: - name: gpt-4 + model_name: gpt-3.5-turbo - - model_name: openai/gpt-3.5-turbo + model_name: openai/davinci model: - name: openai_chat - version: gpt-3.5-turbo + name: openai_causal_lm + version: davinci tokenizer: - name: openai + name: tiktoken kwargs: - name: gpt-3.5-turbo + model_name: davinci -icl_tasks: 'eval/yamls/lm_tasks.yaml' -eval_gauntlet: 'eval/yamls/eval_gauntlet.yaml' +icl_tasks: 'eval/yamls/lm_tasks_v0.2.yaml' diff --git a/scripts/train/train.py b/scripts/train/train.py index db66821fe3..ef7a3b91db 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -35,6 +35,8 @@ process_init_device, update_batch_size_info) +log = logging.getLogger(__name__) + def validate_config(cfg: DictConfig): """Validates compatible model and dataloader selection.""" @@ -139,17 +141,17 @@ def build_composer_peft_model( + f'Error encountered: {e}') # 1) loads a hf model, 2) adds peft modules, 3) wraps it in a ComposerHFCausalLM. - print('Building Lora config...') + log.info('Building Lora config...') lora_cfg = LoraConfig(**lora_args) - print('Building model from HuggingFace checkpoint...') + log.info('Building model from HuggingFace checkpoint...') model = MPTForCausalLM.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True) - print('Model built!') + log.info('Model built!') - print('Adding Lora modules...') + log.info('Adding Lora modules...') model = get_peft_model(model, lora_cfg) - print('Lora modules added!') + log.info('Lora modules added!') model = ComposerHFCausalLM(model, tokenizer) @@ -164,7 +166,7 @@ def print_trainable_parameters(model: torch.nn.Module) -> None: all_param += param.numel() if param.requires_grad: trainable_params += param.numel() - print( + log.info( f'trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}' ) @@ -261,9 +263,9 @@ def main(cfg: DictConfig) -> Trainer: must_exist=False, default_value=None) if eval_gauntlet_config is not None: - print( - 'Use of the key `model_gauntlet` is deprecated, please use the key `eval_gauntlet`' - ) + warnings.warn( + 'Use of the key `model_gauntlet` is deprecated, please use the key `eval_gauntlet`', + DeprecationWarning) icl_subset_num_batches: Optional[int] = pop_config(cfg, 'icl_subset_num_batches', must_exist=False, @@ -399,7 +401,7 @@ def main(cfg: DictConfig) -> Trainer: autoresume_default = True if cfg.get('autoresume') is None and autoresume_default: - print('As run_name, save_folder, and save_latest_filename are set, \ + log.info('As run_name, save_folder, and save_latest_filename are set, \ changing autoresume default to True...') autoresume: bool = pop_config(cfg, @@ -518,7 +520,7 @@ def main(cfg: DictConfig) -> Trainer: ] if algorithm_configs else None # Dataloaders - print('Building train loader...') + log.info('Building train loader...') train_loader = build_dataloader( train_loader_config, tokenizer, @@ -529,7 +531,7 @@ def main(cfg: DictConfig) -> Trainer: mosaicml_logger.log_metrics({'data_validated': time.time()}) ## Evaluation - print('Building eval loader...') + log.info('Building eval loader...') eval_icl_seq_len: int = icl_seq_len if icl_seq_len else max_seq_len # TODO: evaluators should not be built at all if use_async_eval is True # This will be fixed when eval_loader support is fully added to AsyncEval @@ -547,7 +549,7 @@ def main(cfg: DictConfig) -> Trainer: callbacks.append(eval_gauntlet_callback) # Build Model - print('Initializing model...') + log.info('Initializing model...') with init_context: if lora_config is not None: # frozen model + trainable lora modules model: ComposerHFCausalLM = build_composer_peft_model( @@ -576,7 +578,7 @@ def main(cfg: DictConfig) -> Trainer: evaluators = add_metrics_to_eval_loaders(evaluators, train_metrics) # Build the Trainer - print('Building trainer...') + log.info('Building trainer...') trainer = Trainer( run_name=run_name, seed=seed, @@ -615,7 +617,7 @@ def main(cfg: DictConfig) -> Trainer: compile_config=compile_config, ) - print('Logging config') + log.info('Logging config') log_config(logged_cfg) torch.cuda.empty_cache() gc.collect() @@ -624,10 +626,10 @@ def main(cfg: DictConfig) -> Trainer: if eval_first and trainer.state.timestamp.batch.value == 0: trainer.eval() - print('Starting training...') + log.info('Starting training...') trainer.fit() - print('Done.') + log.info('Done.') return trainer diff --git a/setup.py b/setup.py index cbc521c6c9..923705699c 100644 --- a/setup.py +++ b/setup.py @@ -48,11 +48,11 @@ install_requires = [ 'mosaicml[libcloud,wandb,mlflow,oci,gcs]>=0.17.1,<0.18', - 'accelerate>=0.20,<0.21', # for HF inference `device_map` - 'transformers>=4.34.1,<4.35', + 'accelerate>=0.25,<0.26', # for HF inference `device_map` + 'transformers>=4.36,<4.37', 'mosaicml-streaming>=0.7.1,<0.8', 'torch>=2.1,<2.1.1', - 'datasets>=2.14.5,<2.15', + 'datasets==2.15.0', 'fsspec==2023.6.0', # newer version results in a bug in datasets that duplicates data 'sentencepiece==0.1.97', 'einops==0.5.0', @@ -115,7 +115,7 @@ ] extra_deps['openai'] = [ - 'openai==0.27.8', + 'openai==1.3.8', 'tiktoken==0.4.0', ] extra_deps['all-cpu'] = set( diff --git a/tests/a_scripts/inference/test_convert_composer_to_hf.py b/tests/a_scripts/inference/test_convert_composer_to_hf.py index 94a2d66c6e..28fb9219f8 100644 --- a/tests/a_scripts/inference/test_convert_composer_to_hf.py +++ b/tests/a_scripts/inference/test_convert_composer_to_hf.py @@ -258,7 +258,7 @@ def test_callback_inits(): @pytest.mark.parametrize( 'hf_save_interval,save_interval,max_duration,expected_hf_checkpoints,expected_normal_checkpoints', [('3ba', '2ba', '4ba', 2, 2), ('1dur', '2ba', '1ep', 1, 2)]) -@patch('os.cpu_count', MagicMock(return_value=None)) +@patch('os.cpu_count', MagicMock(return_value=1)) def test_huggingface_conversion_callback_interval( tmp_path: pathlib.Path, log_to_mlflow: bool, hf_save_interval: str, save_interval: str, max_duration: str, expected_hf_checkpoints: int, @@ -381,7 +381,7 @@ def test_huggingface_conversion_callback_interval( @pytest.mark.parametrize( 'hf_save_interval,save_interval,max_duration,expected_hf_checkpoints,expected_normal_checkpoints', [('1ba', '1ba', '1ba', 1, 1)]) -@patch('os.cpu_count', MagicMock(return_value=None)) +@patch('os.cpu_count', MagicMock(return_value=1)) def test_huggingface_conversion_callback( model: str, tmp_path: pathlib.Path, diff --git a/tests/data/test_dataloader.py b/tests/data/test_dataloader.py index 728376229b..2cf4c51a72 100644 --- a/tests/data/test_dataloader.py +++ b/tests/data/test_dataloader.py @@ -630,12 +630,12 @@ def test_token_counting_func(pad_token_id: int, batch_size: int, decoder_batch_strings.append(' '.join(['hello'] * sample_length)) decoder_expected_token_count += sample_length expected_token_count += sample_length - batch_tokenized['decoder_input_ids'] = gptt( + batch_tokenized['decoder_attention_mask'] = gptt( decoder_batch_strings, padding=True, - return_tensors='pt')['input_ids'] + return_tensors='pt')['attention_mask'] token_counting_func = get_tokens_per_batch_func( - pad_token_id, decoder_only=not add_decoder_input_ids) + decoder_only=not add_decoder_input_ids) actual_token_count = token_counting_func(batch_tokenized) @@ -654,7 +654,7 @@ def test_token_counting_func_dataloader_setting( model_max_length: int, padding_side: str, monkeypatch: pytest.MonkeyPatch): gptt = transformers.AutoTokenizer.from_pretrained('gpt2') - gptt.pad_token_id = pad_token_id + gptt.pad_token_id = pad_token_id if pad_token_id is not None else gptt.eos_token_id gptt.model_max_length = model_max_length gptt.padding_side = padding_side @@ -662,19 +662,25 @@ def test_token_counting_func_dataloader_setting( expected_token_count = 0 for _ in range(batch_size): sample_length = random.randint( - 1, - model_max_length) if pad_token_id is not None else model_max_length + 1, model_max_length // + 4) if pad_token_id is not None else model_max_length // 4 batch_strings.append(' '.join(['hello'] * sample_length)) expected_token_count += sample_length - batch_tokenized = gptt(batch_strings, - padding=True if pad_token_id is not None else False, - return_tensors='pt') + batch_tokenized = [ + gptt(b, padding=True if pad_token_id is not None else False) + for b in batch_strings + ] if dataloader_type == 'denoising': - batch_tokenized['decoder_input_ids'] = batch_tokenized[ - 'input_ids'].clone() + expected_token_count += 2 * batch_size # for the two eos tokens + expected_token_count += 5 * batch_size # for the corruption prefix tokens + + if dataloader_type in {'finetuning-hf', 'finetuning-streaming'}: + for b in batch_tokenized: + b['labels'] = b['input_ids'].copy() expected_token_count *= 2 + expected_token_count += 1 * batch_size # for the eos token common_args = { 'drop_last': False, @@ -735,8 +741,10 @@ def test_token_counting_func_dataloader_setting( }, **common_args }) + ds_mock = MagicMock() + ds_mock.tokenizer = gptt monkeypatch.setattr('llmfoundry.data.text_data.StreamingTextDataset', - lambda *args, **kwargs: MagicMock()) + lambda *args, **kwargs: ds_mock) dl = build_text_dataloader(cfg, gptt, batch_size) elif dataloader_type == 'denoising': cfg = DictConfig({ @@ -754,7 +762,7 @@ def test_token_counting_func_dataloader_setting( }, 'mixture_of_denoisers': { 'decoder_only_format': False, - 'span_mean_lengths_and_ratios': [[3, .15], [8, .5]], + 'span_mean_lengths_and_ratios': None, 'sequence_mask_ratios': 0.25, }, **common_args @@ -767,7 +775,8 @@ def test_token_counting_func_dataloader_setting( cfg = om.create(cfg) - actual_token_count = dl.get_num_tokens_in_batch(batch_tokenized) + batch_collated = dl.dataloader.collate_fn(batch_tokenized) # type: ignore + actual_token_count = dl.get_num_tokens_in_batch(batch_collated) assert actual_token_count == expected_token_count diff --git a/tests/fixtures/data.py b/tests/fixtures/data.py index 16dd01347d..9ba053ffe8 100644 --- a/tests/fixtures/data.py +++ b/tests/fixtures/data.py @@ -26,7 +26,7 @@ def tiny_ft_dataset_path(tmp_path: Path, dataset_size: int = 4) -> Path: @fixture -@patch('os.cpu_count', MagicMock(return_value=None)) +@patch('os.cpu_count', MagicMock(return_value=1)) def tiny_ft_dataloader(tiny_ft_dataset_path: Path, mpt_tokenizer: PreTrainedTokenizerBase, max_seq_len: int = 128, diff --git a/tests/models/inference_api_wrapper/test_inference_api_eval_wrapper.py b/tests/models/inference_api_wrapper/test_inference_api_eval_wrapper.py index 6e5f91de00..a125203e19 100644 --- a/tests/models/inference_api_wrapper/test_inference_api_eval_wrapper.py +++ b/tests/models/inference_api_wrapper/test_inference_api_eval_wrapper.py @@ -1,6 +1,7 @@ # Copyright 2022 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 +import os from typing import Dict from unittest.mock import patch @@ -13,6 +14,12 @@ from llmfoundry.utils.builders import build_icl_evaluators +@pytest.fixture(scope='module') +def openai_api_key_env_var() -> str: + os.environ['OPENAI_API_KEY'] = 'dummy' + return os.environ['OPENAI_API_KEY'] + + def load_icl_config(): return DictConfig({ 'icl_tasks': @@ -34,60 +41,69 @@ def load_icl_config(): }) +class MockTopLogProb: + + def __init__(self, expected_token: str) -> None: + self.top_logprobs = [{expected_token: 0}] + + +class MockLogprob: + + def __init__(self, expected_token: str) -> None: + self.logprobs = MockTopLogProb(expected_token) + + +class MockCompletion: + + def __init__(self, expected_token: str) -> None: + self.choices = [MockLogprob(expected_token)] + + +class MockContent: + + def __init__(self, expected_token: str) -> None: + setattr(self, 'content', expected_token) + + +class MockMessage: + + def __init__(self, expected_token: str) -> None: + setattr(self, 'message', MockContent(expected_token)) + + +class MockChatCompletion: + + def __init__(self, expected_token: str) -> None: + setattr(self, 'choices', [MockMessage(expected_token)]) + + def mock_create(**kwargs: Dict[str, str]): prompt = kwargs['prompt'] if prompt == 'AMERICAN HISTORY: On May 29, 1765 Patrick Henrys Stamp Act protest was interrupted with this one word\nAnswer:': # pyright: ignore[reportUnnecessaryComparison] - return { - 'choices': [{ - 'logprobs': { - 'top_logprobs': [{ - ' Tre': 0, - }], - }, - }], - } + return MockCompletion(' Tre') + elif prompt == 'AMERICAN HISTORY: On May 29, 1765 Patrick Henrys Stamp Act protest was interrupted with this one word\nAnswer: Tre': # pyright: ignore[reportUnnecessaryComparison] - return { - 'choices': [{ - 'logprobs': { - 'top_logprobs': [{ - 'ason': 0, - }], - }, - }], - } + return MockCompletion('ason') + elif prompt == 'AMERICAN HISTORY: On May 29, 1765 Patrick Henrys Stamp Act protest was interrupted with this one word\nAnswer: Treason': # pyright: ignore[reportUnnecessaryComparison] - return { - 'choices': [{ - 'logprobs': { - 'top_logprobs': [{ - '!': 0, - }], - }, - }], - } + return MockCompletion('!') + else: # dummy token to make sure the model is incorrect on any other prompt - return { - 'choices': [{ - 'logprobs': { - 'top_logprobs': [{ - ' ': 0, - }], - }, - }], - } - - -def test_openai_api_eval_wrapper(tmp_path: str): + return MockCompletion(' ') + + +def test_openai_api_eval_wrapper(tmp_path: str, openai_api_key_env_var: str): _ = pytest.importorskip('openai') - with patch('openai.Completion') as mock: - mock.create = mock_create - model_name = 'davinci' - tokenizer = TiktokenTokenizerWrapper(model_name=model_name, - pad_token='<|endoftext|>') - model = OpenAICausalLMEvalWrapper(model_cfg={'version': model_name}, - tokenizer=tokenizer) + + model_name = 'davinci' + tokenizer = TiktokenTokenizerWrapper(model_name=model_name, + pad_token='<|endoftext|>') + model = OpenAICausalLMEvalWrapper(model_cfg={'version': model_name}, + tokenizer=tokenizer) + with patch.object(model, 'client') as mock: + mock.completions.create = mock_create + task_cfg = load_icl_config() evaluators, _ = build_icl_evaluators(task_cfg.icl_tasks, tokenizer, @@ -107,22 +123,18 @@ def test_openai_api_eval_wrapper(tmp_path: str): assert acc == 0.5 -def test_chat_api_eval_wrapper(tmp_path: str): +def test_chat_api_eval_wrapper(tmp_path: str, openai_api_key_env_var: str): _ = pytest.importorskip('openai') - with patch('openai.ChatCompletion') as mock: - mock.create.return_value = { - 'choices': [{ - 'message': { - 'role': 'assistant', - 'content': 'Treason!' - }, - }], - } - model_name = 'gpt-3.5-turbo' - tokenizer = TiktokenTokenizerWrapper(model_name=model_name, - pad_token='<|endoftext|>') - chatmodel = OpenAIChatAPIEvalWrapper(model_cfg={'version': model_name}, - tokenizer=tokenizer) + + model_name = 'gpt-3.5-turbo' + tokenizer = TiktokenTokenizerWrapper(model_name=model_name, + pad_token='<|endoftext|>') + chatmodel = OpenAIChatAPIEvalWrapper(model_cfg={'version': model_name}, + tokenizer=tokenizer) + with patch.object(chatmodel, 'client') as mock: + mock.chat.completions.create.return_value = MockChatCompletion( + 'Treason!') + task_cfg = load_icl_config() evaluators, _ = build_icl_evaluators(task_cfg.icl_tasks, tokenizer, diff --git a/tests/models/layers/test_huggingface_flash.py b/tests/models/layers/test_huggingface_flash.py index 70c08c4eb1..411aab77a2 100644 --- a/tests/models/layers/test_huggingface_flash.py +++ b/tests/models/layers/test_huggingface_flash.py @@ -159,9 +159,11 @@ def test_attn_patch_integration(patch: str): @pytest.mark.gpu +@pytest.mark.world_size(2) @pytest.mark.parametrize('model_name', ['llama2', 'mistral']) @pytest.mark.parametrize('use_flash_attention_2', [True, False]) -def test_flash2(model_name: str, use_flash_attention_2: bool): +@pytest.mark.parametrize('init_device', ['cpu', 'mixed', 'meta']) +def test_flash2(model_name: str, use_flash_attention_2: bool, init_device: str): if model_name == 'llama2': if 'HUGGING_FACE_HUB_TOKEN' not in os.environ: pytest.skip( @@ -177,7 +179,7 @@ def test_flash2(model_name: str, use_flash_attention_2: bool): }, 'use_auth_token': True, 'pretrained': False, - 'init_device': 'cpu', + 'init_device': init_device, } tokenizer_name = 'meta-llama/Llama-2-7b-hf' @@ -228,21 +230,27 @@ def test_flash2(model_name: str, use_flash_attention_2: bool): model = COMPOSER_MODEL_REGISTRY[model_cfg['name']](model_cfg, tokenizer) # check that it actually used flash attention 2 - assert model.model.config._flash_attn_2_enabled if use_flash_attention_2 else not model.model.config._flash_attn_2_enabled + assert model.model.config._attn_implementation == ( + 'flash_attention_2' if use_flash_attention_2 else 'eager') attention_layer = rgetattr( rgetattr(model, attention_layers_attr)[0], attention_attr) assert isinstance(attention_layer, flash_attn_class) - tokenized_input = tokenizer(['Hello world blah blah', 'Goodbye world'], - return_tensors='pt', - padding=True) - tokenized_input['labels'] = tokenized_input['input_ids'].clone() - - tokenized_input = {k: v.cuda() for k, v in tokenized_input.items()} - model.to('cuda') - - with get_precision_context('amp_bf16'): - # We're just testing that flash attention 2 runs okay - outputs = model(tokenized_input) - loss = outputs.loss - loss.backward() + # Skip attempting to run forward/backward when some devices have meta params + # because we are not instantiating a full Trainer here, which contains the logic + # to move params off of meta device. + if init_device == 'cpu': + tokenized_input = tokenizer( + ['Hello world blah blah', 'Goodbye world'], + return_tensors='pt', + padding=True) + tokenized_input['labels'] = tokenized_input['input_ids'].clone() + + tokenized_input = {k: v.cuda() for k, v in tokenized_input.items()} + model.to('cuda') + + with get_precision_context('amp_bf16'): + # We're just testing that flash attention 2 runs okay + outputs = model(tokenized_input) + loss = outputs.loss + loss.backward() diff --git a/tests/models/test_model.py b/tests/models/test_model.py index 6d48d115fd..3b2fc22ee3 100644 --- a/tests/models/test_model.py +++ b/tests/models/test_model.py @@ -351,7 +351,25 @@ def test_full_forward_and_backward_t5_small(batch_size: int = 2): pytest.param('flash', torch.float16, marks=pytest.mark.gpu), pytest.param('flash', torch.bfloat16, marks=pytest.mark.gpu)]) @pytest.mark.parametrize('ffn_type', ['mptmlp', 'mptgeglu']) -def test_determinism(attn_impl: str, precision: torch.dtype, ffn_type: str): +@pytest.mark.parametrize('ffn_act_fn', [ + None, + { + 'name': 'gelu', + 'approximate': 'tanh', + }, + { + 'name': 'silu', + }, + { + 'name': 'relu', + 'inplace': True, + }, + pytest.param({'name': 'relu5'}, + marks=pytest.mark.xfail(reason='invalid choice.', + strict=True)), +]) +def test_determinism(attn_impl: str, precision: torch.dtype, ffn_type: str, + ffn_act_fn: dict): conf_path = 'scripts/train/yamls/pretrain/testing.yaml' with open(conf_path) as f: test_cfg = om.load(f) @@ -363,6 +381,7 @@ def test_determinism(attn_impl: str, precision: torch.dtype, ffn_type: str): test_cfg.model.ffn_config['ffn_type'] = ffn_type else: test_cfg.model.setdefault('ffn_config', {'ffn_type': ffn_type}) + test_cfg.model.ffn_config['ffn_act_fn'] = ffn_act_fn test_cfg.model.init_device = 'cuda:0' test_cfg.device = 'cuda:0' @@ -516,12 +535,34 @@ def test_opt_wrapping(): @pytest.mark.parametrize('tie_word_embeddings', [True, False]) @pytest.mark.parametrize('expansion_ratio,ffn_hidden_size', [ (2, None), - (1.231, None), + pytest.param(1.231, + None, + marks=pytest.mark.xfail( + reason='d_model * expansion_ratio must be an integer.', + strict=True)), (2, 128), (2, 256), ]) +@pytest.mark.parametrize('ffn_act_fn', [ + None, + { + 'name': 'gelu', + 'approximate': 'tanh', + }, + { + 'name': 'silu', + }, + { + 'name': 'relu', + 'inplace': True, + }, + pytest.param({'name': 'relu5'}, + marks=pytest.mark.xfail(reason='invalid choice.', + strict=True)), +]) def test_mpt_creation(norm_type: str, no_bias: bool, tie_word_embeddings: bool, - expansion_ratio: Union[int, float], ffn_hidden_size: int): + expansion_ratio: Union[int, float], ffn_hidden_size: int, + ffn_act_fn: dict): # Test that the config constructs the model as expected. hf_config = MPTConfig( init_device='cpu', @@ -541,11 +582,9 @@ def test_mpt_creation(norm_type: str, no_bias: bool, tie_word_embeddings: bool, ffn_config={ 'ffn_type': 'mptmlp', 'ffn_hidden_size': ffn_hidden_size, + 'ffn_act_fn': ffn_act_fn, }, ) - if hf_config.d_model * hf_config.expansion_ratio != int( - hf_config.d_model * hf_config.expansion_ratio): - pytest.xfail('d_model * expansion_ratio must be an integer.') mpt = MPTForCausalLM(hf_config) @@ -1901,7 +1940,7 @@ def test_hf_init(tmp_path: pathlib.Path, precision = Precision('amp_bf16') hf_config = MPTConfig( - init_device=init_device, + init_device='cpu', d_model=32, n_heads=4, n_layers=1, diff --git a/tests/test_smoketest.py b/tests/test_smoketest.py new file mode 100644 index 0000000000..a43925e506 --- /dev/null +++ b/tests/test_smoketest.py @@ -0,0 +1,16 @@ +# Copyright 2022 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + +from llmfoundry import callbacks, data, models, optim, tokenizers, utils + + +# This very simple test is just to use the above imports, which check and make sure we can import all the top-level +# modules from foundry. This is mainly useful for checking that we have correctly conditionally imported all optional +# dependencies. +def test_smoketest(): + assert callbacks + assert data + assert models + assert optim + assert tokenizers + assert utils