diff --git a/.ci/FILE_HEADER b/.ci/FILE_HEADER index 22198520fd..e6d99f5d6f 100644 --- a/.ci/FILE_HEADER +++ b/.ci/FILE_HEADER @@ -1,2 +1,2 @@ -Copyright 2022 MosaicML LLM Foundry authors +Copyright 2024 MosaicML LLM Foundry authors SPDX-License-Identifier: Apache-2.0 diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml index bb538dbe9b..bbc5e422d3 100644 --- a/.github/workflows/docker.yaml +++ b/.github/workflows/docker.yaml @@ -7,7 +7,7 @@ on: branches: - main paths: - - ./Dockerfile + - Dockerfile - .github/workflows/docker.yaml workflow_dispatch: {} jobs: diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d4c8cc699c..4a5bf1f6bb 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -57,7 +57,7 @@ repos: - id: mixed-line-ending - id: trailing-whitespace - repo: https://github.com/Lucas-C/pre-commit-hooks - rev: v1.3.1 + rev: v1.5.4 hooks: - id: insert-license args: @@ -65,6 +65,7 @@ repos: - .ci/FILE_HEADER - --comment-style - '#' + - --allow-past-years types: [python] - repo: https://github.com/PyCQA/docformatter rev: v1.5.0 diff --git a/llmfoundry/callbacks/async_eval_callback.py b/llmfoundry/callbacks/async_eval_callback.py index 4227448d87..6cd57d440c 100644 --- a/llmfoundry/callbacks/async_eval_callback.py +++ b/llmfoundry/callbacks/async_eval_callback.py @@ -19,7 +19,7 @@ from composer.utils import dist from composer.utils.misc import create_interval_scheduler -from mcli import ComputeConfig, Run, RunConfig, create_run, get_run +from mcli import Run, RunConfig, create_run, get_run log = logging.getLogger(__name__) @@ -33,7 +33,9 @@ OPTIONAL_PARAMS_FOR_EVAL = { 'dist_timeout', 'eval_gauntlet', + 'eval_loader', 'fsdp_config', + 'eval_subset_num_batches', 'icl_subset_num_batches', 'loggers', 'precision', @@ -175,50 +177,84 @@ def validate_interval(interval: Union[str, int, Time], return async_interval +def validate_eval_run_config( + eval_run_config: Optional[Dict[str, Any]]) -> Dict[str, Any]: + + if not eval_run_config: + return {} + + run_config = eval_run_config.copy() + + supported_keys = {'image', 'command', 'compute', 'scheduling'} + found_unsupported = set() + for key in run_config: + if key not in supported_keys: + found_unsupported.add(key) + + if found_unsupported: + raise ValueError( + f'Unsupported eval run config keys found: {", ".join(found_unsupported)}' + + f'. Supported keys: {supported_keys}') + + return run_config + + class AsyncEval(Callback): """Run the eval loop asynchronously as part of a MosaicML platform run. This callback is currently experimental. The API may change in the future. Args: - training_config: Dict[str, Any]: The config from the training run + training_params: Dict[str, Any]: The parameter config from the training run interval: Union[str, int, Time]: The interval describing how often eval runs should be launched. If an integer, it will be assumed to be in :attr:`.TimeUnit.EPOCH`. Otherwise, the unit must be either :attr:`.TimeUnit.EPOCH`, :attr:`.TimeUnit.BATCH`, :attr:`.TimeUnit.TOKEN`, or :attr:`.TimeUnit.SAMPLE`. - compute: Optional[Union[ComputeConfig, Dict[str, Any]]]: The compute configuration to - use for the eval run. If not provided, the same cluster as the current run and a - single, full GPU node will be used. + eval_run_config: Optional[Dict[str, Any]]: A subset of mcli run config values to use + for the eval run. If not specified, any fields from run config will be created + dynamically from the training run config and the interval. The following fields + are supported: + - ``image``: Image of the eval run. Default: same as training run + - ``command``: Command to run for the eval run. Default: calls + `composer scripts/eval/eval.py $PARAMETERS`. If custom setup is needed, + the command should include calling the eval script with $PARAMETERS + - ``compute``: Compute to use for the eval run. Default: same cluster as + the training run and a single node (8 GPUs) + - ``scheduling``: Scheduling to use for the eval run. Default: same as training run + + All fields are optional, but if specified, must be valid for a mcli run config. We + provide this optional config to give you the most flexibility in customizing the eval + run, but it is recommended to use the default values unless you have a specific use case """ def __init__( self, - training_config: Dict[str, Any], + training_params: Dict[str, Any], interval: Union[str, int, Time], - compute: Optional[Union[ComputeConfig, Dict[str, Any]]] = None, + eval_run_config: Optional[Dict[str, Any]] = None, ): for required in ('save_interval', 'save_folder'): - if required not in training_config: + if required not in training_params: raise ValueError(f'{required} required for async eval') - self.checkpoint_save_folder = training_config['save_folder'] - self.training_config = training_config + self.checkpoint_save_folder = training_params['save_folder'] + self.training_params = training_params + self.eval_run_config = validate_eval_run_config(eval_run_config) self.interval = validate_interval(interval, - self.training_config['save_interval']) + self.training_params['save_interval']) self.check_interval = create_interval_scheduler( interval, # There is a custom close to ensure that the final checkpoint # (which is the most important) is evaled after it is written include_end_of_training=False, ) - self.compute = compute self.last_checkpoint: Optional[str] = None # Run these during init to fail fast in any of the error cases self.current_run = self._get_current_run() get_eval_parameters( - parameters=training_config, + parameters=training_params, checkpoint='test', training_run_name=self.current_run.name, ) @@ -259,7 +295,7 @@ def close(self, state: State, logger: Logger) -> None: if dist.get_global_rank() != 0: return - save_latest_filename = self.training_config.get('save_latest_filename', + save_latest_filename = self.training_params.get('save_latest_filename', None) if not save_latest_filename: @@ -297,7 +333,7 @@ def launch_run(self, checkpoint: str, current_interval: Time) -> Run: run_name = get_run_name(self.current_run.name, str(current_interval)) params = get_eval_parameters( - parameters=self.training_config, + parameters=self.training_params, checkpoint=checkpoint, training_run_name=self.current_run.name, ) @@ -347,12 +383,16 @@ def launch_run(self, checkpoint: str, current_interval: Time) -> Run: # TODO: This just runs an eval run, but we also want to attach the # deployment, which would require a hf conversion and parametrizing the # dependent_deployment in the run config - command = f'cd {installation_path}/scripts \n composer eval/eval.py $PARAMETERS' + default_command = f'cd {installation_path}/scripts \n composer eval/eval.py $PARAMETERS' run_config = RunConfig( name=run_name, - image=self.current_run.image, - compute=self.compute or default_compute, - command=command, + image=self.eval_run_config.get('image', self.current_run.image), + command=self.eval_run_config.get('command', default_command), + compute=self.eval_run_config.get('compute', default_compute), + scheduling=self.eval_run_config.get( + 'scheduling', + self.current_run.submitted_config.scheduling, + ), integrations=integrations, env_variables=cfg.env_variables, metadata=cfg.metadata, diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index b50d81d09e..904f2e208f 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -6,6 +6,7 @@ import logging import math import os +import re import tempfile from pathlib import Path from typing import Optional, Sequence, Union @@ -27,6 +28,23 @@ log = logging.getLogger(__name__) +_LICENSE_FILE_PATTERN = re.compile(r'license(\.[a-z]+|$)', re.IGNORECASE) + + +def _maybe_get_license_filename(local_dir: str) -> Optional[str]: + """Returns the name of the license file if it exists in the local_dir. + + Note: This is intended to be consistent with the code in MLflow. + https://github.com/mlflow/mlflow/blob/5d13d6ec620a02de9a5e31201bf1becdb9722ea5/mlflow/transformers/__init__.py#L1152 + + If the license file does not exist, returns None. + """ + try: + return next(file for file in os.listdir(local_dir) + if _LICENSE_FILE_PATTERN.search(file)) + except StopIteration: + return None + class HuggingFaceCheckpointer(Callback): """Save a huggingface formatted checkpoint during training. @@ -279,6 +297,15 @@ def _save_checkpoint(self, state: State, logger: Logger): path=local_save_path, **self.mlflow_logging_config, ) + + license_filename = _maybe_get_license_filename( + local_save_path) + if license_filename is not None: + mlflow_logger._mlflow_client.log_artifact( + mlflow_logger._run_id, + os.path.join(local_save_path, license_filename), + ) + mlflow_logger.register_model( model_uri=local_save_path, name=self.mlflow_registered_model_name, diff --git a/llmfoundry/callbacks/scheduled_gc_callback.py b/llmfoundry/callbacks/scheduled_gc_callback.py index 6bd085e68f..216fa2adb4 100644 --- a/llmfoundry/callbacks/scheduled_gc_callback.py +++ b/llmfoundry/callbacks/scheduled_gc_callback.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 import gc +from typing import Optional import torch from composer.core import Callback, State @@ -19,16 +20,19 @@ class ScheduledGarbageCollector(Callback): """Disable automatic garbage collection and collect garbage at interval. Args: - batch_interval (int): Number of batches between checkpoints call to gc.collect() + batch_interval (int): Number of batches between calls to gc.collect() + gen_1_batch_interval(int, optional): Number of batches between calls to gc.collect(1) eval_keep_disabled (bool): keep gc disabled during eval (default: False) """ def __init__( self, batch_interval: int, + gen_1_batch_interval: Optional[int] = None, eval_keep_disabled: bool = False, ): self.batch_interval = batch_interval + self.gen_1_batch_interval = gen_1_batch_interval self.eval_keep_disabled = eval_keep_disabled self.gc_init_state = None @@ -56,6 +60,9 @@ def fit_end(self, state: State, logger: Logger) -> None: def before_dataloader(self, state: State, logger: Logger) -> None: del logger # unused + if self.gen_1_batch_interval is not None and state.timestamp.batch.value % self.gen_1_batch_interval == 0: + gc.collect(1) + if state.timestamp.batch.value % self.batch_interval == 0: gc_cuda() diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py index e61d138c41..4846e35840 100644 --- a/llmfoundry/data/finetuning/tasks.py +++ b/llmfoundry/data/finetuning/tasks.py @@ -36,7 +36,8 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]: import os import warnings from pathlib import Path -from typing import Any, Callable, Dict, List, Optional, Union +from typing import (Any, Callable, Dict, List, Literal, Optional, Tuple, Union, + cast) import datasets as hf_datasets import huggingface_hub as hf_hub @@ -57,6 +58,35 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]: '.downloaded_finetuning')) SUPPORTED_EXTENSIONS = ['.csv', '.jsonl', '.parquet'] +PromptResponseDict = Dict[str, str] +ChatFormattedDict = Dict[str, List[Dict[str, str]]] +Example = Union[PromptResponseDict, ChatFormattedDict] +ExampleType = Literal['prompt_response', 'chat'] +TokenizedExample = Dict[str, List[int]] + + +def _get_example_type(example: Example) -> ExampleType: + """Determines the type of the input example. + + Args: + example (Example): The input example, which can be a multi-way chat formatted conversation or an instruction-response pair. + + Returns: + ExampleType: The type of the input example, which can be either 'chat' for multi-way chat formatted conversation or 'prompt_response' for instruction-response pair. + + Raises: + KeyError: If the example type is unknown. + """ + if 'messages' in example: + return 'chat' + elif any([ + pr in example + for pr in _ALLOWED_PROMPT_KEYS.union(_ALLOWED_RESPONSE_KEYS) + ]): + return 'prompt_response' + else: + raise KeyError(f'Unknown conversation type {example=}') + def _is_empty_or_nonexistent(dirpath: str) -> bool: """Check if a directory is empty or non-existent. @@ -70,9 +100,70 @@ def _is_empty_or_nonexistent(dirpath: str) -> bool: return not os.path.isdir(dirpath) or len(os.listdir(dirpath)) == 0 -def _tokenize_formatted_example( - example: Dict[str, Any], - tokenizer: PreTrainedTokenizerBase) -> Dict[str, List[int]]: +def _slice_chat_formatted_example( + example: ChatFormattedDict, + tokenizer: PreTrainedTokenizerBase) -> Tuple[str, str]: + """Slices the chat example into a formatted prompt and response. + + Args: + example (ChatFormattedDict): The chat example containing the messages. + tokenizer (PreTrainedTokenizerBase): The tokenizer to apply the chat template. + + Returns: + Tuple[str, str]: The prompt and response as separate strings. + + Raises: + ValueError: If the chat example has less than two messages or if the last message is not from the assistant. + KeyError: If a message does not have a role or content. + """ + messages = example['messages'] + + if len(messages) < 2: + raise ValueError( + f'chat example must have at least two messages. {messages=}') + last_message = messages[-1] + if last_message['role'] != 'assistant': + raise ValueError( + f'last message must be from assistant. {last_message=}') + for message in messages: + if 'role' not in message or 'content' not in message: + raise KeyError(f'message must have role and content. {message=}') + + full_conversation = tokenizer.apply_chat_template(messages, tokenize=False) + prompt = tokenizer.apply_chat_template(messages[:-1], + tokenize=False, + add_generation_prompt=True) + if prompt != full_conversation[:len(prompt)]: + raise ValueError( + f'prompt must be the first part of the full conversation. {prompt=}, {full_conversation=}' + ) + response = full_conversation[len(prompt):] + if len(response) == 0: + raise ValueError( + f'chat example must have at least one assistant message. {messages=}' + ) + return prompt, response + + +def _tokenize_chat_formatted_example( + example: ChatFormattedDict, + tokenizer: PreTrainedTokenizerBase) -> TokenizedExample: + """Tokenizes a chat-formatted example using the provided tokenizer. + + Args: + example (ChatFormattedDict): The chat-formatted example to tokenize. + tokenizer (PreTrainedTokenizerBase): The tokenizer to use for tokenization. + + Returns: + TokenizedExample: The tokenized example. + """ + prompt, response = _slice_chat_formatted_example(example, tokenizer) + return tokenizer(text=prompt, text_target=response) + + +def _tokenize_prompt_response_formatted_example( + example: PromptResponseDict, + tokenizer: PreTrainedTokenizerBase) -> TokenizedExample: """Tokenize a formatted example and validate expected keys.""" example_keys = set(example.keys()) prompt_keys = example_keys.intersection(_ALLOWED_PROMPT_KEYS) @@ -108,6 +199,35 @@ def _tokenize_formatted_example( return tokenizer(text=prompt, text_target=response) +def _tokenize_formatted_example( + example: Example, + tokenizer: PreTrainedTokenizerBase) -> TokenizedExample: + """Tokenizes a formatted example using the provided tokenizer. + + Args: + example (Example): The input example to be tokenized. + tokenizer (PreTrainedTokenizerBase): The tokenizer to be used for tokenization. + + Returns: + TokenizedExample: The tokenized example. + + Raises: + ValueError: If the example format is unknown. + """ + example_format = _get_example_type(example) + + if example_format == 'chat': + chat_example = cast(ChatFormattedDict, example) + return _tokenize_chat_formatted_example(chat_example, tokenizer) + elif example_format == 'prompt_response': + prompt_response_example: PromptResponseDict = cast( + PromptResponseDict, example) + return _tokenize_prompt_response_formatted_example( + prompt_response_example, tokenizer) + else: + raise ValueError(f'Unknown conversation type {example_format=}') + + class StreamingFinetuningDataset(StreamingDataset): """Finetuning dataset with flexible tokenization using StreamingDataset. diff --git a/llmfoundry/models/inference_api_wrapper/__init__.py b/llmfoundry/models/inference_api_wrapper/__init__.py index 496abf2aa6..9bb2ece2b2 100644 --- a/llmfoundry/models/inference_api_wrapper/__init__.py +++ b/llmfoundry/models/inference_api_wrapper/__init__.py @@ -1,6 +1,8 @@ # Copyright 2022 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 +from llmfoundry.models.inference_api_wrapper.fmapi import ( + FMAPICasualLMEvalWrapper, FMAPIChatAPIEvalWrapper) from llmfoundry.models.inference_api_wrapper.interface import \ InferenceAPIEvalWrapper from llmfoundry.models.inference_api_wrapper.openai_causal_lm import ( @@ -10,4 +12,6 @@ 'OpenAICausalLMEvalWrapper', 'OpenAIChatAPIEvalWrapper', 'InferenceAPIEvalWrapper', + 'FMAPICasualLMEvalWrapper', + 'FMAPIChatAPIEvalWrapper', ] diff --git a/llmfoundry/models/inference_api_wrapper/fmapi.py b/llmfoundry/models/inference_api_wrapper/fmapi.py new file mode 100644 index 0000000000..867b3c272e --- /dev/null +++ b/llmfoundry/models/inference_api_wrapper/fmapi.py @@ -0,0 +1,72 @@ +# Copyright 2022 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + +import logging +import os +import time +from typing import Dict + +import requests +from transformers import AutoTokenizer + +from llmfoundry.models.inference_api_wrapper.openai_causal_lm import ( + OpenAICausalLMEvalWrapper, OpenAIChatAPIEvalWrapper, OpenAIEvalInterface) + +__all__ = [ + 'FMAPICasualLMEvalWrapper', + 'FMAPIChatAPIEvalWrapper', +] + +log = logging.getLogger(__name__) + + +def block_until_ready(base_url: str): + """Block until the endpoint is ready.""" + sleep_s = 5 + timout_s = 5 * 60 # At max, wait 5 minutes + + ping_url = f'{base_url}/ping' + + waited_s = 0 + while True: + try: + requests.get(ping_url) + log.info(f'Endpoint {ping_url} is ready') + break + except requests.exceptions.ConnectionError: + log.debug( + f'Endpoint {ping_url} not ready yet. Sleeping {sleep_s} seconds' + ) + time.sleep(sleep_s) + waited_s += sleep_s + + if waited_s >= timout_s: + raise TimeoutError( + f'Endpoint {ping_url} did not become read after {waited_s:,} seconds, exiting' + ) + + +class FMAPIEvalInterface(OpenAIEvalInterface): + + def __init__(self, model_cfg: Dict, tokenizer: AutoTokenizer): + is_local = model_cfg.pop('local', False) + if is_local: + base_url = os.environ.get('MOSAICML_MODEL_ENDPOINT', + 'http://0.0.0.0:8080/v2') + model_cfg['base_url'] = base_url + block_until_ready(base_url) + + if 'base_url' not in model_cfg: + raise ValueError( + 'Must specify base_url or use local=True in model_cfg for FMAPIsEvalWrapper' + ) + + super().__init__(model_cfg, tokenizer) + + +class FMAPICasualLMEvalWrapper(FMAPIEvalInterface, OpenAICausalLMEvalWrapper): + """Databricks Foundational Model API wrapper for causal LM models.""" + + +class FMAPIChatAPIEvalWrapper(FMAPIEvalInterface, OpenAIChatAPIEvalWrapper): + """Databricks Foundational Model API wrapper for chat models.""" diff --git a/llmfoundry/models/inference_api_wrapper/openai_causal_lm.py b/llmfoundry/models/inference_api_wrapper/openai_causal_lm.py index 39de2ba59c..587dd179bd 100644 --- a/llmfoundry/models/inference_api_wrapper/openai_causal_lm.py +++ b/llmfoundry/models/inference_api_wrapper/openai_causal_lm.py @@ -36,9 +36,6 @@ class OpenAIEvalInterface(InferenceAPIEvalWrapper): def __init__(self, model_cfg: Dict, tokenizer: AutoTokenizer) -> None: super().__init__(model_cfg, tokenizer) - assert os.getenv( - 'OPENAI_API_KEY' - ) is not None, 'No OpenAI API Key found. Ensure it is saved as an environmental variable called OPENAI_API_KEY.' try: import openai except ImportError as e: @@ -46,8 +43,28 @@ def __init__(self, model_cfg: Dict, tokenizer: AutoTokenizer) -> None: extra_deps_group='openai', conda_package='openai', conda_channel='conda-forge') from e - self.client = openai.OpenAI() - self.model_name = model_cfg['version'] + + api_key = os.environ.get('OPENAI_API_KEY') + base_url = model_cfg.get('base_url') + if base_url is None: + # Using OpenAI default, where the API key is required + if api_key is None: + raise ValueError( + 'No OpenAI API Key found. Ensure it is saved as an environmental variable called OPENAI_API_KEY.' + ) + + else: + # Using a custom base URL, where the API key may not be required + log.info( + f'Making request to custom base URL: {base_url}{"" if api_key is not None else " (no API key set)"}' + ) + api_key = 'placeholder' # This cannot be None + + self.client = openai.OpenAI(base_url=base_url, api_key=api_key) + if 'version' in model_cfg: + self.model_name = model_cfg['version'] + else: + self.model_name = model_cfg['name'] def generate_completion(self, prompt: str, num_tokens: int): raise NotImplementedError() diff --git a/llmfoundry/models/layers/attention.py b/llmfoundry/models/layers/attention.py index fecd79553f..89f861c3f0 100644 --- a/llmfoundry/models/layers/attention.py +++ b/llmfoundry/models/layers/attention.py @@ -516,6 +516,7 @@ def __init__( attn_impl: str = 'triton', clip_qkv: Optional[float] = None, qk_ln: bool = False, + qk_gn: bool = False, softmax_scale: Optional[float] = None, attn_pdrop: float = 0.0, norm_type: str = 'low_precision_layernorm', @@ -529,6 +530,7 @@ def __init__( self.attn_impl = attn_impl self.clip_qkv = clip_qkv self.qk_ln = qk_ln + self.qk_gn = qk_gn self.d_model = d_model self.n_heads = n_heads @@ -549,6 +551,8 @@ def __init__( raise ValueError( 'Each Q head should get the same number of KV heads, so n_heads must be divisible by kv_n_heads.' ) + if qk_ln and qk_gn: + raise ValueError('Only one of qk_ln and qk_gn can be set to True.') self.softmax_scale = softmax_scale if self.softmax_scale is None: @@ -572,11 +576,13 @@ def __init__( ] self.Wqkv._fused = (0, fuse_splits) - if self.qk_ln: + if self.qk_ln or self.qk_gn: norm_class = NORM_CLASS_REGISTRY[norm_type.lower()] - self.q_ln = norm_class(self.d_model, device=device) - self.k_ln = norm_class(self.kv_n_heads * self.head_dim, - device=device) + norm_size = self.head_dim if qk_gn else d_model + self.q_ln = norm_class(norm_size, device=device) + if qk_ln: + norm_size = self.head_dim * kv_n_heads + self.k_ln = norm_class(norm_size, device=device) if self.attn_impl == 'flash': self.attn_fn = flash_attn_fn @@ -623,11 +629,16 @@ def forward( key_padding_mask = attention_mask - if self.qk_ln: + if self.qk_ln or self.qk_gn: # Applying layernorm to qk + q_shape, k_shape = query.shape, key.shape + if self.qk_gn: + b, s = query.shape[:2] + query = query.view(b, s, self.n_heads, -1) + key = key.view(b, s, self.kv_n_heads, -1) dtype = query.dtype - query = self.q_ln(query).to(dtype) - key = self.k_ln(key).to(dtype) + query = self.q_ln(query).to(dtype).view(q_shape) + key = self.k_ln(key).to(dtype).view(k_shape) if rotary_emb_w_meta_info is not None: rotary_emb = rotary_emb_w_meta_info['rotary_emb'] @@ -712,6 +723,7 @@ def __init__( attn_impl: str = 'triton', clip_qkv: Optional[float] = None, qk_ln: bool = False, + qk_gn: bool = False, softmax_scale: Optional[float] = None, attn_pdrop: float = 0.0, norm_type: str = 'low_precision_layernorm', @@ -727,6 +739,7 @@ def __init__( attn_impl=attn_impl, clip_qkv=clip_qkv, qk_ln=qk_ln, + qk_gn=qk_gn, softmax_scale=softmax_scale, attn_pdrop=attn_pdrop, norm_type=norm_type, @@ -751,6 +764,7 @@ def __init__( attn_impl: str = 'triton', clip_qkv: Optional[float] = None, qk_ln: bool = False, + qk_gn: bool = False, softmax_scale: Optional[float] = None, attn_pdrop: float = 0.0, norm_type: str = 'low_precision_layernorm', @@ -766,6 +780,7 @@ def __init__( attn_impl=attn_impl, clip_qkv=clip_qkv, qk_ln=qk_ln, + qk_gn=qk_gn, softmax_scale=softmax_scale, attn_pdrop=attn_pdrop, norm_type=norm_type, diff --git a/llmfoundry/models/layers/blocks.py b/llmfoundry/models/layers/blocks.py index 036a4e7cd2..4ac43a8bac 100644 --- a/llmfoundry/models/layers/blocks.py +++ b/llmfoundry/models/layers/blocks.py @@ -22,6 +22,7 @@ 'attn_pdrop': 0.0, 'attn_impl': 'triton', 'qk_ln': False, + 'qk_gn': False, 'clip_qkv': None, 'softmax_scale': None, 'prefix_lm': False, diff --git a/llmfoundry/models/model_registry.py b/llmfoundry/models/model_registry.py index be09a69835..ff9942f5f6 100644 --- a/llmfoundry/models/model_registry.py +++ b/llmfoundry/models/model_registry.py @@ -3,7 +3,9 @@ from llmfoundry.models.hf import (ComposerHFCausalLM, ComposerHFPrefixLM, ComposerHFT5) -from llmfoundry.models.inference_api_wrapper import (OpenAICausalLMEvalWrapper, +from llmfoundry.models.inference_api_wrapper import (FMAPICasualLMEvalWrapper, + FMAPIChatAPIEvalWrapper, + OpenAICausalLMEvalWrapper, OpenAIChatAPIEvalWrapper) from llmfoundry.models.mpt import ComposerMPTCausalLM @@ -13,5 +15,7 @@ 'hf_prefix_lm': ComposerHFPrefixLM, 'hf_t5': ComposerHFT5, 'openai_causal_lm': OpenAICausalLMEvalWrapper, - 'openai_chat': OpenAIChatAPIEvalWrapper + 'fmapi_causal_lm': FMAPICasualLMEvalWrapper, + 'openai_chat': OpenAIChatAPIEvalWrapper, + 'fmapi_chat': FMAPIChatAPIEvalWrapper, } diff --git a/llmfoundry/models/mpt/configuration_mpt.py b/llmfoundry/models/mpt/configuration_mpt.py index 5474529277..7911728397 100644 --- a/llmfoundry/models/mpt/configuration_mpt.py +++ b/llmfoundry/models/mpt/configuration_mpt.py @@ -82,6 +82,7 @@ def __init__( attn_pdrop (float): The dropout probability for the attention layers. attn_impl (str): The attention implementation to use. One of 'torch', 'flash', or 'triton'. qk_ln (bool): Whether to apply layer normalization to the queries and keys in the attention layer. + qk_gn (bool): Whether to apply group normalization to the queries and keys in the attention layer. clip_qkv (Optional[float]): If not None, clip the queries, keys, and values in the attention layer to this value. softmax_scale (Optional[float]): If not None, scale the softmax in the attention layer by this value. If None, diff --git a/llmfoundry/tokenizers/tiktoken.py b/llmfoundry/tokenizers/tiktoken.py index 2632985533..eaaf0da316 100644 --- a/llmfoundry/tokenizers/tiktoken.py +++ b/llmfoundry/tokenizers/tiktoken.py @@ -198,7 +198,7 @@ def default_chat_template(self): '{% else %}' "{{ '\n' + '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}" '{% endif %}' - '{% if (add_generation_prompt == true) %}' + '{% if (add_generation_prompt == true and loop.last) %}' "{{ '\n' + '<|im_start|>' + 'assistant' + '\n' }}" "{% elif (message['role'] == 'assistant') %}" '{{ eos_token }}' diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index 75438b895e..42f817b386 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -213,27 +213,22 @@ def build_callback( raise ValueError( 'Parameters config is required for async eval callback') - return AsyncEval(**kwargs, training_config=config) + return AsyncEval(**kwargs, training_params=config) else: raise ValueError(f'Not sure how to build callback: {name}') def build_logger(name: str, kwargs: Dict[str, Any]) -> LoggerDestination: - kwargs_dict = { - k: v if isinstance(v, str) else om.to_container(v, resolve=True) - for k, v in kwargs.items() - } - if name == 'wandb': - return WandBLogger(**kwargs_dict) + return WandBLogger(**kwargs) elif name == 'tensorboard': - return TensorboardLogger(**kwargs_dict) + return TensorboardLogger(**kwargs) elif name == 'in_memory_logger': - return InMemoryLogger(**kwargs_dict) + return InMemoryLogger(**kwargs) elif name == 'mlflow': - return MLFlowLogger(**kwargs_dict) + return MLFlowLogger(**kwargs) elif name == 'inmemory': - return InMemoryLogger(**kwargs_dict) + return InMemoryLogger(**kwargs) else: raise ValueError(f'Not sure how to build logger: {name}') diff --git a/llmfoundry/utils/model_download_utils.py b/llmfoundry/utils/model_download_utils.py index 5d8a413d91..bade3e1d7d 100644 --- a/llmfoundry/utils/model_download_utils.py +++ b/llmfoundry/utils/model_download_utils.py @@ -30,6 +30,12 @@ ] PYTORCH_WEIGHTS_PATTERN = 'pytorch_model*.bin*' SAFE_WEIGHTS_PATTERN = 'model*.safetensors*' +TOKENIZER_FILES = [ + 'special_tokens_map.json', + 'tokenizer.json', + 'tokenizer.model', + 'tokenizer_config.json', +] ORAS_PASSWD_PLACEHOLDER = '' ORAS_CLI = 'oras' @@ -45,6 +51,7 @@ def download_from_hf_hub( model: str, save_dir: str, prefer_safetensors: bool = True, + tokenizer_only: bool = False, token: Optional[str] = None, ): """Downloads model files from a Hugging Face Hub model repo. @@ -57,6 +64,7 @@ def download_from_hf_hub( save_dir (str, optional): The local path to the directory where the model files will be downloaded. prefer_safetensors (bool): Whether to prefer Safetensors weights over PyTorch weights if both are available. Defaults to True. + tokenizer_only (bool): If true, only download tokenizer files. token (str, optional): The HuggingFace API token. If not provided, the token will be read from the `HUGGING_FACE_HUB_TOKEN` environment variable. @@ -95,10 +103,14 @@ def download_from_hf_hub( ' Please make sure the repo contains either safetensors or pytorch weights.' ) + allow_patterns = TOKENIZER_FILES if tokenizer_only else None + download_start = time.time() hf_hub.snapshot_download(model, local_dir=save_dir, + local_dir_use_symlinks=False, ignore_patterns=ignore_patterns, + allow_patterns=allow_patterns, token=token) download_duration = time.time() - download_start log.info( @@ -221,16 +233,18 @@ def download_from_oras(model: str, config_file: str, credentials_dir: str, save_dir: str, + tokenizer_only: bool = False, concurrency: int = 10): """Download from an OCI-compliant registry using oras. Args: - model: The name of the model to download. - config_file: Path to a YAML config file that maps model names to registry paths. - credentials_dir: Path to a directory containing credentials for the registry. It is expected to contain three + model (str): The name of the model to download. + config_file (str): Path to a YAML config file that maps model and tokenizer names to registry paths. + credentials_dir (str): Path to a directory containing credentials for the registry. It is expected to contain three files: `username`, `password`, and `registry`, each of which contains the corresponding credential. - save_dir: Path to the directory where files will be downloaded. - concurrency: The number of concurrent downloads to run. + save_dir (str): Path to the directory where files will be downloaded. + tokenizer_only (bool): If true, only download the tokenzier files. + concurrency (int): The number of concurrent downloads to run. """ if shutil.which(ORAS_CLI) is None: raise Exception( @@ -253,7 +267,8 @@ def _read_secrets_file(secret_file_path: str,): with open(config_file, 'r', encoding='utf-8') as f: configs = yaml.safe_load(f.read()) - path = configs['models'][model] + config_type = 'tokenizers' if tokenizer_only else 'models' + path = configs[config_type][model] registry = secrets['registry'] def get_oras_cmd(username: Optional[str] = None, diff --git a/scripts/data_prep/convert_text_to_mds.py b/scripts/data_prep/convert_text_to_mds.py index 2218e575b2..bfd60b8ee1 100644 --- a/scripts/data_prep/convert_text_to_mds.py +++ b/scripts/data_prep/convert_text_to_mds.py @@ -385,6 +385,10 @@ def convert_text_to_mds( local_output_folder = tempfile.TemporaryDirectory( ).name if is_remote_output else output_folder + if os.path.isdir(output_folder) and len(os.listdir(output_folder)) > 0: + raise FileExistsError( + f'{output_folder=} is not empty. Please remove or empty it.') + if processes > 1: # Download and convert the text files in parallel args = get_task_args(object_names, local_output_folder, input_folder, diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 8dbe91e6d2..d4ba39acfa 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -118,9 +118,11 @@ def evaluate_model( python_log_level: Optional[str], precision: str, eval_gauntlet_df: Optional[pd.DataFrame], + eval_subset_num_batches: int, icl_subset_num_batches: Optional[int], metadata: Optional[Dict[str, str]], logged_config: DictConfig, + should_log_config: bool = True, ): log.info(f'Evaluating model: {model_cfg.model_name}') @@ -215,14 +217,16 @@ def evaluate_model( python_log_level=python_log_level, ) - log.info('Evaluation config:') - log_config(logged_config) + if should_log_config: + log.info('Evaluation config:') + log_config(logged_config) log.info(f'Starting eval for {model_cfg.model_name}...') if torch.cuda.is_available(): torch.cuda.synchronize() a = time.time() - trainer.eval(eval_dataloader=evaluators) + trainer.eval(eval_dataloader=evaluators, + subset_num_batches=eval_subset_num_batches) if torch.cuda.is_available(): torch.cuda.synchronize() b = time.time() @@ -297,15 +301,23 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: 'loggers', must_exist=False, default_value={}) - icl_subset_num_batches: int = pop_config(cfg, - 'icl_subset_num_batches', - must_exist=False, - default_value=None) + eval_subset_num_batches: int = pop_config(cfg, + 'eval_subset_num_batches', + must_exist=False, + default_value=-1) + icl_subset_num_batches: Optional[int] = pop_config(cfg, + 'icl_subset_num_batches', + must_exist=False, + default_value=None) metadata: Optional[Dict[str, str]] = pop_config(cfg, 'metadata', must_exist=False, default_value=None, convert=True) + should_log_config: bool = pop_config(cfg, + 'log_config', + must_exist=False, + default_value=True) # Pop out interpolation variables. pop_config(cfg, 'model_name_or_path', must_exist=False, default_value=None) @@ -350,9 +362,11 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: python_log_level=python_log_level, precision=precision, eval_gauntlet_df=eval_gauntlet_df, + eval_subset_num_batches=eval_subset_num_batches, icl_subset_num_batches=icl_subset_num_batches, metadata=metadata, - logged_config=logged_cfg) + logged_config=logged_cfg, + should_log_config=should_log_config) trainers.append(trainer) if eval_gauntlet_callback is not None: diff --git a/scripts/misc/download_model.py b/scripts/misc/download_model.py index 1913267e20..13a63ce55e 100644 --- a/scripts/misc/download_model.py +++ b/scripts/misc/download_model.py @@ -7,10 +7,11 @@ python download_model.py hf --model mosaicml/mpt-7b --save-dir --token Download from ORAS registry: - python download_model.py oras --registry --path mosaicml/mpt-7b --save-dir + python download_model.py oras --model mosaicml/mpt-7b --config-file \ + --credentials-dir --save-dir Download from an HTTP file server: - python download_model.py http --host https://server.com --path mosaicml/mpt-7b --save-dir + python download_model.py http --url https://server.com/models/mosaicml/mpt-7b/ --save-dir Download from an HTTP file server with fallback to Hugging Face Hub: python download_model.py http --host https://server.com --path mosaicml/mpt-7b --save-dir \ @@ -56,6 +57,9 @@ def parse_args() -> argparse.Namespace: base_parser = argparse.ArgumentParser(add_help=False) base_parser.add_argument('--save-dir', type=str, required=True) + base_parser.add_argument('--tokenizer-only', + default=False, + action='store_true') # Add subparser for downloading from Hugging Face Hub. hf_parser = subparsers.add_parser('hf', parents=[base_parser]) @@ -85,6 +89,9 @@ def parse_args() -> argparse.Namespace: download_from = args.download_from if download_from == 'http': + if args.tokenizer_only: + raise ValueError( + 'tokenizer-only is not currently supported for http.') try: download_from_http_fileserver(args.url, args.save_dir, args.ignore_cert) @@ -109,7 +116,12 @@ def parse_args() -> argparse.Namespace: download_from_hf_hub(args.model, save_dir=args.save_dir, token=args.token, + tokenizer_only=args.tokenizer_only, prefer_safetensors=args.prefer_safetensors) elif download_from == 'oras': - download_from_oras(args.model, args.config_file, args.credentials_dir, - args.save_dir, args.concurrency) + download_from_oras(args.model, + args.config_file, + args.credentials_dir, + args.save_dir, + tokenizer_only=args.tokenizer_only, + concurrency=args.concurrency) diff --git a/scripts/train/train.py b/scripts/train/train.py index c3da1f1d3c..7bb5e71394 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -278,11 +278,13 @@ def main(cfg: DictConfig) -> Trainer: logger_configs: Optional[DictConfig] = pop_config(cfg, 'loggers', must_exist=False, - default_value=None) + default_value=None, + convert=True) callback_configs: Optional[DictConfig] = pop_config(cfg, 'callbacks', must_exist=False, - default_value=None) + default_value=None, + convert=True) algorithm_configs: Optional[DictConfig] = pop_config(cfg, 'algorithms', must_exist=False, @@ -391,6 +393,10 @@ def main(cfg: DictConfig) -> Trainer: must_exist=False, default_value=None, convert=True) + should_log_config: bool = pop_config(cfg, + 'log_config', + must_exist=False, + default_value=True) # Enable autoresume from model checkpoints if possible autoresume_default: bool = False @@ -514,8 +520,7 @@ def main(cfg: DictConfig) -> Trainer: for name, callback_cfg in callback_configs.items() ] if callback_configs else [] - use_async_eval = any( - isinstance(callback, AsyncEval) for callback in callbacks) + use_async_eval = any(isinstance(c, AsyncEval) for c in callbacks) # Algorithms algorithms = [ @@ -535,22 +540,28 @@ def main(cfg: DictConfig) -> Trainer: mosaicml_logger.log_metrics({'data_validated': time.time()}) ## Evaluation - log.info('Building eval loader...') - eval_icl_seq_len: int = icl_seq_len if icl_seq_len else max_seq_len - # TODO: evaluators should not be built at all if use_async_eval is True - # This will be fixed when eval_loader support is fully added to AsyncEval - evaluators, _, eval_gauntlet_callback = build_evaluators( - eval_loader_config, - icl_tasks_config if not use_async_eval else None, - eval_gauntlet_config if not use_async_eval else None, - tokenizer=tokenizer, - device_eval_batch_size=device_eval_batch_size, - icl_seq_len=eval_icl_seq_len, - icl_subset_num_batches=icl_subset_num_batches, - ) - - if eval_gauntlet_callback is not None and not use_async_eval: - callbacks.append(eval_gauntlet_callback) + if use_async_eval: + evaluators = [] + if eval_first: + warnings.warn( + 'AsyncEval callback does not support eval_first=True. Ignoring.' + ) + eval_first = False + + else: + log.info('Building eval loader...') + eval_icl_seq_len: int = icl_seq_len if icl_seq_len else max_seq_len + evaluators, _, eval_gauntlet_callback = build_evaluators( + eval_loader_config, + icl_tasks_config, + eval_gauntlet_config, + tokenizer=tokenizer, + device_eval_batch_size=device_eval_batch_size, + icl_seq_len=eval_icl_seq_len, + icl_subset_num_batches=icl_subset_num_batches, + ) + if eval_gauntlet_callback is not None: + callbacks.append(eval_gauntlet_callback) # Build Model log.info('Initializing model...') @@ -577,7 +588,7 @@ def main(cfg: DictConfig) -> Trainer: optimizer = build_optimizer(model, optimizer_name, optimizer_config) # Now add the eval metrics - if eval_loader_config is not None: + if eval_loader_config is not None and not use_async_eval: train_metrics = model.get_metrics(is_train=True) evaluators = add_metrics_to_eval_loaders(evaluators, train_metrics) @@ -621,8 +632,9 @@ def main(cfg: DictConfig) -> Trainer: compile_config=compile_config, ) - log.info('Logging config') - log_config(logged_cfg) + if should_log_config: + log.info('Logging config') + log_config(logged_cfg) torch.cuda.empty_cache() gc.collect() diff --git a/setup.py b/setup.py index 2c4a05f396..e5bc7e81d2 100644 --- a/setup.py +++ b/setup.py @@ -1,3 +1,6 @@ +# Copyright 2024 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + """MosaicML LLM Foundry package setup.""" import os @@ -47,12 +50,13 @@ ] install_requires = [ - 'mosaicml[libcloud,wandb,mlflow,oci,gcs]>=0.17.2,<0.18', + 'mosaicml[libcloud,wandb,oci,gcs]>=0.17.2,<0.18', + 'mlflow>=2.10,<3', 'accelerate>=0.25,<0.26', # for HF inference `device_map` - 'transformers>=4.36,<4.37', + 'transformers>=4.37,<4.38', 'mosaicml-streaming>=0.7.2,<0.8', 'torch>=2.1,<2.1.1', - 'datasets==2.15.0', + 'datasets>=2.16,<2.17', 'fsspec==2023.6.0', # newer version results in a bug in datasets that duplicates data 'sentencepiece==0.1.97', 'einops==0.7.0', diff --git a/tests/a_scripts/data_prep/test_convert_text_to_mds.py b/tests/a_scripts/data_prep/test_convert_text_to_mds.py index cc293a2cdd..3a00a8889f 100644 --- a/tests/a_scripts/data_prep/test_convert_text_to_mds.py +++ b/tests/a_scripts/data_prep/test_convert_text_to_mds.py @@ -3,6 +3,7 @@ import os import pathlib +import shutil from concurrent.futures import ProcessPoolExecutor from glob import glob from typing import Callable, Iterable, List @@ -55,23 +56,6 @@ def upload_object(self, object_name: str, filename: str): remote_file.write(local_file.read()) -def _call_convert_text_to_mds(processes: int, tokenizer_name: str, - concat_tokens: int) -> None: - convert_text_to_mds( - tokenizer_name=tokenizer_name, - output_folder=f's3://fake-test-output-path', - input_folder=f's3://fake-test-input-path', - concat_tokens=concat_tokens, - eos_text='', - bos_text='', - no_wrap=False, - compression='zstd', - processes=processes, - args_str='Namespace()', - reprocess=False, - ) - - # Mock starmap with no multiprocessing def _mock_map(func: Callable, args: Iterable) -> Iterable: for arg in args: @@ -107,9 +91,22 @@ def test_single_and_multi_process(merge_shard_groups: Mock, maybe_create_object_store_from_uri.return_value = mock_object_store parse_uri.return_value = ('s3', 'fake-test-bucket', str(remote_folder)) - _call_convert_text_to_mds(processes=processes, - tokenizer_name=tokenizer_name, - concat_tokens=concat_tokens) + def call_convert_text_to_mds() -> None: + convert_text_to_mds( + tokenizer_name=tokenizer_name, + output_folder=f's3://fake-test-output-path', + input_folder=f's3://fake-test-input-path', + concat_tokens=concat_tokens, + eos_text='', + bos_text='', + no_wrap=False, + compression='zstd', + processes=processes, + args_str='Namespace()', + reprocess=False, + ) + + call_convert_text_to_mds() # Check call counts assert download_and_convert.call_count == processes # called once per process @@ -131,9 +128,7 @@ def test_single_and_multi_process(merge_shard_groups: Mock, _assert_files_exist(prefix=remote_folder, files=['index.json', DONE_FILENAME] + shards) - _call_convert_text_to_mds(processes=processes, - tokenizer_name=tokenizer_name, - concat_tokens=concat_tokens) + call_convert_text_to_mds() # Check call counts assert download_and_convert.call_count == processes # No changes because we shoudn't reprocess @@ -146,9 +141,7 @@ def test_single_and_multi_process(merge_shard_groups: Mock, mock_object_store = Mock(wraps=object_store) maybe_create_object_store_from_uri.return_value = mock_object_store - _call_convert_text_to_mds(processes=processes, - tokenizer_name=tokenizer_name, - concat_tokens=concat_tokens) + call_convert_text_to_mds() # Check call counts assert download_and_convert.call_count == processes * 2 # called once per process @@ -187,31 +180,42 @@ def test_local_path(tmp_path: pathlib.Path): input_folder = tmp_path / 'input' output_folder = tmp_path / 'output' + def call_convert_text_to_mds(reprocess: bool): + convert_text_to_mds( + tokenizer_name='mosaicml/mpt-7b', + output_folder=str(output_folder), + input_folder=str(input_folder), + concat_tokens=1, + eos_text='', + bos_text='', + no_wrap=False, + compression='zstd', + processes=1, + args_str='Namespace()', + reprocess=reprocess, + ) + # Create input text data os.makedirs(input_folder, exist_ok=True) with open(input_folder / 'test.txt', 'w') as f: f.write('test') # Convert text data to mds - convert_text_to_mds( - tokenizer_name='mosaicml/mpt-7b', - output_folder=str(output_folder), - input_folder=str(input_folder), - concat_tokens=1, - eos_text='', - bos_text='', - no_wrap=False, - compression='zstd', - processes=1, - args_str='Namespace()', - reprocess=False, - ) + call_convert_text_to_mds(reprocess=False) # Make sure all the files exist as expected. assert os.path.exists(output_folder / '.text_to_mds_conversion_done') assert os.path.exists(output_folder / 'index.json') assert os.path.exists(output_folder / 'shard.00000.mds.zstd') + # Test reprocessing. + with pytest.raises(FileExistsError): + call_convert_text_to_mds(reprocess=True) + + shutil.rmtree(output_folder) + + call_convert_text_to_mds(reprocess=True) + def test_is_already_processed(tmp_path: pathlib.Path): tmp_path_str = str(tmp_path) diff --git a/tests/a_scripts/inference/test_convert_composer_to_hf.py b/tests/a_scripts/inference/test_convert_composer_to_hf.py index deed181475..e85cdb213c 100644 --- a/tests/a_scripts/inference/test_convert_composer_to_hf.py +++ b/tests/a_scripts/inference/test_convert_composer_to_hf.py @@ -6,7 +6,7 @@ import pathlib import shutil from argparse import Namespace -from typing import Callable, Optional, cast +from typing import Any, Callable, Optional, cast from unittest.mock import ANY, MagicMock, patch import pytest @@ -22,6 +22,7 @@ from llmfoundry import COMPOSER_MODEL_REGISTRY from llmfoundry.callbacks import HuggingFaceCheckpointer +from llmfoundry.callbacks.hf_checkpointer import _maybe_get_license_filename from llmfoundry.data.finetuning import build_finetuning_dataloader from llmfoundry.models.mpt.modeling_mpt import ComposerMPTCausalLM from llmfoundry.utils.builders import build_optimizer, build_tokenizer @@ -29,6 +30,10 @@ from tests.data_utils import make_tiny_ft_dataset +def _save_model_mock(*args: Any, path: str, **kwargs: Any): + os.makedirs(path, exist_ok=True) + + def check_hf_tokenizer_equivalence(tokenizer1: PreTrainedTokenizerBase, tokenizer2: PreTrainedTokenizerBase): """WARNING: Parameters are updated within the check so don't call check_hf_tokenizer_equivalence on the same @@ -297,7 +302,7 @@ def test_huggingface_conversion_callback_interval( mlflow_logger_mock = MagicMock(spec=MLFlowLogger) mlflow_logger_mock.state_dict = lambda *args, **kwargs: {} - mlflow_logger_mock.save_model = MagicMock() + mlflow_logger_mock.save_model = MagicMock(wraps=_save_model_mock) mlflow_logger_mock.register_model = MagicMock() mlflow_logger_mock.model_registry_prefix = '' mlflow_logger_mock._experiment_id = 'mlflow-experiment-id' @@ -533,7 +538,7 @@ def test_huggingface_conversion_callback( mlflow_logger_mock = MagicMock(spec=MLFlowLogger) mlflow_logger_mock.state_dict = lambda *args, **kwargs: {} - mlflow_logger_mock.save_model = MagicMock() + mlflow_logger_mock.save_model = MagicMock(wraps=_save_model_mock) mlflow_logger_mock.register_model = MagicMock() mlflow_logger_mock.model_registry_prefix = '' mlflow_logger_mock._experiment_id = 'mlflow-experiment-id' @@ -817,3 +822,17 @@ def test_convert_and_generate_meta(tie_word_embeddings: str, assert torch.allclose(p1, p2) delete_transformers_cache() + + +@pytest.mark.parametrize( + 'license_file_name', + ['LICENSE', 'LICENSE.txt', 'license', 'license.md', None]) +def test_license_file_finder(tmp_path: pathlib.Path, + license_file_name: Optional[str]): + if license_file_name is not None: + with open(os.path.join(tmp_path, license_file_name), 'w') as f: + f.write('test') + + found_path = _maybe_get_license_filename(str(tmp_path)) + assert (found_path == license_file_name + ) if license_file_name is not None else (found_path is None) diff --git a/tests/callbacks/test_async_eval_callback.py b/tests/callbacks/test_async_eval_callback.py index b3a1e98f79..92cb738d9c 100644 --- a/tests/callbacks/test_async_eval_callback.py +++ b/tests/callbacks/test_async_eval_callback.py @@ -11,10 +11,10 @@ from llmfoundry.callbacks.async_eval_callback import (AsyncEval, get_eval_parameters, get_run_name, + validate_eval_run_config, validate_interval) from mcli import Run, RunConfig, RunStatus -# here RUN_NAME = 'foo_bar-1234' BASIC_PARAMS = { 'save_interval': '1ba', @@ -191,6 +191,29 @@ def test_validate_interval(): assert validate_interval('2ep', two_epochs) == two_epochs +def test_validate_eval_run_config(): + assert validate_eval_run_config(None) == {} + assert validate_eval_run_config({}) == {} + + with pytest.raises(ValueError): + validate_eval_run_config({'foo': 'bar'}) + + valid_config = { + 'image': 'example_image', + 'command': 'example_command', + 'compute': { + 'gpus': 1, + 'cluster': 'example_cluster', + }, + 'scheduling': { + 'priority': 'high', + 'preemptible': True, + }, + } + res = validate_eval_run_config(valid_config) + assert res == valid_config + + FAKE_RUN = Run( run_uid='123', name=RUN_NAME, @@ -223,12 +246,16 @@ def test_validate_interval(): return_value=FAKE_RUN) def test_async_eval_callback_minimal(mock_create_run: MagicMock, mock_get_run: MagicMock): - callback = AsyncEval(BASIC_PARAMS, - interval='2ba', - compute={ - 'cluster': 'c2z3', - 'nodes': 2, - }) + callback = AsyncEval( + BASIC_PARAMS, + interval='2ba', + eval_run_config={ + 'compute': { + 'cluster': 'c2z3', + 'nodes': 2, + }, + }, + ) assert callback.current_run.name == RUN_NAME assert mock_get_run.call_count == 1 assert mock_get_run.call_args[0][0] == RUN_NAME @@ -310,12 +337,13 @@ def test_async_eval_callback_minimal(mock_create_run: MagicMock, return_value=FAKE_RUN_WITH_INTEGRATIONS) def test_async_eval_callback_integrations(mock_create_run: MagicMock, mock_get_run: MagicMock): - callback = AsyncEval(BASIC_PARAMS, - interval='2ba', - compute={ - 'cluster': 'c2z3', - 'nodes': 2, - }) + callback = AsyncEval( + BASIC_PARAMS, + interval='2ba', + eval_run_config={'compute': { + 'cluster': 'c2z3', + 'nodes': 2, + }}) assert mock_get_run.call_count == 1 callback.launch_run('checkpoint/path', Time(1, TimeUnit.BATCH)) diff --git a/tests/data/test_dataloader.py b/tests/data/test_dataloader.py index 44d0442a87..73e8427505 100644 --- a/tests/data/test_dataloader.py +++ b/tests/data/test_dataloader.py @@ -23,11 +23,8 @@ from llmfoundry import (build_finetuning_dataloader, build_text_denoising_dataloader) from llmfoundry.data import build_dataloader -from llmfoundry.data.finetuning.tasks import (_ALLOWED_PROMPT_KEYS, - _ALLOWED_RESPONSE_KEYS, - DOWNLOADED_FT_DATASETS_DIRPATH, - SUPPORTED_EXTENSIONS, - _tokenize_formatted_example) +from llmfoundry.data.finetuning.tasks import (DOWNLOADED_FT_DATASETS_DIRPATH, + SUPPORTED_EXTENSIONS) from llmfoundry.data.text_data import (ConcatenatedSequenceCollatorWrapper, build_text_dataloader, get_tokens_per_batch_func) @@ -249,10 +246,12 @@ def test_denoising_dataloader(decoder_only_format: bool, pretokenize: bool, break +@pytest.mark.parametrize('use_chat_formatting', [True, False]) @pytest.mark.parametrize('decoder_only_format', [True, False]) @pytest.mark.parametrize('allow_pad_trimming', [True, False]) @pytest.mark.parametrize('packing_ratio', [10.0, None, 'auto']) -def test_finetuning_dataloader(decoder_only_format: bool, +def test_finetuning_dataloader(use_chat_formatting: bool, + decoder_only_format: bool, allow_pad_trimming: bool, packing_ratio: Optional[Union[float, Literal['auto']]]): @@ -265,13 +264,21 @@ def test_finetuning_dataloader(decoder_only_format: bool, cfg = { 'name': 'finetuning', 'dataset': { - 'hf_name': 'HuggingFaceH4/databricks_dolly_15k', - 'split': 'train', - 'max_seq_len': max_seq_len, - 'decoder_only_format': decoder_only_format, - 'allow_pad_trimming': allow_pad_trimming, - 'packing_ratio': packing_ratio, - 'shuffle': True, + 'hf_name': + 'iamroot/chat_formatted_examples' if use_chat_formatting else + 'HuggingFaceH4/databricks_dolly_15k', + 'split': + 'train', + 'max_seq_len': + max_seq_len, + 'decoder_only_format': + decoder_only_format, + 'allow_pad_trimming': + allow_pad_trimming, + 'packing_ratio': + packing_ratio, + 'shuffle': + True, }, 'drop_last': False, 'num_workers': 0, @@ -417,39 +424,6 @@ def test_finetuning_dataloader_small_data(dataset_size: int, shutil.rmtree(tiny_dataset_folder_path) -def test_tokenize_example_malformed(): - no_keys = {} - no_prompt_key = {'response': 'response'} - no_response_key = {'prompt': 'prompt'} - extra_keys_with_prompt = {'prompt': 'prompt', 'extra': 'extra'} - extra_keys_with_response = {'response': 'response', 'extra': 'extra'} - multiple_allowed_response_keys = { - 'prompt': 'prompt', - 'response': 'response', - 'completion': 'completion' - } - - malformed_examples = [ - no_keys, no_prompt_key, no_response_key, extra_keys_with_prompt, - extra_keys_with_response, multiple_allowed_response_keys - ] - - for example in malformed_examples: - with pytest.raises(KeyError): - _tokenize_formatted_example(example, MagicMock()) - - -def test_tokenize_example_well_formed(): - tokenizer = transformers.AutoTokenizer.from_pretrained('gpt2') - - for prompt_key in _ALLOWED_PROMPT_KEYS: - for response_key in _ALLOWED_RESPONSE_KEYS: - example = {prompt_key: 'prompt', response_key: 'response'} - tokenized_example = _tokenize_formatted_example(example, tokenizer) - assert 'input_ids' in tokenized_example - assert 'labels' in tokenized_example - - @pytest.mark.parametrize('split', ['train', 'custom', 'data']) def test_finetuning_dataloader_custom_split(tmp_path: pathlib.Path, split: str): tokenizer_name = 'gpt2' diff --git a/tests/data/test_template_tokenization.py b/tests/data/test_template_tokenization.py new file mode 100644 index 0000000000..258e491b32 --- /dev/null +++ b/tests/data/test_template_tokenization.py @@ -0,0 +1,163 @@ +# Copyright 2022 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + +from unittest.mock import MagicMock + +import pytest +import transformers + +from llmfoundry.data.finetuning.tasks import (_ALLOWED_PROMPT_KEYS, + _ALLOWED_RESPONSE_KEYS, + _slice_chat_formatted_example, + _tokenize_formatted_example) +from llmfoundry.utils.builders import build_tokenizer + + +def test_tokenize_chat_example_malformed(): + no_content = {'messages': [{'role': 'user'}]} + too_few_messages = { + 'messages': [{ + 'role': 'assistant', + 'content': 'Hi, User!' + }] + } + ends_with_user_role = { + 'messages': [{ + 'role': 'user', + 'content': 'Hello GPT!' + }, { + 'role': 'assistant', + 'content': 'Hi, User!' + }, { + 'role': 'user', + 'content': 'user message not followed by an assistant label' + }] + } + no_assistant_message = { + 'messages': [{ + 'role': 'user', + 'content': 'Hello GPT!' + }, { + 'role': 'user', + 'content': 'user message not followed by an assistant label' + }] + } + malformed_chat_examples = [ + too_few_messages, no_content, ends_with_user_role, no_assistant_message + ] + my_tokenizer = build_tokenizer('mosaicml/mpt-7b-8k-chat', {}) + for example in malformed_chat_examples: + with pytest.raises(Exception): + _tokenize_formatted_example( + example, my_tokenizer + ) # type: ignore (the typing here is supposed to be malformed) + + +def test_tokenize_chat_example_well_formed(): + chat_examples = [ + { + 'messages': [{ + 'role': 'user', + 'content': 'Hello, GPT' + }, { + 'role': 'assistant', + 'content': 'this is my response' + }] + }, # prompt/response but in chat format + { + 'messages': [ + { + 'role': 'user', + 'content': 'Hello, GPT' + }, + { + 'role': 'assistant', + 'content': 'this is my response' + }, + { + 'role': 'user', + 'content': 'Nice to hear that.' + }, + { + 'role': 'assistant', + 'content': 'multi-way chat works too!' + }, + ] + }, # multi-way chat + ] + + expected = [ + { + 'prompt': + '''<|im_start|>system +A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers. +<|im_start|>user +Hello, GPT<|im_end|> +<|im_start|>assistant +''', + 'response': + 'this is my response<|im_end|>' + }, + { + 'prompt': + '''<|im_start|>system +A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers. +<|im_start|>user +Hello, GPT<|im_end|> +<|im_start|>assistant +this is my response<|im_end|> +<|im_start|>user +Nice to hear that.<|im_end|> +<|im_start|>assistant +''', + 'response': + 'multi-way chat works too!<|im_end|>' + }, + ] + + chat_tokenizer = build_tokenizer('mosaicml/mpt-7b-8k-chat', {}) + assert len(expected) == len( + chat_examples) # if we add a new example, zip shouldn't fail silently + for chat_example, expected_stringification in zip(chat_examples, expected): + prompt, response = _slice_chat_formatted_example( + chat_example, chat_tokenizer) + tokenized_example = _tokenize_formatted_example(chat_example, + chat_tokenizer) + assert prompt == expected_stringification['prompt'] + assert response == expected_stringification['response'] + assert 'input_ids' in tokenized_example + assert 'labels' in tokenized_example + + +def test_tokenize_instruct_example_malformed(): + no_keys = {} + no_prompt_key = {'response': 'response'} + no_response_key = {'prompt': 'prompt'} + extra_keys_with_prompt = {'prompt': 'prompt', 'extra': 'extra'} + extra_keys_with_response = {'response': 'response', 'extra': 'extra'} + multiple_allowed_response_keys = { + 'prompt': 'prompt', + 'response': 'response', + 'completion': 'completion' + } + + malformed_prompt_response_examples = [ + no_keys, no_prompt_key, no_response_key, extra_keys_with_prompt, + extra_keys_with_response, multiple_allowed_response_keys + ] + + for example in malformed_prompt_response_examples: + with pytest.raises(KeyError): + _tokenize_formatted_example(example, MagicMock()) + + +def test_tokenize_instruct_example_well_formed(): + tokenizer = transformers.AutoTokenizer.from_pretrained('gpt2') + + for prompt_key in _ALLOWED_PROMPT_KEYS: + for response_key in _ALLOWED_RESPONSE_KEYS: + + example = {prompt_key: 'prompt', response_key: 'response'} + tokenized_example = _tokenize_formatted_example(example, tokenizer) + assert 'input_ids' in tokenized_example + assert 'labels' in tokenized_example diff --git a/tests/models/layers/test_flash_triton_torch.py b/tests/models/layers/test_flash_triton_torch.py index 2f992cd92f..d409486cc6 100644 --- a/tests/models/layers/test_flash_triton_torch.py +++ b/tests/models/layers/test_flash_triton_torch.py @@ -28,7 +28,11 @@ def allclose_helper(t0: torch.Tensor, ('triton', 'torch'), ]) @pytest.mark.parametrize('clip_qkv', [True, False]) -@pytest.mark.parametrize('qk_ln', [True, False]) +@pytest.mark.parametrize('qk_ln, qk_gn', [ + (True, False), + (False, True), + (False, False), +]) @pytest.mark.parametrize('pos_emb_config', [{ 'alibi': False, 'rope': False @@ -64,6 +68,7 @@ def test_attn_impl(attn_impl_0: str, attn_impl_1: str, clip_qkv: bool, qk_ln: bool, + qk_gn: bool, pos_emb_config: dict, attn_type: str, attn_uses_sequence_id: bool, @@ -71,8 +76,8 @@ def test_attn_impl(attn_impl_0: str, device: str = 'cuda'): """Compare all attn impl with each other. - Includes testing with and without attn_clip_qkv, attn_qk_ln, alibi, and - rope. + Includes testing with and without attn_clip_qkv, attn_qk_ln, attn_qk_gn, + alibi, and rope. """ alibi = pos_emb_config['alibi'] rope = pos_emb_config['rope'] @@ -100,6 +105,7 @@ def test_attn_impl(attn_impl_0: str, 'attn_pdrop': 0, 'clip_qkv': clip_qkv, 'qk_ln': qk_ln, + 'qk_gn': qk_gn, }) n, s, f = 2, 4, cfg.d_model @@ -269,7 +275,8 @@ def gen_bias(attn_impl: str): 'rope_impl'] == 'hf' # special case that (likely) fails due to numerics - if clip_qkv and qk_ln and using_hf_rope and attn_type == 'grouped_query_attention': + if (clip_qkv and (qk_ln or qk_gn) and using_hf_rope and + attn_type == 'grouped_query_attention'): assert allclose_helper(p.grad, tp.grad, atol=2.e-2, rtol=2.e-2) else: assert allclose_helper(p.grad, tp.grad) diff --git a/tests/tokenizers/test_tiktoken.py b/tests/tokenizers/test_tiktoken.py index 6a4d1c99c4..aca269af82 100644 --- a/tests/tokenizers/test_tiktoken.py +++ b/tests/tokenizers/test_tiktoken.py @@ -108,6 +108,12 @@ 'Please summarize the goals in this text:\n\nGoing outside has benefits include reducing stress and triggering the relaxation response, which can help us not only feel better mentally, but even heal faster from physical ailments.', 'role': 'user' +}, { + 'content': 'You should go outside and touch grass.', + 'role': 'assistant' +}, { + 'content': 'What else can I do?', + 'role': 'user' }]] MULTI_TURN_GENERATE_STRING = [ @@ -118,6 +124,10 @@ Going outside has benefits include reducing stress and triggering the relaxation response, which can help us not only feel better mentally, but even heal faster from physical ailments.<|im_end|> <|im_start|>assistant +You should go outside and touch grass.<|im_end|><|endoftext|> +<|im_start|>user +What else can I do?<|im_end|> +<|im_start|>assistant """ ] diff --git a/tests/utils/test_builders.py b/tests/utils/test_builders.py index 9be6630075..303afc9b7d 100644 --- a/tests/utils/test_builders.py +++ b/tests/utils/test_builders.py @@ -135,14 +135,14 @@ def test_build_logger(): with pytest.raises(ValueError): _ = build_logger('unknown', {}) - logger_cfg = DictConfig({ + logger_cfg = { 'project': 'foobar', 'init_kwargs': { 'config': { 'foo': 'bar', } } - }) + } wandb_logger = build_logger('wandb', logger_cfg) # type: ignore assert isinstance(wandb_logger, WandBLogger) assert wandb_logger.project == 'foobar' diff --git a/tests/utils/test_model_download_utils.py b/tests/utils/test_model_download_utils.py index 471a39dcdb..08e11a3b0e 100644 --- a/tests/utils/test_model_download_utils.py +++ b/tests/utils/test_model_download_utils.py @@ -110,6 +110,8 @@ def test_download_from_hf_hub_weights_pref(mock_list_repo_files: MagicMock, mock_snapshot_download.assert_called_once_with( test_repo_id, local_dir=save_dir, + local_dir_use_symlinks=False, + allow_patterns=None, ignore_patterns=expected_ignore_patterns, token=None)