From 613457a1cb426ddd601b1b7ee44430be0d8f5ff7 Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Mon, 27 Nov 2023 15:51:30 -0800 Subject: [PATCH 1/5] Bump composer version to min 0.17.1 (#762) --- setup.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index afdfce8d48..9bf2ef2cb0 100644 --- a/setup.py +++ b/setup.py @@ -47,7 +47,7 @@ ] install_requires = [ - 'mosaicml[libcloud,wandb,mlflow,oci,gcs]>=0.17,<0.18', + 'mosaicml[libcloud,wandb,mlflow,oci,gcs]>=0.17.1,<0.18', 'accelerate>=0.20,<0.21', # for HF inference `device_map` 'transformers>=4.34.1,<4.35', 'mosaicml-streaming>=0.7.1,<0.8', @@ -84,11 +84,11 @@ ] extra_deps['databricks'] = [ - 'mosaicml[databricks]>=0.17,<0.18', + 'mosaicml[databricks]>=0.17.1,<0.18', ] extra_deps['tensorboard'] = [ - 'mosaicml[tensorboard]>=0.17,<0.18', + 'mosaicml[tensorboard]>=0.17.1,<0.18', ] extra_deps['gpu'] = [ From 34d04ea689a4e4b08af7e9e911338fd4bc2983c1 Mon Sep 17 00:00:00 2001 From: bandish-shah <86627118+bandish-shah@users.noreply.github.com> Date: Mon, 27 Nov 2023 20:52:14 -0800 Subject: [PATCH 2/5] Update Docker image release logic so that we can release new images to prod from workflow_dispatch (#763) --- .github/workflows/docker.yaml | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml index 13a835356c..f6dac79fe5 100644 --- a/.github/workflows/docker.yaml +++ b/.github/workflows/docker.yaml @@ -69,19 +69,17 @@ jobs: GIT_SHA=$(echo ${{ github.sha }} | cut -c1-7) echo "IMAGE_TAG=${GIT_SHA}" >> ${GITHUB_ENV} - if [ "${{ github.event_name }}" == "push" ]; then - echo "Triggered by push event." - PROD_REPO="mosaicml/llm-foundry" - IMAGE_TAG="${PROD_REPO}:${{matrix.name}}-${GIT_SHA},${PROD_REPO}:${{matrix.name}}-latest" - IMAGE_CACHE="${PROD_REPO}:${{matrix.name}}-buildcache" - elif [ "${{ github.event_name }}" == "pull_request" ]; then + if [ "${{ github.event_name }}" == "pull_request" ]; then echo "Triggered by pull_request event." STAGING_REPO="mosaicml/ci-staging" IMAGE_TAG="${STAGING_REPO}:${{matrix.name}}-${GIT_SHA}" IMAGE_CACHE="${STAGING_REPO}:${{matrix.name}}-buildcache" else - echo "Triggered by unknown event: ${{ github.event_name }}" - exit 1 + # Triggered by push or workflow_dispatch event + echo "Triggered by ${{ github.event_name }} event, releasing to prod" + PROD_REPO="mosaicml/llm-foundry" + IMAGE_TAG="${PROD_REPO}:${{matrix.name}}-${GIT_SHA},${PROD_REPO}:${{matrix.name}}-latest" + IMAGE_CACHE="${PROD_REPO}:${{matrix.name}}-buildcache" fi echo "IMAGE_TAG=${IMAGE_TAG}" >> ${GITHUB_ENV} From 4f399bf5895b52490c1e43cd0f7d1492724bfa47 Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Tue, 28 Nov 2023 12:54:45 -0800 Subject: [PATCH 3/5] Fix tiktoken wrapper (#761) --- llmfoundry/tokenizers/tiktoken.py | 169 ++++++++++++++---------------- tests/test_tiktoken.py | 62 ++++++----- 2 files changed, 111 insertions(+), 120 deletions(-) diff --git a/llmfoundry/tokenizers/tiktoken.py b/llmfoundry/tokenizers/tiktoken.py index 8e258cce74..6110f565df 100644 --- a/llmfoundry/tokenizers/tiktoken.py +++ b/llmfoundry/tokenizers/tiktoken.py @@ -1,8 +1,7 @@ # Copyright 2022 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 - -import warnings -from typing import Any, Dict, List, Optional, Tuple, Union +from functools import lru_cache +from typing import Any, Dict, List, Optional, Tuple import torch from transformers import PreTrainedTokenizer @@ -10,6 +9,38 @@ DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible.""" +# Taken from +# https://github.com/huggingface/transformers/blob/8aca43bdb3cb9a5020f6d57589d85679dc873b1c/src/transformers/models/gpt2/tokenization_gpt2.py#L62-L84 +@lru_cache() +def bytes_to_unicode(): + """Returns list of utf-8 byte and a mapping to unicode strings. + + We specifically avoids mapping to whitespace/control characters the bpe code + barfs on. + + The reversible bpe codes work on unicode strings. This means you need a + large # of unicode characters in your vocab if you want to avoid UNKs. When + you're at something like a 10B token dataset you end up needing around 5K + for decent coverage. This is a significant percentage of your normal, say, + 32K bpe vocab. To avoid that, we want lookup tables between utf-8 bytes and + unicode strings. + """ + bs = (list(range(ord('!'), + ord('~') + 1)) + list(range(ord('¡'), + ord('¬') + 1)) + + list(range(ord('®'), + ord('ÿ') + 1))) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8 + n) + n += 1 + cs = [chr(n) for n in cs] + return dict(zip(bs, cs)) + + class TiktokenTokenizerWrapper(PreTrainedTokenizer): """A thin wrapper around tiktoken to make it compatible with Hugging Face. @@ -93,6 +124,28 @@ def pickle_Encoding(enc: Encoding): self.add_eos_token = add_eos_token self.use_default_system_prompt = use_default_system_prompt + self.byte_encoder = bytes_to_unicode() + self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} + + self.decoder: Dict[int, str] = {} + for i in range(self.encoding.n_vocab): + try: + self.encoding.decode_single_token_bytes(i) + except KeyError: + continue + # Taken from + # https://gist.github.com/xenova/a452a6474428de0182b17605a98631ee + decoding = ''.join([ + bytes_to_unicode()[ord(char)] for char in + self.encoding.decode_single_token_bytes(i).decode('latin-1') + ]) + self.decoder[i] = decoding + + self.encoder: Dict[str, int] = {} + for i in range(self.encoding.n_vocab): + if i in self.decoder: + self.encoder[self.decoder[i]] = i + super().__init__(model_name=model_name, encoding_name=encoding_name, add_bos_token=add_bos_token, @@ -135,122 +188,54 @@ def default_chat_template(self): return template def get_vocab(self) -> Dict[str, int]: - """Returns vocab as a dict. - - Note: This function does not work properly due to difference in assumptions between tiktoken and Hugging Face tokenizers. - Most uses do not need to use get_vocab, so this is not a priority to fix. - """ - warnings.warn( - 'get_vocab does not work properly with TiktokenTokenizerWrapper. Please do not rely on it being perfectly correct.' - + - ' It will be called once init just to get the size of the vocab inside the base class.' - ) - - vocab = {} - for i in range(self.vocab_size): - try: - # need to try this first, so that we get a proper KeyError, - # otherwise it crashes in the rust code - _ = self.encoding.decode_single_token_bytes(i) - vocab[self.encoding.decode([i])] = i - except KeyError: - pass - + """Returns vocab as a dict.""" # As far as I can tell, we don't require get_vocab to completely work, # but when using additional_special_tokens, Hugging Face determines the next # token index to add with len(self.get_vocab()) so we need the _size_ of this dictionary to be correct. + vocab_clone = self.encoder.copy() extra_id_index = 0 candidate_extra_id = f'' indices_to_fill_in = {i for i in range(self.vocab_size)} - set( - vocab.values()) + vocab_clone.values()) # Add enough indices to make get_vocab() the right length for index_to_add in indices_to_fill_in: # Make sure we don't overwrite a token that already exists - while candidate_extra_id in vocab: + while candidate_extra_id in vocab_clone: extra_id_index += 1 candidate_extra_id = f'' # Get an index to add and add the item - vocab[candidate_extra_id] = index_to_add + vocab_clone[candidate_extra_id] = index_to_add - return vocab + return vocab_clone - def _tokenize(self, text: str) -> List[int]: - """Returns a tokenized string. - - Note: We have slightly redefined the expected contract between this method and - the _convert_token_to_id method. Normally, this method turns a string, into a list of strings, - and then the _convert_token_to_id method turns that list of strings into a list of integers. - However, not all vocab indices can be decoded into a string, so instead we just return the integers - from this function, and have adjusted the _convert_token_to_id method to handle integers as well as strings. - The only use of _tokenize that I could find was in this way, so this _should_ be safe. - """ + def _tokenize(self, text: str) -> List[str]: + """Returns a tokenized string.""" if not isinstance(text, str): raise ValueError( f'Expected a string input to _tokenize but got {type(text)}.') - tokens = [t for t in self.encoding.encode(text, allowed_special='all')] + tokens = [ + self.decoder[t] + for t in self.encoding.encode(text, allowed_special='all') + ] return tokens - def _convert_token_to_id(self, token: Union[int, str]) -> int: - """Converts a token (str) into an id using the vocab.""" - if isinstance(token, int): - return token + def _convert_token_to_id(self, token: str) -> Optional[int]: + """Converts a token (str) in an id using the vocab.""" + return self.encoder.get(token, self.encoder.get(self.unk_token)) - return self.encoding.encode(token, allowed_special='all')[0] - - def _convert_id_to_token(self, index: int) -> str: - """Converts an index (integer) into a token (str) using the vocab.""" - return self.encoding.decode([index]) + def _convert_id_to_token(self, index: int) -> Optional[str]: + """Converts an index (integer) in a token (str) using the vocab.""" + return self.decoder.get(index) def convert_tokens_to_string(self, tokens: List[str]) -> str: """Converts a sequence of tokens (string) in a single string.""" - return ''.join(tokens) - - def convert_ids_to_tokens( - self, - ids: Union[int, List[int]], - skip_special_tokens: bool = False) -> Union[str, List[str]]: - """Converts a single index or a sequence of indices into a token or a. - - sequence of tokens, using the vocabulary and added tokens. - - Args: - ids (`int` or `List[int]`): - The token id (or token ids) to convert to tokens. - skip_special_tokens (`bool`, *optional*, defaults to `False`): - Whether or not to remove special tokens in the decoding. - - Returns: - `str` or `List[str]`: The decoded token(s). - """ - if isinstance(ids, int): - if ids in self.added_tokens_decoder: - return str(self.added_tokens_decoder[ids]) - - return self._convert_id_to_token(ids) - - # current_stream will collect multiple tokens, and then separately add items - # for each added token. This is done so that decode works properly with token ids - # that cannot be represented naively in utf-8. - tokens = [] - current_stream = [] - for index in ids: - if skip_special_tokens and index in self.all_special_ids: - continue - - if index in self.added_tokens_decoder: - tokens.append(self.encoding.decode(current_stream)) - current_stream = [] - tokens.append(str(self.added_tokens_decoder[index])) - else: - current_stream.append(index) - - if len(current_stream) > 0: - tokens.append(self.encoding.decode(current_stream)) - return tokens + text = ''.join(tokens) + text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8') + return text def build_inputs_with_special_tokens( self, diff --git a/tests/test_tiktoken.py b/tests/test_tiktoken.py index 5bd10c82d3..fe3db41d50 100644 --- a/tests/test_tiktoken.py +++ b/tests/test_tiktoken.py @@ -7,7 +7,8 @@ import pytest import transformers -from llmfoundry import TiktokenTokenizerWrapper +from llmfoundry.tokenizers.tiktoken import (TiktokenTokenizerWrapper, + bytes_to_unicode) from tests.horrible_strings import HORRIBLE_STRINGS from tests.test_hf_conversion_script import check_hf_tokenizer_equivalence @@ -29,18 +30,12 @@ TEST_STRINGS += HORRIBLE_STRINGS -MODEL_OR_ENCODING_NAME_TO_NON_UTF8_TOKENS = { - 'gpt-4': 77, - 'gpt-3.5-turbo': 77, - 'text-davinci-003': 14, - 'cl100k_base': 77, -} - MODEL_ENCODING_NAME_PARAMETRIZATION = [ ('gpt-4', None), ('gpt-3.5-turbo', None), ('text-davinci-003', None), (None, 'cl100k_base'), + ('gpt2', None), ] MULTI_TURN_CHAT_ML = [[{ @@ -120,6 +115,31 @@ def test_tiktoken_simple(model_name: Optional[str], assert reloaded_wrapped_output == wrapped_output +@pytest.mark.parametrize('model_name,encoding_name', + MODEL_ENCODING_NAME_PARAMETRIZATION) +def test_tiktoken_tokenize_with_ids(model_name: Optional[str], + encoding_name: Optional[str], + tmp_path: pathlib.Path): + wrapped_tokenizer, reloaded_wrapped_tokenizer, original_tokenizer = get_tokenizers_for_testing( + model_name, encoding_name, tmp_path) + + for string in TEST_STRINGS: + wrapped_output = wrapped_tokenizer.tokenize(string) + original_output = original_tokenizer.encode(string, + allowed_special='all') + reloaded_wrapped_output = reloaded_wrapped_tokenizer.tokenize(string) + + assert all([isinstance(t, str) for t in wrapped_output]) + assert len(wrapped_output) == len(original_output) + assert wrapped_output == reloaded_wrapped_output + + redone_token_ids = wrapped_tokenizer.convert_tokens_to_ids( + wrapped_output) + assert redone_token_ids == original_output + assert wrapped_tokenizer.convert_ids_to_tokens( + redone_token_ids) == wrapped_output + + @pytest.mark.parametrize('model_name,encoding_name', MODEL_ENCODING_NAME_PARAMETRIZATION) def test_tiktoken_roundtrip(model_name: Optional[str], @@ -201,31 +221,17 @@ def test_tiktoken_vocab(model_name: Optional[str], encoding_name: Optional[str], reloaded_wrapped_vocab = reloaded_wrapped_tokenizer.get_vocab() assert wrapped_vocab == reloaded_wrapped_vocab - didnt_match = [] for key, value in wrapped_vocab.items(): # Skip checking the extra ids we pad the vocab with if key.startswith(''): continue - if original_tokenizer.encode(key, allowed_special='all') == [value]: - continue - else: - didnt_match.append( - (key, original_tokenizer.encode(key, - allowed_special='all'), value)) - - # Decode is lossy because some bytes are not representable in utf-8 - # see https://github.com/openai/tiktoken/blob/39f29cecdb6fc38d9a3434e5dd15e4de58cf3c80/tiktoken/core.py#L245-L247 - # This means that the str: int vocab mapping doesn't work. Would have to look more into how other HF tokenizers handle this. - model_or_encoding_name = model_name or encoding_name - if model_or_encoding_name is not None: - expected_didnt_match = MODEL_OR_ENCODING_NAME_TO_NON_UTF8_TOKENS.get( - model_or_encoding_name) - assert len(didnt_match) == expected_didnt_match - else: - raise NotImplementedError( - 'Add the new tokenizer and how many tokens in the vocab are not utf8 representable.' - ) + expected_decoding = ''.join([ + bytes_to_unicode()[ord(char)] + for char in original_tokenizer.decode_single_token_bytes( + value).decode('latin-1') + ]) + assert expected_decoding == key @pytest.mark.parametrize('model_name,encoding_name', From 5f21855cb35987ec73fe8b3a515f6ae3db903d56 Mon Sep 17 00:00:00 2001 From: Vitaliy Chiley <6439018+vchiley@users.noreply.github.com> Date: Tue, 28 Nov 2023 16:18:18 -0800 Subject: [PATCH 4/5] enable param group configuration in llm-foundry (#760) * enable param group configuration in llm-foundry * add doc string * add debug logs * add test, fix bug * spell check; mark test gpu * updt to use RegEx search * Apply suggestions from code review Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com> * updt with dakinggg pr comments --------- Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com> --- llmfoundry/optim/lion8b.py | 22 ++++--- llmfoundry/utils/builders.py | 118 +++++++++++++++++++++++++++++++++-- tests/test_builders.py | 89 +++++++++++++++++++++++++- 3 files changed, 211 insertions(+), 18 deletions(-) diff --git a/llmfoundry/optim/lion8b.py b/llmfoundry/optim/lion8b.py index 2c2e6e2d35..9d1d1dda71 100644 --- a/llmfoundry/optim/lion8b.py +++ b/llmfoundry/optim/lion8b.py @@ -1,7 +1,7 @@ # Copyright 2022 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Callable, Dict, Iterable, Optional, Tuple +from typing import Any, Callable, Dict, Iterable, Optional, Tuple, Union import torch @@ -58,15 +58,17 @@ class DecoupledLionW_8bit(torch.optim.Optimizer): device, or b) step() is executed on a non-CUDA parameter. """ - def __init__(self, - params: Iterable[torch.Tensor], - lr: float = 1e-3, - betas: Tuple[float, float] = (0.9, 0.99), - weight_decay: float = 0, - quantize: bool = True, - compress_state_dict: bool = False, - error_correction: bool = False, - _fused: bool = True): # XXX this flag is mostly for testing... + def __init__( + self, + params: Union[Iterable[torch.Tensor], Iterable[Dict[str, Any]]], + lr: float = 1e-3, + betas: Tuple[float, float] = (0.9, 0.99), + weight_decay: float = 0, + quantize: bool = True, + compress_state_dict: bool = False, + error_correction: bool = False, + _fused: bool = True, # XXX this flag is mostly for testing... + ): if lr < 0.0: raise ValueError('Invalid learning rate: {}'.format(lr)) diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index c31917efc6..14196c3ef9 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -1,10 +1,13 @@ # Copyright 2022 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 +import functools import logging import os +import re import warnings -from typing import Any, Dict, List, Optional, Tuple, Union +from collections import OrderedDict +from typing import Any, Dict, Iterable, List, Optional, Tuple, Union import torch from composer import algorithms @@ -155,18 +158,121 @@ def build_algorithm(name: str, kwargs: Dict[str, Any]) -> Algorithm: raise ValueError(f'Not sure how to build algorithm: {name}') +def _extract_param_groups( + model: torch.nn.Module, + optimizer_config: Dict[str, Any], +) -> Union[Iterable[torch.Tensor], Iterable[Dict[str, Any]]]: + """Extracts parameter groups defined in the optimizer config. + + The optimizer_config defines the optimizer args. It can additionally have key + `disable_grad` which is a string or list of strings. If a string matches a + parameter name, then that parameter will have `requires_grad=False`. This is + useful for freezing parameters. It can additionally have a key + `param_groups` which is a list of dicts. In this dict, key `param_str_match` + defines a string; if a parameter name contains this string, then it will be + in this parameter group. This is useful for grouping parameters together. + The dict can also contain any other key that is a valid optimizer arg. + Note: to handle name overlap conflicts, params are assigned to parameter + groups and added to `param_groups` in the order that `param_str_match` appear + in `param_groups`. + + Usage + To disable gradient for all parameters that contain the string "norm" or "bias": + ``` + optimizer_config: { + "name": "decoupled_lionw", + "lr": 1e-3, + "weight_decay": 1e-2, + "betas": [0.9, 0.999], + "eps": 1e-8, + "disable_grad": ["norm", "bias"] + } + ``` + + To create and modify the optimizer parameters for all parameters that contain + the string "norm" and "bias" separately: + ``` + optimizer_config: { + "name": "decoupled_lionw", + "lr": 1e-3, + "weight_decay": 1e-2, + "betas": [0.9, 0.999], + "eps": 1e-8, + "param_groups": [ + { + "param_str_match": "norm", + "lr": 1e-4, + "weight_decay": 0.0, + }, + { + "param_str_match": "bias", + "lr": 5e-4, + "weight_decay": 0.0, + }, + ], + } + ``` + + Args: + model (torch.nn.Module): model to extract parameters from + optimizer_config (Dict[str, Any]): optimizer config + + Returns: + Union[Iterable[torch.Tensor], Iterable[Dict[str, Any]]]: an iterable of + torch.Tensor's or dict's. Specifies what Tensors should be optimized + and their param groupings. + """ + if 'disable_grad' in optimizer_config.keys(): + str_matches = optimizer_config.pop('disable_grad') + if isinstance(str_matches, str): + str_matches = [str_matches] + for str_match in str_matches: + for n, p in model.named_parameters(): + if re.search(str_match, n): + p.requires_grad = False + log.debug(f'Setting `{n}.requires_grad = False`.') + + param_groups_config = optimizer_config.pop('param_groups', None) + if param_groups_config is not None: + params = [] + param_dict = OrderedDict((n, p) for n, p in model.named_parameters()) + + log.debug(f'Default optimizer settings: {optimizer_config}.') + for param_group_config in param_groups_config: + str_match = param_group_config.pop('param_str_match') + filter_fn = functools.partial(re.search, str_match) + param_names = [n for n in param_dict.keys() if filter_fn(n)] + group_params = {'params': [param_dict.pop(n) for n in param_names]} + group_params.update(param_group_config) + + log.debug( + f'Creating optimizer param_group with parameters: {param_names} ' +\ + f'(extracted using {str_match=}). The param_group optimizer ' +\ + f'setting overrides are: {param_group_config}.') + + params.append(group_params) + + params.insert(0, {'params': param_dict.values()}) + return params + + return model.parameters() + + def build_optimizer(model: torch.nn.Module, name: str, optimizer_config: Dict[str, Any]) -> Optimizer: + + params = _extract_param_groups(model, optimizer_config) + if name == 'decoupled_adamw': - return DecoupledAdamW(model.parameters(), **optimizer_config) + return DecoupledAdamW(params, **optimizer_config) elif name == 'decoupled_lionw': - return DecoupledLionW(model.parameters(), **optimizer_config) + return DecoupledLionW(params, **optimizer_config) elif name == 'clip_lion': - return DecoupledClipLion(model.parameters(), **optimizer_config) + return DecoupledClipLion(params, **optimizer_config) elif name == 'adalr_lion': - return DecoupledAdaLRLion(model.parameters(), **optimizer_config) + return DecoupledAdaLRLion(params, **optimizer_config) elif name == 'decoupled_lionw_8b': - return DecoupledLionW_8bit(model.parameters(), **optimizer_config) + return DecoupledLionW_8bit(params, **optimizer_config) else: raise ValueError(f'Not sure how to build optimizer: {name}') diff --git a/tests/test_builders.py b/tests/test_builders.py index 237e27b52b..7ac179720e 100644 --- a/tests/test_builders.py +++ b/tests/test_builders.py @@ -1,17 +1,22 @@ # Copyright 2022 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 +import re import unittest.mock as mock -from typing import Union +from copy import deepcopy +from typing import Any, Dict, Union import pytest +import torch +import torch.nn as nn from composer.callbacks import Generate from omegaconf import OmegaConf as om from transformers import PreTrainedTokenizerBase from llmfoundry.callbacks import HuggingFaceCheckpointer from llmfoundry.tokenizers.tiktoken import TiktokenTokenizerWrapper -from llmfoundry.utils.builders import build_callback, build_tokenizer +from llmfoundry.utils.builders import (build_callback, build_optimizer, + build_tokenizer) @pytest.mark.parametrize('tokenizer_name,tokenizer_kwargs', [ @@ -110,3 +115,83 @@ def test_build_hf_checkpointer_callback(): assert isinstance(kwargs['mlflow_logging_config'], dict) assert isinstance(kwargs['mlflow_logging_config']['metadata'], dict) assert kwargs['mlflow_logging_config'] == mlflow_logging_config_dict + + +class _DummyModule(nn.Module): + + def __init__(self, device: str = 'cpu', dtype: torch.dtype = torch.float32): + super().__init__() + self.linear0 = nn.Linear(4, 3, device=device, dtype=dtype) + self.norm0 = nn.LayerNorm(3, device=device, dtype=dtype) + self.linear1 = nn.Linear(3, 5, device=device, dtype=dtype) + + def forward(self, x: torch.Tensor) -> torch.Tensor: # type:ignore + return self.linear1(self.norm0(self.linear0(x))) + + +@pytest.mark.parametrize('name, optimizer_config', [ + ('decoupled_adamw', {}), + ('decoupled_lionw', {}), + ('clip_lion', {}), + ('adalr_lion', {}), + pytest.param('decoupled_lionw_8b', {}, marks=pytest.mark.gpu), +]) +@pytest.mark.parametrize('opt_additional_config', [ + { + 'disable_grad': 'norm' + }, + { + 'disable_grad': ['norm', 'bias'] + }, + { + 'param_groups': [{ + 'param_str_match': 'norm', + 'lr': 1e-9, + 'weight_decay': 0.0, + },] + }, + { + 'param_groups': [{ + 'param_str_match': 'no.*.bias', + 'lr': 1e-9, + 'weight_decay': 0.0, + },] + }, + { + 'param_groups': [{ + 'param_str_match': 'norm', + 'lr': 1e-4, + 'weight_decay': 0.0, + },], + 'disable_grad': ['bias'], + }, +]) +def test_build_optimizer(name: str, optimizer_config: Dict[str, Any], + opt_additional_config: Dict[str, Any]): + model = _DummyModule() + optimizer_config.update(deepcopy(opt_additional_config)) + optimizer = build_optimizer(model, name, optimizer_config) + + if 'disable_grad' in opt_additional_config.keys(): + disable_grad = opt_additional_config['disable_grad'] + if isinstance(disable_grad, str): + disable_grad = [disable_grad] + for n, p in model.named_parameters(): + for k in disable_grad: + if re.search(k, n): + assert not p.requires_grad + + if 'param_groups' in opt_additional_config.keys(): + for param_group_config, param_group in zip( + opt_additional_config['param_groups'], + optimizer.param_groups[1:]): + param_group_config = deepcopy(param_group_config) + param_str_match = param_group_config.pop('param_str_match') + + for k, v in param_group_config.items(): + assert param_group[k] == v + + param_ids = [id(p) for p in param_group['params']] + for n, p in model.named_parameters(): + if re.search(param_str_match, n): + assert id(p) in param_ids From 3a96b69965189876ff3bccceebb26d991e9bea72 Mon Sep 17 00:00:00 2001 From: Anna Date: Wed, 29 Nov 2023 10:29:07 -0800 Subject: [PATCH 5/5] Add script for doing bulk generation against an endpoint (#765) * Add script for doing bulk generation against an endpoint * more logging * warn * fix * format * asdfads * Add warning * updates * folder -> file * remove blank line * Support remote input * prompts -> inputs --- llmfoundry/utils/prompt_files.py | 58 +++++++ scripts/inference/endpoint_generate.py | 223 +++++++++++++++++++++++++ scripts/inference/hf_generate.py | 31 ++-- tests/test_prompt_files.py | 18 ++ 4 files changed, 309 insertions(+), 21 deletions(-) create mode 100644 llmfoundry/utils/prompt_files.py create mode 100644 scripts/inference/endpoint_generate.py create mode 100644 tests/test_prompt_files.py diff --git a/llmfoundry/utils/prompt_files.py b/llmfoundry/utils/prompt_files.py new file mode 100644 index 0000000000..40de19907a --- /dev/null +++ b/llmfoundry/utils/prompt_files.py @@ -0,0 +1,58 @@ +# Copyright 2022 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + +import os +from typing import List, Optional + +PROMPTFILE_PREFIX = 'file::' + + +def load_prompts(prompts: List[str], + prompt_delimiter: Optional[str] = None) -> List[str]: + """Loads a set of prompts, both free text and from file. + + Args: + prompts (List[str]): List of free text prompts and prompt files + prompt_delimiter (Optional str): Delimiter for text file + If not provided, assumes the prompt file is a single prompt (non-delimited) + + Returns: + List of prompt string(s) + """ + prompt_strings = [] + for prompt in prompts: + if prompt.startswith(PROMPTFILE_PREFIX): + prompts = load_prompts_from_file(prompt, prompt_delimiter) + prompt_strings.extend(prompts) + else: + prompt_strings.append(prompt) + return prompt_strings + + +def load_prompts_from_file(prompt_path: str, + prompt_delimiter: Optional[str] = None) -> List[str]: + """Load a set of prompts from a text fie. + + Args: + prompt_path (str): Path for text file + prompt_delimiter (Optional str): Delimiter for text file + If not provided, assumes the prompt file is a single prompt (non-delimited) + + Returns: + List of prompt string(s) + """ + if not prompt_path.startswith(PROMPTFILE_PREFIX): + raise ValueError(f'prompt_path_str must start with {PROMPTFILE_PREFIX}') + + _, prompt_file_path = prompt_path.split(PROMPTFILE_PREFIX, maxsplit=1) + prompt_file_path = os.path.expanduser(prompt_file_path) + if not os.path.isfile(prompt_file_path): + raise FileNotFoundError( + f'{prompt_file_path=} does not match any existing files.') + + with open(prompt_file_path, 'r') as f: + prompt_string = f.read() + + if prompt_delimiter is None: + return [prompt_string] + return [i for i in prompt_string.split(prompt_delimiter) if i] diff --git a/scripts/inference/endpoint_generate.py b/scripts/inference/endpoint_generate.py new file mode 100644 index 0000000000..e78fecf59b --- /dev/null +++ b/scripts/inference/endpoint_generate.py @@ -0,0 +1,223 @@ +# Copyright 2022 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + +"""Batch generate text completion results from an endpoint. + +Warning: This script is experimental and could change or be removed at any time +""" + +import asyncio +import copy +import logging +import math +import os +import tempfile +import time +from argparse import ArgumentParser, Namespace + +import pandas as pd +import requests +from composer.utils import (get_file, maybe_create_object_store_from_uri, + parse_uri) + +from llmfoundry.utils import prompt_files as utils + +logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s') +log = logging.getLogger(__name__) + +ENDPOINT_API_KEY_ENV: str = 'ENDPOINT_API_KEY' +ENDPOINT_URL_ENV: str = 'ENDPOINT_URL' + +PROMPT_DELIMITER = '\n' + + +def parse_args() -> Namespace: + """Parse commandline arguments.""" + parser = ArgumentParser( + description='Call prompts against a text completions endpoint') + + ##### + # Path Parameters + parser.add_argument( + '-i', + '--inputs', + nargs='+', + help=f'List of strings, local datafiles (starting with {utils.PROMPTFILE_PREFIX}),' +\ + ' and/or remote object stores' + ) + parser.add_argument( + '--prompt-delimiter', + default='\n', + help= + 'Prompt delimiter for txt files. By default, a file is a single prompt') + + parser.add_argument('-o', + '--output-folder', + required=True, + help='Remote folder to save the output') + + ##### + # Generation Parameters + parser.add_argument( + '--rate-limit', + type=int, + default=75, + help='Max number of calls to make to the endpoint in a second') + parser.add_argument( + '--batch-size', + type=int, + default=10, + help='Max number of calls to make to the endpoint in a single request') + + ##### + # Endpoint Parameters + parser.add_argument( + '-e', + '--endpoint', + type=str, + help= + f'OpenAI-compatible text completions endpoint to query on. If not set, will read from {ENDPOINT_URL_ENV}' + ) + + parser.add_argument('--max-tokens', type=int, default=100) + parser.add_argument('--temperature', type=float, default=1.0) + parser.add_argument('--top-k', type=int, default=50) + parser.add_argument('--top-p', type=float, default=1.0) + return parser.parse_args() + + +async def main(args: Namespace) -> None: + # This is mildly experimental, so for now imports are not added as part of llm-foundry + try: + import aiohttp + except ImportError as e: + raise ImportError('Please install aiohttp') from e + + try: + from ratelimit import limits, sleep_and_retry + except ImportError as e: + raise ImportError('Please install ratelimit') from e + + if args.batch_size > args.rate_limit: + raise ValueError( + f'Batch size is {args.batch_size} but rate limit is set to {args.rate_limit} / s' + ) + + url = args.endpoint if args.endpoint else os.environ.get(ENDPOINT_URL_ENV) + if not url: + raise ValueError( + f'URL must be provided via --endpoint or {ENDPOINT_URL_ENV}') + + log.info(f'Using endpoint {url}') + + api_key = os.environ.get(ENDPOINT_API_KEY_ENV, '') + if not api_key: + log.warning(f'API key not set in {ENDPOINT_API_KEY_ENV}') + + new_inputs = [] + for prompt in args.inputs: + if prompt.startswith(utils.PROMPTFILE_PREFIX): + new_inputs.append(prompt) + continue + + input_object_store = maybe_create_object_store_from_uri(prompt) + if input_object_store is not None: + local_output_path = tempfile.TemporaryDirectory().name + get_file(prompt, str(local_output_path)) + log.info(f'Downloaded {prompt} to {local_output_path}') + prompt = f'{utils.PROMPTFILE_PREFIX}{local_output_path}' + + new_inputs.append(prompt) + + prompt_strings = utils.load_prompts(new_inputs, args.prompt_delimiter) + + cols = ['batch', 'prompt', 'output'] + param_data = { + 'max_tokens': args.max_tokens, + 'temperature': args.temperature, + 'top_k': args.top_k, + 'top_p': args.top_p, + } + + total_batches = math.ceil(len(prompt_strings) / args.batch_size) + log.info( + f'Generating {len(prompt_strings)} prompts in {total_batches} batches') + + @sleep_and_retry + @limits(calls=total_batches, period=1) # type: ignore + async def generate(session: aiohttp.ClientSession, batch: int, + prompts: list): + data = copy.copy(param_data) + data['prompt'] = prompts + headers = {'Authorization': api_key, 'Content-Type': 'application/json'} + req_start = time.time() + async with session.post(url, headers=headers, json=data) as resp: + if resp.ok: + try: + response = await resp.json() + except requests.JSONDecodeError: + raise Exception( + f'Bad response: {resp.status} {resp.reason}') + else: + raise Exception(f'Bad response: {resp.status} {resp.reason}') + + req_end = time.time() + n_compl = response['usage']['completion_tokens'] + n_prompt = response['usage']['prompt_tokens'] + req_latency = (req_end - req_start) + log.info(f'Completed batch {batch}: {n_compl:,} completion' + + f' tokens using {n_prompt:,} prompt tokens in {req_latency}s') + + res = pd.DataFrame(columns=cols) + + for r in response['choices']: + index = r['index'] + res.loc[len(res)] = [batch, prompts[index], r['text']] + return res + + res = pd.DataFrame(columns=cols) + batch = 0 + + gen_start = time.time() + async with aiohttp.ClientSession() as session: + tasks = [] + + for i in range(total_batches): + prompts = prompt_strings[i * args.batch_size:min( + (i + 1) * args.batch_size, len(prompt_strings))] + + tasks.append(generate(session, batch, prompts)) + batch += 1 + + results = await asyncio.gather(*tasks) + res = pd.concat(results) + + res.reset_index(drop=True, inplace=True) + + gen_end = time.time() + gen_latency = (gen_end - gen_start) + log.info(f'Generated {len(res)} prompts in {gen_latency}s, example data:') + log.info(res.head()) + + with tempfile.TemporaryDirectory() as tmp_dir: + file = 'output.csv' + local_path = os.path.join(tmp_dir, file) + res.to_csv(local_path, index=False) + + output_object_store = maybe_create_object_store_from_uri( + args.output_folder) + if output_object_store is not None: + _, _, output_folder_prefix = parse_uri(args.output_folder) + remote_path = os.path.join(output_folder_prefix, file) + output_object_store.upload_object(remote_path, local_path) + output_object_store.download_object + log.info(f'Uploaded results to {args.output_folder}/{file}') + else: + output_dir, _ = os.path.split(args.output_folder) + os.makedirs(output_dir, exist_ok=True) + os.rename(local_path, args.output_folder) + log.info(f'Saved results to {args.output_folder}') + + +if __name__ == '__main__': + asyncio.run(main(parse_args())) diff --git a/scripts/inference/hf_generate.py b/scripts/inference/hf_generate.py index 45ddc6b63e..6ac645e5b7 100644 --- a/scripts/inference/hf_generate.py +++ b/scripts/inference/hf_generate.py @@ -1,7 +1,6 @@ # Copyright 2022 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 import itertools -import os import random import time import warnings @@ -13,6 +12,8 @@ import torch from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer +from llmfoundry.utils import prompt_files as utils + def get_dtype(dtype: str): if dtype == 'fp32': @@ -62,9 +63,14 @@ def parse_args() -> Namespace: 'My name is', 'This is an explanation of deep learning to a five year old. Deep learning is', ], - help='Generation prompts. Use syntax "file::/path/to/prompt.txt" to load a ' +\ - 'prompt contained in a txt file.' + help='List of generation prompts or list of delimited files. Use syntax ' +\ + '"file::/path/to/prompt.txt" to load a prompt(s) contained in a txt file.' ) + parser.add_argument( + '--prompt-delimiter', + default=None, + help= + 'Prompt delimiter for txt files. By default, a file is a single prompt') parser.add_argument('--max_seq_len', type=int, default=None) parser.add_argument('--max_new_tokens', type=int, default=100) parser.add_argument('--max_batch_size', type=int, default=None) @@ -125,19 +131,6 @@ def parse_args() -> Namespace: return parser.parse_args() -def load_prompt_string_from_file(prompt_path_str: str): - if not prompt_path_str.startswith('file::'): - raise ValueError('prompt_path_str must start with "file::".') - _, prompt_file_path = prompt_path_str.split('file::', maxsplit=1) - prompt_file_path = os.path.expanduser(prompt_file_path) - if not os.path.isfile(prompt_file_path): - raise FileNotFoundError( - f'{prompt_file_path=} does not match any existing files.') - with open(prompt_file_path, 'r') as f: - prompt_string = ''.join(f.readlines()) - return prompt_string - - def maybe_synchronize(): if torch.cuda.is_available(): torch.cuda.synchronize() @@ -163,11 +156,7 @@ def main(args: Namespace) -> None: print(f'Using {model_dtype=}') # Load prompts - prompt_strings = [] - for prompt in args.prompts: - if prompt.startswith('file::'): - prompt = load_prompt_string_from_file(prompt) - prompt_strings.append(prompt) + prompt_strings = utils.load_prompts(args.prompts, args.prompt_delimiter) # Grab config first print(f'Loading HF Config...') diff --git a/tests/test_prompt_files.py b/tests/test_prompt_files.py new file mode 100644 index 0000000000..12a5d02999 --- /dev/null +++ b/tests/test_prompt_files.py @@ -0,0 +1,18 @@ +# Copyright 2022 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + +from pathlib import Path + +from llmfoundry.utils import prompt_files as utils + + +def test_load_prompt_strings(tmp_path: Path): + assert utils.load_prompts(['hello', 'goodbye']) == ['hello', 'goodbye'] + + with open(tmp_path / 'prompts.txt', 'w') as f: + f.write('hello goodbye') + + temp = utils.PROMPTFILE_PREFIX + str(tmp_path / 'prompts.txt') + assert utils.load_prompts( + [temp, temp, 'why'], + ' ') == ['hello', 'goodbye', 'hello', 'goodbye', 'why']