Skip to content

Commit

Permalink
Merge branch 'main' into shashank/seq_id_flash_attn
Browse files Browse the repository at this point in the history
  • Loading branch information
ShashankMosaicML authored Dec 1, 2023
2 parents fa2a2ee + 32dc3bd commit 8339cd3
Show file tree
Hide file tree
Showing 4 changed files with 112 additions and 32 deletions.
8 changes: 8 additions & 0 deletions .github/CODEOWNERS
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Require admin approval to modify all files in the root of the repository
# This includes setup.py, the README, and the CODEOWNERS file itself!
/* @mosaicml/composer-team-admins

# Require admin approval to change the CI build configuration
# All CI Changes should be reviewed for security
/.ci/ @mosaicml/composer-team-admins
/.github/ @mosaicml/composer-team-admins
6 changes: 3 additions & 3 deletions .github/mcp/mcp_pytest.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@
print(line, end='')

print('[GHA] Run completed. Waiting for run to finish...')
run = wait_for_run_status(run, status='completed')
run = wait_for_run_status(run, status=RunStatus.COMPLETED)

# Fail if command exited with non-zero exit code or timed out
assert run.status == RunStatus.COMPLETED
# Fail if command exited with non-zero exit code or timed out (didn't reach COMPLETED)
assert run.status == RunStatus.COMPLETED, f'Run did not complete: {run.status} ({run.reason})'
51 changes: 28 additions & 23 deletions llmfoundry/models/mpt/modeling_mpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -414,7 +414,7 @@ def _apply_prefix_mask(self, attn_bias: torch.Tensor,

def forward(
self,
input_ids: torch.LongTensor,
input_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[List[Tuple[torch.FloatTensor]]] = None,
attention_mask: Optional[torch.ByteTensor] = None,
prefix_mask: Optional[torch.ByteTensor] = None,
Expand Down Expand Up @@ -458,11 +458,6 @@ def forward(
'prefix_mask is a required argument when MPT is configured with prefix_lm=True.'
)

# Raise a not implemented error if input_embeds is not None (this is an arg in huggingface transformers and we need to support it for PEFT)
if inputs_embeds is not None:
raise NotImplementedError(
'inputs_embeds is not implemented for MPT.')

if self.training:
if self.attn_uses_sequence_id and sequence_id is None:
raise ValueError(
Expand All @@ -476,14 +471,25 @@ def forward(
'This input will be ignored. If you want the model to use `sequence_id`, set attn_uses_sequence_id to True.'
)

S = input_ids.size(1)
if input_ids is not None and inputs_embeds is not None:
raise ValueError(
'You cannot specify both input_ids and inputs_embeds.')
elif input_ids is not None:
S = input_ids.size(1)
x = self.wte(input_ids)
input_device = input_ids.device
elif inputs_embeds is not None:
S = inputs_embeds.size(1)
x = inputs_embeds
input_device = inputs_embeds.device
else:
raise ValueError('You must specify input_ids or inputs_embeds')

assert (
S <= self.config.max_seq_len
), f'Cannot forward input with seq_len={S}, this model only supports seq_len<={self.config.max_seq_len}'

rotary_emb_w_meta_info = None
x = self.wte(input_ids)
if self.learned_pos_emb or self.rope:
past_position = 0
if past_key_values is not None:
Expand Down Expand Up @@ -513,7 +519,7 @@ def forward(
past_position,
S + past_position,
dtype=torch.long,
device=input_ids.device,
device=input_device,
).unsqueeze(0)
if attention_mask is not None:
# adjust the position indices to account for padding tokens
Expand Down Expand Up @@ -704,7 +710,7 @@ def get_decoder(self) -> MPTModel:

def forward(
self,
input_ids: torch.LongTensor,
input_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[List[Tuple[torch.FloatTensor]]] = None,
attention_mask: Optional[torch.ByteTensor] = None,
prefix_mask: Optional[torch.ByteTensor] = None,
Expand All @@ -721,11 +727,6 @@ def forward(
use_cache = (use_cache
if use_cache is not None else self.config.use_cache)

# if input_embeds is not none, raise a not implemented error
if inputs_embeds is not None:
raise NotImplementedError(
'inputs_embeds has to be None (for hf/peft support).')
# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
outputs = self.transformer(
input_ids=input_ids,
past_key_values=past_key_values,
Expand All @@ -736,6 +737,7 @@ def forward(
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
use_cache=use_cache,
inputs_embeds=inputs_embeds,
)

if self.lm_head is not None:
Expand Down Expand Up @@ -825,10 +827,6 @@ def prepare_inputs_for_generation(
inputs_embeds: Optional[torch.Tensor] = None,
**kwargs: Any,
) -> Dict[str, Any]:
if inputs_embeds is not None:
raise NotImplementedError(
'inputs_embeds is not implemented for MPT yet')

attention_mask = kwargs['attention_mask'].bool()
if attention_mask[:, -1].sum() != attention_mask.shape[0]:
raise NotImplementedError(
Expand All @@ -839,6 +837,7 @@ def prepare_inputs_for_generation(
else:
sequence_id = None

# only last token for inputs_ids if past is defined in kwargs
if past_key_values is not None:
input_ids = input_ids[:, -1].unsqueeze(-1)

Expand All @@ -852,14 +851,20 @@ def prepare_inputs_for_generation(
else:
prefix_mask = None

return {
'input_ids': input_ids,
# if `inputs_embeds` are passed, we only want to use them in the 1st generation step
if inputs_embeds is not None and past_key_values is None:
model_inputs = {'inputs_embeds': inputs_embeds}
else:
model_inputs = {'input_ids': input_ids}

model_inputs.update({
'attention_mask': attention_mask,
'prefix_mask': prefix_mask,
'sequence_id': sequence_id,
'past_key_values': past_key_values,
'use_cache': kwargs.get('use_cache', True),
}
})
return model_inputs

@staticmethod
def _reorder_cache(
Expand Down Expand Up @@ -950,7 +955,7 @@ def forward(self, batch: MutableMapping) -> CausalLMOutputWithPast:
add_bidirectional_mask_if_missing(batch)
# Note: prefix_mask is only used if model.prefix_lm is True
return self.model(
input_ids=batch['input_ids'],
input_ids=batch.get('input_ids', None),
attention_mask=batch.get('attention_mask', None),
prefix_mask=batch.get('bidirectional_mask', None),
sequence_id=batch.get('sequence_id', None),
Expand Down
79 changes: 73 additions & 6 deletions tests/test_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import os
import pathlib
import warnings
from typing import Any, Dict, Union, cast
from typing import Any, Dict, List, Optional, Union, cast
from unittest import mock

import pytest
Expand Down Expand Up @@ -94,13 +94,26 @@ def get_objs(conf_path: str = 'scripts/train/yamls/pretrain/testing.yaml'):
return test_cfg, model, optimizer


def gen_random_batch(batch_size: int, test_cfg: Union[DictConfig, ListConfig]):
def gen_random_batch(batch_size: int,
test_cfg: Union[DictConfig, ListConfig],
inputs: Optional[List[str]] = None):
# inputs can be [], ['input_ids'], ['input_ids', 'inputs_embeds'], and ['inputs_embeds']
# default to only input ids
if inputs == None:
inputs = ['input_ids']
# generate input batch of random data, suitable for a Causal or Prefix LM
batch = {}
batch['input_ids'] = torch.randint(
low=0,
high=test_cfg.model.vocab_size,
size=(batch_size, test_cfg.max_seq_len)).to(test_cfg.device)
for inp in inputs:
if inp == 'input_ids':
batch['input_ids'] = torch.randint(
low=0,
high=test_cfg.model.vocab_size,
size=(batch_size, test_cfg.max_seq_len)).to(test_cfg.device)
if inp == 'inputs_embeds':
batch['inputs_embeds'] = torch.randn(
batch_size, test_cfg.max_seq_len,
test_cfg.model.d_model).to(test_cfg.device)

batch['labels'] = torch.randint(low=0,
high=test_cfg.model.vocab_size,
size=(batch_size, test_cfg.max_seq_len)).to(
Expand Down Expand Up @@ -150,6 +163,34 @@ def test_full_forward_and_backward(batch_size: int = 2):
assert not torch.equal(original_params, updated_params)


def test_full_forward_and_backward_with_inputs_embeds(batch_size: int = 2):
test_cfg, model, optimizer = get_objs(
conf_path='scripts/train/yamls/pretrain/testing.yaml')

batch = gen_random_batch(batch_size, test_cfg, inputs=['inputs_embeds'])

model.train()
original_params = next(model.parameters()).clone().data
outputs = model(batch)
loss = model.loss(outputs, batch)
loss.backward()
optimizer.step()
updated_params = next(model.parameters()).clone().data
assert not torch.equal(original_params, updated_params)


@pytest.mark.parametrize('inputs', [[], ['input_ids', 'inputs_embeds']])
def test_invalid_inputs_embeds_input_ids_combinations(inputs: List[str]):
test_cfg, model, _ = get_objs(
conf_path='scripts/train/yamls/pretrain/testing.yaml')

batch = gen_random_batch(2, test_cfg, inputs=inputs)

model.train()
with pytest.raises(ValueError):
_ = model(batch)


def test_attention_mechanism(batch_size: int = 2):
test_cfg, model, _ = get_objs(
conf_path='scripts/train/yamls/pretrain/testing.yaml')
Expand Down Expand Up @@ -938,6 +979,9 @@ def test_generate(attention_impl: str, precision: str, pos_emb_config: dict,
no_padding_attention_mask = composer_device.tensor_to_device(
no_padding_attention_mask)

# inputs_embeds
inputs_embeds = composer_device.tensor_to_device(torch.randn(2, 3, 128))

# a single batch with different amounts of left padding in the input
batched_input_ids = torch.tensor([[50256, 50256, 50256, 11274, 16390, 11],
[50256, 50256, 16, 11274, 16390, 11]])
Expand Down Expand Up @@ -973,6 +1017,29 @@ def test_generate(attention_impl: str, precision: str, pos_emb_config: dict,
assert generation_with_no_padding[:, 3:].equal(
generation_with_left_padding[:, 6:])

# check that both/neither ids and embeds do not error
# note that we need to set the BOS token ID for generating from neither
_ = mpt.generate(input_ids=no_padding_input_ids,
inputs_embeds=inputs_embeds,
attention_mask=no_padding_attention_mask,
max_new_tokens=5,
use_cache=False)
_ = mpt.generate(input_ids=no_padding_input_ids,
inputs_embeds=inputs_embeds,
attention_mask=no_padding_attention_mask,
max_new_tokens=5,
use_cache=True)
_ = mpt.generate(input_ids=None,
inputs_embeds=None,
max_new_tokens=5,
use_cache=False,
bos_token_id=50256)
_ = mpt.generate(input_ids=None,
inputs_embeds=None,
max_new_tokens=5,
use_cache=True,
bos_token_id=50256)


@pytest.mark.gpu
@pytest.mark.parametrize('world_size', [1, 2])
Expand Down

0 comments on commit 8339cd3

Please sign in to comment.