mosaicml · samhavens · Dec 1, 2023 · Oct 20, 2023 · Oct 20, 2023 · Oct 20, 2023
@@ -293,7 +293,7 @@ def _apply_sequence_id(self, attn_bias: torch.Tensor,
 
     def forward(
         self,
-        input_ids: torch.LongTensor,
+        input_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[Tuple[torch.FloatTensor]]] = None,
         attention_mask: Optional[torch.ByteTensor] = None,
         prefix_mask: Optional[torch.ByteTensor] = None,
@@ -337,11 +337,6 @@ def forward(
                 'prefix_mask is a required argument when MPT is configured with prefix_lm=True.'
             )
 
-        # Raise a not implemented error if input_embeds is not None (this is an arg in huggingface transformers and we need to support it for PEFT)
-        if inputs_embeds is not None:
-            raise NotImplementedError(
-                'inputs_embeds is not implemented for MPT.')
-
         if self.training:
             if self.attn_uses_sequence_id and sequence_id is None:
                 raise ValueError(
@@ -355,13 +350,24 @@ def forward(
                     'This input will be ignored. If you want the model to use `sequence_id`, set attn_uses_sequence_id to True.'
                 )
 
-        S = input_ids.size(1)
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                'You cannot specify both input_ids and inputs_embeds.')
+        elif input_ids is not None:
+            S = input_ids.size(1)
+            tok_emb = self.wte(input_ids)
+            input_device = input_ids.device
+        elif inputs_embeds is not None:
+            S = inputs_embeds.size(1)
+            tok_emb = inputs_embeds
+            input_device = inputs_embeds.device
+        else:
+            raise ValueError('You must specify input_ids or inputs_embeds')
 
         assert (
             S <= self.config.max_seq_len
         ), f'Cannot forward input with seq_len={S}, this model only supports seq_len<={self.config.max_seq_len}'
 
-        tok_emb = self.wte(input_ids)
         if self.learned_pos_emb:
             past_position = 0
             if past_key_values is not None:
@@ -388,7 +394,7 @@ def forward(
                 past_position,
                 S + past_position,
                 dtype=torch.long,
-                device=input_ids.device,
+                device=input_device,
             ).unsqueeze(0)
             if attention_mask is not None:
                 # adjust the position indices to account for padding tokens
@@ -554,13 +560,12 @@ def forward(
         use_cache = (use_cache
                      if use_cache is not None else self.config.use_cache)
 
-        # if input_embeds is not none, raise a not implemented error
-        if inputs_embeds is not None:
-            raise NotImplementedError(
-                'inputs_embeds has to be None (for hf/peft support).')
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                'You cannot specify both input_ids and inputs_embeds.')
         outputs = self.transformer(
             input_ids=input_ids,
+            inputs_embeds=inputs_embeds,
             past_key_values=past_key_values,
             attention_mask=attention_mask,
             prefix_mask=prefix_mask,
@@ -628,10 +633,6 @@ def prepare_inputs_for_generation(
         inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: Any,
     ) -> Dict[str, Any]:
-        if inputs_embeds is not None:
-            raise NotImplementedError(
-                'inputs_embeds is not implemented for MPT yet')
-
         attention_mask = kwargs['attention_mask'].bool()
         if attention_mask[:, -1].sum() != attention_mask.shape[0]:
             raise NotImplementedError(
@@ -642,6 +643,7 @@ def prepare_inputs_for_generation(
         else:
             sequence_id = None
 
+        # only last token for inputs_ids if past is defined in kwargs
         if past_key_values is not None:
             input_ids = input_ids[:, -1].unsqueeze(-1)
 
@@ -655,14 +657,20 @@ def prepare_inputs_for_generation(
         else:
             prefix_mask = None
 
-        return {
-            'input_ids': input_ids,
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {'inputs_embeds': inputs_embeds}
+        else:
+            model_inputs = {'input_ids': input_ids}
+
+        model_inputs.update({
             'attention_mask': attention_mask,
             'prefix_mask': prefix_mask,
             'sequence_id': sequence_id,
             'past_key_values': past_key_values,
             'use_cache': kwargs.get('use_cache', True),
-        }
+        })
+        return model_inputs
 
     @staticmethod
     def _reorder_cache(
@@ -753,7 +761,7 @@ def forward(self, batch: MutableMapping) -> CausalLMOutputWithPast:
             add_bidirectional_mask_if_missing(batch)
         # Note: prefix_mask is only used if model.prefix_lm is True
         return self.model(
-            input_ids=batch['input_ids'],
+            input_ids=batch.get('input_ids', None),
             attention_mask=batch.get('attention_mask', None),
             prefix_mask=batch.get('bidirectional_mask', None),
             sequence_id=batch.get('sequence_id', None),

diff --git a/tests/test_model.py b/tests/test_model.py
@@ -97,13 +97,20 @@ def get_objs(conf_path: str = 'scripts/train/yamls/pretrain/testing.yaml'):
     return test_cfg, model, optimizer
 
 
-def gen_random_batch(batch_size: int, test_cfg: Union[DictConfig, ListConfig]):
+def gen_random_batch(batch_size: int,
+                     test_cfg: Union[DictConfig, ListConfig],
+                     inputs_embeds: bool = False):
     # generate input batch of random data, suitable for a Causal or Prefix LM
     batch = {}
-    batch['input_ids'] = torch.randint(
-        low=0,
-        high=test_cfg.model.vocab_size,
-        size=(batch_size, test_cfg.max_seq_len)).to(test_cfg.device)
+    if inputs_embeds:
+        batch['inputs_embeds'] = torch.randn(batch_size, test_cfg.max_seq_len,
+                                             test_cfg.model.d_model).to(
+                                                 test_cfg.device)
+    else:
+        batch['input_ids'] = torch.randint(
+            low=0,
+            high=test_cfg.model.vocab_size,
+            size=(batch_size, test_cfg.max_seq_len)).to(test_cfg.device)
     batch['labels'] = torch.randint(low=0,
                                     high=test_cfg.model.vocab_size,
                                     size=(batch_size, test_cfg.max_seq_len)).to(
@@ -153,6 +160,22 @@ def test_full_forward_and_backward(batch_size: int = 2):
     assert not torch.equal(original_params, updated_params)
 
 
+def test_full_forward_and_backward_with_inputs_embeds(batch_size: int = 2):
+    test_cfg, model, optimizer = get_objs(
+        conf_path='scripts/train/yamls/pretrain/testing.yaml')
+
+    batch = gen_random_batch(batch_size, test_cfg, inputs_embeds=True)
+
+    model.train()
+    original_params = next(model.parameters()).clone().data
+    outputs = model(batch)
+    loss = model.loss(outputs, batch)
+    loss.backward()
+    optimizer.step()
+    updated_params = next(model.parameters()).clone().data
+    assert not torch.equal(original_params, updated_params)
+
+
 def test_attention_mechanism(batch_size: int = 2):
     test_cfg, model, _ = get_objs(
         conf_path='scripts/train/yamls/pretrain/testing.yaml')