diff --git a/llmfoundry/data/text_data.py b/llmfoundry/data/text_data.py
index 8c278b7e17..b6c0685960 100644
--- a/llmfoundry/data/text_data.py
+++ b/llmfoundry/data/text_data.py
@@ -210,15 +210,17 @@ def _read_binary_tokenized_sample(
         self,
         sample: Dict[str, Any],
     ) -> torch.Tensor:
+        # Modeling code still expects int64 tensors.
         if isinstance(sample['tokens'], np.ndarray):
-            return torch.from_numpy(sample['tokens'][:self.max_seq_len].copy())
+            return torch.from_numpy(sample['tokens'][:self.max_seq_len].copy()
+                                   ).to(torch.int64)
         else:
             return torch.from_numpy(
                 np.frombuffer(
                     sample['tokens'],
                     dtype=getattr(np, self.token_encoding_type),
                 )[:self.max_seq_len].copy(),
-            )
+            ).to(torch.int64)
 
     # How to process a sample
     def __getitem__(self,
diff --git a/llmfoundry/eval/datasets/utils.py b/llmfoundry/eval/datasets/utils.py
index 40881b3735..1ce249437d 100644
--- a/llmfoundry/eval/datasets/utils.py
+++ b/llmfoundry/eval/datasets/utils.py
@@ -6,7 +6,7 @@
 
 import logging
 import random
-from typing import Any, Dict, List, Optional, Set, Union
+from typing import Any, Dict, List, Optional, Set
 
 import torch
 import transformers
@@ -272,7 +272,7 @@ def __init__(
 
     def __call__(
         self,
-        input_ids: Union[torch.LongTensor, torch.IntTensor],
+        input_ids: torch.LongTensor,
         scores: Optional[torch.FloatTensor] = None,
         **kwargs: Dict[str, Any],
     ) -> bool:
diff --git a/llmfoundry/models/mpt/modeling_mpt.py b/llmfoundry/models/mpt/modeling_mpt.py
index a6a7d659ac..9d18799e93 100644
--- a/llmfoundry/models/mpt/modeling_mpt.py
+++ b/llmfoundry/models/mpt/modeling_mpt.py
@@ -277,7 +277,7 @@ def gen_flash_attn_padding_info(
 
 def apply_sequence_id(
     attn_bias: torch.Tensor,
-    sequence_id: Union[torch.LongTensor, torch.IntTensor],
+    sequence_id: torch.LongTensor,
     max_seq_len: int,
 ) -> torch.Tensor:
     seq_len = sequence_id.shape[-1]
@@ -470,7 +470,7 @@ def _attn_bias(
         device: torch.device,
         dtype: torch.dtype,
         attention_mask: Optional[torch.ByteTensor] = None,
-        sequence_id: Optional[Union[torch.LongTensor, torch.IntTensor]] = None,
+        sequence_id: Optional[torch.LongTensor] = None,
     ) -> Tuple[Optional[torch.Tensor], Optional[torch.ByteTensor]]:
         if not self._attn_bias_initialized:
             if self.attn_bias_shape:
@@ -533,10 +533,10 @@ def _attn_bias(
 
     def forward(
         self,
-        input_ids: Optional[Union[torch.LongTensor, torch.IntTensor]] = None,
+        input_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[Tuple[torch.FloatTensor]]] = None,
         attention_mask: Optional[torch.ByteTensor] = None,
-        sequence_id: Optional[Union[torch.LongTensor, torch.IntTensor]] = None,
+        sequence_id: Optional[torch.LongTensor] = None,
         return_dict: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -877,11 +877,11 @@ def get_decoder(self) -> MPTModel:
 
     def forward(
         self,
-        input_ids: Optional[Union[torch.LongTensor, torch.IntTensor]] = None,
+        input_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[Tuple[torch.FloatTensor]]] = None,
         attention_mask: Optional[torch.ByteTensor] = None,
-        sequence_id: Optional[Union[torch.LongTensor, torch.IntTensor]] = None,
-        labels: Optional[Union[torch.LongTensor, torch.IntTensor]] = None,
+        sequence_id: Optional[torch.LongTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
         return_dict: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -1056,7 +1056,7 @@ def prepare_inputs_for_generation(
     @staticmethod
     def _reorder_cache(
         past_key_values: List[Tuple[torch.Tensor, torch.Tensor]],
-        beam_idx: Union[torch.LongTensor, torch.IntTensor],
+        beam_idx: torch.LongTensor,
     ) -> List[Tuple[torch.Tensor, ...]]:
         """Used by HuggingFace generate when using beam search with kv-caching.
 
diff --git a/scripts/inference/hf_chat.py b/scripts/inference/hf_chat.py
index dc9776ee46..e992371c32 100644
--- a/scripts/inference/hf_chat.py
+++ b/scripts/inference/hf_chat.py
@@ -87,7 +87,7 @@ class StopOnTokens(StoppingCriteria):
 
             def __call__(
                 self,
-                input_ids: Union[torch.LongTensor, torch.IntTensor],
+                input_ids: torch.LongTensor,
                 scores: torch.FloatTensor,
                 **kwargs: Any,
             ) -> bool:
diff --git a/tests/data/test_dataloader.py b/tests/data/test_dataloader.py
index fee3e53c8b..ec27df8121 100644
--- a/tests/data/test_dataloader.py
+++ b/tests/data/test_dataloader.py
@@ -270,8 +270,7 @@ def test_correct_padding(
     batch = next(iter(eval_loader))
 
     assert batch['input_ids'].shape == torch.Size([batch_size, 2048])
-    assert batch['input_ids'].type(
-    ) == 'torch.LongTensor' or batch['input_ids'].type() == 'torch.IntTensor'
+    assert batch['input_ids'].type() == 'torch.LongTensor'
 
     # we follow the convention (from huggingface) that non-attended tokens are 0 in the attn mask and -100 in the labels
     attention_mask = batch.get(
diff --git a/tests/models/test_mpt_gen.py b/tests/models/test_mpt_gen.py
index 6cca704bb2..1c9b5ef9d4 100644
--- a/tests/models/test_mpt_gen.py
+++ b/tests/models/test_mpt_gen.py
@@ -1,7 +1,7 @@
 # Copyright 2022 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Callable, List, Optional, Tuple, Union
+from typing import Callable, List, Optional, Tuple
 from unittest.mock import Mock, patch
 
 import pytest
@@ -27,11 +27,11 @@ class MockMPTForCausalLM(MPTForCausalLM):
 
     def forward(
         self,
-        input_ids: Union[torch.LongTensor, torch.IntTensor],
+        input_ids: torch.LongTensor,
         past_key_values: Optional[List[Tuple[torch.FloatTensor]]] = None,
         attention_mask: Optional[torch.ByteTensor] = None,
-        sequence_id: Optional[Union[torch.LongTensor, torch.IntTensor]] = None,
-        labels: Optional[Union[torch.LongTensor, torch.IntTensor]] = None,
+        sequence_id: Optional[torch.LongTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
         return_dict: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,