From cd18e74238855073f7fc63769d40c8ad3be14457 Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Tue, 23 Jan 2024 14:03:40 -0500
Subject: [PATCH 01/59] start

---
 .../in_context_learning_evaluation.py         | 1282 +++++++++++
 llmfoundry/eval/metrics/nlp.py                |  360 +++
 llmfoundry/utils/builders.py                  |    2 +-
 .../eval/test_in_context_learning_datasets.py | 2007 +++++++++++++++++
 tests/eval/test_nlp_metrics.py                |  146 ++
 5 files changed, 3796 insertions(+), 1 deletion(-)
 create mode 100644 llmfoundry/eval/datasets/in_context_learning_evaluation.py
 create mode 100644 llmfoundry/eval/metrics/nlp.py
 create mode 100644 tests/eval/test_in_context_learning_datasets.py
 create mode 100644 tests/eval/test_nlp_metrics.py

diff --git a/llmfoundry/eval/datasets/in_context_learning_evaluation.py b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
new file mode 100644
index 0000000000..bcc7996189
--- /dev/null
+++ b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
@@ -0,0 +1,1282 @@
+# Copyright 2022 MosaicML Composer authors
+# SPDX-License-Identifier: Apache-2.0
+# This code is based on the implementation in https://github.com/EleutherAI/lm-evaluation-harness/blob/8c048e266a22a1c85ccbdb0c209ac712e4f39989/lm_eval/base.py#L221-L330
+
+from __future__ import annotations
+
+import copy
+import json
+import os
+import random
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Union
+
+import torch
+from torch.utils.data import DataLoader
+
+from composer.core import DataSpec
+from composer.core.data_spec import _default_split_batch, _split_list
+from composer.datasets.utils import stop_sequences_criteria
+from composer.utils import MissingConditionalImportError, dist, get_file
+from composer.datasets import InContextLearningDataset
+if TYPE_CHECKING:
+    import transformers
+    from datasets import Dataset as HFDataset  # pyright: ignore[reportGeneralTypeIssues]
+
+# Allow models to have slightly more tokens than were used in the most verbose CoT in the dataset
+_MAX_ANSWER_BUFFER_LENGTH = 10
+
+__all__ = [
+    'InContextLearningLMTaskDataset',
+    'InContextLearningMultipleChoiceTaskDataset',
+    'InContextLearningCodeEvalDataset',
+    'InContextLearningQATaskDataset',
+    'get_icl_task_dataloader',
+]
+
+
+def strip_data(example: Dict) -> Dict:
+    """
+    Remove white space from the begging and end of string values in a dictionary
+
+    Args:
+        example: Dictionary to be stripped
+
+    Returns:
+        dict: The same dictionary with .strip() applied to any value in the dict that is a string
+    """
+    return {k: v.strip() if isinstance(v, str) else v for k, v in example.items()}
+
+
+def _trim_context(context_enc: List, continuation_enc: List, max_seq_len: int) -> List:
+    """
+    Trims a list of tokens down to `max_seq_len` if the length of the list plus the continuation
+    is more than `max_seq_len`. It will always trim tokens from the left, i.e. tokens at the beginning
+    of the context will be removed.
+
+    Args:
+        context_enc (list): List of tokens in the context
+        continuation_enc (lsit): List of tokens in the continuation
+        max_seq_len (int): Maximum length the model can ingest
+
+    Returns:
+        list: The encoded context trimmed from the left
+    """
+    if len(continuation_enc) + len(context_enc) > max_seq_len:
+        context_max_subseq_len = max_seq_len - len(continuation_enc)
+
+        if context_max_subseq_len < 0:
+            # can't support continuations which are longer than the max seq len
+            raise Exception(f'Dataset included continuation longer than the max seq len')
+
+        # clip from the end
+        context_enc = context_enc[-(context_max_subseq_len):]
+    return context_enc
+
+
+def _get_continuation_span(context_enc: List, continuation_enc: List) -> torch.Tensor:
+    """
+    Gets the list of indices of the continuation tokens for language modeling or generation tasks.
+
+    Args:
+        context_enc (list): List of context tokens
+        continuation_enc (list): List of continuation tokens
+
+    Returns:
+        torch.tensor: A tensor containing indices corresponding to continuation tokens
+    """
+    return torch.tensor(range(len(context_enc), len(context_enc) + len(continuation_enc)))
+
+
+def _make_padded_input(context_enc: List,
+                       continuation_enc: List,
+                       max_seq_len: int,
+                       pad_tok_id: int,
+                       padding_side: str = 'right') -> torch.Tensor:
+    """
+    Takes an encoded context and continuation and clips the beginning of the context if they're too long.
+    Adds the padding token to the specified side.
+
+    Args:
+        context_enc (List): The encoded input to the model
+        continuation_enc (List): The encoded desired output for the example
+        max_seq_list (int): Maximum length sequences can be
+        pad_tok_id (int): The token id we pad with
+        padding_side (str): Which side to pad the context on. Can be 'right' or 'left
+
+    Returns:
+        input (torch.tensor): The padded and encoded context
+        continuation_span (torch.tensor): The _inclusive_ range of indices corresponding to the continuation
+    """
+
+    inp = torch.tensor(
+        (context_enc + continuation_enc),
+        dtype=torch.long,
+    )
+    (inp_len,) = inp.shape
+
+    # Sometimes tokenizers that have neither a pad_tok_id or eos_tok_id will pass None in as the padding
+    # token and cause errors
+    if not isinstance(pad_tok_id, int):
+        raise ValueError(f'`pad_tok_id` must be an integer. Found {type(pad_tok_id)} instead')
+    # pad length from seq to padding_length
+    if padding_side == 'right':
+        inp = torch.cat(
+            [
+                inp,  # [seq]
+                torch.LongTensor((max_seq_len - inp_len) * [pad_tok_id]),
+            ],
+            dim=0,
+        )
+    elif padding_side == 'left':
+        inp = torch.cat(
+            [
+                torch.LongTensor((max_seq_len - inp_len) * [pad_tok_id]),
+                inp,  # [seq]
+            ],
+            dim=0,
+        )
+    else:
+        raise ValueError(f"Unknown padding_side {padding_side}. padding_side must be either 'left' or 'right'")
+
+    return inp
+
+
+def convert_tokens_to_tensors(batch: Dict, tokenize_labels: bool) -> Dict[str, Any]:
+    """
+    HF Datasets converts tensors into lists when we store them, and we don't want to use `type='torch'`
+    because some content in the dataset, like generation args or single ints, should not be converted.
+
+    Here, we convert those lists of tokens back into tensors in order to feed them into the model.
+
+    Args:
+        batch (dict): A dictionary of batched inputs
+        tokenize_labels (bool): Whether or not the labels are tokenized (and need to be stacked)
+
+    Returns:
+        dict: The batch with torch tensors in the corresponding keys instead of lists of lists
+    """
+    batch['input_ids'] = torch.stack(list(map(torch.tensor, batch['input_ids'])))
+    if tokenize_labels:
+        batch['labels'] = torch.stack(list(map(torch.tensor, batch['labels'])))
+        batch['continuation_indices'] = list(map(torch.tensor, batch['continuation_indices']))
+    return batch
+
+
+def _get_fewshot_sample_idxs(dataset_size: int, num_fewshot: int, example_idx: int, rng: random.Random) -> Set[int]:
+    """
+    Samples indices without replacement. If num_fewshot exceeds the number of unique examples in the dataset,
+    then we will have fewer than num_fewshot examples in context.
+    Args:
+        dataset_size (int): Length of the dataset
+        num_fewshot (int): Number of examples to prepend
+        example_idx (int): Current example's index (excluded from fewshot choices)
+        rng (random.Random): RNG for repeatable sample selection
+
+    Returns:
+        list: Indices of the examples chosen for fewshot selection
+    """
+    num_fewshot = min(dataset_size - 1, num_fewshot)
+    fewshot_idxs = set(rng.sample(range(0, dataset_size), num_fewshot))
+
+    if example_idx in fewshot_idxs:
+        fewshot_idxs.remove(example_idx)
+        if len(fewshot_idxs) >= dataset_size - 1:
+            return fewshot_idxs
+
+        replacement_sample = rng.choice(range(0, dataset_size))
+        while replacement_sample in fewshot_idxs or replacement_sample == example_idx:
+            replacement_sample = rng.choice(range(0, dataset_size))
+        fewshot_idxs.add(replacement_sample)
+    return fewshot_idxs
+
+class InContextLearningQATaskDataset(InContextLearningDataset):
+    """
+    A dataset that constructs batches for in-context learning question answering evaluation.
+    QA tasks evaluate a model's ability to answer questions using a consistent format.
+
+    The input format is expected to be a jsonl file with the following fields:
+    - context: The question
+    - answer: The preferred answer to the question
+    - aliases: A list of aliases for the answer
+
+    See InContextLearningDataset for more details.
+
+    Additional Args:
+        cot_delimiter (str): Delimiter to place between the chain of thought and continuations.
+    """
+
+    def __init__(self,
+                 cot_delimiter: str = '',
+                 early_stopping_criteria: Optional[List[str]] = None,
+                 do_normalization: bool = True,
+                 *args,
+                 **kwargs):
+        if kwargs['tokenizer'].eos_token_id is None:
+            raise ValueError('`InContextLearningQATaskDataset` tokenizer must have non-null `eos_token_id`')
+        self.cot_delimiter = cot_delimiter
+        self.has_cot = False
+        self.max_answer_length = 0
+        static_keys = [
+            'mode', 'cot_delimiter', 'generation_length', 'generation_kwargs', 'do_normalization', 'stopping_criteria'
+        ]
+        tensor_keys = ['input_ids', 'attention_mask']
+        list_keys = ['labels']
+        super().__init__(padding_side='left',
+                         tokenize_labels=False,
+                         static_keys=static_keys,
+                         list_keys=list_keys,
+                         tensor_keys=tensor_keys,
+                         *args,
+                         **kwargs)
+        # NOTE: set these after init call because they take class vars
+        self.early_stopping_criteria = early_stopping_criteria
+        self.base_batch = {
+            'input_ids': [],
+            'mode': 'generate',
+            'labels': [],
+            'cot_delimiter': self.cot_delimiter,
+            'generation_length': self.max_answer_length,
+            'stopping_criteria': early_stopping_criteria,
+            'do_normalization': do_normalization,
+            'generation_kwargs': {
+                'pad_token_id': self.pad_tok_id,
+                'use_cache': True,
+                'eos_token_id': self.tokenizer.eos_token_id,
+            }
+        }
+        self.batch_mapping = {
+            'input_ids': self.context_key,
+            'labels': 'aliases',
+        }
+        self.update_generation_kwargs(kwargs.get('generation_kwargs', {}))
+
+    def read_dataset(
+        self,
+        dataset_uri: str,
+        destination_path: str,
+        hf_loading_vars: Dict,
+        hf_parsing_map: Dict,
+    ) -> 'HFDataset':
+        dataset = super().read_dataset(dataset_uri, destination_path, hf_loading_vars, hf_parsing_map)
+        self.has_cot = 'chain_of_thought' in dataset.features
+        dataset = dataset.map(
+            lambda examples: {
+                'context': examples['context'],
+                'answer': examples['answer'],
+                'aliases': set([examples['answer']] + examples.get('aliases', [])),
+                'chain_of_thought': examples.get('chain_of_thought', ''),
+            })
+        self.max_answer_length = self._get_max_answer_length(dataset)
+        # NOTE: This is the only time we use the class variable padding_size.
+        self.padding_size = self.max_seq_len - self.max_answer_length
+        return dataset
+
+    def get_answer_from_example(self, example: Dict, in_context=False) -> str:
+        """
+        Returns the answer from the example. Applies chain of thought if self.has_cot is marked as true.
+        Args:
+            example (Dict): The example from which to retrieve the answer
+
+        Returns:
+            str: The answer in from the example with chain of thought and delimiter if needed
+        """
+        if self.has_cot:
+            return f'{example["chain_of_thought"]}{self.cot_delimiter}{example[self.answer_key]}'
+        else:
+            return example[self.answer_key]
+
+    def tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -> Dict[str, Any]:
+        """
+        Run text through the tokenizer and handle special cases.
+        Args:
+            prompt_and_fewshot (str): The collection of the prompt and fewshot examples that belongs before the example's context
+            ctx (str): The specific example's derrived context
+            example (Dict): The example as a dictionary.
+
+        Returns:
+            Dict: Dictionary with the tokenized data
+        """
+        tokenized_example = super().tokenize_example(prompt_and_fewshot, ctxt, example)
+        tokenized_example['aliases'] = list(example.get('aliases', []))
+        return tokenized_example
+
+    def _get_max_answer_length(self, dataset) -> int:
+        f"""
+        Loops over the dataset and finds the longest answer length.
+
+        Returns:
+            int: The maximum answer length with an additional buffer of {_MAX_ANSWER_BUFFER_LENGTH} if chain of thought is present
+        """
+        max_answer_length = 0
+        for example in dataset:
+            all_answers = [example[self.answer_key]] + list(example.get('aliases', []))
+            for answer in all_answers:
+                if self.has_cot:
+                    response = (f'{example["chain_of_thought"]}{self.cot_delimiter}{answer}')
+                else:
+                    response = answer
+                tokenized_repsonse = self.tokenizer(response)['input_ids']
+                assert isinstance(tokenized_repsonse, list)
+                max_answer_length = max(max_answer_length, len(tokenized_repsonse))
+        max_answer_length = max_answer_length + (_MAX_ANSWER_BUFFER_LENGTH if len(self.cot_delimiter) > 0 else 0)
+        return max_answer_length
+
+    def collate_fn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
+        batch = super().collate_fn(data)
+        batch_size = batch['input_ids'].shape[0]
+        stopping_criteria = None
+        if self.early_stopping_criteria:
+            if stop_sequences_criteria is None:  # pyright: ignore [reportUnnecessaryComparison]
+                raise MissingConditionalImportError(extra_deps_group='nlp',
+                                                    conda_package='transformers',
+                                                    conda_channel='conda-forge')
+            stopping_criteria = stop_sequences_criteria(self.tokenizer, self.early_stopping_criteria, batch_size)
+        batch['generation_kwargs']['stopping_criteria'] = stopping_criteria
+        return batch
+
+
+class InContextLearningLMTaskDataset(InContextLearningDataset):
+    """
+    A dataset that constructs batches for in-context learning language modeling evaluation.
+    Language modeling tasks test a model's ability to properly predict tokens based on preceding tokens.
+
+    The input format is expected to be a jsonl file with the following fields:
+    - context: Preceding text
+    - continuation: The expected continuation
+
+    See InContextLearningDataset for more details.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(answer_key='continuation',
+                         static_keys=['mode'],
+                         tensor_keys=['input_ids', 'continuation_indices', 'labels', 'attention_mask'],
+                         base_batch={
+                             'input_ids': [],
+                             'continuation_indices': [],
+                             'mode': 'icl_task',
+                             'labels': []
+                         },
+                         batch_mapping={
+                             'input_ids': 'context',
+                             'labels': 'context'
+                         },
+                         padding_side='right',
+                         *args,
+                         **kwargs)
+
+
+class InContextLearningMultipleChoiceTaskDataset(InContextLearningDataset):
+    """
+    A dataset that construct batches for in-context learning multiple choice evaluation.
+
+    If each question has N answer choices, we construct N distinct inputs per question. In order to ensure
+    consistency across multi-GPU, we set the batch size to be `min(N, batch_size)` so that all N
+    inputs per question can stored in the same batch.
+
+    The default input format is a jsonl file with the following fields:
+    - query: The preceding text, question, or document relevant to the choices
+    - gold: Index of the correct choice under 'choices'
+    - choices: A list of strings, each being one of the potential choices
+
+    Each batch then consists of ``|batch_size // N|`` distinct questions and has the following the structure.
+    - input_ids: Input tensor ``|batch x seqlen x # tokens|``
+    - continuation_indices: List of ``|batch|`` consisting of tensors indicating which indices in the sequence correspond to the question answer (aka continuation)
+    - mode: Indicates to the model that this is an ICL task and may rely on a custom code path to properly update metrics
+    - labels: Identical to the input, used by the model to calculate loss/metrics
+    - gold_indices: List of length ``|batch_size // N|`` indicating for each question, which of the answers is correct (via an integer [0, N-1])
+    - choice_groupings: Indicates which indices of the batch correspond to which questions
+
+    Additional Args:
+        choices_key (str): The key under which the choices are stored in the saved dataset. Defaults to 'choices'.
+    """
+
+    def __init__(self,
+                 choices_key: str = 'choices',
+                 static_keys: Optional[List] = None,
+                 list_of_tensors_keys: Optional[List] = None,
+                 list_of_tuples_keys: Optional[List] = None,
+                 list_of_primitives: Optional[List] = None,
+                 *args,
+                 **kwargs):
+        self.choices_key = choices_key
+        base_batch = {
+            'input_ids': [],
+            'continuation_indices': [],
+            'mode': 'icl_task',
+            'labels': [],
+            'gold_indices': [],
+            'choice_groupings': [],
+        }
+        context_key = kwargs.pop('context_key', 'query')
+        static_keys = kwargs.pop('static_keys', ['mode', 'generation_kwargs'])
+        tensor_keys = kwargs.pop('tensor_keys', ['input_ids', 'labels', 'attention_mask'])
+        self.list_of_tensors_keys = list_of_tensors_keys or ['continuation_indices']
+        self.list_of_tuples_keys = list_of_tuples_keys or ['choice_groupings']
+        self.list_of_primitives = list_of_primitives or ['gold_indices']
+        super().__init__(context_key=context_key,
+                         base_batch=base_batch,
+                         static_keys=static_keys,
+                         tensor_keys=tensor_keys,
+                         padding_side='right',
+                         *args,
+                         **kwargs)
+        self.num_choices = len(self.dataset[0][self.choices_key])
+        self.batch_mapping_per_choice = {'input_ids': 'context', 'labels': 'context'}
+        self.batch_map_per_example = {'gold_indices': 'gold'}
+
+    def get_answer_from_example(self, example: Dict, in_context=False) -> str:
+        """
+        Returns the correct answer from the example's choices.
+        Args:
+            example (Dict): The example from which to retrieve the answer
+
+        Returns:
+            str: The full string of the correct answer based on the 'gold' key
+        """
+        choices = example[self.choices_key]
+        gold_idx = example['gold']
+        return choices[gold_idx]
+
+    def tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -> Dict[str, Any]:
+        """
+        Runs text through the tokenizer and handle special cases.
+        Args:
+            prompt_and_fewshot (str): The collection of the prompt and fewshot examples that belongs before the example's context
+            ctx (str): The specific example's derrived context
+            example (Dict): The example as a dictionary.
+
+        Returns:
+            Dict: Dictionary with the tokenized data
+        """
+        # NOTE: some of this is repeated from super class but for loop makes things considerably different
+        tokenized_example = {}
+        # Always add special tokens to preamble
+        preamble = self.tokenizer(prompt_and_fewshot)['input_ids']
+        assert isinstance(preamble, list)
+        preamble = self._fix_eos_on_preamble(preamble)
+        if self.strip_data:
+            # rstrip context because a prompt ending in a space results in degenerate output
+            ctxt = ctxt.rstrip()
+        # Never add special tokens to context
+        tokenized_context = self.tokenizer(ctxt, add_special_tokens=False)['input_ids']
+        assert isinstance(tokenized_context, list)
+        tokenized_context = preamble + tokenized_context
+
+        tokenized_example[self.context_key] = []
+        tokenized_example[self.answer_key] = []
+        tokenized_example['continuation_indices'] = []
+        # NOTE: Treating tokenize_labels as True for all MC datasets (required for our MC accuracy metric)
+        for choice in example[self.choices_key]:
+            if self.prefix_space:
+                choice = f' {choice}' if not choice.startswith(' ') else choice
+
+            # Never add special tokens to answer
+            tokenized_answer = self.tokenizer(choice, add_special_tokens=False)['input_ids']
+            assert isinstance(tokenized_context, list)
+            assert isinstance(tokenized_answer, list)
+            trimmed_context = _trim_context(tokenized_context, tokenized_answer, self.padding_size)
+            assert isinstance(trimmed_context, list)
+            continuation_indices = _get_continuation_span(trimmed_context, tokenized_answer)
+            padded_context = _make_padded_input(
+                trimmed_context,
+                tokenized_answer,
+                self.padding_size,
+                self.pad_tok_id,
+                self.padding_side,
+            )
+
+            tokenized_example[self.context_key].append(padded_context)
+            tokenized_example[self.answer_key].append(tokenized_answer)
+            tokenized_example['continuation_indices'].append(continuation_indices)
+
+        tokenized_example['gold'] = example['gold']
+        return tokenized_example
+
+    def collate_fn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """
+        The function that the dataloader uses to accumulate data into batches.
+        We run each distinct query + answer choice through the model separately and determine which
+        answer has the lowest per-token-perplexity.
+
+        If each question has N possible choices, all N must be grouped together as distinct elements of the batch
+        since the batch may consist of multiple questions, the choice_groupings indicates
+        which contiguous sequences of elements in the batch correspond to which question
+        gold_indices indicates which of the [0, N-1] choices is the correct one for each question.
+        Args:
+            data (List): List of tokenized datapoints (dicts returned by self._tokenize_example)
+
+        Returns:
+            Dict: Dictionary for a single batch
+        """
+        batch = copy.deepcopy(self.base_batch)
+        for data_pair in data:
+            choice_start_idx = len(batch['continuation_indices'])
+            # NOTE: not using batch_mapping
+            for i, context_enc in enumerate(data_pair[self.context_key]):
+                batch['input_ids'].append(context_enc)
+                batch['continuation_indices'].append(data_pair['continuation_indices'][i])
+                batch['labels'].append(context_enc)
+
+            batch['gold_indices'].append(data_pair['gold'])
+            choice_end_idx = len(batch['continuation_indices'])
+            batch['choice_groupings'].append((choice_start_idx, choice_end_idx))
+
+        batch = convert_tokens_to_tensors(batch, self.tokenize_labels)
+        batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
+        return batch
+
+    def get_num_samples_in_batch(self, batch) -> int:
+        return batch['input_ids'].shape[0] // self.num_choices
+
+    def split_batch(self, batch: Any, microbatch_size: int) -> List[Dict[str, Any]]:
+        """
+        Split batch while ensuring all continuations are in the same microbatch.
+
+        In ICL Multiple Choice, we duplicate each data point for each possible continuation.
+        When splitting a batch, we have logical example, which refer to one possible question,
+        and real example, which refers to one possible continuation. As example count and
+        microbatch_size are tracked in logical example, we split logical attributes by
+        microbatch_size and real attributes by microbatch_size * num_choices.
+        Args:
+            batch (Dict): Batch of data
+            microbatch_size (int): Size of microbatches
+
+        Returns:
+            list: List of chunked batches
+        """
+        chunked = {}
+        for k, v in batch.items():
+            if k in self.static_keys:
+                # Defer broadcasting primitives until we know num_chunks
+                pass
+            elif type(v) == list:
+                # list of tensors - 'continuation_indices'
+                if k in self.list_of_tensors_keys:
+                    chunked[k] = _split_list(v, microbatch_size * self.num_choices)
+                # list of tuples - 'choice_groupings'
+                elif k in self.list_of_tuples_keys:
+                    chunked[k] = _split_list(v, microbatch_size)
+                # list - 'gold_indices'
+                elif k in self.list_of_primitives:
+                    chunked[k] = _default_split_batch(v, microbatch_size)
+                else:
+                    raise ValueError(f'Unexpected key {k} in list splitting')
+            elif k in self.tensor_keys:
+                chunked[k] = _default_split_batch(v, microbatch_size * self.num_choices)
+            else:
+                raise ValueError(f'Unexpected key {k} in batch splitting')
+        num_chunks = len(chunked['input_ids'])
+        # Broadcast primitives to all chunks
+        for k, v in batch.items():
+            if k in self.static_keys:
+                chunked[k] = [v] * num_chunks
+
+        return [{k: v[idx] for k, v in chunked.items()} for idx in range(num_chunks)]
+
+
+class InContextLearningSchemaTaskDataset(InContextLearningMultipleChoiceTaskDataset):
+    """A dataset that constructs batches for in-context learning schema evaluation.
+    A schema task involves sentences with a fill-in-the-blank where the user needs to choose the correct word
+    to fill in from a set of N options. We use the partial evaluation technique from https://arxiv.org/abs/1806.02847
+    to determine the model's choice of fill-in word.
+
+    The default input format is a jsonl file with the following fields:
+    - context_options: List of strings corresponding to possible preceding context options for the continuation
+    - gold: Index of the correct context from 'context_options'
+    - continuation: The finishing continuation
+
+    Each batch then consists of ``batch_size // N`` distinct tasks and has the following the structure
+    - input_ids: Input tensor ``batch x seqlen x # of tokens``
+    - continuation_indices: List of ``batch`` consisting of tensors indicating which indices in the sequence correspond to the question answer (aka continuation)
+    - mode: Indicates to the model that this is an ICL task and may rely on a custom code path to properly update metrics
+    - labels: Identical to the input, used by the model to calculate loss/metrics
+    - gold_indices: List of length ``batch_size // N`` indicating for each question, which of the answers is correct (via an integer [0, N-1])
+    - choice_groupings: Indicates which indices of the batch correspond to which questions
+
+    """
+
+    def __init__(self, choices_key='context_options', *args, **kwargs):
+        static_keys = ['mode']
+        tensor_keys = ['input_ids', 'labels', 'attention_mask']
+        list_of_tensors_keys = ['continuation_indices']
+        super().__init__(choices_key=choices_key,
+                         context_key=choices_key,
+                         static_keys=static_keys,
+                         tensor_keys=tensor_keys,
+                         list_of_tensors_keys=list_of_tensors_keys,
+                         *args,
+                         **kwargs)
+        self.base_batch = {
+            'input_ids': [],
+            'continuation_indices': [],
+            'mode': 'icl_task',
+            'labels': [],
+            'gold_indices': [],
+            'choice_groupings': [],
+        }
+
+    def construct_context(self, example, preceding_text: str = '', add_answer: bool = False) -> str:
+        """
+        Takes a example and constructs a context with the correct context for the example's continuation.
+
+        Args:
+            example (Dict): The example from which to construct the context
+            preceding_text (str): Any preceding text, needed to if self.example_delimiter is needed at the beginning
+            add_answer (bool): This will always be true when calling this function for SchemaTaskDataset
+
+        Returns:
+            str: The single correct context for a given continuation
+        """
+        context_options = example[self.choices_key]
+        gold_idx = example['gold']
+        continuation = example['continuation']
+        context = context_options[gold_idx]
+        if len(preceding_text) > 0:
+            context = f'{self.example_delimiter}{context}'
+        context = f'{context}{self.continuation_delimiter}{continuation}'
+        return context
+
+    def _construct_multiple_contexts(self, example: Dict, preceding_text: str = '') -> List[str]:
+        """
+        Takes a example and constructs all contexts. Optionally, appends this to preceeding text (such as a
+        prompt or fewshot examples).
+
+        Args:
+            example (Dict): The example from which to construct the context
+            preceding_text (str): Any preceding text, needed to if self.example_delimiter is needed at the beginning
+
+        Returns:
+            list: All context options for the selected example with formatting
+        """
+        context_options = example[self.choices_key]
+        if len(preceding_text) > 0:
+            if self.strip_data:
+                cont_del = self.continuation_delimiter.rstrip()
+            else:
+                cont_del = self.continuation_delimiter
+            context_options = [f'{self.example_delimiter}{c}{cont_del}' for c in context_options]
+        return context_options
+
+    def _prep_example(
+        self,
+        example: Dict,
+        example_idx: int,
+        num_fewshot: int,
+        prompt_string: str,
+        fewshot_rng: random.Random,
+    ) -> Dict[str, Any]:
+        """
+        Prepares a single example from a HF Dataset into tokenized format with prompt and fewshot examples.
+
+        Each task consists of multiple contexts and a single, correct continuation. Will preprend fewshot examples and
+        prompt if present.
+
+        Args:
+            example (Dict): A dictionary from the hf dataset
+            example_idx (int): The index of example
+            num_fewshot (int): Number of examples context/continuation pairs to prepend to the test pair
+            prompt_string (str): The prompt to prepend to all inputs
+            fewshot_rng (random.Random): Random number generator to use for fewshot sampling
+
+        Returns:
+            Dict: Contains a dictionary with the tokenized data
+        """
+        prompt_and_fewshot = self._generate_few_shot_prompt(num_fewshot, example_idx, prompt_string, fewshot_rng)
+        ctxt = self._construct_multiple_contexts(example, prompt_and_fewshot)
+        tokenized_example = self.tokenize_example(prompt_and_fewshot, ctxt, example)
+        return tokenized_example
+
+    def tokenize_example(self, prompt_and_fewshot: str, context_options: List[str], example: Dict) -> Dict[str, Any]:
+        """
+        Runs text through the tokenizer and handle special cases.
+
+        Args:
+            prompt_and_fewshot (str): The collection of the prompt and fewshot examples that belongs before the example's context
+            ctx (str): The specific example's derrived context
+            example (Dict): The example as a dictionary.
+
+        Returns:
+            Dict: Dictionary with the tokenized data
+        """
+        tokenized_example = {}
+        preamble = self.tokenizer(prompt_and_fewshot)['input_ids']
+        assert isinstance(preamble, list)
+        preamble = self._fix_eos_on_preamble(preamble)
+        encoded_contexts = [
+            preamble +  # pyright: ignore[reportOperatorIssue, reportGeneralTypeIssues]
+            self.tokenizer(c, add_special_tokens=False)['input_ids']  # pyright: ignore[reportOperatorIssue, ]
+            for c in context_options
+        ]
+        continuation = example['continuation']
+        if self.prefix_space:
+            continuation = (f' {continuation}' if not continuation.startswith(' ') else continuation)
+        tokenized_continuation = self.tokenizer(continuation, add_special_tokens=False)['input_ids']
+
+        tokenized_example[self.context_key] = []
+        tokenized_example['continuation_indices'] = []
+        tokenized_example[self.answer_key] = []
+        for context in encoded_contexts:
+            assert isinstance(context, list)
+            assert isinstance(tokenized_continuation, list)
+            trimmed_context = _trim_context(context, tokenized_continuation, self.padding_size)
+            assert isinstance(trimmed_context, list)
+            continuation_indices = _get_continuation_span(trimmed_context, tokenized_continuation)
+            padded_context = _make_padded_input(trimmed_context, tokenized_continuation, self.padding_size,
+                                                self.pad_tok_id, self.padding_side)
+            tokenized_example[self.context_key].append(padded_context)
+            tokenized_example['continuation_indices'].append(continuation_indices)
+            tokenized_example[self.answer_key].append(tokenized_continuation)
+
+        tokenized_example['gold'] = example['gold']
+        return tokenized_example
+
+
+class InContextLearningCodeEvalDataset(InContextLearningDataset):
+    """
+    A dataset that constructs batches for in-context learning code evaluation.
+
+    The input format is expected to be a jsonl file with the following fields:
+
+    - task_id: Label of given task
+    - prompt: The code snippet that must be completed
+    - entry_point: The entry to the function/code snippet to generate
+    - canonical_solution: Working solution
+    - test: The checker code that will run to completion if the code generation is valid and otherwise throw assertion
+    - test_inputs: List of test inputs
+    - test_outputs: List of test outputs
+    - language: The language of the code snippet
+
+    Each batch then consists of the following the structure
+
+    - input_ids: Input tensor batch x seqlen x num tokens
+    - mode: Indicates to the model that this is an ICL task and may rely on a custom code path to properly update metrics
+    - mode: Always set to 'generate'
+    - labels: Exact solution for the coding problem
+    - prompts: Prompt for the task
+    - entry_points: List of entry points
+    - test_inputs: List of test inputs
+    - test_outputs: List of test outputs
+    - languages:  List of languages
+    - pass_at_k: Passed value for pass_at_k
+    - generation_length: Derrived maximum generation length
+    - generation_kwargs: Dictionary of kwargs neeeded for generation. Includes the following, which will be individually overwritten
+      by keys in generaiton_kwargs if set (see https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig
+      for more details):
+
+        - pad_token_id: ID for padding token, derived automatically
+        - num_beams: How many beams to search for generations, set to 1
+        - num_return_sequences: Value passed for 'generations_per_sample', how many generations per prompt
+        - do_sample: Determines whether model is sampling or greedily decoding. Always set to True
+        - use_cache: Whether or not to use past key values to speed up sampling. Always set to True
+
+    Additional Args:
+        generations_per_sample (int) (defaults to 1): The number of independently computed returned sequences for each element in the batch
+        pass_at_k (int) (defaults to 1): k for how many chances the model gets to write passing code
+    """
+
+    def __init__(
+        self,
+        generations_per_sample: int,
+        pass_at_k: int = 1,
+        *args,
+        **kwargs,
+    ):
+        if generations_per_sample < pass_at_k:
+            raise ValueError(
+                f'generations_per_sample ({generations_per_sample}) must be greater than or equal to pass_at_k ({pass_at_k}) for code evaluation.'
+            )
+        batch_mapping = {
+            'input_ids': 'prompt',
+            'prompts': 'prompt_text',
+            'tests': 'test',
+            'labels': 'canonical_solution',
+            'entry_points': 'entry_point',
+            'test_inputs': 'test_inputs',
+            'test_outputs': 'test_outputs',
+            'languages': 'language'
+        }
+        # Linting complains if these are not set in init
+        self.max_prompt_length = 0
+        self.max_answer_length = 0
+        static_keys = ['mode', 'pass_at_k', 'generation_length', 'generation_kwargs']
+        list_keys = ['prompts', 'tests', 'entry_points', 'test_inputs', 'test_outputs', 'languages', 'labels']
+        tensor_keys = ['input_ids', 'attention_mask']
+        super().__init__(
+            context_key='prompt',
+            answer_key='canonical_solution',
+            strip_dataset=False,
+            static_keys=static_keys,
+            list_keys=list_keys,
+            tensor_keys=tensor_keys,
+            tokenize_labels=False,
+            padding_side='left',
+            batch_mapping=batch_mapping,
+            *args,
+            **kwargs,
+        )
+        self._set_max_prompt_and_answer_lengths()
+        self.dataset = self.dataset.map(self._trim_padding)
+        self.base_batch = {
+            'input_ids': [],
+            'mode': 'generate',
+            'labels': [],
+            'prompts': [],
+            'tests': [],
+            'entry_points': [],
+            'test_inputs': [],
+            'test_outputs': [],
+            'languages': [],
+            'pass_at_k': pass_at_k,
+            'generation_length': min(self.max_answer_length, self.max_seq_len - self.max_prompt_length),
+            'generation_kwargs': {
+                'pad_token_id': self.pad_tok_id,
+                'num_beams': 1,  # single beam
+                'num_return_sequences': generations_per_sample,
+                'do_sample': True,
+                'use_cache': True,
+                'eos_token_id': self.tokenizer.eos_token_id
+            }
+        }
+        self.update_generation_kwargs(kwargs.get('generation_kwargs', {}))
+
+    def _set_max_prompt_and_answer_lengths(self):
+        """
+        Iterates through the dataset and finds the maximum prompt length and sequence lengths
+
+        Returns:
+            None
+        """
+        max_prompt_length = 0
+        max_answer_length = 0
+        for example in self.dataset:
+            assert isinstance(example, Dict)
+            unpadded_example = [token for token in example[self.context_key] if token != self.pad_tok_id]
+            max_prompt_length = max(max_prompt_length, len(unpadded_example))
+
+            tokenized_answer = self.tokenizer(example['canonical_solution'], add_special_tokens=False)['input_ids']
+            assert isinstance(tokenized_answer, list)
+            len_tokenized_answer = len(tokenized_answer)
+            max_answer_length = max(max_answer_length, len_tokenized_answer)
+
+        self.max_prompt_length = max_prompt_length
+        self.max_answer_length = max_answer_length + _MAX_ANSWER_BUFFER_LENGTH
+
+    def _trim_padding(self, example: Dict):
+        """
+        Adjusts padding to the maximum prompt length rather than max_seq_len.
+        Needs to be done after the dataset has been processed because we don't know the maximum
+        prompt length until after we've tokenized it.
+
+        Returns:
+            dataset: A HuggingFace Dataset with different padding lengths for example[self.context_key]
+        """
+        # Remove padding tokens applied during tokenization
+        unpadded_prompt = [token for token in example[self.context_key] if token != self.pad_tok_id]
+        # Reapply padding only to max_prompt_length
+        full_prompt = _trim_context(unpadded_prompt, [], self.max_prompt_length)
+        padded_context = _make_padded_input(full_prompt, [], self.max_prompt_length, self.pad_tok_id, self.padding_side)
+
+        example[self.context_key] = padded_context
+        return example
+
+    def tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -> Dict[str, Any]:
+        """
+        Adds extra code task details to the example dictionary.
+        See InContextLearningDataset for more details
+        """
+        tokenized_example = super().tokenize_example(prompt_and_fewshot, ctxt, example)
+        tokenized_example['prompt_text'] = example['prompt']
+        tokenized_example['task_id'] = example['task_id']
+        tokenized_example['canonical_solution'] = example['canonical_solution']
+        tokenized_example['test'] = example['test']
+        tokenized_example['entry_point'] = example['entry_point']
+        tokenized_example['test_inputs'] = example['test_inputs']
+        tokenized_example['test_outputs'] = example['test_outputs']
+        tokenized_example['language'] = example['language']
+        return tokenized_example
+
+
+def build_icl_dataloader(
+        icl_task_type: str,
+        dataset_uri: str,
+        tokenizer: transformers.PreTrainedTokenizerBase,
+        batch_size: int,
+        max_seq_len: int,
+        pad_tok_id: int,
+        num_fewshot: int,
+        prompt_string: str,  # e.g. 'translate english to french:'
+        example_delimiter: str,  # e.g. '\n'
+        continuation_delimiter: str,  # e.g. ''
+        hf_loading_vars: Dict,
+        hf_parsing_map: Dict,
+        destination_path: str,
+        prelimiter: str,  # e.g. 'Question: '
+        cot_delimiter: str,  # e.g. ' ### '
+        fewshot_random_seed: int,
+        pass_at_k: int,
+        generations_per_sample: int,
+        generation_kwargs: Dict,
+        early_stopping_criteria: Optional[List[str]] = None,
+        do_normalization: bool = True) -> DataSpec:
+    """
+    Factory method that builds the specific dataset for the specified icl_task_type.
+    See documentation for `get_icl_task_dataloader` for arugment documentation.
+
+    When writing a dataset for a new task, here you will need to:
+        1. add the dataset to the factory and choose an appropriate string
+        2. set the batch size for that task (see InContextLearningMultipleChoiceTaskDataset for why
+            this might be different)
+        3. set the `split_batch` funciton if necessary
+    """
+    if icl_task_type == 'multiple_choice':
+        dataset = InContextLearningMultipleChoiceTaskDataset(
+            dataset_uri=dataset_uri,
+            tokenizer=tokenizer,
+            max_seq_len=max_seq_len,
+            pad_tok_id=pad_tok_id,
+            num_fewshot=num_fewshot,
+            prompt_string=prompt_string,
+            example_delimiter=example_delimiter,
+            continuation_delimiter=continuation_delimiter,
+            destination_path=destination_path,
+            fewshot_random_seed=fewshot_random_seed,
+            hf_loading_vars=hf_loading_vars,
+            hf_parsing_map=hf_parsing_map,
+            generation_kwargs=generation_kwargs,
+        )
+        batch_size = max(dataset.num_choices, batch_size)
+        effective_batchsize = batch_size // dataset.num_choices
+    elif icl_task_type == 'schema':
+        dataset = InContextLearningSchemaTaskDataset(
+            dataset_uri=dataset_uri,
+            tokenizer=tokenizer,
+            max_seq_len=max_seq_len,
+            pad_tok_id=pad_tok_id,
+            num_fewshot=num_fewshot,
+            prompt_string=prompt_string,
+            example_delimiter=example_delimiter,
+            continuation_delimiter=continuation_delimiter,
+            destination_path=destination_path,
+            fewshot_random_seed=fewshot_random_seed,
+            hf_loading_vars=hf_loading_vars,
+            hf_parsing_map=hf_parsing_map,
+            generation_kwargs=generation_kwargs,
+        )
+        batch_size = max(dataset.num_choices, batch_size)
+        effective_batchsize = batch_size // dataset.num_choices
+    elif icl_task_type == 'language_modeling':
+        dataset = InContextLearningLMTaskDataset(
+            dataset_uri=dataset_uri,
+            tokenizer=tokenizer,
+            max_seq_len=max_seq_len,
+            pad_tok_id=pad_tok_id,
+            num_fewshot=num_fewshot,
+            prompt_string=prompt_string,
+            example_delimiter=example_delimiter,
+            continuation_delimiter=continuation_delimiter,
+            destination_path=destination_path,
+            fewshot_random_seed=fewshot_random_seed,
+            hf_loading_vars=hf_loading_vars,
+            hf_parsing_map=hf_parsing_map,
+            generation_kwargs=generation_kwargs,
+        )
+        effective_batchsize = batch_size
+    elif icl_task_type == 'question_answering':
+        dataset = InContextLearningQATaskDataset(
+            dataset_uri=dataset_uri,
+            tokenizer=tokenizer,
+            max_seq_len=max_seq_len,
+            pad_tok_id=pad_tok_id,
+            num_fewshot=num_fewshot,
+            prompt_string=prompt_string,
+            example_delimiter=example_delimiter,
+            continuation_delimiter=continuation_delimiter,
+            destination_path=destination_path,
+            prelimiter=prelimiter,
+            fewshot_random_seed=fewshot_random_seed,
+            hf_loading_vars=hf_loading_vars,
+            hf_parsing_map=hf_parsing_map,
+            cot_delimiter=cot_delimiter,
+            early_stopping_criteria=early_stopping_criteria,
+            do_normalization=do_normalization,
+            generation_kwargs=generation_kwargs,
+        )
+        effective_batchsize = batch_size
+    elif icl_task_type == 'code_evaluation':
+        dataset = InContextLearningCodeEvalDataset(
+            dataset_uri=dataset_uri,
+            tokenizer=tokenizer,
+            max_seq_len=max_seq_len,
+            pad_tok_id=pad_tok_id,
+            num_fewshot=num_fewshot,
+            prompt_string=prompt_string,
+            example_delimiter=example_delimiter,
+            continuation_delimiter=continuation_delimiter,
+            destination_path=destination_path,
+            prelimiter=prelimiter,
+            fewshot_random_seed=fewshot_random_seed,
+            hf_loading_vars=hf_loading_vars,
+            hf_parsing_map=hf_parsing_map,
+            pass_at_k=pass_at_k,
+            generations_per_sample=generations_per_sample,
+            generation_kwargs=generation_kwargs,
+        )
+        effective_batchsize = batch_size
+    else:
+        raise Exception(f'Unrecognized ICL task type: {icl_task_type}')
+
+    sampler = dist.get_sampler(dataset, drop_last=False, shuffle=False)
+
+    split_batch = None
+    if isinstance(
+            dataset,
+        (
+            InContextLearningMultipleChoiceTaskDataset,
+            InContextLearningQATaskDataset,
+            InContextLearningCodeEvalDataset,
+        ),
+    ):
+        split_batch = dataset.split_batch
+
+    return DataSpec(
+        DataLoader(
+            dataset,
+            batch_size=effective_batchsize,
+            sampler=sampler,
+            collate_fn=dataset.collate_fn,
+        ),
+        device_transforms=None,
+        get_num_samples_in_batch=dataset.get_num_samples_in_batch,
+        split_batch=split_batch,
+    )
+
+
+def partition_dataset_by_category(dataset_uri: str, destination_path: str, hf_loading_vars: Dict,
+                                  hf_parsing_map: Dict) -> Dict[str, str]:
+    """
+    If has_categories is enabled, we partition the dataset into a separate dataset for each category value in the data and write each partition to a local file.
+
+    Args:
+        dataset_uri (str): Location of dataset.
+        destination_path (str): Base destination path, we will write a separate partition off this URI for each category.
+
+    Raises:
+        MissingConditionalImportError: If datasets not installed raise exception.
+        Exception: If 'category' key missing from dataset, raise exception.
+    Returns:
+        Dict[str, str]: Mapping of category names to partitioned dataset local files names.
+    """
+    try:
+        from datasets import Dataset as HFDataset  # pyright: ignore[reportGeneralTypeIssues]
+        from datasets import IterableDataset, load_dataset  # pyright: ignore[reportGeneralTypeIssues]
+    except ImportError as e:
+        raise MissingConditionalImportError(
+            extra_deps_group='nlp',
+            conda_package='datasets',
+            conda_channel='conda-forge',
+        ) from e
+    if dataset_uri.startswith('hf://'):
+        dataset_uri = dataset_uri.replace('hf://', '')
+        dataset = load_dataset(dataset_uri, **hf_loading_vars)
+        assert isinstance(dataset, HFDataset) or isinstance(dataset, IterableDataset)
+        if hf_parsing_map:
+            dataset_parsing_func = lambda example: {
+                k: ' '.join([str(example[col]) for col in v]) for k, v in hf_parsing_map.items()
+            }
+            assert hasattr(dataset, 'column_names')
+            dataset = dataset.map(dataset_parsing_func, remove_columns=dataset.column_names)
+    else:
+        with dist.local_rank_zero_download_and_wait(destination_path):
+            if dist.get_local_rank() == 0:
+                get_file(dataset_uri, destination_path, overwrite=True)
+        dataset = load_dataset('json', data_files=destination_path, split='train', streaming=False)
+    assert isinstance(dataset, HFDataset) or isinstance(dataset, IterableDataset)
+    assert hasattr(dataset, 'features')
+    assert dataset.features is not None
+    if 'category' not in dataset.features.keys():
+        raise Exception(f"""Attempted to partition dataset by `category` \
+            but it doesn't have a `category` key. \
+            Got keys: {str(list(dataset.features.keys()))}""")
+    categories = sorted(set(dataset['category']))  # pyright: ignore[reportIndexIssue, reportGeneralTypeIssues]
+    output_files = {}
+    for cat in categories:
+        path = destination_path.split('/')
+        cat_dest = '/'.join(path[:-1]) + f'/{cat}_{path[-1]}'
+        tmp_path_to_broadcast = str(os.path.abspath(cat_dest))
+        gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
+        if dist.get_local_rank() == 0:
+            subset = [
+                l for l in dataset if l['category'] == cat  # pyright: ignore[reportGeneralTypeIssues]
+            ]  # pyright: ignore[reportArgumentType, reportCallIssue]
+            with open(gathered_paths[0], 'w', encoding='utf8') as f:
+                for l in subset:
+                    f.write(json.dumps(l, ensure_ascii=False) + '\n')
+        output_files[cat] = cat_dest
+    return output_files
+
+
+def get_icl_task_dataloader(
+        icl_task_type: str,
+        dataset_uri: str,
+        tokenizer: Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast],
+        batch_size: int,
+        max_seq_len: int,
+        pad_tok_id: int,
+        num_fewshot: int,
+        prompt_string: str,  # e.g. 'translate english to french:'
+        example_delimiter: str,  # e.g. '\n'
+        continuation_delimiter: str = '',
+        destination_path: str = '',
+        question_prelimiter: str = '',  # e.g. 'Question: '
+        fewshot_random_seed: int = 1234,
+        pass_at_k: int = 1,
+        generations_per_sample: int = 1,
+        cot_delimiter: str = '',
+        has_categories: bool = False,
+        hf_loading_vars: Optional[Dict] = None,
+        hf_parsing_map: Optional[Dict] = None,
+        generation_kwargs: Optional[Dict] = None,
+        early_stopping_criteria: Optional[List[str]] = None,
+        do_normalization: bool = True) -> Union[DataSpec, Dict[str, DataSpec]]:
+    """This constructs a dataloader (or dataloaders if has_categories is True) capable of evaluating LLMs on in-context learning language modeling tasks, for example LAMBADA. An example usage is below:
+
+        .. testsetup::
+
+            import transformers
+            from composer.models import HuggingFaceModel
+            from composer.trainer import Trainer
+            dataset_uri = "/tmp/dataset_uri.jsonl"
+            dataset = RandomTextClassificationDataset(size=16, use_keys=True)
+            train_dataloader = torch.utils.data.DataLoader(dataset, batch_size=8)
+            hf_model, tokenizer = HuggingFaceModel.hf_from_composer_checkpoint('composer-hf-checkpoint.pt')
+            # At this point, hf_model is randomly initialized
+            composer_model = HuggingFaceModel(hf_model, hf_tokenizer)
+
+        Example:
+
+        .. testcode::
+
+
+            dl = get_icl_task_dataloader(
+                'language_modeling',
+                dataset_uri,
+                tokenizer,
+                batch_size=2,
+                max_seq_len=2048,
+                pad_tok_id=tokenizer.pad_token_id,
+                num_fewshot=10,
+                prompt_string='translate english to french',
+                example_delimiter='\\n',
+                continuation_delimiter=''
+                )
+            eval_evaluator = Evaluator(
+                    label="lambada",
+                    dataloader=dl,
+                    metric_names=['InContextLearningLMAccuracy']
+                )
+            trainer = Trainer(
+                    model=model,
+                    train_dataloader=train_dataloader,
+                    eval_dataloader=eval_evaluator,
+                    optimizers=optimizer,
+                    max_duration="1ep",
+                )
+
+    Args:
+        icl_task_type (str): Name of icl_task type. One of ['multiple_choice', 'schema', 'language_modeling', 'question_answering', 'code_evaluation']
+        dataset_uri (str): A local path, a remote path beginning with ``s3://`` or another backend, or a HuggingFace dataset uri prepended with ``hf://``.
+            Alternate backends must be supported by :meth:`composer.utils.maybe_create_object_store_from_uri`.
+            A local dataset must consist of rows of JSON data points with task dependant fields.
+            The default keys expected are "context" and "answer".
+        tokenizer (transformers.PreTrainedTokenizerBase): The tokenizer used to map between strings and token ids.
+        batch_size (int): Size of a batch used for eval
+        max_seq_len (int): The maximum sequence length supported by the model.
+        pad_tok_id (int): The special token used for padding batches.
+        num_fewshot (int): The number of complete fewshot examples to prepend before each test example. These are not identical across examples.
+        prompt_string (str, default = ''): Prompt string to put once before all fewshot examples/test examples (e.g. 'Translate english to french.').
+        example_delimiter (str, default = '\\n'): Separator inserted before (context, answer) pairs (e.g. '\\n') for fewshot sampling and prompting.
+        continuation_delimiter: (str, default = ' '): Separator inserted between context and answer in each example (e.g. '\\nA: ').
+        destination_path: (str, default = ''): This is the local file where remote datasets will be saved.
+        question_prelimiter: (str, default = ''): Text to be prepended before each context, including few shot examples (e.g. "Question: ").
+        fewshot_random_seed (int, default = 1234): Random seed to use for fewshot sampling
+        pass_at_k (int): k for how many chances the model gets to write passing code.
+        generations_per_sample (int): How many outputs to generate per prompt. Passed in generation_kwargs under "num_return_sequences" and overwritten by generation_kwargs dict.
+        cot_delimiter (str): Delimiter to place between chain of thoughts and continuations.
+        has_categories: (bool): If ``True``, we will search the dataset file for a category key, and partition the dataset into a separate dataloader for each category occurring in the data.
+        hf_loading_vars (Dict, default = None): A dictionary containing keyword arguments to be passed into `load_dataset` if dataset is being pulled from HF.
+        hf_parsing_map (Dict, default = None): A dictionary containing a mapping from HF columns to ICL dataset keys. The dictionary should be formatted {icl_key:[hf_key1, hf_key1]}.
+            Column contents will be concatenated with ' ' seperating them. If not included, will load the columns already present in the HF dataset.
+        generation_kwargs (Dict, default = None): A dictionary containing keyword arguments to be passed along to the model's generate function. Overwrites any previously specified generation
+                                                  keyword args in this fucntion (see https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig
+                                                  for more details)
+        early_stopping (List, default = None): A list of strings that, when found in a model's output, will be treated as a stopping criteria at metric computation time.
+            Used in QA tasks with CoT
+        do_normalization (bool, default = True): Whether or not to normalize the outputs and labels in InContextLearningQAAccuracy. Only used in QA tasks.
+
+    Returns:
+        DataLoader: A dataloader used for performing in-context learning evaluation on the dataset provided.
+    """
+    if hf_loading_vars is None:
+        hf_loading_vars = {}
+    if hf_parsing_map is None:
+        hf_parsing_map = {}
+    if generation_kwargs is None:
+        generation_kwargs = {}
+    if early_stopping_criteria is None:
+        early_stopping_criteria = []
+
+    if has_categories:
+        result_dls = {}
+        output_files = partition_dataset_by_category(dataset_uri, destination_path, hf_loading_vars, hf_parsing_map)
+        categories = sorted(output_files.keys())
+        for category in categories:
+            partition_uri = output_files[category]
+            result_dls[category] = build_icl_dataloader(
+                icl_task_type=icl_task_type,
+                dataset_uri=partition_uri,
+                tokenizer=tokenizer,
+                batch_size=batch_size,
+                max_seq_len=max_seq_len,
+                pad_tok_id=pad_tok_id,
+                num_fewshot=num_fewshot,
+                prompt_string=prompt_string,
+                example_delimiter=example_delimiter,
+                continuation_delimiter=continuation_delimiter,
+                destination_path=partition_uri + '_tmp',
+                prelimiter=question_prelimiter,
+                cot_delimiter=cot_delimiter,
+                fewshot_random_seed=fewshot_random_seed,
+                pass_at_k=pass_at_k,
+                generations_per_sample=generations_per_sample,
+                hf_loading_vars=hf_loading_vars,
+                hf_parsing_map=hf_parsing_map,
+                generation_kwargs=generation_kwargs,
+                early_stopping_criteria=early_stopping_criteria,
+                do_normalization=do_normalization,
+            )
+        return result_dls
+    else:
+        return build_icl_dataloader(
+            icl_task_type=icl_task_type,
+            dataset_uri=dataset_uri,
+            tokenizer=tokenizer,
+            batch_size=batch_size,
+            max_seq_len=max_seq_len,
+            pad_tok_id=pad_tok_id,
+            num_fewshot=num_fewshot,
+            prompt_string=prompt_string,
+            example_delimiter=example_delimiter,
+            hf_loading_vars=hf_loading_vars,
+            hf_parsing_map=hf_parsing_map,
+            continuation_delimiter=continuation_delimiter,
+            destination_path=destination_path,
+            prelimiter=question_prelimiter,
+            cot_delimiter=cot_delimiter,
+            fewshot_random_seed=fewshot_random_seed,
+            pass_at_k=pass_at_k,
+            generations_per_sample=generations_per_sample,
+            generation_kwargs=generation_kwargs,
+            early_stopping_criteria=early_stopping_criteria,
+            do_normalization=do_normalization,
+        )
\ No newline at end of file
diff --git a/llmfoundry/eval/metrics/nlp.py b/llmfoundry/eval/metrics/nlp.py
new file mode 100644
index 0000000000..bef7d2f3c0
--- /dev/null
+++ b/llmfoundry/eval/metrics/nlp.py
@@ -0,0 +1,360 @@
+# Copyright 2022 MosaicML Composer authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""A collection of common torchmetrics for NLP tasks."""
+
+import logging
+import os
+import re
+import string
+import warnings
+from typing import Any, Dict, List, Optional
+
+import numpy as np
+import torch
+from torch import Tensor
+from torch.nn import functional as F
+from composer.metrics.nlp import InContextLearningMetric
+from composer.utils.eval_client import EvalClient, LambdaEvalClient, LocalEvalClient, MosaicMLLambdaEvalClient
+
+log = logging.getLogger(__name__)
+
+__all__ = [
+    'InContextLearningLMAccuracy',
+    'InContextLearningMultipleChoiceAccuracy',
+    'InContextLearningQAAccuracy',
+    'InContextLearningCodeEvalAccuracy',
+    'BinaryF1Score',
+    'LanguageCrossEntropy',
+    'MaskedAccuracy',
+    'LanguagePerplexity',
+    'InContextLearningLMExpectedCalibrationError',
+    'InContextLearningMCExpectedCalibrationError',
+]
+
+
+class InContextLearningQAAccuracy(InContextLearningMetric):
+    r"""Computes accuracy for In-context learning (ICL) question answering (QA) tasks.
+
+    ICL QA tasks consist of some number of example question answering tasks (referred to as the 'context'), followed by a test task where the model must
+    match one of the possible answer aliases (referred to as the 'continuation').
+
+    For example, the model may be provided the context below and evaluated on its ability to correctly predict the continuation.
+
+    Context: `Question: Who was president of the United States in 2012?\nAnswer: Barack Obama\nQuestion: Is water wet?\nAnswer: `
+    Continuation: [`yes`, `no`]
+
+    Both predictions and answers will be normalized before comparison.
+
+    Adds metric state variables:
+        correct (float): The number of instances where the prediction was a prefix for any of the answer aliases.
+        total (float): The number of total instances that were predicted.
+
+    Args:
+        dist_sync_on_step (bool, optional): Synchronize metric state across processes at
+            each forward() before returning the value at the step. Default: ``False``.
+    """
+
+    # Make torchmetrics call update only once
+    full_state_update = False
+
+    def __init__(self, dist_sync_on_step: bool = False):
+        # state from multiple processes
+        super().__init__(dist_sync_on_step=dist_sync_on_step)
+        self.add_state('correct', default=torch.tensor(0.), dist_reduce_fx='sum')
+        self.add_state('total', default=torch.tensor(0.), dist_reduce_fx='sum')
+
+    def normalize_answer(self, answer: str):
+        """Lower text and remove punctuation, articles and extra whitespace.
+
+        Copied from https://github.com/mandarjoshi90/triviaqa/blob/master/evaluation/triviaqa_evaluation.py
+        """
+
+        def remove_articles(text: str) -> str:
+            return re.sub(r'\b(a|an|the)\b', ' ', text)
+
+        def white_space_fix(text: str) -> str:
+            return ' '.join(text.split())
+
+        def handle_punc(text: str) -> str:
+            exclude = set(string.punctuation + ''.join([u'‘', u'’', u'´', u'`']))
+            return ''.join(ch if ch not in exclude else ' ' for ch in text)
+
+        def lower(text: str) -> str:
+            return text.lower()
+
+        def replace_underscore(text: str) -> str:
+            return text.replace('_', ' ')
+
+        return white_space_fix(remove_articles(handle_punc(lower(replace_underscore(answer))))).strip()
+
+    def update(self, batch: Optional[Dict[str, Any]], outputs: List[str], labels: List[List[str]]):
+        cot_delimiter = batch.get('cot_delimiter', '')
+        do_normalization = batch.get('do_normalization', True)
+        stopping_criteria = batch.get('stopping_criteria', None)
+        for sample_output, sample_labels in zip(outputs, labels):
+            final_answer = sample_output
+
+            if stopping_criteria is not None and len(stopping_criteria) > 0:
+                final_answer = re.split('|'.join(stopping_criteria), final_answer)[0]
+
+            if cot_delimiter is not None and len(cot_delimiter) > 0:
+                final_answer = final_answer.split(cot_delimiter)[-1]
+
+            if do_normalization:
+                cleaned_final_answer = self.normalize_answer(final_answer)
+                cleaned_sample_labels = {self.normalize_answer(label) for label in sample_labels}
+            else:
+                cleaned_final_answer = final_answer
+                cleaned_sample_labels = set(sample_labels)
+
+            if any(cleaned_final_answer.startswith(label) for label in cleaned_sample_labels):
+                self.correct += torch.tensor(1.0)
+            self.total += torch.tensor(1.0)
+
+    def compute(self):
+        assert isinstance(self.correct, Tensor)
+        assert isinstance(self.total, Tensor)
+        return self.correct / self.total
+
+
+class InContextLearningLMAccuracy(InContextLearningMetric):
+    r"""Computes accuracy for In-context learning (ICL) language modeling (LM) tasks.
+
+    ICL LM tasks consist of some number of example language modeling tasks (referred to as the 'context'), followed by a test task where the model must correctly predict all the tokens
+    following tokens in some passage (referred to as the 'continuation').
+
+    For example, the model may be provided the context below and evaluated on its ability to correctly predict the continuation. Note: it doesn't matter
+    whether the model correctly predicts the context tokens.
+
+    Context: `The dog is->fuzzy\nthe water is->hot\nthe tree is->`
+    Continuation: `green`
+
+    Adds metric state variables:
+        correct (float): The number of instances where the prediction masked the target.
+        total (float): The number of total instances that were predicted.
+
+    Args:
+        dist_sync_on_step (bool, optional): Synchronize metric state across processes at
+            each forward() before returning the value at the step. Default: ``False``.
+    """
+
+    # Make torchmetrics call update only once
+    full_state_update = False
+
+    def __init__(self, dist_sync_on_step: bool = False):
+        # state from multiple processes
+        super().__init__(dist_sync_on_step=dist_sync_on_step)
+        self.add_state('correct', default=torch.tensor(0.), dist_reduce_fx='sum')
+        self.add_state('total', default=torch.tensor(0.), dist_reduce_fx='sum')
+
+    def update(self, batch: dict, output_logits: torch.Tensor, labels: torch.Tensor):
+        for batch_idx, cont_idx in enumerate(batch['continuation_indices']):
+            cont_tok_pred = output_logits[batch_idx].index_select(dim=0, index=cont_idx - 1).argmax(dim=-1)
+            cont_tok_targ = labels[batch_idx].index_select(dim=0, index=cont_idx - 1)
+
+            self.correct += (cont_tok_pred == cont_tok_targ).all().int()
+            self.total += torch.tensor(1.0)
+
+    def compute(self):
+        assert isinstance(self.correct, Tensor)
+        assert isinstance(self.total, Tensor)
+        return self.correct / self.total
+
+
+class InContextLearningMultipleChoiceAccuracy(InContextLearningMetric):
+    r"""Computes accuracy for In-context learning (ICL) multiple choice (MC) tasks.
+
+    ICL MC tasks consists of a series of questions with some number of possible choices (only one of which can be correct).
+    At inference time each possible choice is given to the model as a separate input and the one for which the model assigns
+    the lowest perplexity to the choice is considered the model's choice. The model is correct if it "chooses" the right answer.
+
+    Context: `The dog is->fuzzy\nthe water is->hot\nthe tree is->`
+    Continuation: `green`
+
+    Adds metric state variables:
+        correct (float): The number of instances where the prediction masked the target.
+        total (float): The number of total instances that were predicted.
+
+    Args:
+        dist_sync_on_step (bool, optional): Synchronize metric state across processes at
+            each forward() before returning the value at the step. Default: ``False``.
+    """
+
+    # Make torchmetrics call update only once
+    full_state_update = False
+
+    def __init__(self, dist_sync_on_step: bool = False):
+        # state from multiple processes
+        super().__init__(dist_sync_on_step=dist_sync_on_step)
+        self.add_state('correct', default=torch.tensor(0.0), dist_reduce_fx='sum')
+        self.add_state('total', default=torch.tensor(0.0), dist_reduce_fx='sum')
+
+    def update(self, batch: dict, output_logits: torch.Tensor, labels: torch.Tensor):
+        perplexities = []
+        for batch_idx, cont_idx in enumerate(batch['continuation_indices']):
+            # continuation indices refer to indices in the original input's token space
+            cont_tok_logits = output_logits[batch_idx].index_select(dim=0, index=cont_idx - 1)
+            # labels have been shifted left by one index, so the cont_idx needs to be shifted as well.
+            cont_tok_targ = labels[batch_idx].index_select(dim=0, index=cont_idx - 1)
+            cross_entropy = F.cross_entropy(cont_tok_logits, cont_tok_targ)
+            perplexity = torch.exp(cross_entropy)
+            perplexities.append(perplexity)
+
+        for (start, end), gold_idx in zip(batch['choice_groupings'], batch['gold_indices']):
+            subset = perplexities[start:end]
+            idx_min = subset.index(min(subset))
+
+            if idx_min == gold_idx:
+                self.correct += torch.tensor(1.0)
+            self.total += torch.tensor(1.0)
+
+    def compute(self):
+        assert isinstance(self.correct, Tensor)
+        assert isinstance(self.total, Tensor)
+        return self.correct.float() / self.total
+
+
+class InContextLearningCodeEvalAccuracy(InContextLearningMetric):
+    r"""Computes accuracy for In-context learning (ICL) code evaluation tasks.
+
+    ICL code eval tasks consist of some number of example code eval tasks (referred to as the 'context'), followed by a test task where the model must
+    complete the code, where we term the code completion a 'continuation'.
+
+    In each case, the model constructs a given number of continuations (termed pass@K for K continuations), and each continuation is run against a set of test cases. The model is considered
+    correct if at least one of the proposed continuations passes all the test cases.
+
+    Runs on AWS Lambdas by default.
+
+    Adds metric state variables:
+        correct (float): The number of instances where the predictions passed all the test cases.
+        total (float): The number of total instances that were predicted.
+
+    Args:
+        dist_sync_on_step (bool, optional): Synchronize metric state across processes at
+            each forward() before returning the value at the step. Default: ``False``.
+    """
+
+    # Make torchmetrics call update only once
+    full_state_update = False
+
+    def __init__(self, dist_sync_on_step: bool = False):
+        # state from multiple processes
+        super().__init__(dist_sync_on_step=dist_sync_on_step)
+        self.add_state('correct', default=torch.tensor(0.), dist_reduce_fx='sum')
+        self.add_state('total', default=torch.tensor(0.), dist_reduce_fx='sum')
+
+        self.eval_device = os.environ.get('CODE_EVAL_DEVICE', None)
+        if self.eval_device is not None:
+            self.eval_device = self.eval_device.upper()
+
+    def get_client(self) -> EvalClient:
+        """Returns a client for the appropriate remote platform."""
+        client = None
+        if self.eval_device == 'LOCAL':
+            warnings.warn(
+                'Running code eval locally may be insecure. Please set environment variable CODE_EVAL_DEVICE '
+                'to LAMBDA to run on remote. To use Lambdas, spin up your instance that checks code, set the URL as '
+                'CODE_EVAL_URL and the API key as CODE_EVAL_APIKEY.')
+            log.debug('Running code eval locally.')
+            client = LocalEvalClient()
+        elif self.eval_device == 'LAMBDA':
+            client = LambdaEvalClient()
+        elif self.eval_device == 'MOSAICML':
+            client = MosaicMLLambdaEvalClient()
+        elif self.eval_device is None:
+            raise ValueError(
+                'Attempting to use InContextLearningCodeEvalAccuracy but environment '
+                'variable `CODE_EVAL_DEVICE` is not set. Please set it to `CODE_EVAL_DEVICE` '
+                'to one of `LOCAL` (for unsafe local eval), `LAMBDA` (for AWS lambda ',
+                'evaluation), or `MOSAICML` (for lambda eval through MAPI).')
+        else:
+            raise ValueError('Environment variable `CODE_EVAL_DEVICE` must be one of `LOCAL`, '
+                             f'`LAMBDA`, or `MOSAICML` but got {self.eval_device}.')
+
+        return client
+
+    def estimator(self, n: int, c: int, k: int) -> float:
+        """Computes the pass@k metric.
+
+        Given the number of generated samples, n, the number of correct samples, c, and the k of interest,
+        this function calculates pass@k as 1 - comb(n - c, k) / comb(n, k) as per the definition of
+        pass@k in the HumanEval paper (https://arxiv.org/abs/2107.03374) and it's associated implementation:
+        https://github.com/openai/human-eval.
+        """
+        if n - c < k:
+            return 1.0
+        return 1.0 - float(np.prod(1.0 - k / np.arange(n - c + 1, n + 1)))
+
+    def update(self, batch: Dict[str, Any], outputs: List[str], labels: List[str]):
+        """Updates the pass@k accuracy of code generation.
+
+        Given a batch of prompts, test cases, and code generations, evaluates the code generations
+        against the test cases and augments the pass@k accuracy of the batch to the values so far.
+
+        Args:
+            batch (Dict[str, Any]): A batch of data produced by the InContextLearningCodeEvalDataset, with
+            the prompt, test cases, and entry points. This will be a dictionary that must have the following
+            arguments:
+            {
+                'prompts': List[str],
+                'test_inputs': List[List[str]],
+                'test_outputs': List[List[str]],
+                'entry_points': List[str],
+                'languages': List[str],
+                'generation_kwargs': Dict[str, Any]
+            }
+            outputs (List[str]): A list of code generations in the format of HF generate with beam search,
+            which is the a list of strings in groups of beam_size e.g. for beam size 2 and batch size 2, the list
+            will be of the format [prompt 1 gen 1, prompt 1 gen 2, prompt 2 gen 1, prompt 2 gen 2]
+            labels (List[str]): A list of the correct code generations, for compatibility with existing HF generate
+            functionalities. This is not used.
+        """
+        del labels  # never used
+        client = self.get_client()
+
+        pass_at_k = batch['pass_at_k']
+        num_generations = batch['generation_kwargs']['num_return_sequences']
+        processed_outputs = [
+            outputs[i * num_generations:(i + 1) * num_generations] for i in range(len(batch['prompts']))
+        ]
+        payloads = []
+        for sample_outputs, sample_prompt, test_inputs, test_outputs, entry_point, language in zip(
+                processed_outputs, batch['prompts'], batch['test_inputs'], batch['test_outputs'], batch['entry_points'],
+                batch['languages']):
+            self.total += torch.tensor(1.0)
+            prompt_payload = []
+            for code_gen in sample_outputs:
+                code_gen = re.split(r'\n[A-Za-z0-9#`]', code_gen)[0]  # remove everything after function ends
+                final_code = sample_prompt + code_gen  # combine prompt with the code generation
+                generation_payload = []
+                for test_input, test_output in zip(test_inputs, test_outputs):
+                    payload = {
+                        'code': final_code,
+                        'input': test_input,
+                        'output': test_output,
+                        'entry_point': entry_point,
+                        'language': language,
+                    }
+                    generation_payload.append(payload)
+
+                prompt_payload.append(generation_payload)
+            payloads.append(prompt_payload)
+
+        results = client.invoke(payloads)
+        for prompt in results:
+            num_correct = 0
+            for generation in prompt:
+                correct = all(generation)
+                if correct:
+                    num_correct += 1
+
+            pass_at_k_rate = self.estimator(num_generations, num_correct, pass_at_k)
+            self.correct += torch.tensor(pass_at_k_rate)
+
+        client.close()  # pyright: ignore [reportOptionalMemberAccess]
+
+    def compute(self):
+        assert isinstance(self.correct, Tensor)
+        assert isinstance(self.total, Tensor)
+        return self.correct / self.total
\ No newline at end of file
diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py
index 75438b895e..ad4b3851db 100644
--- a/llmfoundry/utils/builders.py
+++ b/llmfoundry/utils/builders.py
@@ -15,7 +15,7 @@
                                 MemoryMonitor, OptimizerMonitor,
                                 RuntimeEstimator, SpeedMonitor)
 from composer.core import Algorithm, Callback, Evaluator
-from composer.datasets.in_context_learning_evaluation import \
+from llmfoundry.eval.datasets.in_context_learning_evaluation import \
     get_icl_task_dataloader
 from composer.loggers import (InMemoryLogger, LoggerDestination, MLFlowLogger,
                               TensorboardLogger, WandBLogger)
diff --git a/tests/eval/test_in_context_learning_datasets.py b/tests/eval/test_in_context_learning_datasets.py
new file mode 100644
index 0000000000..556cf43dec
--- /dev/null
+++ b/tests/eval/test_in_context_learning_datasets.py
@@ -0,0 +1,2007 @@
+# Copyright 2022 MosaicML Composer authors
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import types
+from pathlib import Path
+
+import pytest
+import torch
+from torch.utils.data import DataLoader
+
+from composer import Evaluator
+from composer.core import DataSpec
+
+# isort: off
+from llmfoundry.eval.datasets.in_context_learning_evaluation import (
+    InContextLearningCodeEvalDataset,
+    InContextLearningMultipleChoiceTaskDataset,
+    InContextLearningQATaskDataset,
+    InContextLearningSchemaTaskDataset,
+   
+    get_icl_task_dataloader,
+)
+# isort: on
+from composer.loggers import InMemoryLogger
+from composer.metrics import (InContextLearningCodeEvalAccuracy, InContextLearningLMAccuracy,
+                              InContextLearningMultipleChoiceAccuracy, InContextLearningQAAccuracy)
+from composer.models import HuggingFaceModel
+from composer.trainer import Trainer
+from composer.utils import dist, reproducibility
+from tests.common import device, world_size
+
+
+
+
+def test_qa_set_cot_no_cot(tmp_path):
+    pytest.importorskip('datasets')
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    dataset_uri = f'{local_data}/triviaqa_small.jsonl'
+    transformers = pytest.importorskip('transformers')
+    tokenizer = transformers.AutoTokenizer.from_pretrained('facebook/opt-125m')  # type: ignore reportUnboundVariable
+
+    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
+    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
+    dl = InContextLearningQATaskDataset(
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        max_seq_len=1024,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=0,
+        fewshot_random_seed=1234,
+        prompt_string='',
+        example_delimiter='\n',
+        continuation_delimiter=': ',
+        destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
+    )
+    assert not dl.has_cot
+
+
+def test_qa_set_cot_has_cot(tmp_path):
+    pytest.importorskip('datasets')
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    dataset_uri = f'{local_data}/gsm8k_small.jsonl'
+    transformers = pytest.importorskip('transformers')
+    tokenizer = transformers.AutoTokenizer.from_pretrained('facebook/opt-125m')  # type: ignore reportUnboundVariable
+
+    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
+    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
+    dl = InContextLearningQATaskDataset(
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        max_seq_len=1024,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=0,
+        fewshot_random_seed=1234,
+        prompt_string='',
+        example_delimiter='\n',
+        continuation_delimiter=': ',
+        destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
+    )
+    assert dl.has_cot
+
+
+def test_qa_get_max_answer_length(tiny_gpt2_tokenizer, tmp_path):
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    dataset_uri = f'{local_data}/triviaqa_small.jsonl'
+    tokenizer = tiny_gpt2_tokenizer
+
+    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
+    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
+    dl = InContextLearningQATaskDataset(
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        max_seq_len=1024,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=0,
+        fewshot_random_seed=1234,
+        prompt_string='',
+        example_delimiter='',
+        continuation_delimiter='',
+        cot_delimiter='',
+        destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
+    )
+    # empirical number from the small test dataset
+    assert dl.max_answer_length == 7
+
+
+def test_qa_get_answer_from_example_with_no_cot(tmp_path, tiny_gpt2_tokenizer):
+    pytest.importorskip('datasets')
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    dataset_uri = f'{local_data}/triviaqa_small.jsonl'
+
+    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
+    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
+    dl = InContextLearningQATaskDataset(
+        dataset_uri=dataset_uri,
+        tokenizer=tiny_gpt2_tokenizer,
+        max_seq_len=1024,
+        pad_tok_id=tiny_gpt2_tokenizer.eos_token_id,
+        num_fewshot=0,
+        fewshot_random_seed=1234,
+        prompt_string='',
+        example_delimiter='\n',
+        continuation_delimiter=': ',
+        cot_delimiter=' ### ',
+        destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
+    )
+    answer = dl.get_answer_from_example({
+        'context': 'empty',
+        'answer': 'this is the correct answer',
+        'chain_of_thought': "Let's think step by step. "
+    })
+    assert answer == 'this is the correct answer'
+
+
+def test_qa_get_answer_from_example_with_cot(tmp_path, tiny_gpt2_tokenizer):
+    pytest.importorskip('datasets')
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    dataset_uri = f'{local_data}/triviaqa_small.jsonl'
+
+    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
+    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
+    dl = InContextLearningQATaskDataset(
+        dataset_uri=dataset_uri,
+        tokenizer=tiny_gpt2_tokenizer,
+        max_seq_len=1024,
+        pad_tok_id=tiny_gpt2_tokenizer.eos_token_id,
+        num_fewshot=0,
+        fewshot_random_seed=1234,
+        prompt_string='',
+        example_delimiter='\n',
+        continuation_delimiter=': ',
+        cot_delimiter=' ### ',
+        destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
+    )
+    dl.has_cot = True
+    answer = dl.get_answer_from_example({
+        'context': 'empty',
+        'answer': 'this is the correct answer',
+        'chain_of_thought': "Let's think step by step. "
+    })
+    assert answer == "Let's think step by step.  ### this is the correct answer"
+
+
+def test_qa_tokenize_example(tiny_gpt2_tokenizer, tmp_path):
+    pytest.importorskip('datasets')
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    dataset_uri = f'{local_data}/triviaqa_small.jsonl'
+
+    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
+    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
+    dl = InContextLearningQATaskDataset(
+        dataset_uri=dataset_uri,
+        tokenizer=tiny_gpt2_tokenizer,
+        max_seq_len=1024,
+        pad_tok_id=tiny_gpt2_tokenizer.eos_token_id,
+        num_fewshot=0,
+        fewshot_random_seed=1234,
+        prompt_string='',
+        example_delimiter='\n',
+        continuation_delimiter=': ',
+        cot_delimiter=' ### ',
+        destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
+    )
+    dl.has_cot = True
+    tokenized_example = dl.tokenize_example(
+        'starting prompt', 'a context', {
+            'context': 'empty',
+            'answer': 'this is the correct answer',
+            'aliases': ['this is the right answer', 'this is the best answer'],
+            'chain_of_thought': "Let's think step by step. "
+        })
+    assert 'aliases' in tokenized_example
+    assert tokenized_example['aliases'] == ['this is the right answer', 'this is the best answer']
+
+
+def test_code_adjust_padding(tiny_gpt2_tokenizer, tmp_path):
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    dataset_uri = f'{local_data}/human_eval_small.jsonl'
+    tokenizer = tiny_gpt2_tokenizer
+    seqlen = 2048
+    num_fewshot = 0
+    prompt_string = ''
+    gen_kwargs = {'temperature': .9, 'top_p': .95, 'num_beams': 9000}
+
+    dl = InContextLearningCodeEvalDataset(
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        max_seq_len=seqlen,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=num_fewshot,
+        fewshot_random_seed=1,
+        prompt_string=prompt_string,
+        example_delimiter='\n',
+        prelimiter='Code start:',
+        continuation_delimiter='\nPlease code:',
+        destination_path=str(tmp_path / 'test_human_eval_small.jsonl'),
+        generation_kwargs=gen_kwargs,
+        generations_per_sample=10,
+    )
+
+    assert all(len(data['prompt']) == 148 for data in dl.dataset)  # pyright: ignore [reportGeneralTypeIssues]
+
+
+def test_code_update_gen_kwargs(tiny_gpt2_tokenizer, tmp_path):
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    dataset_uri = f'{local_data}/human_eval_small.jsonl'
+    tokenizer = tiny_gpt2_tokenizer
+    seqlen = 2048
+    num_fewshot = 0
+    prompt_string = ''
+    gen_kwargs = {'temperature': .9, 'top_p': .95, 'num_beams': 9000}
+
+    dl = InContextLearningCodeEvalDataset(
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        max_seq_len=seqlen,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=num_fewshot,
+        fewshot_random_seed=1,
+        prompt_string=prompt_string,
+        example_delimiter='\n',
+        prelimiter='Code start:',
+        continuation_delimiter='\nPlease code:',
+        destination_path=str(tmp_path / 'test_human_eval_small.jsonl'),
+        generation_kwargs=gen_kwargs,
+        generations_per_sample=10,
+    )
+    assert dl.base_batch['generation_kwargs']['num_beams'] == 9000
+    assert dl.base_batch['generation_kwargs']['top_p'] == .95
+    assert dl.base_batch['generation_kwargs']['temperature'] == .9
+    assert dl.base_batch['generation_kwargs']['do_sample'] == True
+
+
+def test_mc_tokenize_example(tiny_gpt2_tokenizer, tmp_path):
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    dataset_uri = f'{local_data}/mmlu_small.jsonl'
+    tokenizer = tiny_gpt2_tokenizer
+    seqlen = 2048
+    num_fewshot = 0
+    prompt_string = ''
+    seqlen = 2048
+    dl = InContextLearningMultipleChoiceTaskDataset(
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        max_seq_len=seqlen,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=num_fewshot,
+        fewshot_random_seed=1,
+        prompt_string=prompt_string,
+        example_delimiter='\n',
+        continuation_delimiter=' ### ',
+        destination_path=str(tmp_path / 'test_human_eval_small.jsonl'),
+    )
+    example = {
+        'context': "Who's the best eval researcher?\n A. Jeremy\n B. Tessa\n C. Max\n D. Other\nAnswer: ",
+        'choices': ['A', 'B', 'C', 'D'],
+        'gold': 2
+    }
+    tokenized_example = dl.tokenize_example(prompt_and_fewshot='Answer the following: ',
+                                            ctxt=example['context'],
+                                            example=example)
+    unpadded_queries = [context[context != tokenizer.eos_token_id] for context in tokenized_example['query']]
+    untokenized_inputs = [tokenizer.decode(unpadded_input) for unpadded_input in unpadded_queries]
+    correct_output = [
+        "Answer the following: Who's the best eval researcher?\n A. Jeremy\n B. Tessa\n C. Max\n D. Other\nAnswer: A",
+        "Answer the following: Who's the best eval researcher?\n A. Jeremy\n B. Tessa\n C. Max\n D. Other\nAnswer: B",
+        "Answer the following: Who's the best eval researcher?\n A. Jeremy\n B. Tessa\n C. Max\n D. Other\nAnswer: C",
+        "Answer the following: Who's the best eval researcher?\n A. Jeremy\n B. Tessa\n C. Max\n D. Other\nAnswer: D"
+    ]
+    assert untokenized_inputs == correct_output
+
+
+def test_schema_construct_context(tiny_gpt2_tokenizer, tmp_path):
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    dataset_uri = f'{local_data}/winograd_small.jsonl'
+    tokenizer = tiny_gpt2_tokenizer
+    seqlen = 2048
+    num_fewshot = 0
+    seqlen = 2048
+    dl = InContextLearningSchemaTaskDataset(
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        max_seq_len=seqlen,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=num_fewshot,
+        fewshot_random_seed=1,
+        prompt_string='',
+        example_delimiter='\n',
+        continuation_delimiter=' ### ',
+        destination_path=str(tmp_path / 'test_human_eval_small.jsonl'),
+    )
+    example = {'context_options': ['cont one', 'cont two'], 'gold': 0, 'continuation': 'this is a continuation'}
+    constructed_context = dl.construct_context(example)
+    assert constructed_context == 'cont one ### this is a continuation'
+    constructed_context = dl.construct_context(example, preceding_text='text')
+    assert constructed_context == '\ncont one ### this is a continuation'
+
+
+def test_schema_construct_multiple_contexts(tiny_gpt2_tokenizer, tmp_path):
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    dataset_uri = f'{local_data}/winograd_small.jsonl'
+    tokenizer = tiny_gpt2_tokenizer
+    seqlen = 2048
+    num_fewshot = 0
+    prompt_string = ''
+    seqlen = 2048
+    dl = InContextLearningSchemaTaskDataset(
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        max_seq_len=seqlen,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=num_fewshot,
+        fewshot_random_seed=1,
+        prompt_string=prompt_string,
+        example_delimiter='\n',
+        continuation_delimiter=' ### ',
+        destination_path=str(tmp_path / 'test_human_eval_small.jsonl'),
+    )
+    example = {'context_options': ['cont one', 'cont two'], 'gold': 0, 'continuation': 'this is a continuation'}
+    constructed_contexts = dl._construct_multiple_contexts(example)
+    assert constructed_contexts == ['cont one', 'cont two']
+    constructed_contexts = dl._construct_multiple_contexts(example, preceding_text='some text')
+    assert constructed_contexts == ['\ncont one ###', '\ncont two ###']
+
+
+def test_schema_tokenize_example(tiny_gpt2_tokenizer, tmp_path):
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    dataset_uri = f'{local_data}/winograd_small.jsonl'
+    tokenizer = tiny_gpt2_tokenizer
+    seqlen = 2048
+    num_fewshot = 0
+    prompt_string = ''
+    seqlen = 2048
+    dl = InContextLearningSchemaTaskDataset(
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        max_seq_len=seqlen,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=num_fewshot,
+        fewshot_random_seed=1,
+        prompt_string=prompt_string,
+        example_delimiter='\n',
+        continuation_delimiter=' ### ',
+        destination_path=str(tmp_path / 'test_human_eval_small.jsonl'),
+    )
+    example = {'context_options': ['context one', 'context two'], 'gold': 0, 'continuation': 'this is a continuation'}
+    tokenized_example = dl.tokenize_example(prompt_and_fewshot='prompt ',
+                                            context_options=example['context_options'],
+                                            example=example)
+    assert all(tiny_gpt2_tokenizer.decode(cont) == ' this is a continuation' for cont in tokenized_example['answer'])
+    unpadded_inputs = [context[context != tokenizer.eos_token_id] for context in tokenized_example['context_options']]
+    untokenized_inputs = [tokenizer.decode(unpadded_input) for unpadded_input in unpadded_inputs]
+    assert untokenized_inputs == [
+        'prompt context one this is a continuation', 'prompt context two this is a continuation'
+    ]
+
+
+@pytest.mark.parametrize('dataset_uri', ['mmlu_small.jsonl'])
+def test_mc_task_dataloader_subcategories(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
+    pytest.importorskip('datasets')
+
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+
+    tokenizer = tiny_gpt2_tokenizer
+    dataset_uri = f'{local_data}/{dataset_uri}'
+    batch_size = 8
+    seqlen = 64
+    dls = get_icl_task_dataloader('multiple_choice',
+                                  dataset_uri=dataset_uri,
+                                  tokenizer=tokenizer,
+                                  batch_size=batch_size,
+                                  max_seq_len=seqlen,
+                                  pad_tok_id=tokenizer.eos_token_id,
+                                  num_fewshot=2,
+                                  prompt_string='The following are multiple choice questions (with answers).\n',
+                                  example_delimiter='\n',
+                                  continuation_delimiter='Answer: ',
+                                  destination_path=str(tmp_path / 'icl.jsonl'),
+                                  has_categories=True)
+    assert isinstance(dls, dict)
+
+    assert 'computer_security' in dls
+    dl = dls['computer_security']
+    assert isinstance(dl.dataloader, DataLoader)  # pyright
+    batch = next(dl.dataloader._get_iterator())
+    assert dl.dataloader.__len__() == 2
+    assert 'input_ids' in batch
+    assert tuple(batch['input_ids'].shape) == (batch_size, seqlen)
+    assert 'attention_mask' in batch
+    assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen)
+    assert 'continuation_indices' in batch
+    assert isinstance(batch['continuation_indices'], list) and len(batch['continuation_indices']) == batch_size
+    assert 'mode' in batch
+    assert batch['mode'] == 'icl_task'
+    min_idx = min(batch['continuation_indices'][0]).item()
+    max_idx = max(batch['continuation_indices'][0]).item()
+    assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx + 1]) == ' A'
+
+
+@pytest.mark.parametrize('dataset_uri', [
+    'pubmed_sm.jsonl',
+])
+def test_lm_task_dataloader_extra_space(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
+    pytest.importorskip('datasets')
+
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+
+    tokenizer = tiny_gpt2_tokenizer
+    dataset_uri = f'{local_data}/{dataset_uri}'
+    batch_size = 2
+    seqlen = 64
+    dl = get_icl_task_dataloader('language_modeling',
+                                 dataset_uri=dataset_uri,
+                                 tokenizer=tokenizer,
+                                 batch_size=batch_size,
+                                 max_seq_len=seqlen,
+                                 pad_tok_id=tokenizer.eos_token_id,
+                                 num_fewshot=10,
+                                 prompt_string='',
+                                 example_delimiter='\n',
+                                 continuation_delimiter=' ',
+                                 destination_path=str(tmp_path / 'icl.jsonl'))
+    assert isinstance(dl, DataSpec)
+    assert isinstance(dl.dataloader, DataLoader)  # pyright
+    batch = next(dl.dataloader._get_iterator())
+
+    assert 'input_ids' in batch
+    assert tuple(batch['input_ids'].shape) == (batch_size, seqlen)
+    assert 'attention_mask' in batch
+    assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen)
+    assert 'continuation_indices' in batch
+    assert isinstance(batch['continuation_indices'], list) and len(batch['continuation_indices']) == batch_size
+    assert 'mode' in batch
+    assert batch['mode'] == 'icl_task'
+    min_idx = min(batch['continuation_indices'][0]).item()
+    max_idx = max(batch['continuation_indices'][0]).item()
+    assert '  ' not in tokenizer.decode(batch['input_ids'][0][0:max_idx + 1])
+    assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx + 1]) == ' yes'
+
+
+@pytest.mark.parametrize('dataset_uri', [
+    'lambada_small.jsonl',
+])
+def test_lm_task_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
+    pytest.importorskip('datasets')
+
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+
+    tokenizer = tiny_gpt2_tokenizer
+    dataset_uri = f'{local_data}/{dataset_uri}'
+    batch_size = 2
+    seqlen = 64
+    dl = get_icl_task_dataloader('language_modeling',
+                                 dataset_uri=dataset_uri,
+                                 tokenizer=tokenizer,
+                                 batch_size=batch_size,
+                                 max_seq_len=seqlen,
+                                 pad_tok_id=tokenizer.eos_token_id,
+                                 num_fewshot=0,
+                                 prompt_string='',
+                                 example_delimiter='\n',
+                                 continuation_delimiter='',
+                                 destination_path=str(tmp_path / 'icl.jsonl'))
+    assert isinstance(dl, DataSpec)
+    assert isinstance(dl.dataloader, DataLoader)  # pyright
+    batch = next(dl.dataloader._get_iterator())
+
+    assert 'input_ids' in batch
+    assert tuple(batch['input_ids'].shape) == (batch_size, seqlen)
+    assert 'attention_mask' in batch
+    assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen)
+    assert 'continuation_indices' in batch
+    assert isinstance(batch['continuation_indices'], list) and len(batch['continuation_indices']) == batch_size
+    assert 'mode' in batch
+    assert batch['mode'] == 'icl_task'
+    min_idx = min(batch['continuation_indices'][0]).item()
+    max_idx = max(batch['continuation_indices'][0]).item()
+    assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx + 1]) == ' glen'
+
+
+@pytest.mark.parametrize('dataset_uri', ['winograd_small.jsonl'])
+def test_schema_task_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
+    pytest.importorskip('datasets')
+
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+
+    tokenizer = tiny_gpt2_tokenizer
+    dataset_uri = f'{local_data}/{dataset_uri}'
+    batch_size = 2
+    seqlen = 64
+    dl = get_icl_task_dataloader('schema',
+                                 dataset_uri=dataset_uri,
+                                 tokenizer=tokenizer,
+                                 batch_size=batch_size,
+                                 max_seq_len=seqlen,
+                                 pad_tok_id=tokenizer.eos_token_id,
+                                 num_fewshot=1,
+                                 prompt_string='',
+                                 example_delimiter='\n',
+                                 continuation_delimiter='',
+                                 destination_path=str(tmp_path / 'icl.jsonl'))
+    assert isinstance(dl, DataSpec)
+    assert isinstance(dl.dataloader, DataLoader)
+    batch = next(dl.dataloader._get_iterator())
+
+    choices_per_question = 2
+    assert 'input_ids' in batch
+    assert tuple(batch['input_ids'].shape) == (batch_size, seqlen)
+    assert 'attention_mask' in batch
+    assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen)
+    assert 'continuation_indices' in batch
+    assert isinstance(batch['continuation_indices'], list) and len(batch['continuation_indices']) == batch_size
+    assert 'mode' in batch
+    assert batch['mode'] == 'icl_task'
+    assert 'gold_indices' in batch
+    assert isinstance(batch['gold_indices'], list) and len(batch['gold_indices']) == batch_size // choices_per_question
+    assert 'choice_groupings' in batch
+    assert isinstance(batch['choice_groupings'], list) and len(
+        batch['choice_groupings']) == batch_size // choices_per_question
+
+    min_idx = min(batch['continuation_indices'][0]).item()
+    max_idx = max(batch['continuation_indices'][0]).item()
+    assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx + 1]) == ' feared violence.'
+
+
+@pytest.mark.parametrize('dataset_uri', ['winograd_small.jsonl'])
+def test_schema_task_dataloader_sentpiece_tokenizer(dataset_uri, tmp_path):
+    pytest.importorskip('datasets')
+    transformers = pytest.importorskip('transformers')
+
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        'huggyllama/llama-7b',  # type: ignore reportUnboundVariable
+        use_fast=False)
+    dataset_uri = f'{local_data}/{dataset_uri}'
+    batch_size = 2
+    seqlen = 64
+    dl = get_icl_task_dataloader('schema',
+                                 dataset_uri=dataset_uri,
+                                 tokenizer=tokenizer,
+                                 batch_size=batch_size,
+                                 max_seq_len=seqlen,
+                                 pad_tok_id=tokenizer.eos_token_id,
+                                 num_fewshot=1,
+                                 prompt_string='',
+                                 example_delimiter='\n',
+                                 continuation_delimiter=' ',
+                                 destination_path=str(tmp_path / 'icl.jsonl'))
+    assert isinstance(dl, DataSpec)
+    assert isinstance(dl.dataloader, DataLoader)
+    batch = next(dl.dataloader._get_iterator())
+
+    choices_per_question = 2
+    assert 'input_ids' in batch
+    assert tuple(batch['input_ids'].shape) == (batch_size, seqlen)
+    assert 'attention_mask' in batch
+    assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen)
+    assert 'continuation_indices' in batch
+    assert isinstance(batch['continuation_indices'], list) and len(batch['continuation_indices']) == batch_size
+    assert 'mode' in batch
+    assert batch['mode'] == 'icl_task'
+    assert 'gold_indices' in batch
+    assert isinstance(batch['gold_indices'], list) and len(batch['gold_indices']) == batch_size // choices_per_question
+    assert 'choice_groupings' in batch
+    assert isinstance(batch['choice_groupings'], list) and len(
+        batch['choice_groupings']) == batch_size // choices_per_question
+
+    max_idx = max(batch['continuation_indices'][0]).item()
+    assert tokenizer.decode(
+        batch['input_ids'][0][0:max_idx + 1]
+    ) == "<s>The trophy doesn't fit into the brown suitcase because the suitcase is too small. \nThe city councilmen refused the demonstrators a permit because the city councilmen feared violence."
+
+
+@pytest.mark.parametrize('dataset_uri', ['lambada_small.jsonl'])
+@pytest.mark.parametrize('num_fewshot', [0, 1])
+def test_lm_task_dataloader_opt_tokenizer(tiny_opt_tokenizer, dataset_uri, num_fewshot, tmp_path):
+    pytest.importorskip('datasets')
+
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+
+    tokenizer = tiny_opt_tokenizer
+    dataset_uri = f'{local_data}/{dataset_uri}'
+    batch_size = 2
+    seqlen = 512
+    dl = get_icl_task_dataloader('language_modeling',
+                                 dataset_uri=dataset_uri,
+                                 tokenizer=tokenizer,
+                                 batch_size=batch_size,
+                                 max_seq_len=seqlen,
+                                 pad_tok_id=tokenizer.eos_token_id,
+                                 num_fewshot=num_fewshot,
+                                 prompt_string='',
+                                 example_delimiter='\n',
+                                 continuation_delimiter='',
+                                 destination_path=str(tmp_path / 'icl.jsonl'))
+    assert isinstance(dl, DataSpec)
+    assert isinstance(dl.dataloader, DataLoader)  # pyright
+    batch = next(dl.dataloader._get_iterator())
+
+    assert 'input_ids' in batch
+    assert tuple(batch['input_ids'].shape) == (batch_size, seqlen)
+    assert 'attention_mask' in batch
+    assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen)
+    assert 'continuation_indices' in batch
+    assert isinstance(batch['continuation_indices'], list) and len(batch['continuation_indices']) == batch_size
+    assert 'mode' in batch
+    assert batch['mode'] == 'icl_task'
+    min_idx = min(batch['continuation_indices'][0]).item()
+    max_idx = max(batch['continuation_indices'][0]).item()
+    assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx + 1]) == ' glen'
+    assert tokenizer.decode(batch['input_ids'][0][0:min_idx]).startswith('</s>')
+    assert tokenizer.decode(batch['input_ids'][0][0:min_idx]).count('</s>') == 1
+
+
+@pytest.mark.parametrize('dataset_uri', ['piqa_small.jsonl'])
+@pytest.mark.parametrize('num_fewshot', [0, 1])
+def test_mc_task_dataloader_opt_tokenizer(tiny_opt_tokenizer, dataset_uri, num_fewshot, tmp_path):
+    pytest.importorskip('datasets')
+
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+
+    tokenizer = tiny_opt_tokenizer
+
+    dataset_uri = f'{local_data}/{dataset_uri}'
+    batch_size = 4
+    seqlen = 64
+    dl = get_icl_task_dataloader('multiple_choice',
+                                 dataset_uri=dataset_uri,
+                                 tokenizer=tokenizer,
+                                 batch_size=batch_size,
+                                 max_seq_len=seqlen,
+                                 pad_tok_id=tokenizer.eos_token_id,
+                                 num_fewshot=num_fewshot,
+                                 prompt_string='',
+                                 example_delimiter='\n',
+                                 continuation_delimiter=': ',
+                                 destination_path=str(tmp_path / 'icl.jsonl'))
+    assert isinstance(dl, DataSpec)
+    assert isinstance(dl.dataloader, DataLoader)  # pyright
+    batch = next(dl.dataloader._get_iterator())
+
+    choices_per_question = 2
+    assert dl.get_num_samples_in_batch(batch) == 2
+    assert 'input_ids' in batch
+    assert tuple(batch['input_ids'].shape) == (batch_size, seqlen)
+    assert 'attention_mask' in batch
+    assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen)
+    assert 'continuation_indices' in batch
+    assert isinstance(batch['continuation_indices'], list) and len(batch['continuation_indices']) == batch_size
+    assert 'mode' in batch
+    assert batch['mode'] == 'icl_task'
+    assert 'gold_indices' in batch
+    assert isinstance(batch['gold_indices'], list) and len(batch['gold_indices']) == batch_size // choices_per_question
+    assert 'choice_groupings' in batch
+    assert isinstance(batch['choice_groupings'], list) and len(
+        batch['choice_groupings']) == batch_size // choices_per_question
+
+    min_idx = min(batch['continuation_indices'][0]).item()
+    max_idx = max(batch['continuation_indices'][0]).item()
+    assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx + 1]) == ' Pour it onto a plate'
+    assert tokenizer.decode(batch['input_ids'][0][0:min_idx]).startswith('</s>')
+    assert tokenizer.decode(batch['input_ids'][0][0:min_idx]).count('</s>') == 1
+
+
+@pytest.mark.parametrize('dataset_uri', ['piqa_small.jsonl'])
+@pytest.mark.parametrize('num_fewshot', [0, 1])
+def test_mc_split_batch(tiny_opt_tokenizer, dataset_uri, num_fewshot, tmp_path):
+    pytest.importorskip('datasets')
+
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+
+    tokenizer = tiny_opt_tokenizer
+
+    dataset_uri = f'{local_data}/{dataset_uri}'
+    batch_size = 4
+    seqlen = 512
+    dl = get_icl_task_dataloader('multiple_choice',
+                                 dataset_uri=dataset_uri,
+                                 tokenizer=tokenizer,
+                                 batch_size=batch_size,
+                                 max_seq_len=seqlen,
+                                 pad_tok_id=tokenizer.eos_token_id,
+                                 num_fewshot=num_fewshot,
+                                 prompt_string='',
+                                 example_delimiter='\n',
+                                 continuation_delimiter=': ',
+                                 destination_path=str(tmp_path / 'icl.jsonl'))
+    assert isinstance(dl, DataSpec)
+    assert isinstance(dl.dataloader, DataLoader)  # pyright
+    batch = next(dl.dataloader._get_iterator())
+    choices_per_question = 2
+    real_microbatch_size = batch_size // 2
+    logical_microbatch_size = real_microbatch_size // choices_per_question
+    microbatches = dl.split_batch(batch, logical_microbatch_size)
+    assert len(microbatches) == 2
+    for i, microbatch in enumerate(microbatches):
+        assert dl.get_num_samples_in_batch(microbatch) == 1
+        assert 'input_ids' in microbatch
+        assert tuple(microbatch['input_ids'].shape) == (real_microbatch_size, seqlen)
+        assert 'attention_mask' in microbatch
+        assert tuple(microbatch['attention_mask'].shape) == (real_microbatch_size, seqlen)
+        assert 'continuation_indices' in microbatch
+        assert isinstance(microbatch['continuation_indices'], list) and len(
+            microbatch['continuation_indices']) == real_microbatch_size
+        assert 'mode' in microbatch
+        assert microbatch['mode'] == 'icl_task'
+        assert 'gold_indices' in microbatch
+        assert isinstance(microbatch['gold_indices'], list) and len(
+            microbatch['gold_indices']) == real_microbatch_size // choices_per_question
+        assert 'choice_groupings' in microbatch
+        assert isinstance(microbatch['choice_groupings'], list) and len(
+            microbatch['choice_groupings']) == real_microbatch_size // choices_per_question
+
+        min_idx = min(microbatch['continuation_indices'][0]).item()
+        max_idx = max(microbatch['continuation_indices'][0]).item()
+        if i == 0:
+            assert tokenizer.decode(microbatch['input_ids'][0][min_idx:max_idx + 1]) == ' Pour it onto a plate'
+        elif i == 1:
+            assert tokenizer.decode(
+                microbatch['input_ids'][0][min_idx:max_idx +
+                                           1]) == ' Weld the metal together to get it to stay firmly in place'
+        assert tokenizer.decode(microbatch['input_ids'][0][0:min_idx]).startswith('</s>')
+        assert tokenizer.decode(microbatch['input_ids'][0][0:min_idx]).count('</s>') == 1
+
+
+@pytest.mark.parametrize('dataset_uri', ['triviaqa_small.jsonl'])
+def test_qa_split_batch(tiny_opt_tokenizer, dataset_uri, tmp_path):
+    pytest.importorskip('datasets')
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    dataset_uri = f'{local_data}/{dataset_uri}'
+    tokenizer = tiny_opt_tokenizer
+
+    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
+    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)  # for dist
+    dl = get_icl_task_dataloader(
+        icl_task_type='question_answering',
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        batch_size=8,
+        max_seq_len=1024,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=0,
+        prompt_string='',
+        example_delimiter='\n',
+        continuation_delimiter=': ',
+        destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
+    )
+
+    assert isinstance(dl, DataSpec)  # pyright
+
+    batch = next(iter(dl.dataloader))
+    split_batch = dl.split_batch(batch, 3)
+
+    assert len(split_batch) == 2
+    split1 = split_batch[0]
+    split2 = split_batch[1]
+
+    assert split1['input_ids'].shape[0] == 3
+    assert split2['input_ids'].shape[0] == 1
+
+    assert split1['attention_mask'].shape[0] == 3
+    assert split2['attention_mask'].shape[0] == 1
+
+    assert isinstance(split1['mode'], str)
+    assert isinstance(split2['mode'], str)
+
+    assert len(split1['labels']) == 3
+    assert len(split2['labels']) == 1
+    assert all(isinstance(v, list) for v in split1['labels'] + split2['labels'])
+
+    assert isinstance(split1['generation_length'], int)
+    assert isinstance(split2['generation_length'], int)
+
+    assert isinstance(split1['generation_kwargs'], dict)
+    assert isinstance(split2['generation_kwargs'], dict)
+
+
+@pytest.mark.parametrize('dataset_uri', ['triviaqa_small.jsonl'])
+@pytest.mark.parametrize('num_fewshot', [0])
+@pytest.mark.parametrize('prompt_string', ['I am a prompt', ''])
+def test_qa_task_dataloader_w_null_eos(dataset_uri, tiny_gpt2_tokenizer, tmp_path, num_fewshot, prompt_string):
+    pytest.importorskip('datasets')
+
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+
+    tokenizer = tiny_gpt2_tokenizer
+    dataset_uri = f'{local_data}/{dataset_uri}'
+    batch_size = 4
+    seqlen = 512
+    tiny_gpt2_tokenizer.eos_token_id = None
+    with pytest.raises(ValueError):
+        _ = get_icl_task_dataloader('question_answering',
+                                    dataset_uri,
+                                    tokenizer,
+                                    batch_size,
+                                    max_seq_len=seqlen,
+                                    pad_tok_id=tokenizer.eos_token_id,
+                                    num_fewshot=num_fewshot,
+                                    prompt_string=prompt_string,
+                                    example_delimiter='\n',
+                                    question_prelimiter='Q: ',
+                                    continuation_delimiter='\nA:',
+                                    destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'))
+
+
+@pytest.mark.parametrize('dataset_uri', ['triviaqa_small.jsonl'])
+@pytest.mark.parametrize('num_fewshot', [0, 2])
+@pytest.mark.parametrize('prompt_string', ['I am a prompt', ''])
+def test_qa_task_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path, num_fewshot, prompt_string):
+    pytest.importorskip('datasets')
+
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+
+    tokenizer = tiny_gpt2_tokenizer
+    dataset_uri = f'{local_data}/{dataset_uri}'
+    batch_size = 4
+    seqlen = 512
+    # empirical number from the small test dataset
+    maximum_answer_length = 7
+    dl = get_icl_task_dataloader('question_answering',
+                                 dataset_uri=dataset_uri,
+                                 tokenizer=tokenizer,
+                                 batch_size=batch_size,
+                                 max_seq_len=seqlen,
+                                 pad_tok_id=tokenizer.eos_token_id,
+                                 num_fewshot=num_fewshot,
+                                 prompt_string=prompt_string,
+                                 example_delimiter='\n',
+                                 question_prelimiter='Q: ',
+                                 continuation_delimiter='\nA:',
+                                 destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'))
+    assert isinstance(dl, DataSpec)
+
+    assert isinstance(dl.dataloader, DataLoader)  # pyright
+    batch = next(dl.dataloader._get_iterator())
+
+    assert tuple(batch['input_ids'].shape) == (batch_size, seqlen - maximum_answer_length)
+    assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen - maximum_answer_length)
+    assert batch['mode'] == 'generate'
+    # the maximum generation length from the small test data
+
+    assert batch['generation_length'] == maximum_answer_length
+    assert all(item[0] == tokenizer.eos_token_id for item in batch['input_ids'])
+
+    decoded_batch = tokenizer.batch_decode(batch['input_ids'])
+    assert all(item.count('Q: ') == num_fewshot + 1 for item in decoded_batch)
+    assert all(item.count('\nA:') == num_fewshot + 1 for item in decoded_batch)
+
+    if len(prompt_string) > 0:
+        assert all(item.count('I am a prompt') == 1 for item in decoded_batch)
+    assert all(
+        set(found) == set(expected)
+        for found, expected in zip(batch['labels'], [['David Seville'], ['Skorpio', 'Scorpio']]))
+    assert decoded_batch[0].endswith('Q: Who was the man behind The Chipmunks?\nA:')
+    assert decoded_batch[1].endswith('Q: What star sign is Jamie Lee Curtis?\nA:')
+    assert 'eos_token_id' in batch['generation_kwargs']
+
+
+@pytest.mark.parametrize('dataset_uri', ['gsm8k_small.jsonl'])
+@pytest.mark.parametrize('num_fewshot', [0, 2])
+def test_qa_task_with_cot_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path, num_fewshot):
+    pytest.importorskip('datasets')
+
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+
+    tokenizer = tiny_gpt2_tokenizer
+    dataset_uri = f'{local_data}/{dataset_uri}'
+    batch_size = 2
+    seqlen = 512
+    # empirical number from the small test dataset
+    maximum_answer_length = 132
+    dl = get_icl_task_dataloader('question_answering',
+                                 dataset_uri=dataset_uri,
+                                 tokenizer=tokenizer,
+                                 batch_size=batch_size,
+                                 max_seq_len=seqlen,
+                                 pad_tok_id=tokenizer.eos_token_id,
+                                 num_fewshot=num_fewshot,
+                                 prompt_string='',
+                                 example_delimiter='\n',
+                                 question_prelimiter='Q: ',
+                                 continuation_delimiter="\nA: Let's think step by step. ",
+                                 cot_delimiter=' #### ',
+                                 destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'))
+    assert isinstance(dl, DataSpec)
+    assert isinstance(dl.dataloader, DataLoader)  # pyright
+    batch = next(dl.dataloader._get_iterator())
+    assert tuple(batch['input_ids'].shape) == (batch_size, seqlen - maximum_answer_length)
+    assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen - maximum_answer_length)
+    assert batch['mode'] == 'generate'
+    # the maximum generation length from the small test data
+    assert batch['generation_length'] == maximum_answer_length
+    assert all(item[0] == tokenizer.eos_token_id for item in batch['input_ids'])
+    decoded_batch = tokenizer.batch_decode(batch['input_ids'])
+    assert all(item.count('Q: ') == num_fewshot + 1 for item in decoded_batch)
+    assert all(item.count('\nA:') == num_fewshot + 1 for item in decoded_batch)
+
+    assert batch['labels'] == [['18'], ['3']]
+    if num_fewshot == 0:
+        assert decoded_batch[0].endswith(
+            "Q: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\nA: Let's think step by step."
+        )
+        assert decoded_batch[1].endswith(
+            "Q: A robe takes 2 bolts of blue fiber and half that much white fiber.  How many bolts in total does it take?\nA: Let's think step by step."
+        )
+    elif num_fewshot == 2:
+        assert decoded_batch[0].endswith(
+            "Q: Josh decides to try flipping a house.  He buys a house for $80,000 and then puts in $50,000 in repairs.  This increased the value of the house by 150%.  How much profit did he make?\nA: Let's think step by step. The cost of the house and repairs came out to 80,000+50,000=$<<80000+50000=130000>>130,000\nHe increased the value of the house by 80,000*1.5=<<80000*1.5=120000>>120,000\nSo the new value of the house is 120,000+80,000=$<<120000+80000=200000>>200,000\nSo he made a profit of 200,000-130,000=$<<200000-130000=70000>>70,000 #### 70000\nQ: James decides to run 3 sprints 3 times a week.  He runs 60 meters each sprint.  How many total meters does he run a week?\nA: Let's think step by step. He sprints 3*3=<<3*3=9>>9 times\nSo he runs 9*60=<<9*60=540>>540 meters #### 540\nQ: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\nA: Let's think step by step."
+        )
+        assert decoded_batch[1].endswith(
+            "Q: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\nA: Let's think step by step. Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\nShe makes 9 * 2 = $<<9*2=18>>18 every day at the farmer’s market. #### 18\nQ: Josh decides to try flipping a house.  He buys a house for $80,000 and then puts in $50,000 in repairs.  This increased the value of the house by 150%.  How much profit did he make?\nA: Let's think step by step. The cost of the house and repairs came out to 80,000+50,000=$<<80000+50000=130000>>130,000\nHe increased the value of the house by 80,000*1.5=<<80000*1.5=120000>>120,000\nSo the new value of the house is 120,000+80,000=$<<120000+80000=200000>>200,000\nSo he made a profit of 200,000-130,000=$<<200000-130000=70000>>70,000 #### 70000\nQ: A robe takes 2 bolts of blue fiber and half that much white fiber.  How many bolts in total does it take?\nA: Let's think step by step."
+        )
+
+
+@pytest.mark.parametrize('dataset_uri', ['piqa_small.jsonl'])
+def test_mc_task_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
+    pytest.importorskip('datasets')
+
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+
+    tokenizer = tiny_gpt2_tokenizer
+    dataset_uri = f'{local_data}/{dataset_uri}'
+    batch_size = 2
+    seqlen = 64
+    dl = get_icl_task_dataloader('multiple_choice',
+                                 dataset_uri=dataset_uri,
+                                 tokenizer=tokenizer,
+                                 batch_size=batch_size,
+                                 max_seq_len=seqlen,
+                                 pad_tok_id=tokenizer.eos_token_id,
+                                 num_fewshot=1,
+                                 prompt_string='',
+                                 example_delimiter='\n',
+                                 continuation_delimiter=': ',
+                                 destination_path=str(tmp_path / 'icl.jsonl'))
+    assert isinstance(dl, DataSpec)
+    assert isinstance(dl.dataloader, DataLoader)  # pyright
+    batch = next(dl.dataloader._get_iterator())
+
+    choices_per_question = 2
+    assert 'input_ids' in batch
+    assert tuple(batch['input_ids'].shape) == (batch_size, seqlen)
+    assert 'attention_mask' in batch
+    assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen)
+    assert 'continuation_indices' in batch
+    assert isinstance(batch['continuation_indices'], list) and len(batch['continuation_indices']) == batch_size
+    assert 'mode' in batch
+    assert batch['mode'] == 'icl_task'
+    assert 'gold_indices' in batch
+    assert isinstance(batch['gold_indices'], list) and len(batch['gold_indices']) == batch_size // choices_per_question
+    assert 'choice_groupings' in batch
+    assert isinstance(batch['choice_groupings'], list) and len(
+        batch['choice_groupings']) == batch_size // choices_per_question
+
+    min_idx = min(batch['continuation_indices'][0]).item()
+    max_idx = max(batch['continuation_indices'][0]).item()
+    assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx + 1]) == ' Pour it onto a plate'
+
+
+@pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
+def test_code_eval_split_batch(dataset_uri, tmp_path):
+    pytest.importorskip('datasets')
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    dataset_uri = f'{local_data}/{dataset_uri}'
+    transformers = pytest.importorskip('transformers')
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        'EleutherAI/gpt-neox-20b')  # type: ignore reportUnboundVariable
+
+    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
+    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
+    dl = get_icl_task_dataloader(
+        'code_evaluation',
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        batch_size=8,
+        max_seq_len=1024,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=2,
+        prompt_string='',
+        example_delimiter='\n',
+        continuation_delimiter='',
+        destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
+        generations_per_sample=4,
+    )
+
+    assert isinstance(dl, DataSpec)  # pyright
+
+    batch = next(iter(dl.dataloader))
+    split_batch = dl.split_batch(batch, 3)
+
+    assert len(split_batch) == 2
+    split1 = split_batch[0]
+    split2 = split_batch[1]
+
+    assert split1['input_ids'].shape[0] == 3
+    assert split2['input_ids'].shape[0] == 1
+
+    assert split1['attention_mask'].shape[0] == 3
+    assert split2['attention_mask'].shape[0] == 1
+
+    assert isinstance(split1['mode'], str)
+    assert isinstance(split2['mode'], str)
+
+    list_split = {
+        'labels': str,
+        'prompts': str,
+        'tests': str,
+        'entry_points': str,
+        'test_inputs': list,
+        'test_outputs': list,
+        'languages': str,
+    }
+    for k, v in list_split.items():
+        assert len(split1[k]) == 3
+        assert len(split2[k]) == 1
+        assert all(isinstance(val, v) for val in split1[k] + split2[k])
+
+    assert isinstance(split1['pass_at_k'], int)
+    assert isinstance(split2['pass_at_k'], int)
+
+    assert isinstance(split1['generation_length'], int)
+    assert isinstance(split2['generation_length'], int)
+
+    assert isinstance(split1['generation_kwargs'], dict)
+    assert isinstance(split2['generation_kwargs'], dict)
+
+
+@pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
+@pytest.mark.parametrize('num_fewshot', [0, 2])
+@pytest.mark.parametrize('prompt_string', ['Please code:\n', ''])
+@pytest.mark.parametrize('generations_per_sample', [1, 3])
+def test_code_eval_sentpiece_dataloader(dataset_uri, tmp_path, num_fewshot, prompt_string, generations_per_sample):
+    pytest.importorskip('datasets')
+
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+
+    transformers = pytest.importorskip('transformers')
+    tokenizer = transformers.AutoTokenizer.from_pretrained('huggyllama/llama-7b')  # type: ignore reportUnboundVariable
+    dataset_uri = f'{local_data}/{dataset_uri}'
+    batch_size = 4
+    seqlen = 2048
+
+    dl = get_icl_task_dataloader('code_evaluation',
+                                 dataset_uri=dataset_uri,
+                                 tokenizer=tokenizer,
+                                 batch_size=batch_size,
+                                 max_seq_len=seqlen,
+                                 pad_tok_id=tokenizer.eos_token_id,
+                                 num_fewshot=num_fewshot,
+                                 prompt_string=prompt_string,
+                                 example_delimiter='\n',
+                                 continuation_delimiter='',
+                                 question_prelimiter='Code start: \n',
+                                 destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'),
+                                 generations_per_sample=generations_per_sample)
+    assert isinstance(dl, DataSpec)
+
+    assert isinstance(dl.dataloader, DataLoader)  # pyright
+    batch = next(dl.dataloader._get_iterator())
+
+    max_prompt_length = 0
+    if isinstance(dl.dataloader.dataset, InContextLearningCodeEvalDataset):
+        max_prompt_length = dl.dataloader.dataset.max_prompt_length
+    assert tuple(batch['input_ids'].shape) == (batch_size, max_prompt_length)
+    assert tuple(batch['attention_mask'].shape) == (batch_size, max_prompt_length)
+    assert batch['mode'] == 'generate'
+    # the maximum generation length from the small test data
+    assert batch['generation_length'] == 129
+    assert any(item[0] != tokenizer.eos_token_id for item in batch['input_ids'])  # longest should be pushed left
+
+    decoded_batch = tokenizer.batch_decode(batch['input_ids'])
+    assert all(item.count('Code start: \n') == num_fewshot + 1 for item in decoded_batch)
+
+    if len(prompt_string) > 0:
+        assert all(item.count('Please code:\n') == 1 for item in decoded_batch)
+
+    assert batch['labels'] == [
+        '    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx != idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n',
+        "    result = []\n    current_string = []\n    current_depth = 0\n\n    for c in paren_string:\n        if c == '(':\n            current_depth += 1\n            current_string.append(c)\n        elif c == ')':\n            current_depth -= 1\n            current_string.append(c)\n\n            if current_depth == 0:\n                result.append(''.join(current_string))\n                current_string.clear()\n\n    return result\n",
+        '    return number % 1.0\n',
+        '    balance = 0\n\n    for op in operations:\n        balance += op\n        if balance < 0:\n            return True\n\n    return False\n',
+    ]
+
+    assert decoded_batch[0].endswith(
+        "Code start: \nfrom typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    \"\"\"\n"
+    )
+    assert decoded_batch[1].endswith(
+        "Code start: \nfrom typing import List\n\n\ndef separate_paren_groups(paren_string: str) -> List[str]:\n    \"\"\" Input to this function is a string containing multiple groups of nested parentheses. Your goal is to\n    separate those group into separate strings and return the list of those.\n    Separate groups are balanced (each open brace is properly closed) and not nested within each other\n    Ignore any spaces in the input string.\n    >>> separate_paren_groups('( ) (( )) (( )( ))')\n    ['()', '(())', '(()())']\n    \"\"\"\n"
+    )
+    assert decoded_batch[2].endswith(
+        "Code start: \n\n\ndef truncate_number(number: float) -> float:\n    \"\"\" Given a positive floating point number, it can be decomposed into\n    and integer part (largest integer smaller than given number) and decimals\n    (leftover part always smaller than 1).\n\n    Return the decimal part of the number.\n    >>> truncate_number(3.5)\n    0.5\n    \"\"\"\n"
+    )
+    assert decoded_batch[3].endswith(
+        "Code start: \nfrom typing import List\n\n\ndef below_zero(operations: List[int]) -> bool:\n    \"\"\" You're given a list of deposit and withdrawal operations on a bank account that starts with\n    zero balance. Your task is to detect if at any point the balance of account fallls below zero, and\n    at that point function should return True. Otherwise it should return False.\n    >>> below_zero([1, 2, 3])\n    False\n    >>> below_zero([1, 2, -4, 5])\n    True\n    \"\"\"\n"
+    )
+
+
+@pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
+def test_code_eval_test_cases(dataset_uri, tmp_path):
+    pytest.importorskip('datasets')
+
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+
+    transformers = pytest.importorskip('transformers')
+    tokenizer = transformers.AutoTokenizer.from_pretrained('huggyllama/llama-7b')  # type: ignore reportUnboundVariable
+    dataset_uri = f'{local_data}/{dataset_uri}'
+    batch_size = 4
+    seqlen = 512
+
+    dl = get_icl_task_dataloader('code_evaluation',
+                                 dataset_uri=dataset_uri,
+                                 tokenizer=tokenizer,
+                                 batch_size=batch_size,
+                                 max_seq_len=seqlen,
+                                 pad_tok_id=tokenizer.eos_token_id,
+                                 num_fewshot=0,
+                                 prompt_string='',
+                                 example_delimiter='\n',
+                                 continuation_delimiter='',
+                                 question_prelimiter='Code start: \n',
+                                 destination_path=str(tmp_path / f'icl_.jsonl'),
+                                 generations_per_sample=1)
+    assert isinstance(dl, DataSpec)
+
+    assert isinstance(dl.dataloader, DataLoader)  # pyright
+    batch = next(dl.dataloader._get_iterator())
+
+    max_prompt_length = 0
+    if isinstance(dl.dataloader.dataset, InContextLearningCodeEvalDataset):
+        max_prompt_length = dl.dataloader.dataset.max_prompt_length
+    assert tuple(batch['input_ids'].shape) == (batch_size, max_prompt_length)
+    assert tuple(batch['attention_mask'].shape) == (batch_size, max_prompt_length)
+    assert batch['mode'] == 'generate'
+    # the maximum generation length from the small test data
+    assert batch['generation_length'] == 129
+    assert any(item[0] != tokenizer.eos_token_id for item in batch['input_ids'])  # longest should be pushed left
+
+    mod = types.ModuleType('test_module')
+    for prompt, solution, inputs, outputs, entry_point in zip(batch['prompts'], batch['labels'], batch['test_inputs'],
+                                                              batch['test_outputs'], batch['entry_points']):
+        exec(prompt + solution, mod.__dict__)
+        for test_input, test_output in zip(inputs, outputs):
+            result = mod.__dict__[entry_point](*eval(test_input))
+            assert result == eval(test_output)
+
+
+@pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
+def test_code_eval_pass_at_k_validity(dataset_uri, tmp_path):
+    pytest.importorskip('datasets')
+
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+
+    transformers = pytest.importorskip('transformers')
+    tokenizer = transformers.AutoTokenizer.from_pretrained('huggyllama/llama-7b')  # type: ignore reportUnboundVariable
+    dataset_uri = f'{local_data}/{dataset_uri}'
+    batch_size = 2
+    seqlen = 64
+
+    with pytest.raises(ValueError, match=r'.* pass_at_k .*'):
+        get_icl_task_dataloader('code_evaluation',
+                                dataset_uri=dataset_uri,
+                                tokenizer=tokenizer,
+                                batch_size=batch_size,
+                                max_seq_len=seqlen,
+                                pad_tok_id=tokenizer.eos_token_id,
+                                num_fewshot=0,
+                                prompt_string='',
+                                example_delimiter='\n',
+                                continuation_delimiter='',
+                                question_prelimiter='Code start: \n',
+                                destination_path=str(tmp_path / f'icl_.jsonl'),
+                                pass_at_k=10,
+                                generations_per_sample=1)
+
+
+@pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
+@pytest.mark.parametrize('num_fewshot', [0, 2])
+@pytest.mark.parametrize('prompt_string', ['Please code:\n', ''])
+@pytest.mark.parametrize('generations_per_sample', [1, 3])
+def test_code_eval_task_dataloader(dataset_uri, tmp_path, num_fewshot, prompt_string, generations_per_sample):
+    pytest.importorskip('datasets')
+
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+
+    transformers = pytest.importorskip('transformers')
+    tokenizer = transformers.AutoTokenizer.from_pretrained('mosaicml/mpt-7b')  # type: ignore reportUnboundVariable
+    dataset_uri = f'{local_data}/{dataset_uri}'
+    batch_size = 4
+    seqlen = 2048
+
+    dl = get_icl_task_dataloader('code_evaluation',
+                                 dataset_uri=dataset_uri,
+                                 tokenizer=tokenizer,
+                                 batch_size=batch_size,
+                                 max_seq_len=seqlen,
+                                 pad_tok_id=tokenizer.eos_token_id,
+                                 num_fewshot=num_fewshot,
+                                 prompt_string=prompt_string,
+                                 example_delimiter='\n',
+                                 continuation_delimiter='',
+                                 question_prelimiter='Code start: \n',
+                                 destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'),
+                                 generations_per_sample=generations_per_sample,
+                                 generation_kwargs={
+                                     'temperature': .9,
+                                     'top_k': 40
+                                 })
+    assert isinstance(dl, DataSpec)
+
+    assert isinstance(dl.dataloader, DataLoader)  # pyright
+    batch = next(dl.dataloader._get_iterator())
+
+    max_prompt_length = 0
+    if isinstance(dl.dataloader.dataset, InContextLearningCodeEvalDataset):
+        max_prompt_length = dl.dataloader.dataset.max_prompt_length
+    assert tuple(batch['input_ids'].shape) == (batch_size, max_prompt_length)
+    assert tuple(batch['attention_mask'].shape) == (batch_size, max_prompt_length)
+    assert batch['mode'] == 'generate'
+    # the maximum generation length from the small test data
+    assert batch['generation_length'] == 122
+    assert any(item[0] != tokenizer.eos_token_id for item in batch['input_ids'])  # longest should be pushed left
+
+    decoded_batch = tokenizer.batch_decode(batch['input_ids'])
+    assert all(item.count('Code start: \n') == num_fewshot + 1 for item in decoded_batch)
+
+    if len(prompt_string) > 0:
+        assert all(item.count('Please code:\n') == 1 for item in decoded_batch)
+
+    assert batch['labels'] == [
+        '    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx != idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n',
+        "    result = []\n    current_string = []\n    current_depth = 0\n\n    for c in paren_string:\n        if c == '(':\n            current_depth += 1\n            current_string.append(c)\n        elif c == ')':\n            current_depth -= 1\n            current_string.append(c)\n\n            if current_depth == 0:\n                result.append(''.join(current_string))\n                current_string.clear()\n\n    return result\n",
+        '    return number % 1.0\n',
+        '    balance = 0\n\n    for op in operations:\n        balance += op\n        if balance < 0:\n            return True\n\n    return False\n',
+    ]
+
+    assert decoded_batch[0].endswith(
+        "Code start: \nfrom typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    \"\"\"\n"
+    )
+    assert decoded_batch[1].endswith(
+        "Code start: \nfrom typing import List\n\n\ndef separate_paren_groups(paren_string: str) -> List[str]:\n    \"\"\" Input to this function is a string containing multiple groups of nested parentheses. Your goal is to\n    separate those group into separate strings and return the list of those.\n    Separate groups are balanced (each open brace is properly closed) and not nested within each other\n    Ignore any spaces in the input string.\n    >>> separate_paren_groups('( ) (( )) (( )( ))')\n    ['()', '(())', '(()())']\n    \"\"\"\n"
+    )
+    assert decoded_batch[2].endswith(
+        "Code start: \n\n\ndef truncate_number(number: float) -> float:\n    \"\"\" Given a positive floating point number, it can be decomposed into\n    and integer part (largest integer smaller than given number) and decimals\n    (leftover part always smaller than 1).\n\n    Return the decimal part of the number.\n    >>> truncate_number(3.5)\n    0.5\n    \"\"\"\n"
+    )
+    assert decoded_batch[3].endswith(
+        "Code start: \nfrom typing import List\n\n\ndef below_zero(operations: List[int]) -> bool:\n    \"\"\" You're given a list of deposit and withdrawal operations on a bank account that starts with\n    zero balance. Your task is to detect if at any point the balance of account fallls below zero, and\n    at that point function should return True. Otherwise it should return False.\n    >>> below_zero([1, 2, 3])\n    False\n    >>> below_zero([1, 2, -4, 5])\n    True\n    \"\"\"\n"
+    )
+
+
+@pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
+@pytest.mark.parametrize('num_fewshot', [0, 1])
+def test_eval_split_batch(tiny_opt_tokenizer, dataset_uri, num_fewshot, tmp_path):
+    pytest.importorskip('datasets')
+
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    transformers = pytest.importorskip('transformers')
+    tokenizer = transformers.AutoTokenizer.from_pretrained('mosaicml/mpt-7b')  # type: ignore reportUnboundVariable
+    dataset_uri = f'{local_data}/{dataset_uri}'
+    batch_size = 4
+    seqlen = 512
+
+    dl = get_icl_task_dataloader('code_evaluation',
+                                 dataset_uri=dataset_uri,
+                                 tokenizer=tokenizer,
+                                 batch_size=batch_size,
+                                 max_seq_len=seqlen,
+                                 pad_tok_id=tokenizer.eos_token_id,
+                                 num_fewshot=num_fewshot,
+                                 prompt_string='',
+                                 example_delimiter='\n',
+                                 continuation_delimiter='',
+                                 question_prelimiter='Code start: \n',
+                                 destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'),
+                                 generations_per_sample=1,
+                                 generation_kwargs={
+                                     'temperature': .9,
+                                     'top_k': 40
+                                 })
+    assert isinstance(dl, DataSpec)
+    assert isinstance(dl.dataloader, DataLoader)  # pyright
+    batch = next(dl.dataloader._get_iterator())
+    microbatch_size = 1
+    microbatches = dl.split_batch(batch, microbatch_size)
+    assert len(microbatches) == 4
+    for microbatch in microbatches:
+        assert dl.get_num_samples_in_batch(microbatch) == 1
+        assert 'input_ids' in microbatch
+        # TODO: what should this be?
+        # assert tuple(microbatch['input_ids'].shape) == (microbatch_size, seqlen)
+        assert 'attention_mask' in microbatch
+        # assert tuple(microbatch['attention_mask'].shape) == (microbatch_size, seqlen)
+        assert isinstance(microbatch['generation_kwargs'], dict)
+        assert microbatch['generation_kwargs']['temperature'] == .9
+        assert microbatch['generation_kwargs']['top_k'] == 40
+        assert microbatch['generation_kwargs']['pad_token_id'] == 0
+        assert microbatch['generation_kwargs']['num_beams'] == 1
+        assert microbatch['generation_kwargs']['num_return_sequences'] == 1
+        assert microbatch['generation_kwargs']['do_sample'] == True
+        assert microbatch['generation_kwargs']['use_cache'] == True
+        assert microbatch['generation_kwargs']['eos_token_id'] == 0
+
+
+@pytest.mark.parametrize('dataset_uri', ['lambada_small.jsonl'])
+@pytest.mark.parametrize('num_fewshot', [0, 5])
+@device('gpu')
+def test_lm_task_evaluation(device, dataset_uri, num_fewshot, tiny_gpt2_tokenizer, tmp_path):
+    pytest.importorskip('datasets')
+    in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    dataset_uri = f'{local_data}/{dataset_uri}'
+    tokenizer = tiny_gpt2_tokenizer
+    batch_size = 2
+    dl = get_icl_task_dataloader(
+        'language_modeling',
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        batch_size=batch_size,
+        max_seq_len=2048,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=num_fewshot,
+        prompt_string='',
+        example_delimiter='\n',
+        continuation_delimiter='',
+        destination_path=str(tmp_path / 'icl.jsonl'),
+    )
+
+    evaluator = Evaluator(label='lambada', dataloader=dl, metric_names=['InContextLearningLMAccuracy'])
+
+    transformers = pytest.importorskip('transformers')
+    config = transformers.AutoConfig.from_pretrained('EleutherAI/gpt-neo-125M')
+    model = transformers.AutoModelForCausalLM.from_config(config)
+    model = HuggingFaceModel(
+        model=model,
+        tokenizer=None,
+        eval_metrics=[InContextLearningLMAccuracy()],
+        use_logits=True,
+    )
+
+    trainer = Trainer(model=model, max_duration='1ep', loggers=in_memory_logger)
+    trainer.eval(eval_dataloader=evaluator, subset_num_batches=2)
+    assert 'metrics/lambada/InContextLearningLMAccuracy' in in_memory_logger.data.keys()
+    assert in_memory_logger.data['metrics/lambada/InContextLearningLMAccuracy'][0][1].item() == 0
+
+
+@pytest.mark.parametrize('num_fewshot', [0, 5])
+@pytest.mark.parametrize('dataset_uri', ['winograd_small.jsonl'])
+@pytest.mark.filterwarnings(r'ignore:Cannot split .* of length.*:UserWarning')
+def test_schema_task_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tmp_path, tiny_gpt2_model):
+    pytest.importorskip('datasets')
+    in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    dataset_uri = f'{local_data}/{dataset_uri}'
+    tokenizer = tiny_gpt2_tokenizer
+    batch_size = 8
+    dl = get_icl_task_dataloader(
+        'schema',
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        batch_size=batch_size,
+        max_seq_len=1024,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=num_fewshot,
+        prompt_string='',
+        example_delimiter='\n',
+        continuation_delimiter=': ',
+        destination_path=str(tmp_path / 'icl.jsonl'),
+    )
+
+    evaluator = Evaluator(label='winograd', dataloader=dl, metric_names=['InContextLearningMultipleChoiceAccuracy'])
+
+    model = HuggingFaceModel(
+        model=tiny_gpt2_model,
+        tokenizer=tokenizer,
+        eval_metrics=[InContextLearningMultipleChoiceAccuracy()],
+        use_logits=True,
+    )
+
+    trainer = Trainer(model=model, max_duration='1ba', loggers=in_memory_logger)
+    trainer.eval(eval_dataloader=evaluator)
+    assert 'metrics/winograd/InContextLearningMultipleChoiceAccuracy' in in_memory_logger.data.keys()
+    assert in_memory_logger.data['metrics/winograd/InContextLearningMultipleChoiceAccuracy'][0][1].item() > 0
+    num_samples = 0
+    with open(dataset_uri) as f:
+        for _ in f:
+            num_samples += 1
+    assert trainer.state.eval_metrics['winograd']['InContextLearningMultipleChoiceAccuracy'].total == num_samples
+
+
+@pytest.mark.parametrize('dataset_uri', ['mmlu_small.jsonl'])
+@pytest.mark.parametrize('num_fewshot', [0, 5])
+@device('gpu')
+@world_size(1, 2)
+@pytest.mark.filterwarnings(r'ignore:Cannot split .* of length.*:UserWarning')
+def test_mc_task_evaluation_subcategories(device, world_size, dataset_uri, num_fewshot, tiny_gpt2_model,
+                                          tiny_gpt2_tokenizer, tmp_path):
+    pytest.importorskip('datasets')
+    in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    dataset_uri = f'{local_data}/{dataset_uri}'
+    tokenizer = tiny_gpt2_tokenizer
+    batch_size = 8
+    max_seq_len = 64
+    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
+    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
+    reproducibility.seed_all(1234)
+    dls = get_icl_task_dataloader('multiple_choice',
+                                  dataset_uri=dataset_uri,
+                                  tokenizer=tokenizer,
+                                  batch_size=batch_size,
+                                  max_seq_len=max_seq_len,
+                                  pad_tok_id=tokenizer.eos_token_id,
+                                  num_fewshot=num_fewshot,
+                                  prompt_string='',
+                                  example_delimiter='\n',
+                                  continuation_delimiter=': ',
+                                  destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
+                                  has_categories=True)
+
+    assert isinstance(dls, dict)
+    evaluators = [
+        Evaluator(label='mmlu/' + k, dataloader=dl, metric_names=['InContextLearningMultipleChoiceAccuracy'])
+        for k, dl in dls.items()
+    ]
+
+    model = HuggingFaceModel(
+        model=tiny_gpt2_model,
+        tokenizer=tiny_gpt2_tokenizer,
+        eval_metrics=[InContextLearningMultipleChoiceAccuracy()],
+        use_logits=True,
+    )
+
+    trainer = Trainer(model=model, loggers=in_memory_logger)
+    trainer.eval(eval_dataloader=evaluators)
+    assert 'metrics/mmlu/computer_security/InContextLearningMultipleChoiceAccuracy' in in_memory_logger.data.keys()
+    assert in_memory_logger.data['metrics/mmlu/computer_security/InContextLearningMultipleChoiceAccuracy'][0][1].item(
+    ) > 0
+    total = trainer.state.eval_metrics['mmlu/computer_security']['InContextLearningMultipleChoiceAccuracy'].total
+    dist.all_reduce(total)  # type: ignore
+    assert total.item() == 4  # type: ignore
+
+
+@pytest.mark.parametrize('dataset_uri', ['piqa_small.jsonl', 'hellaswag_small.jsonl'])
+@pytest.mark.parametrize('num_fewshot', [0, 5])
+@pytest.mark.filterwarnings(r'ignore:Cannot split .* of length.*:UserWarning')
+@device('gpu')
+@world_size(1, 2)
+def test_mc_task_evaluation(device, world_size, num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tmp_path,
+                            tiny_gpt2_model):
+    pytest.importorskip('datasets')
+    in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    dataset_uri = f'{local_data}/{dataset_uri}'
+    tokenizer = tiny_gpt2_tokenizer
+    batch_size = 8
+    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
+    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
+
+    # seed because the fewshot selection is currently unseeded
+    reproducibility.seed_all(1234)
+    dl = get_icl_task_dataloader(
+        'multiple_choice',
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        batch_size=batch_size,
+        max_seq_len=64,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=num_fewshot,
+        prompt_string='',
+        example_delimiter='\n',
+        continuation_delimiter=': ',
+        destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
+    )
+
+    evaluator = Evaluator(label='mc', dataloader=dl, metric_names=['InContextLearningMultipleChoiceAccuracy'])
+
+    model = HuggingFaceModel(
+        model=tiny_gpt2_model,
+        tokenizer=tiny_gpt2_tokenizer,
+        eval_metrics=[InContextLearningMultipleChoiceAccuracy()],
+        use_logits=True,
+    )
+
+    trainer = Trainer(model=model, max_duration='1ba', loggers=in_memory_logger)
+    trainer.eval(eval_dataloader=evaluator)
+    assert 'metrics/mc/InContextLearningMultipleChoiceAccuracy' in in_memory_logger.data.keys()
+    assert in_memory_logger.data['metrics/mc/InContextLearningMultipleChoiceAccuracy'][0][1].item() >= 0
+    num_samples = 0
+    with open(dataset_uri) as f:
+        for _ in f:
+            num_samples += 1
+    total = trainer.state.eval_metrics['mc']['InContextLearningMultipleChoiceAccuracy'].total
+    dist.all_reduce(total)  # type: ignore
+    assert total.item() == num_samples  # type: ignore
+
+
+@pytest.mark.parametrize('num_fewshot', [0, 5])
+@pytest.mark.parametrize('dataset_uri', ['triviaqa_small.jsonl'])
+@pytest.mark.filterwarnings(r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning')
+@pytest.mark.filterwarnings(r'ignore:Cannot split .* of length.*:UserWarning')
+@device('gpu')
+@world_size(1, 2)
+def test_qa_task_evaluation_opt_tokenizer(device, world_size, tiny_opt_tokenizer, tiny_opt_model, num_fewshot,
+                                          dataset_uri, tmp_path):
+    pytest.importorskip('datasets')
+    in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    dataset_uri = f'{local_data}/{dataset_uri}'
+    tokenizer = tiny_opt_tokenizer
+
+    batch_size = 4
+    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
+    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
+    dl = get_icl_task_dataloader(
+        'question_answering',
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        batch_size=batch_size,
+        max_seq_len=1024,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=num_fewshot,
+        prompt_string='',
+        example_delimiter='\n',
+        continuation_delimiter=': ',
+        destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
+    )
+
+    evaluator = Evaluator(label='triviaqa', dataloader=dl, metric_names=['InContextLearningQAAccuracy'])
+    model = HuggingFaceModel(
+        model=tiny_opt_model,
+        tokenizer=tokenizer,
+        eval_metrics=[InContextLearningQAAccuracy()],
+        use_logits=True,
+    )
+
+    trainer = Trainer(model=model, max_duration='1ba', loggers=in_memory_logger)
+
+    trainer.eval(eval_dataloader=evaluator, subset_num_batches=2)
+    assert 'metrics/triviaqa/InContextLearningQAAccuracy' in in_memory_logger.data.keys()
+    assert in_memory_logger.data['metrics/triviaqa/InContextLearningQAAccuracy'][0][1].item() == 0
+
+
+@pytest.mark.parametrize('num_fewshot', [5])
+@pytest.mark.parametrize('dataset_uri', ['gsm8k_small.jsonl'])
+@device('gpu')
+@world_size(1, 2)
+@pytest.mark.filterwarnings(r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning')
+@pytest.mark.filterwarnings(r'ignore:Cannot split .* of length.*:UserWarning')
+def test_qa_task_evaluation_with_cot_opt_tokenizer(device, world_size, tiny_opt_tokenizer, tiny_opt_model, num_fewshot,
+                                                   dataset_uri, tmp_path):
+    pytest.importorskip('datasets')
+    in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    dataset_uri = f'{local_data}/{dataset_uri}'
+    tokenizer = tiny_opt_tokenizer
+
+    batch_size = 4
+    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
+    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
+    dl = get_icl_task_dataloader(
+        'question_answering',
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        batch_size=batch_size,
+        max_seq_len=1024,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=num_fewshot,
+        prompt_string='',
+        example_delimiter='\n',
+        continuation_delimiter="A: Let's think step by step. ",
+        cot_delimiter=' #### ',
+        destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
+    )
+
+    evaluator = Evaluator(label='gsm8k', dataloader=dl, metric_names=['InContextLearningQAAccuracy'])
+    model = HuggingFaceModel(
+        model=tiny_opt_model,
+        tokenizer=tokenizer,
+        eval_metrics=[InContextLearningQAAccuracy()],
+        use_logits=True,
+    )
+
+    trainer = Trainer(model=model, max_duration='1ba', loggers=in_memory_logger)
+
+    trainer.eval(eval_dataloader=evaluator, subset_num_batches=2)
+    assert 'metrics/gsm8k/InContextLearningQAAccuracy' in in_memory_logger.data.keys()
+    assert in_memory_logger.data['metrics/gsm8k/InContextLearningQAAccuracy'][0][1].item() == 0
+
+
+@pytest.mark.parametrize('dataset_uri', ['triviaqa_small.jsonl'])
+@pytest.mark.parametrize('num_fewshot', [0, 5])
+@device('gpu')
+@world_size(1, 2)
+@pytest.mark.filterwarnings(r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning')
+def test_qa_task_evaluation(device, world_size, num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tiny_gpt2_model,
+                            tmp_path):
+    pytest.importorskip('datasets')
+    in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    dataset_uri = f'{local_data}/{dataset_uri}'
+    tokenizer = tiny_gpt2_tokenizer
+    batch_size = 2
+    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
+    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
+    dl = get_icl_task_dataloader(
+        'question_answering',
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        batch_size=batch_size,
+        max_seq_len=1024,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=num_fewshot,
+        prompt_string='',
+        example_delimiter='\n',
+        continuation_delimiter=': ',
+        destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
+    )
+
+    evaluator = Evaluator(label='triviaqa', dataloader=dl, metric_names=['InContextLearningQAAccuracy'])
+
+    model = HuggingFaceModel(
+        model=tiny_gpt2_model,
+        tokenizer=tiny_gpt2_tokenizer,
+        eval_metrics=[InContextLearningQAAccuracy()],
+        use_logits=True,
+    )
+
+    trainer = Trainer(model=model, max_duration='1ba', loggers=in_memory_logger)
+
+    trainer.eval(eval_dataloader=evaluator, subset_num_batches=2)
+    assert 'metrics/triviaqa/InContextLearningQAAccuracy' in in_memory_logger.data.keys()
+    assert in_memory_logger.data['metrics/triviaqa/InContextLearningQAAccuracy'][0][1].item() == 0
+
+
+@pytest.mark.parametrize('dataset_uri', ['gsm8k_small.jsonl'])
+@pytest.mark.parametrize('num_fewshot', [5])
+@pytest.mark.filterwarnings(r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning')
+@device('gpu')
+@world_size(1, 2)
+def test_qa_task_with_cot_evaluation(device, world_size, num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tiny_gpt2_model,
+                                     tmp_path):
+    pytest.importorskip('datasets')
+    in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    dataset_uri = f'{local_data}/{dataset_uri}'
+    tokenizer = tiny_gpt2_tokenizer
+    batch_size = 2
+    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
+    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
+    dl = get_icl_task_dataloader(
+        'question_answering',
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        batch_size=batch_size,
+        max_seq_len=1024,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=num_fewshot,
+        prompt_string='',
+        example_delimiter='\n',
+        continuation_delimiter="A: Let's think step by step",
+        cot_delimiter=' #### ',
+        destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
+    )
+
+    evaluator = Evaluator(label='gsm8k', dataloader=dl, metric_names=['InContextLearningQAAccuracy'])
+
+    model = HuggingFaceModel(
+        model=tiny_gpt2_model,
+        tokenizer=tiny_gpt2_tokenizer,
+        eval_metrics=[InContextLearningQAAccuracy()],
+        use_logits=True,
+    )
+
+    trainer = Trainer(model=model, max_duration='1ba', loggers=in_memory_logger)
+
+    trainer.eval(eval_dataloader=evaluator, subset_num_batches=2)
+    assert 'metrics/gsm8k/InContextLearningQAAccuracy' in in_memory_logger.data.keys()
+    assert in_memory_logger.data['metrics/gsm8k/InContextLearningQAAccuracy'][0][1].item() == 0
+
+
+def test_code_eval_requires_envvar(monkeypatch):
+    monkeypatch.delenv('CODE_EVAL_DEVICE', raising=False)
+    with pytest.raises(ValueError, match='Attempting to use InContextLearningCodeEvalAccuracy but.*'):
+        InContextLearningCodeEvalAccuracy().get_client()
+
+
+def test_code_eval_requires_valid_envvar(monkeypatch):
+    monkeypatch.setenv('CODE_EVAL_DEVICE', 'bigchungus')
+    with pytest.raises(ValueError, match='Environment variable `CODE_EVAL_DEVICE` must be on.*'):
+        InContextLearningCodeEvalAccuracy().get_client()
+
+
+@pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
+@pytest.mark.parametrize('num_fewshot', [0])
+@pytest.mark.parametrize('generations_per_sample', range(1, 3))
+@device('gpu')
+@world_size(1, 2)
+@pytest.mark.filterwarnings(r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning')
+def test_code_eval_microbatching(monkeypatch, device, world_size, tiny_opt_tokenizer, tiny_opt_model, num_fewshot,
+                                 dataset_uri, tmp_path, generations_per_sample):
+    pytest.importorskip('datasets')
+    monkeypatch.setenv('CODE_EVAL_DEVICE', 'LOCAL')
+    in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    dataset_uri = f'{local_data}/{dataset_uri}'
+    tokenizer = tiny_opt_tokenizer
+    batch_size = 4
+
+    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
+    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
+    dl = get_icl_task_dataloader(
+        'code_evaluation',
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        batch_size=batch_size,
+        max_seq_len=150,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=num_fewshot,
+        prompt_string='',
+        example_delimiter='\n',
+        continuation_delimiter=': ',
+        destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
+        generations_per_sample=generations_per_sample,
+    )
+
+    evaluator = Evaluator(label='humaneval',
+                          dataloader=dl,
+                          metric_names=['InContextLearningCodeEvalAccuracy'],
+                          device_eval_microbatch_size=1)
+    model = HuggingFaceModel(
+        model=tiny_opt_model,
+        tokenizer=tokenizer,
+        eval_metrics=[InContextLearningCodeEvalAccuracy()],
+        use_logits=True,
+    )
+
+    trainer = Trainer(model=model, max_duration='1ba', loggers=in_memory_logger)
+    torch.use_deterministic_algorithms(False)
+    trainer.eval(eval_dataloader=evaluator, subset_num_batches=2)
+    torch.use_deterministic_algorithms(True)
+    assert 'metrics/humaneval/InContextLearningCodeEvalAccuracy' in in_memory_logger.data.keys()
+    assert in_memory_logger.data['metrics/humaneval/InContextLearningCodeEvalAccuracy'][0][1].item() == 0
+
+
+@pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
+@pytest.mark.parametrize('num_fewshot', [0])
+@pytest.mark.parametrize('generations_per_sample', range(1, 3))
+@device('gpu')
+@world_size(1, 2)
+@pytest.mark.filterwarnings(r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning')
+def test_code_eval_sentpiece_evaluation(monkeypatch, device, world_size, num_fewshot, dataset_uri, tiny_t5_tokenizer,
+                                        tiny_t5_model, tmp_path, generations_per_sample):
+    pytest.importorskip('datasets')
+    monkeypatch.setenv('CODE_EVAL_DEVICE', 'LOCAL')
+    in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    dataset_uri = f'{local_data}/{dataset_uri}'
+    tokenizer = tiny_t5_tokenizer
+    batch_size = 2
+    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
+    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
+    dl = get_icl_task_dataloader(
+        'code_evaluation',
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        batch_size=batch_size,
+        max_seq_len=175,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=num_fewshot,
+        prompt_string='',
+        example_delimiter='\n',
+        continuation_delimiter=': ',
+        destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
+        generations_per_sample=generations_per_sample,
+    )
+
+    evaluator = Evaluator(label='humaneval', dataloader=dl, metric_names=['InContextLearningCodeEvalAccuracy'])
+    model = HuggingFaceModel(
+        model=tiny_t5_model,
+        tokenizer=tiny_t5_tokenizer,
+        eval_metrics=[InContextLearningCodeEvalAccuracy()],
+        use_logits=True,
+    )
+
+    trainer = Trainer(model=model, max_duration='1ba', loggers=in_memory_logger)
+    torch.use_deterministic_algorithms(False)
+    trainer.eval(eval_dataloader=evaluator, subset_num_batches=2)
+    torch.use_deterministic_algorithms(True)
+    assert 'metrics/humaneval/InContextLearningCodeEvalAccuracy' in in_memory_logger.data.keys()
+    assert in_memory_logger.data['metrics/humaneval/InContextLearningCodeEvalAccuracy'][0][1].item() == 0
+
+
+@pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
+@pytest.mark.parametrize('num_fewshot', [0, 2])
+@pytest.mark.parametrize('generations_per_sample', [1])
+@pytest.mark.filterwarnings(r'ignore: Input length of input_ids is')
+@device('gpu')
+@world_size(1, 2)
+@pytest.mark.filterwarnings(r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning')
+def test_code_eval_task_evaluation(monkeypatch, device, world_size, num_fewshot, dataset_uri, tiny_gpt2_tokenizer,
+                                   tiny_gpt2_model, tmp_path, generations_per_sample):
+    pytest.importorskip('datasets')
+    monkeypatch.setenv('CODE_EVAL_DEVICE', 'LOCAL')
+    in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    dataset_uri = f'{local_data}/{dataset_uri}'
+    tokenizer = tiny_gpt2_tokenizer
+    batch_size = 2
+    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
+    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
+    dl = get_icl_task_dataloader(
+        'code_evaluation',
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        batch_size=batch_size,
+        max_seq_len=64 * num_fewshot,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=num_fewshot,
+        prompt_string='',
+        example_delimiter='\n',
+        continuation_delimiter=': ',
+        destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
+        generations_per_sample=generations_per_sample,
+    )
+
+    evaluator = Evaluator(label='humaneval', dataloader=dl, metric_names=['InContextLearningCodeEvalAccuracy'])
+    model = HuggingFaceModel(
+        model=tiny_gpt2_model,
+        tokenizer=tiny_gpt2_tokenizer,
+        eval_metrics=[InContextLearningCodeEvalAccuracy()],
+        use_logits=True,
+    )
+
+    trainer = Trainer(model=model, max_duration='1ba', loggers=in_memory_logger)
+    torch.use_deterministic_algorithms(False)
+    trainer.eval(eval_dataloader=evaluator, subset_num_batches=2)
+    torch.use_deterministic_algorithms(True)
+    assert 'metrics/humaneval/InContextLearningCodeEvalAccuracy' in in_memory_logger.data.keys()
+    assert in_memory_logger.data['metrics/humaneval/InContextLearningCodeEvalAccuracy'][0][1].item() == 0
+
+
+@pytest.mark.parametrize('dataset_uri', ['lambada_small.jsonl'])
+def test_lm_spacing_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
+    pytest.importorskip('datasets')
+
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+
+    tokenizer = tiny_gpt2_tokenizer
+    dataset_uri = f'{local_data}/{dataset_uri}'
+    batch_size = 2
+    seqlen = 512
+    dl = get_icl_task_dataloader('language_modeling',
+                                 dataset_uri=dataset_uri,
+                                 tokenizer=tokenizer,
+                                 batch_size=batch_size,
+                                 max_seq_len=seqlen,
+                                 pad_tok_id=tokenizer.eos_token_id,
+                                 num_fewshot=1,
+                                 prompt_string='',
+                                 example_delimiter='\n',
+                                 continuation_delimiter=' UNIQUE ',
+                                 destination_path=str(tmp_path / 'icl.jsonl'))
+    assert isinstance(dl, DataSpec)
+    assert isinstance(dl.dataloader, DataLoader)  # pyright
+    first_batch = next(dl.dataloader._get_iterator())
+    second_batch = next(dl.dataloader._get_iterator())
+
+    first_batch_text = tokenizer.decode(first_batch['input_ids'][0], skip_special_tokens=True)
+    second_batch_text = tokenizer.decode(second_batch['input_ids'][0], skip_special_tokens=True)
+
+    first_batch_without_last_word = ' '.join(first_batch_text.split(' ')[:-1])
+    second_batch_without_last_word = ' '.join(second_batch_text.split(' ')[:-1])
+
+    assert first_batch_without_last_word.endswith(' UNIQUE')
+    assert second_batch_without_last_word.endswith(' UNIQUE')
+
+    assert first_batch_without_last_word.count(' UNIQUE ') == 1
+    assert second_batch_without_last_word.count(' UNIQUE ') == 1
+
+
+@pytest.mark.parametrize('dataset_uri', ['hf://mosaicml/test_dataset'])
+@pytest.mark.parametrize('num_fewshot', [0, 1])
+@pytest.mark.parametrize('prompt_string', ['Complete the voiceline: ', ''])
+@pytest.mark.parametrize('hf_loading_vars', [{
+    'split': 'test',
+    'name': 'juggernaut',
+}])
+@pytest.mark.parametrize('hf_parsing_map', [None, {'context': ['context'], 'continuation': ['continuation']}])
+@pytest.mark.filterwarnings(
+    r'ignore:The repository for mosaicml/test_dataset contains custom code which must*:FutureWarning')
+def test_hf_dataloading_lm_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path, num_fewshot, prompt_string,
+                                      hf_loading_vars, hf_parsing_map):
+    pytest.importorskip('datasets')
+
+    tokenizer = tiny_gpt2_tokenizer
+    batch_size = 2
+    seqlen = 2048
+    dl = get_icl_task_dataloader('language_modeling',
+                                 dataset_uri=dataset_uri,
+                                 tokenizer=tokenizer,
+                                 batch_size=batch_size,
+                                 max_seq_len=seqlen,
+                                 pad_tok_id=tokenizer.eos_token_id,
+                                 num_fewshot=0,
+                                 prompt_string='',
+                                 example_delimiter='\n',
+                                 continuation_delimiter=' ',
+                                 destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
+                                 hf_loading_vars=hf_loading_vars,
+                                 hf_parsing_map=hf_parsing_map)
+    assert isinstance(dl, DataSpec)
+    assert isinstance(dl.dataloader, DataLoader)  # pyright
+    batch = next(dl.dataloader._get_iterator())
+
+    assert 'input_ids' in batch
+    assert tuple(batch['input_ids'].shape) == (batch_size, seqlen)
+    assert 'attention_mask' in batch
+    assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen)
+    assert 'continuation_indices' in batch
+    assert isinstance(batch['continuation_indices'], list) and len(batch['continuation_indices']) == batch_size
+    assert 'mode' in batch
+    assert batch['mode'] == 'icl_task'
+    min_idx = min(batch['continuation_indices'][0]).item()
+    max_idx = max(batch['continuation_indices'][0]).item()
+    assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx + 1]) == ' and me.'
+
+    decoded_batch = [tokenizer.decode(row[row != tokenizer.eos_token_id]) for row in batch['input_ids']]
+    assert decoded_batch[0] == "Looks like it's just you and me."
+    assert decoded_batch[1] == "There's a fine line between bravery and stupidity."
+
+
+@pytest.mark.parametrize('dataset_uri', ['hf://mosaicml/test_dataset'])
+@pytest.mark.parametrize('num_fewshot', [0, 1])
+@pytest.mark.parametrize('prompt_string', ['What spell does this invoke? ', ''])
+@pytest.mark.parametrize('hf_loading_vars', [{
+    'split': 'test',
+    'name': 'invoker',
+}])
+@pytest.mark.parametrize('hf_parsing_map', [{'context': ['quas', 'wex', 'exort'], 'answer': ['spell']}])
+@pytest.mark.filterwarnings(
+    r'ignore:The repository for mosaicml/test_dataset contains custom code which must*:FutureWarning')
+def test_hf_dataloading_custom_parsing(dataset_uri, tiny_gpt2_tokenizer, tmp_path, num_fewshot, prompt_string,
+                                       hf_loading_vars, hf_parsing_map):
+    pytest.importorskip('datasets')
+
+    tokenizer = tiny_gpt2_tokenizer
+    batch_size = 2
+    seqlen = 2048
+
+    # empirical number from the small test dataset
+    maximum_answer_length = 4
+
+    dl = get_icl_task_dataloader('question_answering',
+                                 dataset_uri=dataset_uri,
+                                 tokenizer=tokenizer,
+                                 batch_size=batch_size,
+                                 max_seq_len=seqlen,
+                                 pad_tok_id=tokenizer.eos_token_id,
+                                 num_fewshot=num_fewshot,
+                                 prompt_string=prompt_string,
+                                 example_delimiter='\n',
+                                 question_prelimiter='Orbs: ',
+                                 continuation_delimiter='\nSpell:',
+                                 destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
+                                 hf_loading_vars=hf_loading_vars,
+                                 hf_parsing_map=hf_parsing_map)
+    assert isinstance(dl, DataSpec)
+    assert isinstance(dl.dataloader, DataLoader)  # pyright
+    batch = next(dl.dataloader._get_iterator())
+
+    assert tuple(batch['input_ids'].shape) == (batch_size, seqlen - maximum_answer_length)
+    assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen - maximum_answer_length)
+    assert batch['mode'] == 'generate'
+    # the maximum generation length from the small test data
+    assert batch['generation_length'] == maximum_answer_length
+    assert all(item[0] == tokenizer.eos_token_id for item in batch['input_ids'])
+
+    decoded_batch = tokenizer.batch_decode(batch['input_ids'])
+    assert all(item.count('Orbs: ') == num_fewshot + 1 for item in decoded_batch)
+    assert all(item.count('\nSpell:') == num_fewshot + 1 for item in decoded_batch)
+
+    if len(prompt_string) > 0:
+        assert all(item.count('What spell does this invoke? ') == 1 for item in decoded_batch)
+    assert all(
+        set(found) == set(expected) for found, expected in zip(batch['labels'], [['defeaning blast'], ['cold snap']]))
+    assert decoded_batch[0].endswith('Orbs: quas wex exort\nSpell:')
+    assert decoded_batch[1].endswith('Orbs: quas quas quas\nSpell:')
diff --git a/tests/eval/test_nlp_metrics.py b/tests/eval/test_nlp_metrics.py
new file mode 100644
index 0000000000..fb6a88c780
--- /dev/null
+++ b/tests/eval/test_nlp_metrics.py
@@ -0,0 +1,146 @@
+# Copyright 2022 MosaicML Composer authors
+# SPDX-License-Identifier: Apache-2.0
+
+
+import torch
+
+from llmfoundry.eval.metrics.nlp import ( InContextLearningCodeEvalAccuracy, InContextLearningLMAccuracy, InContextLearningMultipleChoiceAccuracy,
+                                  InContextLearningQAAccuracy,)
+
+
+
+def test_in_context_learning_lm_accuracy(tiny_gpt2_tokenizer):
+    contexts = ['The dog is', 'I love to eat', 'I hate', 'The weather is']
+    continuations = [' furry', ' pie', ' long lines', ' snowy']
+    pad = tiny_gpt2_tokenizer.pad_token_id
+    inputs = [
+        tiny_gpt2_tokenizer(context)['input_ids'] + tiny_gpt2_tokenizer(continuation)['input_ids']
+        for context, continuation in zip(contexts, continuations)
+    ]
+    inputs = torch.tensor([input + [pad] * (2048 - len(input)) for input in inputs])
+
+    cont_idxs = []
+    for context, continuation in zip(contexts, continuations):
+        start = len(tiny_gpt2_tokenizer(context)['input_ids'])
+        end = start + len(tiny_gpt2_tokenizer(continuation)['input_ids'])
+        cont_idxs.append(torch.tensor(list(range(start, end))))
+
+    batch = {'continuation_indices': cont_idxs, 'labels': inputs.roll(-1), 'input_ids': inputs}
+    logits = torch.nn.functional.one_hot(inputs.roll(-1), num_classes=pad + 1).float() * 100
+    start, end = cont_idxs[1].tolist()[0] - 1, cont_idxs[1].tolist()[-1]
+    logits[1][start:end] = logits[0][start:end].clone()  # make one of the answer's continuations incorrect
+    metric = InContextLearningLMAccuracy()
+    metric.update(batch, logits, batch['labels'])
+
+    assert metric.compute() == 0.75
+
+
+
+def test_in_context_learning_qa_accuracy():
+    outputs = ['Correct but then some more text', 'Incorrect', ' the CORREct with weird casing and spacing']
+    labels = [['Correct'], ['blah', 'blah2'], ['blah', 'correct']]
+    batch = {'cot_delimiter': '', 'labels': labels}
+    metric = InContextLearningQAAccuracy()
+    metric.update(outputs, labels, batch)
+
+    assert metric.compute() == (2 / 3)
+
+
+def test_in_context_learning_qa_cot_accuracy():
+    outputs = [
+        'chain of thought ### Correct but then some more text\n\nanother chain of thought ### Incorrect answer this time',
+        'Incorrect', 'chain of thought ### the CORREct with weird casing and spacing',
+        'incorrect chain of thought delimiter ## Correct but wrong delimiter'
+    ]
+    labels = [['Correct'], ['blah', 'blah2'], ['blah', 'correct'], ['correct']]
+    batch = {'cot_delimiter': ' ### ', 'labels': labels, 'do_normalization': True, 'stopping_criteria': '\n\n'}
+    metric = InContextLearningQAAccuracy()
+    metric.update(outputs, labels, batch)
+
+    assert metric.compute() == (2 / 4)
+
+
+def test_in_context_learning_code_eval_accuracy(monkeypatch):
+    outputs = [
+        '    return 1 if n <= 1 else fib(n - 1) + fib(n - 1)',  # incorrect
+        '   if n <= 1:\n        return 1\n    return fib(n-1) + fib(n-2)',  # incorrect spacing
+        '    return n * 2',  # correct
+        '    return 2*n',  # correct
+        '    return n + 2',  # incorrect
+        '    return n + 1'
+    ]  # correct
+    labels = []
+    prompts = ['def fib(n):\n', 'def multiply_by_two(n):\n', 'def add_one(n):\n']
+    entry_points = ['fib', 'multiply_by_two', 'add_one']
+    test_inputs = [['(1,)', '(2,)', '(4,)'], ['(1,)', '(2,)', '(4,)'], ['(1,)', '(2,)', '(4,)']]
+    test_outputs = [['1', '2', '5'], ['2', '4', '8'], ['2', '3', '5']]
+    languages = ['python', 'python', 'python']
+    monkeypatch.setenv('CODE_EVAL_DEVICE', 'LOCAL')
+    batch = {
+        # This tests deterministic beam search rather than sampling
+        'generation_kwargs': {
+            'num_beams': 1,
+            'num_return_sequences': 2
+        },
+        'prompts': prompts,
+        'pass_at_k': 1,
+        'entry_points': entry_points,
+        'test_inputs': test_inputs,
+        'test_outputs': test_outputs,
+        'languages': languages,
+    }
+    metric = InContextLearningCodeEvalAccuracy()
+    metric.update(batch, outputs, labels)
+
+    # pass@1 values
+    #   program 1: 0
+    #   program 2: 1
+    #   program 3: .5
+    # mean: 0.5
+    assert metric.compute() == 0.5
+
+
+def test_in_context_learning_mc_accuracy(tiny_gpt2_tokenizer):
+    contexts = [
+        'Q: How do you cook a cake?', 'Q: How do you cook a cake?', 'Q: How old is the earth?',
+        'Q: How old is the earth?'
+    ]
+    continuations = [' A: turn on the oven', ' A: do a backflip', ' A: 2 minutes', ' A: 4.5 billion years']
+    gold_indices = [0, 1]
+    choice_groupings = [(0, 2), (2, 4)]
+    pad = tiny_gpt2_tokenizer.pad_token_id
+    inputs = [
+        tiny_gpt2_tokenizer(context)['input_ids'] + tiny_gpt2_tokenizer(continuation)['input_ids']
+        for context, continuation in zip(contexts, continuations)
+    ]
+    inputs = torch.tensor([input + [pad] * (2048 - len(input)) for input in inputs])
+
+    cont_idxs = []
+    for context, continuation in zip(contexts, continuations):
+        start = len(tiny_gpt2_tokenizer(context)['input_ids'])
+        end = start + len(tiny_gpt2_tokenizer(continuation)['input_ids'])
+        cont_idxs.append(torch.tensor(list(range(start, end))))
+
+    batch = {
+        'continuation_indices': cont_idxs,
+        'labels': inputs.roll(-1),
+        'input_ids': inputs,
+        'gold_indices': gold_indices,
+        'choice_groupings': choice_groupings
+    }
+    logits = torch.nn.functional.one_hot(inputs.roll(-1), num_classes=pad + 1).float()
+
+    # for the first two, the correct answer is continuation 0
+    # make the answer correct by making continuation 0 more likely for both answers
+    start, end = cont_idxs[1].tolist()[0] - 1, cont_idxs[1].tolist()[-1]
+    logits[1][start:end] = logits[0][start:end].clone()
+
+    # for the last two, the correct answer is continuation 3
+    # make the answer incorrect by making continuation 2 more likely for both answers
+    start, end = cont_idxs[3].tolist()[0], cont_idxs[3].tolist()[-1]
+    logits[3][start:end] = logits[2][start:end].clone()
+
+    metric = InContextLearningMultipleChoiceAccuracy()
+
+    metric.update(batch, logits, batch['labels'])
+    assert metric.compute() == 0.5

From 1fffbada0d126dd0a69684d1238cb028f80ec9ac Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Fri, 2 Feb 2024 12:29:43 -0500
Subject: [PATCH 02/59] still need to migrate fixtures

---
 llmfoundry/models/hf/hf_causal_lm.py                 | 7 +++----
 llmfoundry/models/inference_api_wrapper/interface.py | 8 ++++----
 llmfoundry/models/mpt/modeling_mpt.py                | 4 +---
 mcli/mcli-hf-eval.yaml                               | 2 ++
 tests/eval/test_in_context_learning_datasets.py      | 3 +--
 5 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/llmfoundry/models/hf/hf_causal_lm.py b/llmfoundry/models/hf/hf_causal_lm.py
index 87b6080de7..498133262b 100644
--- a/llmfoundry/models/hf/hf_causal_lm.py
+++ b/llmfoundry/models/hf/hf_causal_lm.py
@@ -10,13 +10,12 @@
 
 # required for loading a python model into composer
 import transformers
-from composer.metrics.nlp import (InContextLearningCodeEvalAccuracy,
+from llmfoundry.eval.metrics.nlp import (InContextLearningCodeEvalAccuracy,
                                   InContextLearningLMAccuracy,
-                                  InContextLearningLMExpectedCalibrationError,
-                                  InContextLearningMCExpectedCalibrationError,
                                   InContextLearningMultipleChoiceAccuracy,
                                   InContextLearningQAAccuracy,
-                                  LanguageCrossEntropy, LanguagePerplexity)
+                                  )
+from composer.metrics.nlp import LanguageCrossEntropy, LanguagePerplexity
 from composer.utils import dist
 from omegaconf import DictConfig
 from torch import nn
diff --git a/llmfoundry/models/inference_api_wrapper/interface.py b/llmfoundry/models/inference_api_wrapper/interface.py
index 9d0ce7deb3..893bdd975c 100644
--- a/llmfoundry/models/inference_api_wrapper/interface.py
+++ b/llmfoundry/models/inference_api_wrapper/interface.py
@@ -6,12 +6,12 @@
 import torch
 from composer.core.types import Batch
 from composer.metrics import InContextLearningMetric
-from composer.metrics.nlp import (InContextLearningLMAccuracy,
-                                  InContextLearningLMExpectedCalibrationError,
-                                  InContextLearningMCExpectedCalibrationError,
+from llmfoundry.eval.metrics.nlp import (InContextLearningLMAccuracy,
                                   InContextLearningMultipleChoiceAccuracy,
                                   InContextLearningQAAccuracy,
-                                  LanguageCrossEntropy, LanguagePerplexity)
+                                  )
+
+from composer.metrics.nlp import LanguageCrossEntropy, LanguagePerplexity
 from composer.models import ComposerModel
 from torchmetrics import Metric
 from transformers import AutoTokenizer
diff --git a/llmfoundry/models/mpt/modeling_mpt.py b/llmfoundry/models/mpt/modeling_mpt.py
index 2177124740..029ae78375 100644
--- a/llmfoundry/models/mpt/modeling_mpt.py
+++ b/llmfoundry/models/mpt/modeling_mpt.py
@@ -16,10 +16,8 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from composer.metrics import (InContextLearningCodeEvalAccuracy,
+from llmfoundry.eval.metrics.nlp import (InContextLearningCodeEvalAccuracy,
                               InContextLearningLMAccuracy,
-                              InContextLearningLMExpectedCalibrationError,
-                              InContextLearningMCExpectedCalibrationError,
                               InContextLearningMultipleChoiceAccuracy,
                               InContextLearningQAAccuracy)
 from composer.metrics.nlp import LanguageCrossEntropy, LanguagePerplexity
diff --git a/mcli/mcli-hf-eval.yaml b/mcli/mcli-hf-eval.yaml
index c23df846f3..6601a93f79 100644
--- a/mcli/mcli-hf-eval.yaml
+++ b/mcli/mcli-hf-eval.yaml
@@ -8,6 +8,8 @@ integrations:
 
 command: |
   cd llm-foundry/scripts
+  pip uninstall mosaicml -y
+  pip install git+https://github.com/bmosaicml/composer.git@remove_subclasses_from_composer 
   composer eval/eval.py /mnt/config/parameters.yaml
 
 # Mosaic Cloud will use run_name (with a unique suffix) to populate the env var $RUN_NAME
diff --git a/tests/eval/test_in_context_learning_datasets.py b/tests/eval/test_in_context_learning_datasets.py
index 556cf43dec..0dce36bc3d 100644
--- a/tests/eval/test_in_context_learning_datasets.py
+++ b/tests/eval/test_in_context_learning_datasets.py
@@ -18,12 +18,11 @@
     InContextLearningMultipleChoiceTaskDataset,
     InContextLearningQATaskDataset,
     InContextLearningSchemaTaskDataset,
-   
     get_icl_task_dataloader,
 )
 # isort: on
 from composer.loggers import InMemoryLogger
-from composer.metrics import (InContextLearningCodeEvalAccuracy, InContextLearningLMAccuracy,
+from llmfoundry.eval.metrics.nlp import (InContextLearningCodeEvalAccuracy, InContextLearningLMAccuracy,
                               InContextLearningMultipleChoiceAccuracy, InContextLearningQAAccuracy)
 from composer.models import HuggingFaceModel
 from composer.trainer import Trainer

From 4aac81e1f15a5d7786f53e926289f19d27773885 Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Fri, 2 Feb 2024 12:56:10 -0500
Subject: [PATCH 03/59] wip onboarding tests

---
 .../eval/test_in_context_learning_datasets.py | 60 +++++++++----------
 1 file changed, 30 insertions(+), 30 deletions(-)

diff --git a/tests/eval/test_in_context_learning_datasets.py b/tests/eval/test_in_context_learning_datasets.py
index 0dce36bc3d..ab762d55bc 100644
--- a/tests/eval/test_in_context_learning_datasets.py
+++ b/tests/eval/test_in_context_learning_datasets.py
@@ -27,7 +27,6 @@
 from composer.models import HuggingFaceModel
 from composer.trainer import Trainer
 from composer.utils import dist, reproducibility
-from tests.common import device, world_size
 
 
 
@@ -1322,8 +1321,9 @@ def test_eval_split_batch(tiny_opt_tokenizer, dataset_uri, num_fewshot, tmp_path
 
 @pytest.mark.parametrize('dataset_uri', ['lambada_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [0, 5])
-@device('gpu')
-def test_lm_task_evaluation(device, dataset_uri, num_fewshot, tiny_gpt2_tokenizer, tmp_path):
+@pytest.mark.gpu
+@pytest.mark.world_size(2)
+def test_lm_task_evaluation(dataset_uri, num_fewshot, tiny_gpt2_tokenizer, tmp_path):
     pytest.importorskip('datasets')
     in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -1408,10 +1408,10 @@ def test_schema_task_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer, t
 
 @pytest.mark.parametrize('dataset_uri', ['mmlu_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [0, 5])
-@device('gpu')
-@world_size(1, 2)
+@pytest.mark.gpu
+@pytest.mark.world_size(2)
 @pytest.mark.filterwarnings(r'ignore:Cannot split .* of length.*:UserWarning')
-def test_mc_task_evaluation_subcategories(device, world_size, dataset_uri, num_fewshot, tiny_gpt2_model,
+def test_mc_task_evaluation_subcategories(dataset_uri, num_fewshot, tiny_gpt2_model,
                                           tiny_gpt2_tokenizer, tmp_path):
     pytest.importorskip('datasets')
     in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
@@ -1462,9 +1462,9 @@ def test_mc_task_evaluation_subcategories(device, world_size, dataset_uri, num_f
 @pytest.mark.parametrize('dataset_uri', ['piqa_small.jsonl', 'hellaswag_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [0, 5])
 @pytest.mark.filterwarnings(r'ignore:Cannot split .* of length.*:UserWarning')
-@device('gpu')
-@world_size(1, 2)
-def test_mc_task_evaluation(device, world_size, num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tmp_path,
+@pytest.mark.gpu
+@pytest.mark.world_size(2)
+def test_mc_task_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tmp_path,
                             tiny_gpt2_model):
     pytest.importorskip('datasets')
     in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
@@ -1517,9 +1517,9 @@ def test_mc_task_evaluation(device, world_size, num_fewshot, dataset_uri, tiny_g
 @pytest.mark.parametrize('dataset_uri', ['triviaqa_small.jsonl'])
 @pytest.mark.filterwarnings(r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning')
 @pytest.mark.filterwarnings(r'ignore:Cannot split .* of length.*:UserWarning')
-@device('gpu')
-@world_size(1, 2)
-def test_qa_task_evaluation_opt_tokenizer(device, world_size, tiny_opt_tokenizer, tiny_opt_model, num_fewshot,
+@pytest.mark.gpu
+@pytest.mark.world_size(2)
+def test_qa_task_evaluation_opt_tokenizer(tiny_opt_tokenizer, tiny_opt_model, num_fewshot,
                                           dataset_uri, tmp_path):
     pytest.importorskip('datasets')
     in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
@@ -1561,11 +1561,11 @@ def test_qa_task_evaluation_opt_tokenizer(device, world_size, tiny_opt_tokenizer
 
 @pytest.mark.parametrize('num_fewshot', [5])
 @pytest.mark.parametrize('dataset_uri', ['gsm8k_small.jsonl'])
-@device('gpu')
-@world_size(1, 2)
+@pytest.mark.gpu
+@pytest.mark.world_size(2)
 @pytest.mark.filterwarnings(r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning')
 @pytest.mark.filterwarnings(r'ignore:Cannot split .* of length.*:UserWarning')
-def test_qa_task_evaluation_with_cot_opt_tokenizer(device, world_size, tiny_opt_tokenizer, tiny_opt_model, num_fewshot,
+def test_qa_task_evaluation_with_cot_opt_tokenizer(tiny_opt_tokenizer, tiny_opt_model, num_fewshot,
                                                    dataset_uri, tmp_path):
     pytest.importorskip('datasets')
     in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
@@ -1608,10 +1608,10 @@ def test_qa_task_evaluation_with_cot_opt_tokenizer(device, world_size, tiny_opt_
 
 @pytest.mark.parametrize('dataset_uri', ['triviaqa_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [0, 5])
-@device('gpu')
-@world_size(1, 2)
+@pytest.mark.gpu
+@pytest.mark.world_size(2)
 @pytest.mark.filterwarnings(r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning')
-def test_qa_task_evaluation(device, world_size, num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tiny_gpt2_model,
+def test_qa_task_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tiny_gpt2_model,
                             tmp_path):
     pytest.importorskip('datasets')
     in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
@@ -1654,9 +1654,9 @@ def test_qa_task_evaluation(device, world_size, num_fewshot, dataset_uri, tiny_g
 @pytest.mark.parametrize('dataset_uri', ['gsm8k_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [5])
 @pytest.mark.filterwarnings(r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning')
-@device('gpu')
-@world_size(1, 2)
-def test_qa_task_with_cot_evaluation(device, world_size, num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tiny_gpt2_model,
+@pytest.mark.gpu
+@pytest.mark.world_size(2)
+def test_qa_task_with_cot_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tiny_gpt2_model,
                                      tmp_path):
     pytest.importorskip('datasets')
     in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
@@ -1712,10 +1712,10 @@ def test_code_eval_requires_valid_envvar(monkeypatch):
 @pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [0])
 @pytest.mark.parametrize('generations_per_sample', range(1, 3))
-@device('gpu')
-@world_size(1, 2)
+@pytest.mark.gpu
+@pytest.mark.world_size(2)
 @pytest.mark.filterwarnings(r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning')
-def test_code_eval_microbatching(monkeypatch, device, world_size, tiny_opt_tokenizer, tiny_opt_model, num_fewshot,
+def test_code_eval_microbatching(monkeypatch, tiny_opt_tokenizer, tiny_opt_model, num_fewshot,
                                  dataset_uri, tmp_path, generations_per_sample):
     pytest.importorskip('datasets')
     monkeypatch.setenv('CODE_EVAL_DEVICE', 'LOCAL')
@@ -1764,10 +1764,10 @@ def test_code_eval_microbatching(monkeypatch, device, world_size, tiny_opt_token
 @pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [0])
 @pytest.mark.parametrize('generations_per_sample', range(1, 3))
-@device('gpu')
-@world_size(1, 2)
+@pytest.mark.gpu
+@pytest.mark.world_size(2)
 @pytest.mark.filterwarnings(r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning')
-def test_code_eval_sentpiece_evaluation(monkeypatch, device, world_size, num_fewshot, dataset_uri, tiny_t5_tokenizer,
+def test_code_eval_sentpiece_evaluation(monkeypatch, num_fewshot, dataset_uri, tiny_t5_tokenizer,
                                         tiny_t5_model, tmp_path, generations_per_sample):
     pytest.importorskip('datasets')
     monkeypatch.setenv('CODE_EVAL_DEVICE', 'LOCAL')
@@ -1813,10 +1813,10 @@ def test_code_eval_sentpiece_evaluation(monkeypatch, device, world_size, num_few
 @pytest.mark.parametrize('num_fewshot', [0, 2])
 @pytest.mark.parametrize('generations_per_sample', [1])
 @pytest.mark.filterwarnings(r'ignore: Input length of input_ids is')
-@device('gpu')
-@world_size(1, 2)
+@pytest.mark.gpu
+@pytest.mark.world_size(2)
 @pytest.mark.filterwarnings(r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning')
-def test_code_eval_task_evaluation(monkeypatch, device, world_size, num_fewshot, dataset_uri, tiny_gpt2_tokenizer,
+def test_code_eval_task_evaluation(monkeypatch, num_fewshot, dataset_uri, tiny_gpt2_tokenizer,
                                    tiny_gpt2_model, tmp_path, generations_per_sample):
     pytest.importorskip('datasets')
     monkeypatch.setenv('CODE_EVAL_DEVICE', 'LOCAL')

From 946a4af28274a444a44b53c13bb93af8a6484d2a Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Fri, 2 Feb 2024 13:24:03 -0500
Subject: [PATCH 04/59] still workin'

---
 tests/fixtures/models.py | 109 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 108 insertions(+), 1 deletion(-)

diff --git a/tests/fixtures/models.py b/tests/fixtures/models.py
index 1b1ef86302..b897c9a242 100644
--- a/tests/fixtures/models.py
+++ b/tests/fixtures/models.py
@@ -5,6 +5,7 @@
 
 from omegaconf import DictConfig
 from pytest import fixture
+import pytest
 from transformers import PreTrainedTokenizerBase
 
 from llmfoundry.models.hf.hf_causal_lm import ComposerHFCausalLM
@@ -22,7 +23,6 @@ def _build_model(config: DictConfig, tokenizer: PreTrainedTokenizerBase):
 def mpt_tokenizer():
     return build_tokenizer('EleutherAI/gpt-neox-20b', {})
 
-
 @fixture
 def build_tiny_mpt(
     mpt_tokenizer: PreTrainedTokenizerBase
@@ -68,3 +68,110 @@ def build(**kwargs: Any) -> ComposerHFCausalLM:
         return model
 
     return build
+
+
+
+def tiny_gpt2_model_helper(config):
+    transformers = pytest.importorskip('transformers')
+
+    return transformers.AutoModelForCausalLM.from_config(config)
+
+
+@pytest.fixture(scope='session')
+def _session_tiny_gpt2_model(_session_tiny_gpt2_config):  # type: ignore
+    return tiny_gpt2_model_helper(_session_tiny_gpt2_config)
+
+
+def tiny_gpt2_config_helper():
+    transformers = pytest.importorskip('transformers')
+
+    tiny_overrides = {
+        'n_embd': 2,
+        'n_head': 2,
+        'n_layer': 2,
+        'vocab_size': 50258  # 50257 + 1 for pad token
+    }
+    return transformers.AutoConfig.from_pretrained('gpt2', **tiny_overrides)
+
+
+@pytest.fixture(scope='session')
+def _session_tiny_gpt2_config():  # type: ignore
+    return tiny_gpt2_config_helper()
+
+
+def tiny_gpt2_tokenizer_helper():
+    transformers = pytest.importorskip('transformers')
+
+    hf_tokenizer = transformers.AutoTokenizer.from_pretrained('gpt2')
+    hf_tokenizer.add_special_tokens({'pad_token': '[PAD]'})
+    return hf_tokenizer
+
+
+def tiny_llama_tokenizer_helper():
+    transformers = pytest.importorskip('transformers')
+
+    hf_tokenizer = transformers.AutoTokenizer.from_pretrained('huggyllama/llama-7b', use_fast=False)
+    return hf_tokenizer
+
+
+@pytest.fixture(scope='session')
+def _session_tiny_gpt2_tokenizer():  # type: ignore
+    return tiny_gpt2_tokenizer_helper()
+
+
+@pytest.fixture(scope='session')
+def _session_tiny_llama_tokenizer():  # type: ignore
+    return tiny_llama_tokenizer_helper()
+
+
+
+
+
+def tiny_opt_tokenizer_helper():
+    transformers = pytest.importorskip('transformers')
+
+    hf_tokenizer = transformers.AutoTokenizer.from_pretrained('facebook/opt-125m')
+    hf_tokenizer.add_special_tokens({'pad_token': '[PAD]'})
+    return hf_tokenizer
+
+
+@pytest.fixture(scope='session')
+def _session_tiny_opt_tokenizer():  # type: ignore
+    return tiny_opt_tokenizer_helper()
+
+
+@pytest.fixture
+def tiny_gpt2_config(_session_tiny_gpt2_config):
+    return copy.deepcopy(_session_tiny_gpt2_config)
+
+
+@pytest.fixture
+def tiny_gpt2_tokenizer(_session_tiny_gpt2_tokenizer):
+    return copy.deepcopy(_session_tiny_gpt2_tokenizer)
+
+
+@pytest.fixture
+def tiny_llama_tokenizer(_session_tiny_llama_tokenizer):
+    return copy.deepcopy(_session_tiny_llama_tokenizer)
+
+
+@pytest.fixture
+def tiny_gpt2_model(_session_tiny_gpt2_model):
+    return copy.deepcopy(_session_tiny_gpt2_model)
+
+
+@pytest.fixture
+def tiny_opt_config(_session_tiny_opt_config):
+    return copy.deepcopy(_session_tiny_opt_config)
+
+
+@pytest.fixture
+def tiny_opt_tokenizer(_session_tiny_opt_tokenizer):
+    return copy.deepcopy(_session_tiny_opt_tokenizer)
+
+
+@pytest.fixture
+def tiny_opt_model(_session_tiny_opt_model):
+    return copy.deepcopy(_session_tiny_opt_model)
+
+

From 289ca55201e30346c2c1a1080adb6bdf30c88bd2 Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Fri, 2 Feb 2024 13:38:53 -0500
Subject: [PATCH 05/59] still wip

---
 tests/fixtures/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/fixtures/models.py b/tests/fixtures/models.py
index b897c9a242..33cb27ee8a 100644
--- a/tests/fixtures/models.py
+++ b/tests/fixtures/models.py
@@ -7,7 +7,7 @@
 from pytest import fixture
 import pytest
 from transformers import PreTrainedTokenizerBase
-
+import copy
 from llmfoundry.models.hf.hf_causal_lm import ComposerHFCausalLM
 from llmfoundry.models.model_registry import COMPOSER_MODEL_REGISTRY
 from llmfoundry.models.mpt.modeling_mpt import ComposerMPTCausalLM

From 3696f8dbb58130f9fc869492e78f2f946c122f9e Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Fri, 2 Feb 2024 14:01:48 -0500
Subject: [PATCH 06/59] maybe done; test out on mcli now

---
 mcli/mcli-hf-eval.yaml         | 4 ++--
 tests/eval/test_nlp_metrics.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/mcli/mcli-hf-eval.yaml b/mcli/mcli-hf-eval.yaml
index 2739f93979..a16bef7d9c 100644
--- a/mcli/mcli-hf-eval.yaml
+++ b/mcli/mcli-hf-eval.yaml
@@ -15,8 +15,8 @@ command: |
 # Mosaic Cloud will use run_name (with a unique suffix) to populate the env var $RUN_NAME
 run_name: mpt-eval
 gpu_num: 8
-# gpu_type:
-# cluster:  # replace with your cluster here!
+gpu_type: a100_80gb
+cluster: r1z1 # replace with your cluster here!
 
 image: mosaicml/llm-foundry:2.1.0_cu121_flash2-latest
 
diff --git a/tests/eval/test_nlp_metrics.py b/tests/eval/test_nlp_metrics.py
index fb6a88c780..93c0f91035 100644
--- a/tests/eval/test_nlp_metrics.py
+++ b/tests/eval/test_nlp_metrics.py
@@ -41,7 +41,7 @@ def test_in_context_learning_qa_accuracy():
     labels = [['Correct'], ['blah', 'blah2'], ['blah', 'correct']]
     batch = {'cot_delimiter': '', 'labels': labels}
     metric = InContextLearningQAAccuracy()
-    metric.update(outputs, labels, batch)
+    metric.update(batch, outputs, labels)
 
     assert metric.compute() == (2 / 3)
 
@@ -55,7 +55,7 @@ def test_in_context_learning_qa_cot_accuracy():
     labels = [['Correct'], ['blah', 'blah2'], ['blah', 'correct'], ['correct']]
     batch = {'cot_delimiter': ' ### ', 'labels': labels, 'do_normalization': True, 'stopping_criteria': '\n\n'}
     metric = InContextLearningQAAccuracy()
-    metric.update(outputs, labels, batch)
+    metric.update(batch, outputs, labels)
 
     assert metric.compute() == (2 / 4)
 

From a20877dc203812ca19406a40d7d012f31c87afd1 Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Fri, 2 Feb 2024 14:13:50 -0500
Subject: [PATCH 07/59] mcli

---
 mcli/mcli-hf-eval.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mcli/mcli-hf-eval.yaml b/mcli/mcli-hf-eval.yaml
index a16bef7d9c..c3979a1922 100644
--- a/mcli/mcli-hf-eval.yaml
+++ b/mcli/mcli-hf-eval.yaml
@@ -1,7 +1,7 @@
 integrations:
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  git_branch: v0.4.0
+  git_branch: migrate_subclasses_to_foundry # v0.4.0
   # git_commit:  # OR use your commit hash
   pip_install: -e ".[gpu]"
   ssh_clone: false  # Should be true if using a private repo

From 53da3ea86298f5bcb02a782303fd91996fe1b6e5 Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Fri, 2 Feb 2024 16:41:25 -0500
Subject: [PATCH 08/59] remove calibration error

---
 llmfoundry/models/hf/hf_causal_lm.py  | 4 +---
 llmfoundry/models/mpt/modeling_mpt.py | 2 --
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/llmfoundry/models/hf/hf_causal_lm.py b/llmfoundry/models/hf/hf_causal_lm.py
index 498133262b..cdfa142420 100644
--- a/llmfoundry/models/hf/hf_causal_lm.py
+++ b/llmfoundry/models/hf/hf_causal_lm.py
@@ -71,9 +71,7 @@ def __init__(self, om_model_config: Union[DictConfig,
             InContextLearningLMAccuracy(),
             InContextLearningMultipleChoiceAccuracy(),
             InContextLearningQAAccuracy(),
-            InContextLearningCodeEvalAccuracy(),
-            InContextLearningLMExpectedCalibrationError(),
-            InContextLearningMCExpectedCalibrationError()
+            InContextLearningCodeEvalAccuracy()
         ]
 
         # if we are passed a DictConfig, we need to instantiate the model
diff --git a/llmfoundry/models/mpt/modeling_mpt.py b/llmfoundry/models/mpt/modeling_mpt.py
index 029ae78375..45bc4eb6d1 100644
--- a/llmfoundry/models/mpt/modeling_mpt.py
+++ b/llmfoundry/models/mpt/modeling_mpt.py
@@ -1025,8 +1025,6 @@ def __init__(
             InContextLearningMultipleChoiceAccuracy(),
             InContextLearningQAAccuracy(),
             InContextLearningCodeEvalAccuracy(),
-            InContextLearningLMExpectedCalibrationError(),
-            InContextLearningMCExpectedCalibrationError(),
         ]
 
         super().__init__(

From a90766e6284cf72f7cab89af1256b08524c12ee8 Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Wed, 7 Feb 2024 15:11:03 -0500
Subject: [PATCH 09/59] migration

---
 .../in_context_learning_evaluation.py         |  949 +++++++++++--
 llmfoundry/eval/metrics/nlp.py                |  395 ++++--
 llmfoundry/models/hf/hf_causal_lm.py          |    9 +-
 .../models/inference_api_wrapper/interface.py |   13 +-
 llmfoundry/models/mpt/modeling_mpt.py         |    7 +-
 llmfoundry/utils/builders.py                  |    4 +-
 mcli/mcli-hf-eval.yaml                        |    6 +-
 .../eval/test_in_context_learning_datasets.py | 1253 +++++++++++++----
 tests/eval/test_nlp_metrics.py                |   69 +-
 tests/fixtures/models.py                      |   18 +-
 10 files changed, 2197 insertions(+), 526 deletions(-)

diff --git a/llmfoundry/eval/datasets/in_context_learning_evaluation.py b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
index bcc7996189..668dd25145 100644
--- a/llmfoundry/eval/datasets/in_context_learning_evaluation.py
+++ b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
@@ -1,3 +1,6 @@
+# Copyright 2024 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
 # Copyright 2022 MosaicML Composer authors
 # SPDX-License-Identifier: Apache-2.0
 # This code is based on the implementation in https://github.com/EleutherAI/lm-evaluation-harness/blob/8c048e266a22a1c85ccbdb0c209ac712e4f39989/lm_eval/base.py#L221-L330
@@ -8,19 +11,20 @@
 import json
 import os
 import random
+import warnings
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Union
 
 import torch
-from torch.utils.data import DataLoader
-
 from composer.core import DataSpec
 from composer.core.data_spec import _default_split_batch, _split_list
 from composer.datasets.utils import stop_sequences_criteria
 from composer.utils import MissingConditionalImportError, dist, get_file
-from composer.datasets import InContextLearningDataset
+from torch.utils.data import DataLoader, Dataset
+
 if TYPE_CHECKING:
     import transformers
-    from datasets import Dataset as HFDataset  # pyright: ignore[reportGeneralTypeIssues]
+    from datasets import \
+        Dataset as HFDataset  # pyright: ignore[reportGeneralTypeIssues]
 
 # Allow models to have slightly more tokens than were used in the most verbose CoT in the dataset
 _MAX_ANSWER_BUFFER_LENGTH = 10
@@ -35,8 +39,8 @@
 
 
 def strip_data(example: Dict) -> Dict:
-    """
-    Remove white space from the begging and end of string values in a dictionary
+    """Remove white space from the begging and end of string values in a
+    dictionary.
 
     Args:
         example: Dictionary to be stripped
@@ -44,14 +48,33 @@ def strip_data(example: Dict) -> Dict:
     Returns:
         dict: The same dictionary with .strip() applied to any value in the dict that is a string
     """
-    return {k: v.strip() if isinstance(v, str) else v for k, v in example.items()}
+    return {
+        k: v.strip() if isinstance(v, str) else v for k, v in example.items()
+    }
+
 
+def _tokenizer_needs_prefix_space(
+        tokenizer: transformers.PreTrainedTokenizerBase) -> bool:
+    """Test for whether a prefix space is needed before the continuation.
+    Sentencepiece tokenization should not have a prefix space, but gpt2 style
+    BPE should.
+
+    Args:
+        tokenizer: Tokenizer to test
 
-def _trim_context(context_enc: List, continuation_enc: List, max_seq_len: int) -> List:
+    Returns:
+        bool: Whether or not the tokenizer needs a prefix space
     """
-    Trims a list of tokens down to `max_seq_len` if the length of the list plus the continuation
-    is more than `max_seq_len`. It will always trim tokens from the left, i.e. tokens at the beginning
-    of the context will be removed.
+    test_tokens = tokenizer(' a', add_special_tokens=False)['input_ids']
+    assert isinstance(test_tokens, list)
+    return len(test_tokens) == 1
+
+
+def _trim_context(context_enc: List, continuation_enc: List,
+                  max_seq_len: int) -> List:
+    """Trims a list of tokens down to `max_seq_len` if the length of the list
+    plus the continuation is more than `max_seq_len`. It will always trim tokens
+    from the left, i.e. tokens at the beginning of the context will be removed.
 
     Args:
         context_enc (list): List of tokens in the context
@@ -66,16 +89,18 @@ def _trim_context(context_enc: List, continuation_enc: List, max_seq_len: int) -
 
         if context_max_subseq_len < 0:
             # can't support continuations which are longer than the max seq len
-            raise Exception(f'Dataset included continuation longer than the max seq len')
+            raise Exception(
+                f'Dataset included continuation longer than the max seq len')
 
         # clip from the end
         context_enc = context_enc[-(context_max_subseq_len):]
     return context_enc
 
 
-def _get_continuation_span(context_enc: List, continuation_enc: List) -> torch.Tensor:
-    """
-    Gets the list of indices of the continuation tokens for language modeling or generation tasks.
+def _get_continuation_span(context_enc: List,
+                           continuation_enc: List) -> torch.Tensor:
+    """Gets the list of indices of the continuation tokens for language modeling
+    or generation tasks.
 
     Args:
         context_enc (list): List of context tokens
@@ -84,7 +109,9 @@ def _get_continuation_span(context_enc: List, continuation_enc: List) -> torch.T
     Returns:
         torch.tensor: A tensor containing indices corresponding to continuation tokens
     """
-    return torch.tensor(range(len(context_enc), len(context_enc) + len(continuation_enc)))
+    return torch.tensor(
+        range(len(context_enc),
+              len(context_enc) + len(continuation_enc)))
 
 
 def _make_padded_input(context_enc: List,
@@ -92,9 +119,8 @@ def _make_padded_input(context_enc: List,
                        max_seq_len: int,
                        pad_tok_id: int,
                        padding_side: str = 'right') -> torch.Tensor:
-    """
-    Takes an encoded context and continuation and clips the beginning of the context if they're too long.
-    Adds the padding token to the specified side.
+    """Takes an encoded context and continuation and clips the beginning of the
+    context if they're too long. Adds the padding token to the specified side.
 
     Args:
         context_enc (List): The encoded input to the model
@@ -117,7 +143,9 @@ def _make_padded_input(context_enc: List,
     # Sometimes tokenizers that have neither a pad_tok_id or eos_tok_id will pass None in as the padding
     # token and cause errors
     if not isinstance(pad_tok_id, int):
-        raise ValueError(f'`pad_tok_id` must be an integer. Found {type(pad_tok_id)} instead')
+        raise ValueError(
+            f'`pad_tok_id` must be an integer. Found {type(pad_tok_id)} instead'
+        )
     # pad length from seq to padding_length
     if padding_side == 'right':
         inp = torch.cat(
@@ -136,15 +164,18 @@ def _make_padded_input(context_enc: List,
             dim=0,
         )
     else:
-        raise ValueError(f"Unknown padding_side {padding_side}. padding_side must be either 'left' or 'right'")
+        raise ValueError(
+            f"Unknown padding_side {padding_side}. padding_side must be either 'left' or 'right'"
+        )
 
     return inp
 
 
-def convert_tokens_to_tensors(batch: Dict, tokenize_labels: bool) -> Dict[str, Any]:
-    """
-    HF Datasets converts tensors into lists when we store them, and we don't want to use `type='torch'`
-    because some content in the dataset, like generation args or single ints, should not be converted.
+def convert_tokens_to_tensors(batch: Dict,
+                              tokenize_labels: bool) -> Dict[str, Any]:
+    """HF Datasets converts tensors into lists when we store them, and we don't
+    want to use `type='torch'` because some content in the dataset, like
+    generation args or single ints, should not be converted.
 
     Here, we convert those lists of tokens back into tensors in order to feed them into the model.
 
@@ -155,14 +186,17 @@ def convert_tokens_to_tensors(batch: Dict, tokenize_labels: bool) -> Dict[str, A
     Returns:
         dict: The batch with torch tensors in the corresponding keys instead of lists of lists
     """
-    batch['input_ids'] = torch.stack(list(map(torch.tensor, batch['input_ids'])))
+    batch['input_ids'] = torch.stack(list(map(torch.tensor,
+                                              batch['input_ids'])))
     if tokenize_labels:
         batch['labels'] = torch.stack(list(map(torch.tensor, batch['labels'])))
-        batch['continuation_indices'] = list(map(torch.tensor, batch['continuation_indices']))
+        batch['continuation_indices'] = list(
+            map(torch.tensor, batch['continuation_indices']))
     return batch
 
 
-def _get_fewshot_sample_idxs(dataset_size: int, num_fewshot: int, example_idx: int, rng: random.Random) -> Set[int]:
+def _get_fewshot_sample_idxs(dataset_size: int, num_fewshot: int,
+                             example_idx: int, rng: random.Random) -> Set[int]:
     """
     Samples indices without replacement. If num_fewshot exceeds the number of unique examples in the dataset,
     then we will have fewer than num_fewshot examples in context.
@@ -189,10 +223,482 @@ def _get_fewshot_sample_idxs(dataset_size: int, num_fewshot: int, example_idx: i
         fewshot_idxs.add(replacement_sample)
     return fewshot_idxs
 
-class InContextLearningQATaskDataset(InContextLearningDataset):
+
+class InContextLearningDataset(Dataset):
+    """A base dataset that constructs batches for in-context learning task
+    evaluations. The dataset format is expected to be a local jsonl file, a
+    cloud link to a jsonl file, or a Hugging Face dataset link. 'context' refers
+    to the input a model will recieve before generating an output. For example,
+    the question in question answering tasks, the preceding text in a language
+    modeling task, or the document and question regarding the document in a
+    document understanding task. 'example' refers to a loaded dictionary,
+    generally containing a context, an answer, and any other information needed
+    to run the task. 'answer' refers to the desired output of the model.
+
+    When creating a new ICL Dataset, it is likely that you will need to reimplement the following methods:
+
+    - construct_context(): Takes a single example dictionary and formulates the context as a string for that eval question.
+    - get_answer_from_example(): Takes a single example dictionary and formulates the correct, ground truth answer as a string.
+    - tokenize_example(): Tokenizes the example and adds any extra content from the original dictionary that needs to be passed downstream.
+    - read_dataset(): Loads the dataset and does basic parsing. If additional parsing must be done, this is a good place to do so (See InContextLearningQATaskDataset.read_dataset())
+
+    Additionally, base_batch and batch_mapping must be defined.
+
+    - base_batch (Dict): The base dictionary that the dataset will use to construct a batch. This should contain static values, like generation_kwargs or mode,
+      and empty lists for values that will need to be accumulated from each example.
+      NOTE: Sometimes you will need to set base_batch directly after the init call, e.g. in order to use class variables
+      like self.pad_tok_id or self.max_answer_length. If you manually set generation_kwargs this way, you'll need to call self.update_generation_kwargs()
+      after setting self.base_batch.
+    - batch_mapping (Dict): A mapping with keys that are keys in the batch and values that are columns in the loaded dataset.
+      collate_fn will use this mapping to create batches from self.dataset.
+
+    Args:
+        dataset_uri (str): A local path, a remote path beginning with ``s3://`` or another backend, or a HuggingFace dataset uri prepended with ``hf://``.
+            Alternate backends must be supported by :meth:`composer.utils.maybe_create_object_store_from_uri`.
+            A local dataset must consist of rows of JSON data points with task dependent fields.
+            The default keys expected are "context" and "answer".
+        tokenizer (transformers.PreTrainedTokenizerBase): The tokenizer used to map between strings and token ids.
+        max_seq_len (int): The maximum sequence length supported by the model.
+        pad_tok_id (int): The special token used for padding batches.
+        num_fewshot (int): The number of complete fewshot examples to prepend before each test example. These are not identical across examples.
+        fewshot_random_seed (int): Random seed to use for fewshot sampling.
+        prompt_string (str): Prompt string to put once before all fewshot examples/test examples (e.g. 'Translate english to french.').
+        example_delimiter (str): Separator inserted before (context, answer) pairs (e.g. '\\n') for fewshot sampling and prompting.
+        continuation_delimiter: (str): Separator inserted between context and answer in each example (e.g. '\\nA: ').
+        destination_path (str): Temporary path to store downloaded datasets.
+        prelimiter (str): Text to be prepended before each context, including few shot examples (e.g. "Question: ").
+        context_key (str): The key in the loaded dataset that contains the context.
+        answer_key (str): The key in the loaded dataset that contains the answer.
+        strip_dataset (bool): Boolean for whether to strip whitespace from data. Trailing whitespace can cause degenerative outputs,
+            so unless whitespace should be preserved (for example in code), this should be set to True.
+        padding_side (str): Side of the content and answer on which to apply padding. Can be either 'right' or 'left'.
+        padding_size (int): The final size of the tensor after padding. Defaults to max_sequence_length.
+        base_batch (Dict): The base dictionary upon which a batch is created. See above for more details.
+        base_mapping (Dict): A mapping of batch keys to dataset columns, used to create batches. See above for more details.
+        hf_loading_vars (Dict): A dictionary containing keyword arguments to be passed into `load_dataset` if dataset is being pulled from HF.
+        hf_parsing_map (Dict): A dictionary containing a mapping from HF columns to ICL dataset keys. The dictionary should be formatted {icl_key:[hf_key1, hf_key1]}.
+            Column contents will be concatenated with ' ' seperating them. If not included, will load the columns already present in the HF dataset.
+        tokenize_labels (bool): Whether or not the labels should be tokenized. Generally determined by which metric a dataset uses.
+        generation_kwargs (Dict): A dictionary containing keyword arguments to be passed along to the model's generate function.
     """
-    A dataset that constructs batches for in-context learning question answering evaluation.
-    QA tasks evaluate a model's ability to answer questions using a consistent format.
+
+    def __init__(
+        self,
+        dataset_uri: str,
+        tokenizer: transformers.PreTrainedTokenizerBase,
+        max_seq_len: int,
+        pad_tok_id: int,
+        num_fewshot: int,
+        fewshot_random_seed: int,
+        prompt_string: str,
+        example_delimiter: str,
+        continuation_delimiter: str,
+        destination_path: str,
+        prelimiter: str = '',
+        context_key: str = 'context',
+        answer_key: str = 'answer',
+        strip_dataset: bool = True,
+        padding_side: str = 'right',
+        tokenize_labels: bool = True,
+        static_keys: Optional[List] = None,
+        list_keys: Optional[List] = None,
+        tensor_keys: Optional[List] = None,
+        padding_size: Optional[int] = None,
+        base_batch: Optional[Dict] = None,
+        batch_mapping: Optional[Dict] = None,
+        hf_loading_vars: Optional[Dict] = None,
+        hf_parsing_map: Optional[Dict] = None,
+        generation_kwargs: Optional[Dict] = None,
+    ):
+        try:
+            import datasets
+            del datasets
+        except ImportError as e:
+            raise MissingConditionalImportError(
+                extra_deps_group='nlp',
+                conda_package='datasets',
+                conda_channel='conda-forge',
+            ) from e
+
+        self.tokenizer = tokenizer
+        self.prefix_space = _tokenizer_needs_prefix_space(self.tokenizer)
+
+        self.max_seq_len = max_seq_len
+        self.pad_tok_id = pad_tok_id
+        self.num_fewshot = num_fewshot
+        self.padding_side = padding_side
+        self.padding_size = padding_size if padding_size else self.max_seq_len
+        self.prelimiter = prelimiter
+        self.example_delimiter = example_delimiter
+        self.continuation_delimiter = continuation_delimiter
+        self.context_key = context_key
+        self.answer_key = answer_key
+        self.tokenize_labels = tokenize_labels
+        self.batch_mapping = batch_mapping or {}
+        self.base_batch = base_batch or {}
+        if generation_kwargs:
+            self.update_generation_kwargs(generation_kwargs)
+
+        self.static_keys = static_keys
+        self.list_keys = list_keys
+        self.tensor_keys = tensor_keys
+
+        hf_loading_vars = hf_loading_vars or {}
+        self.dataset: HFDataset = self.read_dataset(dataset_uri,
+                                                    destination_path,
+                                                    hf_loading_vars,
+                                                    hf_parsing_map)
+        self.strip_data = strip_dataset
+        if self.strip_data:
+            self.dataset = self.dataset.map(strip_data)
+
+        fewshot_rng = random.Random(fewshot_random_seed)
+        self.dataset: HFDataset = self.dataset.map(
+            self._prep_example,
+            with_indices=True,
+            fn_kwargs={
+                'num_fewshot': num_fewshot,
+                'prompt_string': prompt_string,
+                'fewshot_rng': fewshot_rng,
+            },
+        )
+
+    def __getitem__(self, index: int) -> Dict:
+        return self.dataset[index]
+
+    def __len__(self) -> int:
+        return len(self.dataset)
+
+    def get_num_samples_in_batch(self, batch: Dict) -> int:
+        return batch['input_ids'].shape[0]
+
+    def update_generation_kwargs(self, generation_kwargs: Dict) -> None:
+        """Updates self.base_batch with the passed in generation_kwargs. This
+        must be run after self.base_batch is set (for example, if
+        self.base_batch is set after __init__() is run, likely because
+        base_batch needs a class variable like self.pad_tok_id or
+        self.max_answer_length).
+
+        Args:
+            dict: Keyword arguments that be written into base_batch['generation_kwargs']
+        """
+        if generation_kwargs:
+            if 'generation_kwargs' not in self.base_batch:
+                self.base_batch['generation_kwargs'] = {}
+            self.base_batch['generation_kwargs'].update(generation_kwargs)
+
+    def read_dataset(
+            self,
+            dataset_uri: str,
+            destination_path: str,
+            hf_loading_vars: Optional[Dict[str, Any]] = None,
+            hf_parsing_map: Optional[Dict[str, Any]] = None) -> 'HFDataset':
+        """Reads a dataset and handles parsing it from HuggingFace.
+
+        Args:
+            dataset_uri (str): A local path, a remote path beginning with ``s3://`` or another backend, or a HuggingFace dataset uri.
+                Alternate backends must be supported by :meth:`composer.utils.maybe_create_object_store_from_uri`.
+            destination_path (str): A local path where the data will be stored
+            hf_loading_vars (Dict): If parsing from HuggingFace, keyword args that will be passed into load_dataset
+            hf_parsing_map (Dict): Dictionary in the form of {icl_key: [hf_col1, hf_col2]} that will map one or more hf columns, in order, to ICL dataset columns
+
+        Returns:
+            dataset: A loaded HF dataset
+        """
+        from datasets import \
+            Dataset as HFDataset  # pyright: ignore[reportGeneralTypeIssues]
+        from datasets import \
+            load_dataset  # pyright: ignore[reportGeneralTypeIssues]
+        if 'hf://' in dataset_uri:
+            dataset_uri = dataset_uri.replace('hf://', '')
+            if hf_loading_vars is None:
+                hf_loading_vars = {}
+            dataset = load_dataset(dataset_uri, **hf_loading_vars)
+            if hf_parsing_map:
+                dataset_parsing_func = lambda example: {
+                    k: ' '.join([str(example[col]) for col in v])
+                    for k, v in hf_parsing_map.items(
+                    )  # pyright: ignore[reportOptionalMemberAccess]
+                }
+                assert isinstance(dataset, HFDataset)
+                dataset = dataset.map(dataset_parsing_func,
+                                      remove_columns=dataset.column_names)
+        else:
+            with dist.local_rank_zero_download_and_wait(destination_path):
+                if dist.get_local_rank() == 0:
+                    get_file(dataset_uri, destination_path, overwrite=True)
+            dataset = load_dataset('json',
+                                   data_files=destination_path,
+                                   split='train',
+                                   streaming=False)
+        assert isinstance(dataset, HFDataset)
+        return dataset
+
+    def _generate_few_shot_prompt(
+        self,
+        num_fewshot: int,
+        example_idx: int,
+        preamble: str,
+        fewshot_rng: random.Random,
+    ) -> str:
+        """Formats the fewshot prompt for test example `example_idx`.
+
+        Randomly selects `num_fewshot` samples from the dataset (excluding the example at `example_idx`) and constructs
+        contextes with answers appended.
+
+        Returns the formatted prompt_string + concatenated list of formatted few shot examples as a string.
+
+        Args:
+            num_fewshot (int): Number of examples to prepend
+            example_idx (int): Current example idx
+            preamble (str): Text to occur at the beginning of the task. Generally instructions or a prompt.
+            fewshot_rng (random.Random): Seeded sampler to chose samples with
+
+        Returns:
+            str: The original preamble with num_fewshot examples appended
+        """
+        few_shot_text = preamble
+
+        if num_fewshot > 0:
+            fewshot_idxs = _get_fewshot_sample_idxs(
+                len(self.dataset),
+                num_fewshot,
+                example_idx,
+                fewshot_rng,
+            )
+            for fewshot_idx in fewshot_idxs:
+                ctxt = self.construct_context(
+                    self.dataset[fewshot_idx],
+                    few_shot_text,
+                    add_answer=True,
+                )
+                few_shot_text += ctxt
+
+        return few_shot_text
+
+    def construct_context(self,
+                          example: Dict,
+                          preceding_text: str = '',
+                          add_answer: bool = False) -> str:
+        """Takes an example and constructs a context, i.e. the input the model
+        reads for this example. Optionally adds the correct answer (for fewshot
+        examples) and handles example delimiters.
+
+        Args:
+            example (Dict): The example from which to construct the context
+            preceding_text (str): Any preceding text, used as a check for prepending self.example_delimiter
+            add_answer (bool): Bool for whether or not to add the answer on the end of the context (e.g. for fewshot examples)
+
+        Returns:
+            str: The constructed context. The default output context is
+                 formatted as follows: f'{self.prelimiter}{example[self.context_key]}{self.continuation_delimiter}'
+        """
+        ctxt = example[self.context_key]
+        ctxt = f'{self.prelimiter}{ctxt}'
+        if len(preceding_text) > 0:
+            ctxt = f'{self.example_delimiter}{ctxt}'
+        ctxt = f'{ctxt}{self.continuation_delimiter}'
+        if add_answer:
+            ctxt = f'{ctxt}{self.get_answer_from_example(example, in_context=add_answer)}'
+        return ctxt
+
+    def get_answer_from_example(self,
+                                example: Dict[str, Any],
+                                in_context: bool = False) -> str:
+        """Returns the answer from the example.
+
+        Args:
+            example (Dict): The example from which to retrieve the answer
+
+        Returns:
+            str: The answer in the example
+        """
+        cont = example[self.answer_key]
+        if self.prefix_space and not cont.startswith(' ') and not in_context:
+            cont = f' {cont}'
+        return cont
+
+    def _fix_eos_on_preamble(self, input_ids: List[int]) -> List[int]:
+        """If the input_ids is empty then input_ids will be a 0-length List
+        unless the tokenizer adds special tokens to empty strings (e.g. OPT
+        tokenizer). If there is an EOS token added, we need to remove it so it
+        is not in the middle of the prompt, as the specific eval question's
+        prompt will follow the input_ids.
+
+        Args:
+            input_ids (List): The tokenized input
+
+        Returns:
+            input_ids: The tokenized input conditionally edited
+        """
+        if (self.tokenizer.eos_token_id is not None and len(input_ids) > 1 and
+                input_ids[-1] == self.tokenizer.eos_token_id):
+            input_ids = input_ids[:-1]
+        return input_ids
+
+    def tokenize_example(self, prompt_and_fewshot: str, ctxt: str,
+                         example: Dict) -> Dict[str, Any]:
+        """Runs text through the tokenizer and handle special cases.
+
+        Args:
+            prompt_and_fewshot (str): The collection of the prompt and fewshot examples that belongs before the example's context
+            ctxt (str): The specific example's derrived context
+            example (Dict): The example as a dictionary. Used for additional processing in inherited classes.
+
+        Returns:
+            Dict: Dictionary with the tokenized data
+        """
+        tokenized_example = {}
+        # Always add special tokens to preamble
+        preamble = self.tokenizer(prompt_and_fewshot)['input_ids']
+        assert isinstance(preamble, list)
+        preamble = self._fix_eos_on_preamble(preamble)
+        if self.strip_data:
+            # rstrip context because a prompt ending in a space results in degenerate output
+            ctxt = ctxt.rstrip()
+        # Never add special tokens to context
+        tokenized_context = self.tokenizer(
+            ctxt, add_special_tokens=False)['input_ids']
+        assert isinstance(preamble, list)
+        assert isinstance(tokenized_context, list)
+
+        tokenized_context = preamble + tokenized_context
+
+        if self.tokenize_labels:
+            # Never add special tokens to answer
+            tokenized_answer = self.tokenizer(
+                self.get_answer_from_example(example),
+                add_special_tokens=False)['input_ids']
+            assert isinstance(tokenized_answer, list)
+            trimmed_context = _trim_context(tokenized_context, tokenized_answer,
+                                            self.padding_size)
+            assert isinstance(trimmed_context, list)
+            continuation_indices = _get_continuation_span(
+                trimmed_context, tokenized_answer)
+            padded_context = _make_padded_input(trimmed_context,
+                                                tokenized_answer,
+                                                self.padding_size,
+                                                self.pad_tok_id,
+                                                self.padding_side)
+
+            tokenized_example[self.context_key] = padded_context
+            tokenized_example[self.answer_key] = tokenized_answer
+            tokenized_example['continuation_indices'] = continuation_indices
+        else:
+            assert isinstance(tokenized_context, list)
+            trimmed_context = _trim_context(
+                tokenized_context,
+                [],
+                self.padding_size,
+            )
+            assert isinstance(trimmed_context, list)
+            padded_context = _make_padded_input(trimmed_context, [],
+                                                self.padding_size,
+                                                self.pad_tok_id,
+                                                self.padding_side)
+
+            tokenized_example[self.context_key] = padded_context
+            tokenized_example[self.answer_key] = self.get_answer_from_example(
+                example)
+
+        return tokenized_example
+
+    def _prep_example(
+        self,
+        example: Dict,
+        example_idx: int,
+        num_fewshot: int,
+        prompt_string: str,
+        fewshot_rng: random.Random,
+    ) -> Dict[str, Any]:
+        """Prepares a single example from a HF Dataset into tokenized format
+        with prompt and fewshot examples.
+
+        Each task consists of a context and a continuation as well as an optional prompt and optional list of
+        example context/continuation pairs which precede the test context/continuation pair.
+
+        Args:
+            example (Dict): A Dictionary from the hf dataset
+            example_idx (int): The index of example
+            num_fewshot (int): Number of examples context/continuation pairs to prepend to the test pair
+            prompt_string (str): The prompt to prepend to all inputs
+            fewshot_rng (random.Random): Random number generator to use for fewshot sampling
+
+        Returns:
+            Dict: Contains a dictionary with the tokenized data
+        """
+        prompt_and_fewshot = self._generate_few_shot_prompt(
+            num_fewshot, example_idx, prompt_string, fewshot_rng)
+        ctxt = self.construct_context(example,
+                                      prompt_and_fewshot,
+                                      add_answer=False)
+        tokenized_example = self.tokenize_example(prompt_and_fewshot, ctxt,
+                                                  example)
+        return tokenized_example
+
+    def collate_fn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """The function that the dataloader uses to accumulate data into
+        batches.
+
+        Args:
+            data (List): List of tokenized datapoints (dicts returned by self._tokenize_example)
+
+        Returns:
+            Dict: Dictionary for a single batch
+        """
+        batch = copy.deepcopy(self.base_batch)
+        for data_pair in data:
+            for batch_key, data_key in self.batch_mapping.items():
+                batch[batch_key].append(data_pair[data_key])
+            if 'continuation_indices' in data_pair:
+                batch['continuation_indices'].append(
+                    data_pair['continuation_indices'])
+
+        batch = convert_tokens_to_tensors(batch, self.tokenize_labels)
+        batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
+        return batch
+
+    def split_batch(self, batch: Any,
+                    microbatch_size: int) -> List[Dict[str, Any]]:
+        """Handling for certain specialty columns that must be split into
+        batches in different formats.
+
+        Args:
+            batch (Dict): Batch of data
+            microbatch_size (int): Size of microbatches
+
+        Returns:
+            List: List of chunked batches
+        """
+        # Don't split kwargs that don't change
+        # Normally split torch tensors
+        # List split lists of strings
+        chunked = {}
+        for k, v in batch.items():
+            if k in self.static_keys:
+                # Defer broadcasting until we know num_chunks
+                pass
+            elif k in self.list_keys:
+                chunked[k] = _split_list(v, microbatch_size)
+            elif k in self.tensor_keys:
+                chunked[k] = _default_split_batch(v, microbatch_size)
+            else:
+                raise ValueError(f'Unexpected key {k} in batch splitting')
+        num_chunks = len(chunked['input_ids'])
+        for k, v in batch.items():
+            if k in self.static_keys:
+                chunked[k] = [v] * num_chunks
+
+        batched_list = [
+            {k: v[idx] for k, v in chunked.items()} for idx in range(num_chunks)
+        ]
+        return batched_list
+
+
+class InContextLearningQATaskDataset(InContextLearningDataset):
+    """A dataset that constructs batches for in-context learning question
+    answering evaluation. QA tasks evaluate a model's ability to answer
+    questions using a consistent format.
 
     The input format is expected to be a jsonl file with the following fields:
     - context: The question
@@ -211,13 +717,23 @@ def __init__(self,
                  do_normalization: bool = True,
                  *args,
                  **kwargs):
+        warnings.warn(
+            ('InContextLearningQATaskDataset is deprecated and will be removed in a future '
+             'release. Its functionality has been reimplemented '
+             'in llmfoundry.eval.datasets.in_context_learning_evaluation.InContextLearningQATaskDataset.'
+            ),
+            DeprecationWarning,
+        )
         if kwargs['tokenizer'].eos_token_id is None:
-            raise ValueError('`InContextLearningQATaskDataset` tokenizer must have non-null `eos_token_id`')
+            raise ValueError(
+                '`InContextLearningQATaskDataset` tokenizer must have non-null `eos_token_id`'
+            )
         self.cot_delimiter = cot_delimiter
         self.has_cot = False
         self.max_answer_length = 0
         static_keys = [
-            'mode', 'cot_delimiter', 'generation_length', 'generation_kwargs', 'do_normalization', 'stopping_criteria'
+            'mode', 'cot_delimiter', 'generation_length', 'generation_kwargs',
+            'do_normalization', 'stopping_criteria'
         ]
         tensor_keys = ['input_ids', 'attention_mask']
         list_keys = ['labels']
@@ -248,7 +764,8 @@ def __init__(self,
             'input_ids': self.context_key,
             'labels': 'aliases',
         }
-        self.update_generation_kwargs(kwargs.get('generation_kwargs', {}))
+        if 'generation_kwargs' in kwargs:
+            self.update_generation_kwargs(kwargs['generation_kwargs'])
 
     def read_dataset(
         self,
@@ -257,14 +774,19 @@ def read_dataset(
         hf_loading_vars: Dict,
         hf_parsing_map: Dict,
     ) -> 'HFDataset':
-        dataset = super().read_dataset(dataset_uri, destination_path, hf_loading_vars, hf_parsing_map)
+        dataset = super().read_dataset(dataset_uri, destination_path,
+                                       hf_loading_vars, hf_parsing_map)
         self.has_cot = 'chain_of_thought' in dataset.features
         dataset = dataset.map(
             lambda examples: {
-                'context': examples['context'],
-                'answer': examples['answer'],
-                'aliases': set([examples['answer']] + examples.get('aliases', [])),
-                'chain_of_thought': examples.get('chain_of_thought', ''),
+                'context':
+                    examples['context'],
+                'answer':
+                    examples['answer'],
+                'aliases':
+                    set([examples['answer']] + examples.get('aliases', [])),
+                'chain_of_thought':
+                    examples.get('chain_of_thought', ''),
             })
         self.max_answer_length = self._get_max_answer_length(dataset)
         # NOTE: This is the only time we use the class variable padding_size.
@@ -285,7 +807,8 @@ def get_answer_from_example(self, example: Dict, in_context=False) -> str:
         else:
             return example[self.answer_key]
 
-    def tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -> Dict[str, Any]:
+    def tokenize_example(self, prompt_and_fewshot: str, ctxt: str,
+                         example: Dict) -> Dict[str, Any]:
         """
         Run text through the tokenizer and handle special cases.
         Args:
@@ -296,7 +819,8 @@ def tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) ->
         Returns:
             Dict: Dictionary with the tokenized data
         """
-        tokenized_example = super().tokenize_example(prompt_and_fewshot, ctxt, example)
+        tokenized_example = super().tokenize_example(prompt_and_fewshot, ctxt,
+                                                     example)
         tokenized_example['aliases'] = list(example.get('aliases', []))
         return tokenized_example
 
@@ -309,16 +833,21 @@ def _get_max_answer_length(self, dataset) -> int:
         """
         max_answer_length = 0
         for example in dataset:
-            all_answers = [example[self.answer_key]] + list(example.get('aliases', []))
+            all_answers = [example[self.answer_key]] + list(
+                example.get('aliases', []))
             for answer in all_answers:
                 if self.has_cot:
-                    response = (f'{example["chain_of_thought"]}{self.cot_delimiter}{answer}')
+                    response = (
+                        f'{example["chain_of_thought"]}{self.cot_delimiter}{answer}'
+                    )
                 else:
                     response = answer
                 tokenized_repsonse = self.tokenizer(response)['input_ids']
                 assert isinstance(tokenized_repsonse, list)
-                max_answer_length = max(max_answer_length, len(tokenized_repsonse))
-        max_answer_length = max_answer_length + (_MAX_ANSWER_BUFFER_LENGTH if len(self.cot_delimiter) > 0 else 0)
+                max_answer_length = max(max_answer_length,
+                                        len(tokenized_repsonse))
+        max_answer_length = max_answer_length + (
+            _MAX_ANSWER_BUFFER_LENGTH if len(self.cot_delimiter) > 0 else 0)
         return max_answer_length
 
     def collate_fn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
@@ -327,18 +856,20 @@ def collate_fn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
         stopping_criteria = None
         if self.early_stopping_criteria:
             if stop_sequences_criteria is None:  # pyright: ignore [reportUnnecessaryComparison]
-                raise MissingConditionalImportError(extra_deps_group='nlp',
-                                                    conda_package='transformers',
-                                                    conda_channel='conda-forge')
-            stopping_criteria = stop_sequences_criteria(self.tokenizer, self.early_stopping_criteria, batch_size)
+                raise MissingConditionalImportError(
+                    extra_deps_group='nlp',
+                    conda_package='transformers',
+                    conda_channel='conda-forge')
+            stopping_criteria = stop_sequences_criteria(
+                self.tokenizer, self.early_stopping_criteria, batch_size)
         batch['generation_kwargs']['stopping_criteria'] = stopping_criteria
         return batch
 
 
 class InContextLearningLMTaskDataset(InContextLearningDataset):
-    """
-    A dataset that constructs batches for in-context learning language modeling evaluation.
-    Language modeling tasks test a model's ability to properly predict tokens based on preceding tokens.
+    """A dataset that constructs batches for in-context learning language
+    modeling evaluation. Language modeling tasks test a model's ability to
+    properly predict tokens based on preceding tokens.
 
     The input format is expected to be a jsonl file with the following fields:
     - context: Preceding text
@@ -348,9 +879,19 @@ class InContextLearningLMTaskDataset(InContextLearningDataset):
     """
 
     def __init__(self, *args, **kwargs):
+        warnings.warn(
+            ('InContextLearningLMTaskDataset is deprecated and will be removed in a future '
+             'release. Its functionality has been reimplemented '
+             'in llmfoundry.eval.datasets.in_context_learning_evaluation.InContextLearningLMTaskDataset.'
+            ),
+            DeprecationWarning,
+        )
         super().__init__(answer_key='continuation',
                          static_keys=['mode'],
-                         tensor_keys=['input_ids', 'continuation_indices', 'labels', 'attention_mask'],
+                         tensor_keys=[
+                             'input_ids', 'continuation_indices', 'labels',
+                             'attention_mask'
+                         ],
                          base_batch={
                              'input_ids': [],
                              'continuation_indices': [],
@@ -367,8 +908,8 @@ def __init__(self, *args, **kwargs):
 
 
 class InContextLearningMultipleChoiceTaskDataset(InContextLearningDataset):
-    """
-    A dataset that construct batches for in-context learning multiple choice evaluation.
+    """A dataset that construct batches for in-context learning multiple choice
+    evaluation.
 
     If each question has N answer choices, we construct N distinct inputs per question. In order to ensure
     consistency across multi-GPU, we set the batch size to be `min(N, batch_size)` so that all N
@@ -399,6 +940,13 @@ def __init__(self,
                  list_of_primitives: Optional[List] = None,
                  *args,
                  **kwargs):
+        warnings.warn(
+            ('InContextLearningMultipleChoiceTaskDataset is deprecated and will be removed in a future '
+             'release. Its functionality has been reimplemented '
+             'in llmfoundry.eval.datasets.in_context_learning_evaluation.InContextLearningMultipleChoiceTaskDataset.'
+            ),
+            DeprecationWarning,
+        )
         self.choices_key = choices_key
         base_batch = {
             'input_ids': [],
@@ -410,8 +958,11 @@ def __init__(self,
         }
         context_key = kwargs.pop('context_key', 'query')
         static_keys = kwargs.pop('static_keys', ['mode', 'generation_kwargs'])
-        tensor_keys = kwargs.pop('tensor_keys', ['input_ids', 'labels', 'attention_mask'])
-        self.list_of_tensors_keys = list_of_tensors_keys or ['continuation_indices']
+        tensor_keys = kwargs.pop('tensor_keys',
+                                 ['input_ids', 'labels', 'attention_mask'])
+        self.list_of_tensors_keys = list_of_tensors_keys or [
+            'continuation_indices'
+        ]
         self.list_of_tuples_keys = list_of_tuples_keys or ['choice_groupings']
         self.list_of_primitives = list_of_primitives or ['gold_indices']
         super().__init__(context_key=context_key,
@@ -422,7 +973,10 @@ def __init__(self,
                          *args,
                          **kwargs)
         self.num_choices = len(self.dataset[0][self.choices_key])
-        self.batch_mapping_per_choice = {'input_ids': 'context', 'labels': 'context'}
+        self.batch_mapping_per_choice = {
+            'input_ids': 'context',
+            'labels': 'context'
+        }
         self.batch_map_per_example = {'gold_indices': 'gold'}
 
     def get_answer_from_example(self, example: Dict, in_context=False) -> str:
@@ -438,7 +992,8 @@ def get_answer_from_example(self, example: Dict, in_context=False) -> str:
         gold_idx = example['gold']
         return choices[gold_idx]
 
-    def tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -> Dict[str, Any]:
+    def tokenize_example(self, prompt_and_fewshot: str, ctxt: str,
+                         example: Dict) -> Dict[str, Any]:
         """
         Runs text through the tokenizer and handle special cases.
         Args:
@@ -459,7 +1014,8 @@ def tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) ->
             # rstrip context because a prompt ending in a space results in degenerate output
             ctxt = ctxt.rstrip()
         # Never add special tokens to context
-        tokenized_context = self.tokenizer(ctxt, add_special_tokens=False)['input_ids']
+        tokenized_context = self.tokenizer(
+            ctxt, add_special_tokens=False)['input_ids']
         assert isinstance(tokenized_context, list)
         tokenized_context = preamble + tokenized_context
 
@@ -472,12 +1028,15 @@ def tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) ->
                 choice = f' {choice}' if not choice.startswith(' ') else choice
 
             # Never add special tokens to answer
-            tokenized_answer = self.tokenizer(choice, add_special_tokens=False)['input_ids']
+            tokenized_answer = self.tokenizer(
+                choice, add_special_tokens=False)['input_ids']
             assert isinstance(tokenized_context, list)
             assert isinstance(tokenized_answer, list)
-            trimmed_context = _trim_context(tokenized_context, tokenized_answer, self.padding_size)
+            trimmed_context = _trim_context(tokenized_context, tokenized_answer,
+                                            self.padding_size)
             assert isinstance(trimmed_context, list)
-            continuation_indices = _get_continuation_span(trimmed_context, tokenized_answer)
+            continuation_indices = _get_continuation_span(
+                trimmed_context, tokenized_answer)
             padded_context = _make_padded_input(
                 trimmed_context,
                 tokenized_answer,
@@ -488,16 +1047,17 @@ def tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) ->
 
             tokenized_example[self.context_key].append(padded_context)
             tokenized_example[self.answer_key].append(tokenized_answer)
-            tokenized_example['continuation_indices'].append(continuation_indices)
+            tokenized_example['continuation_indices'].append(
+                continuation_indices)
 
         tokenized_example['gold'] = example['gold']
         return tokenized_example
 
     def collate_fn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
-        """
-        The function that the dataloader uses to accumulate data into batches.
-        We run each distinct query + answer choice through the model separately and determine which
-        answer has the lowest per-token-perplexity.
+        """The function that the dataloader uses to accumulate data into
+        batches. We run each distinct query + answer choice through the model
+        separately and determine which answer has the lowest per-token-
+        perplexity.
 
         If each question has N possible choices, all N must be grouped together as distinct elements of the batch
         since the batch may consist of multiple questions, the choice_groupings indicates
@@ -515,7 +1075,8 @@ def collate_fn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
             # NOTE: not using batch_mapping
             for i, context_enc in enumerate(data_pair[self.context_key]):
                 batch['input_ids'].append(context_enc)
-                batch['continuation_indices'].append(data_pair['continuation_indices'][i])
+                batch['continuation_indices'].append(
+                    data_pair['continuation_indices'][i])
                 batch['labels'].append(context_enc)
 
             batch['gold_indices'].append(data_pair['gold'])
@@ -529,9 +1090,10 @@ def collate_fn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
     def get_num_samples_in_batch(self, batch) -> int:
         return batch['input_ids'].shape[0] // self.num_choices
 
-    def split_batch(self, batch: Any, microbatch_size: int) -> List[Dict[str, Any]]:
-        """
-        Split batch while ensuring all continuations are in the same microbatch.
+    def split_batch(self, batch: Any,
+                    microbatch_size: int) -> List[Dict[str, Any]]:
+        """Split batch while ensuring all continuations are in the same
+        microbatch.
 
         In ICL Multiple Choice, we duplicate each data point for each possible continuation.
         When splitting a batch, we have logical example, which refer to one possible question,
@@ -553,7 +1115,8 @@ def split_batch(self, batch: Any, microbatch_size: int) -> List[Dict[str, Any]]:
             elif type(v) == list:
                 # list of tensors - 'continuation_indices'
                 if k in self.list_of_tensors_keys:
-                    chunked[k] = _split_list(v, microbatch_size * self.num_choices)
+                    chunked[k] = _split_list(v,
+                                             microbatch_size * self.num_choices)
                 # list of tuples - 'choice_groupings'
                 elif k in self.list_of_tuples_keys:
                     chunked[k] = _split_list(v, microbatch_size)
@@ -563,7 +1126,8 @@ def split_batch(self, batch: Any, microbatch_size: int) -> List[Dict[str, Any]]:
                 else:
                     raise ValueError(f'Unexpected key {k} in list splitting')
             elif k in self.tensor_keys:
-                chunked[k] = _default_split_batch(v, microbatch_size * self.num_choices)
+                chunked[k] = _default_split_batch(
+                    v, microbatch_size * self.num_choices)
             else:
                 raise ValueError(f'Unexpected key {k} in batch splitting')
         num_chunks = len(chunked['input_ids'])
@@ -572,14 +1136,19 @@ def split_batch(self, batch: Any, microbatch_size: int) -> List[Dict[str, Any]]:
             if k in self.static_keys:
                 chunked[k] = [v] * num_chunks
 
-        return [{k: v[idx] for k, v in chunked.items()} for idx in range(num_chunks)]
+        return [
+            {k: v[idx] for k, v in chunked.items()} for idx in range(num_chunks)
+        ]
 
 
-class InContextLearningSchemaTaskDataset(InContextLearningMultipleChoiceTaskDataset):
-    """A dataset that constructs batches for in-context learning schema evaluation.
-    A schema task involves sentences with a fill-in-the-blank where the user needs to choose the correct word
-    to fill in from a set of N options. We use the partial evaluation technique from https://arxiv.org/abs/1806.02847
-    to determine the model's choice of fill-in word.
+class InContextLearningSchemaTaskDataset(
+        InContextLearningMultipleChoiceTaskDataset):
+    """A dataset that constructs batches for in-context learning schema
+    evaluation. A schema task involves sentences with a fill-in-the-blank where
+    the user needs to choose the correct word to fill in from a set of N
+    options. We use the partial evaluation technique from
+    https://arxiv.org/abs/1806.02847 to determine the model's choice of fill-in
+    word.
 
     The default input format is a jsonl file with the following fields:
     - context_options: List of strings corresponding to possible preceding context options for the continuation
@@ -593,13 +1162,19 @@ class InContextLearningSchemaTaskDataset(InContextLearningMultipleChoiceTaskData
     - labels: Identical to the input, used by the model to calculate loss/metrics
     - gold_indices: List of length ``batch_size // N`` indicating for each question, which of the answers is correct (via an integer [0, N-1])
     - choice_groupings: Indicates which indices of the batch correspond to which questions
-
     """
 
     def __init__(self, choices_key='context_options', *args, **kwargs):
         static_keys = ['mode']
         tensor_keys = ['input_ids', 'labels', 'attention_mask']
         list_of_tensors_keys = ['continuation_indices']
+        warnings.warn(
+            ('InContextLearningSchemaTaskDataset is deprecated and will be removed in a future '
+             'release. Its functionality has been reimplemented '
+             'in llmfoundry.eval.datasets.in_context_learning_evaluation.InContextLearningSchemaTaskDataset.'
+            ),
+            DeprecationWarning,
+        )
         super().__init__(choices_key=choices_key,
                          context_key=choices_key,
                          static_keys=static_keys,
@@ -616,9 +1191,12 @@ def __init__(self, choices_key='context_options', *args, **kwargs):
             'choice_groupings': [],
         }
 
-    def construct_context(self, example, preceding_text: str = '', add_answer: bool = False) -> str:
-        """
-        Takes a example and constructs a context with the correct context for the example's continuation.
+    def construct_context(self,
+                          example,
+                          preceding_text: str = '',
+                          add_answer: bool = False) -> str:
+        """Takes a example and constructs a context with the correct context for
+        the example's continuation.
 
         Args:
             example (Dict): The example from which to construct the context
@@ -637,10 +1215,11 @@ def construct_context(self, example, preceding_text: str = '', add_answer: bool
         context = f'{context}{self.continuation_delimiter}{continuation}'
         return context
 
-    def _construct_multiple_contexts(self, example: Dict, preceding_text: str = '') -> List[str]:
-        """
-        Takes a example and constructs all contexts. Optionally, appends this to preceeding text (such as a
-        prompt or fewshot examples).
+    def _construct_multiple_contexts(self,
+                                     example: Dict,
+                                     preceding_text: str = '') -> List[str]:
+        """Takes a example and constructs all contexts. Optionally, appends this
+        to preceeding text (such as a prompt or fewshot examples).
 
         Args:
             example (Dict): The example from which to construct the context
@@ -655,7 +1234,10 @@ def _construct_multiple_contexts(self, example: Dict, preceding_text: str = '')
                 cont_del = self.continuation_delimiter.rstrip()
             else:
                 cont_del = self.continuation_delimiter
-            context_options = [f'{self.example_delimiter}{c}{cont_del}' for c in context_options]
+            context_options = [
+                f'{self.example_delimiter}{c}{cont_del}'
+                for c in context_options
+            ]
         return context_options
 
     def _prep_example(
@@ -666,8 +1248,8 @@ def _prep_example(
         prompt_string: str,
         fewshot_rng: random.Random,
     ) -> Dict[str, Any]:
-        """
-        Prepares a single example from a HF Dataset into tokenized format with prompt and fewshot examples.
+        """Prepares a single example from a HF Dataset into tokenized format
+        with prompt and fewshot examples.
 
         Each task consists of multiple contexts and a single, correct continuation. Will preprend fewshot examples and
         prompt if present.
@@ -682,14 +1264,17 @@ def _prep_example(
         Returns:
             Dict: Contains a dictionary with the tokenized data
         """
-        prompt_and_fewshot = self._generate_few_shot_prompt(num_fewshot, example_idx, prompt_string, fewshot_rng)
+        prompt_and_fewshot = self._generate_few_shot_prompt(
+            num_fewshot, example_idx, prompt_string, fewshot_rng)
         ctxt = self._construct_multiple_contexts(example, prompt_and_fewshot)
-        tokenized_example = self.tokenize_example(prompt_and_fewshot, ctxt, example)
+        tokenized_example = self.tokenize_example(prompt_and_fewshot, ctxt,
+                                                  example)
         return tokenized_example
 
-    def tokenize_example(self, prompt_and_fewshot: str, context_options: List[str], example: Dict) -> Dict[str, Any]:
-        """
-        Runs text through the tokenizer and handle special cases.
+    def tokenize_example(self, prompt_and_fewshot: str,
+                         context_options: List[str],
+                         example: Dict) -> Dict[str, Any]:
+        """Runs text through the tokenizer and handle special cases.
 
         Args:
             prompt_and_fewshot (str): The collection of the prompt and fewshot examples that belongs before the example's context
@@ -704,14 +1289,18 @@ def tokenize_example(self, prompt_and_fewshot: str, context_options: List[str],
         assert isinstance(preamble, list)
         preamble = self._fix_eos_on_preamble(preamble)
         encoded_contexts = [
-            preamble +  # pyright: ignore[reportOperatorIssue, reportGeneralTypeIssues]
-            self.tokenizer(c, add_special_tokens=False)['input_ids']  # pyright: ignore[reportOperatorIssue, ]
+            preamble
+            +  # pyright: ignore[reportOperatorIssue, reportGeneralTypeIssues]
+            self.tokenizer(c, add_special_tokens=False)[
+                'input_ids']  # pyright: ignore[reportOperatorIssue, ]
             for c in context_options
         ]
         continuation = example['continuation']
         if self.prefix_space:
-            continuation = (f' {continuation}' if not continuation.startswith(' ') else continuation)
-        tokenized_continuation = self.tokenizer(continuation, add_special_tokens=False)['input_ids']
+            continuation = (f' {continuation}' if
+                            not continuation.startswith(' ') else continuation)
+        tokenized_continuation = self.tokenizer(
+            continuation, add_special_tokens=False)['input_ids']
 
         tokenized_example[self.context_key] = []
         tokenized_example['continuation_indices'] = []
@@ -719,13 +1308,19 @@ def tokenize_example(self, prompt_and_fewshot: str, context_options: List[str],
         for context in encoded_contexts:
             assert isinstance(context, list)
             assert isinstance(tokenized_continuation, list)
-            trimmed_context = _trim_context(context, tokenized_continuation, self.padding_size)
+            trimmed_context = _trim_context(context, tokenized_continuation,
+                                            self.padding_size)
             assert isinstance(trimmed_context, list)
-            continuation_indices = _get_continuation_span(trimmed_context, tokenized_continuation)
-            padded_context = _make_padded_input(trimmed_context, tokenized_continuation, self.padding_size,
-                                                self.pad_tok_id, self.padding_side)
+            continuation_indices = _get_continuation_span(
+                trimmed_context, tokenized_continuation)
+            padded_context = _make_padded_input(trimmed_context,
+                                                tokenized_continuation,
+                                                self.padding_size,
+                                                self.pad_tok_id,
+                                                self.padding_side)
             tokenized_example[self.context_key].append(padded_context)
-            tokenized_example['continuation_indices'].append(continuation_indices)
+            tokenized_example['continuation_indices'].append(
+                continuation_indices)
             tokenized_example[self.answer_key].append(tokenized_continuation)
 
         tokenized_example['gold'] = example['gold']
@@ -733,8 +1328,8 @@ def tokenize_example(self, prompt_and_fewshot: str, context_options: List[str],
 
 
 class InContextLearningCodeEvalDataset(InContextLearningDataset):
-    """
-    A dataset that constructs batches for in-context learning code evaluation.
+    """A dataset that constructs batches for in-context learning code
+    evaluation.
 
     The input format is expected to be a jsonl file with the following fields:
 
@@ -782,6 +1377,13 @@ def __init__(
         *args,
         **kwargs,
     ):
+        warnings.warn(
+            ('InContextLearningCodeEvalDataset is deprecated and will be removed in a future '
+             'release. Its functionality has been reimplemented '
+             'in llmfoundry.eval.datasets.in_context_learning_evaluation.InContextLearningCodeEvalDataset.'
+            ),
+            DeprecationWarning,
+        )
         if generations_per_sample < pass_at_k:
             raise ValueError(
                 f'generations_per_sample ({generations_per_sample}) must be greater than or equal to pass_at_k ({pass_at_k}) for code evaluation.'
@@ -799,8 +1401,13 @@ def __init__(
         # Linting complains if these are not set in init
         self.max_prompt_length = 0
         self.max_answer_length = 0
-        static_keys = ['mode', 'pass_at_k', 'generation_length', 'generation_kwargs']
-        list_keys = ['prompts', 'tests', 'entry_points', 'test_inputs', 'test_outputs', 'languages', 'labels']
+        static_keys = [
+            'mode', 'pass_at_k', 'generation_length', 'generation_kwargs'
+        ]
+        list_keys = [
+            'prompts', 'tests', 'entry_points', 'test_inputs', 'test_outputs',
+            'languages', 'labels'
+        ]
         tensor_keys = ['input_ids', 'attention_mask']
         super().__init__(
             context_key='prompt',
@@ -819,7 +1426,8 @@ def __init__(
         self.dataset = self.dataset.map(self._trim_padding)
         self.base_batch = {
             'input_ids': [],
-            'mode': 'generate',
+            'mode':
+                'generate',
             'labels': [],
             'prompts': [],
             'tests': [],
@@ -827,8 +1435,11 @@ def __init__(
             'test_inputs': [],
             'test_outputs': [],
             'languages': [],
-            'pass_at_k': pass_at_k,
-            'generation_length': min(self.max_answer_length, self.max_seq_len - self.max_prompt_length),
+            'pass_at_k':
+                pass_at_k,
+            'generation_length':
+                min(self.max_answer_length,
+                    self.max_seq_len - self.max_prompt_length),
             'generation_kwargs': {
                 'pad_token_id': self.pad_tok_id,
                 'num_beams': 1,  # single beam
@@ -838,11 +1449,12 @@ def __init__(
                 'eos_token_id': self.tokenizer.eos_token_id
             }
         }
-        self.update_generation_kwargs(kwargs.get('generation_kwargs', {}))
+        if 'generation_kwargs' in kwargs:
+            self.update_generation_kwargs(kwargs['generation_kwargs'])
 
     def _set_max_prompt_and_answer_lengths(self):
-        """
-        Iterates through the dataset and finds the maximum prompt length and sequence lengths
+        """Iterates through the dataset and finds the maximum prompt length and
+        sequence lengths.
 
         Returns:
             None
@@ -851,10 +1463,15 @@ def _set_max_prompt_and_answer_lengths(self):
         max_answer_length = 0
         for example in self.dataset:
             assert isinstance(example, Dict)
-            unpadded_example = [token for token in example[self.context_key] if token != self.pad_tok_id]
+            unpadded_example = [
+                token for token in example[self.context_key]
+                if token != self.pad_tok_id
+            ]
             max_prompt_length = max(max_prompt_length, len(unpadded_example))
 
-            tokenized_answer = self.tokenizer(example['canonical_solution'], add_special_tokens=False)['input_ids']
+            tokenized_answer = self.tokenizer(
+                example['canonical_solution'],
+                add_special_tokens=False)['input_ids']
             assert isinstance(tokenized_answer, list)
             len_tokenized_answer = len(tokenized_answer)
             max_answer_length = max(max_answer_length, len_tokenized_answer)
@@ -863,29 +1480,35 @@ def _set_max_prompt_and_answer_lengths(self):
         self.max_answer_length = max_answer_length + _MAX_ANSWER_BUFFER_LENGTH
 
     def _trim_padding(self, example: Dict):
-        """
-        Adjusts padding to the maximum prompt length rather than max_seq_len.
-        Needs to be done after the dataset has been processed because we don't know the maximum
-        prompt length until after we've tokenized it.
+        """Adjusts padding to the maximum prompt length rather than max_seq_len.
+        Needs to be done after the dataset has been processed because we don't
+        know the maximum prompt length until after we've tokenized it.
 
         Returns:
             dataset: A HuggingFace Dataset with different padding lengths for example[self.context_key]
         """
         # Remove padding tokens applied during tokenization
-        unpadded_prompt = [token for token in example[self.context_key] if token != self.pad_tok_id]
+        unpadded_prompt = [
+            token for token in example[self.context_key]
+            if token != self.pad_tok_id
+        ]
         # Reapply padding only to max_prompt_length
         full_prompt = _trim_context(unpadded_prompt, [], self.max_prompt_length)
-        padded_context = _make_padded_input(full_prompt, [], self.max_prompt_length, self.pad_tok_id, self.padding_side)
+        padded_context = _make_padded_input(full_prompt, [],
+                                            self.max_prompt_length,
+                                            self.pad_tok_id, self.padding_side)
 
         example[self.context_key] = padded_context
         return example
 
-    def tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -> Dict[str, Any]:
-        """
-        Adds extra code task details to the example dictionary.
+    def tokenize_example(self, prompt_and_fewshot: str, ctxt: str,
+                         example: Dict) -> Dict[str, Any]:
+        """Adds extra code task details to the example dictionary.
+
         See InContextLearningDataset for more details
         """
-        tokenized_example = super().tokenize_example(prompt_and_fewshot, ctxt, example)
+        tokenized_example = super().tokenize_example(prompt_and_fewshot, ctxt,
+                                                     example)
         tokenized_example['prompt_text'] = example['prompt']
         tokenized_example['task_id'] = example['task_id']
         tokenized_example['canonical_solution'] = example['canonical_solution']
@@ -919,9 +1542,9 @@ def build_icl_dataloader(
         generation_kwargs: Dict,
         early_stopping_criteria: Optional[List[str]] = None,
         do_normalization: bool = True) -> DataSpec:
-    """
-    Factory method that builds the specific dataset for the specified icl_task_type.
-    See documentation for `get_icl_task_dataloader` for arugment documentation.
+    """Factory method that builds the specific dataset for the specified
+    icl_task_type. See documentation for `get_icl_task_dataloader` for arugment
+    documentation.
 
     When writing a dataset for a new task, here you will need to:
         1. add the dataset to the factory and choose an appropriate string
@@ -929,6 +1552,13 @@ def build_icl_dataloader(
             this might be different)
         3. set the `split_batch` funciton if necessary
     """
+    warnings.warn(
+        ('build_icl_dataloader is deprecated and will be removed in a future '
+         'release. Its functionality has been reimplemented '
+         'in llmfoundry.eval.datasets.in_context_learning_evaluation.build_icl_dataloader.'
+        ),
+        DeprecationWarning,
+    )
     if icl_task_type == 'multiple_choice':
         dataset = InContextLearningMultipleChoiceTaskDataset(
             dataset_uri=dataset_uri,
@@ -1052,10 +1682,12 @@ def build_icl_dataloader(
     )
 
 
-def partition_dataset_by_category(dataset_uri: str, destination_path: str, hf_loading_vars: Dict,
+def partition_dataset_by_category(dataset_uri: str, destination_path: str,
+                                  hf_loading_vars: Dict,
                                   hf_parsing_map: Dict) -> Dict[str, str]:
-    """
-    If has_categories is enabled, we partition the dataset into a separate dataset for each category value in the data and write each partition to a local file.
+    """If has_categories is enabled, we partition the dataset into a separate
+    dataset for each category value in the data and write each partition to a
+    local file.
 
     Args:
         dataset_uri (str): Location of dataset.
@@ -1068,8 +1700,10 @@ def partition_dataset_by_category(dataset_uri: str, destination_path: str, hf_lo
         Dict[str, str]: Mapping of category names to partitioned dataset local files names.
     """
     try:
-        from datasets import Dataset as HFDataset  # pyright: ignore[reportGeneralTypeIssues]
-        from datasets import IterableDataset, load_dataset  # pyright: ignore[reportGeneralTypeIssues]
+        from datasets import \
+            Dataset as HFDataset  # pyright: ignore[reportGeneralTypeIssues]
+        from datasets import (  # pyright: ignore[reportGeneralTypeIssues]
+            IterableDataset, load_dataset)
     except ImportError as e:
         raise MissingConditionalImportError(
             extra_deps_group='nlp',
@@ -1079,26 +1713,35 @@ def partition_dataset_by_category(dataset_uri: str, destination_path: str, hf_lo
     if dataset_uri.startswith('hf://'):
         dataset_uri = dataset_uri.replace('hf://', '')
         dataset = load_dataset(dataset_uri, **hf_loading_vars)
-        assert isinstance(dataset, HFDataset) or isinstance(dataset, IterableDataset)
+        assert isinstance(dataset, HFDataset) or isinstance(
+            dataset, IterableDataset)
         if hf_parsing_map:
             dataset_parsing_func = lambda example: {
-                k: ' '.join([str(example[col]) for col in v]) for k, v in hf_parsing_map.items()
+                k: ' '.join([str(example[col]) for col in v])
+                for k, v in hf_parsing_map.items()
             }
             assert hasattr(dataset, 'column_names')
-            dataset = dataset.map(dataset_parsing_func, remove_columns=dataset.column_names)
+            dataset = dataset.map(dataset_parsing_func,
+                                  remove_columns=dataset.column_names)
     else:
         with dist.local_rank_zero_download_and_wait(destination_path):
             if dist.get_local_rank() == 0:
                 get_file(dataset_uri, destination_path, overwrite=True)
-        dataset = load_dataset('json', data_files=destination_path, split='train', streaming=False)
-    assert isinstance(dataset, HFDataset) or isinstance(dataset, IterableDataset)
+        dataset = load_dataset('json',
+                               data_files=destination_path,
+                               split='train',
+                               streaming=False)
+    assert isinstance(dataset, HFDataset) or isinstance(dataset,
+                                                        IterableDataset)
     assert hasattr(dataset, 'features')
     assert dataset.features is not None
     if 'category' not in dataset.features.keys():
         raise Exception(f"""Attempted to partition dataset by `category` \
             but it doesn't have a `category` key. \
             Got keys: {str(list(dataset.features.keys()))}""")
-    categories = sorted(set(dataset['category']))  # pyright: ignore[reportIndexIssue, reportGeneralTypeIssues]
+    categories = sorted(
+        set(dataset['category']
+           ))  # pyright: ignore[reportIndexIssue, reportGeneralTypeIssues]
     output_files = {}
     for cat in categories:
         path = destination_path.split('/')
@@ -1107,7 +1750,8 @@ def partition_dataset_by_category(dataset_uri: str, destination_path: str, hf_lo
         gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
         if dist.get_local_rank() == 0:
             subset = [
-                l for l in dataset if l['category'] == cat  # pyright: ignore[reportGeneralTypeIssues]
+                l for l in dataset if
+                l['category'] == cat  # pyright: ignore[reportGeneralTypeIssues]
             ]  # pyright: ignore[reportArgumentType, reportCallIssue]
             with open(gathered_paths[0], 'w', encoding='utf8') as f:
                 for l in subset:
@@ -1119,7 +1763,8 @@ def partition_dataset_by_category(dataset_uri: str, destination_path: str, hf_lo
 def get_icl_task_dataloader(
         icl_task_type: str,
         dataset_uri: str,
-        tokenizer: Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast],
+        tokenizer: Union[transformers.PreTrainedTokenizer,
+                         transformers.PreTrainedTokenizerFast],
         batch_size: int,
         max_seq_len: int,
         pad_tok_id: int,
@@ -1139,7 +1784,9 @@ def get_icl_task_dataloader(
         generation_kwargs: Optional[Dict] = None,
         early_stopping_criteria: Optional[List[str]] = None,
         do_normalization: bool = True) -> Union[DataSpec, Dict[str, DataSpec]]:
-    """This constructs a dataloader (or dataloaders if has_categories is True) capable of evaluating LLMs on in-context learning language modeling tasks, for example LAMBADA. An example usage is below:
+    """This constructs a dataloader (or dataloaders if has_categories is True)
+    capable of evaluating LLMs on in-context learning language modeling tasks,
+    for example LAMBADA. An example usage is below:
 
         .. testsetup::
 
@@ -1217,6 +1864,13 @@ def get_icl_task_dataloader(
     Returns:
         DataLoader: A dataloader used for performing in-context learning evaluation on the dataset provided.
     """
+    warnings.warn(
+        ('get_icl_task_dataloader is deprecated and will be removed in a future '
+         'release. Its functionality has been reimplemented '
+         'in llmfoundry.eval.datasets.in_context_learning_evaluation.get_icl_task_dataloader.'
+        ),
+        DeprecationWarning,
+    )
     if hf_loading_vars is None:
         hf_loading_vars = {}
     if hf_parsing_map is None:
@@ -1228,7 +1882,10 @@ def get_icl_task_dataloader(
 
     if has_categories:
         result_dls = {}
-        output_files = partition_dataset_by_category(dataset_uri, destination_path, hf_loading_vars, hf_parsing_map)
+        output_files = partition_dataset_by_category(dataset_uri,
+                                                     destination_path,
+                                                     hf_loading_vars,
+                                                     hf_parsing_map)
         categories = sorted(output_files.keys())
         for category in categories:
             partition_uri = output_files[category]
@@ -1279,4 +1936,4 @@ def get_icl_task_dataloader(
             generation_kwargs=generation_kwargs,
             early_stopping_criteria=early_stopping_criteria,
             do_normalization=do_normalization,
-        )
\ No newline at end of file
+        )
diff --git a/llmfoundry/eval/metrics/nlp.py b/llmfoundry/eval/metrics/nlp.py
index bef7d2f3c0..065c90306f 100644
--- a/llmfoundry/eval/metrics/nlp.py
+++ b/llmfoundry/eval/metrics/nlp.py
@@ -1,3 +1,6 @@
+# Copyright 2024 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
 # Copyright 2022 MosaicML Composer authors
 # SPDX-License-Identifier: Apache-2.0
 
@@ -12,29 +15,49 @@
 
 import numpy as np
 import torch
+from composer.metrics.nlp import InContextLearningMetric
+from composer.utils.eval_client import (EvalClient, LambdaEvalClient,
+                                        LocalEvalClient,
+                                        MosaicMLLambdaEvalClient)
 from torch import Tensor
 from torch.nn import functional as F
-from composer.metrics.nlp import InContextLearningMetric
-from composer.utils.eval_client import EvalClient, LambdaEvalClient, LocalEvalClient, MosaicMLLambdaEvalClient
 
 log = logging.getLogger(__name__)
 
 __all__ = [
+    'InContextLearningMetric',
     'InContextLearningLMAccuracy',
     'InContextLearningMultipleChoiceAccuracy',
     'InContextLearningQAAccuracy',
     'InContextLearningCodeEvalAccuracy',
-    'BinaryF1Score',
-    'LanguageCrossEntropy',
-    'MaskedAccuracy',
-    'LanguagePerplexity',
     'InContextLearningLMExpectedCalibrationError',
     'InContextLearningMCExpectedCalibrationError',
 ]
 
 
+class InContextLearningMetric(
+        InContextLearningMetric
+):  # TODO: this is a temporary solution until Max deprecates composer's superclass entirely
+
+    def update(self, batch: dict, output_logits: torch.Tensor,
+               labels: torch.Tensor):
+        """Abstract interface for computing an in-context learning metrics.
+
+        Args:
+            batch (dict): Batch must consist minimally of `input_ids` as well as any other structure needed
+                to compute the metric.
+            output_logits (torch.Tensor): The model outputs evaluated on the batch `input_ids`
+            labels (torch.Tensor): The correct outputs.
+
+        Raises:
+            NotImplementedError: Abstract method must be implemented by subclasses
+        """
+        raise NotImplementedError
+
+
 class InContextLearningQAAccuracy(InContextLearningMetric):
-    r"""Computes accuracy for In-context learning (ICL) question answering (QA) tasks.
+    r"""Computes accuracy for In-context learning (ICL) question answering (QA)
+    tasks.
 
     ICL QA tasks consist of some number of example question answering tasks (referred to as the 'context'), followed by a test task where the model must
     match one of the possible answer aliases (referred to as the 'continuation').
@@ -60,8 +83,16 @@ class InContextLearningQAAccuracy(InContextLearningMetric):
 
     def __init__(self, dist_sync_on_step: bool = False):
         # state from multiple processes
+        warnings.warn(
+            ('InContextLearningQAAccuracy is deprecated and will be removed in a future '
+             'release. Its functionality has been reimplemented '
+             'in llmfoundry.eval.metrics.nlp.InContextLearningQAAccuracy.'),
+            DeprecationWarning,
+        )
         super().__init__(dist_sync_on_step=dist_sync_on_step)
-        self.add_state('correct', default=torch.tensor(0.), dist_reduce_fx='sum')
+        self.add_state('correct',
+                       default=torch.tensor(0.),
+                       dist_reduce_fx='sum')
         self.add_state('total', default=torch.tensor(0.), dist_reduce_fx='sum')
 
     def normalize_answer(self, answer: str):
@@ -77,7 +108,8 @@ def white_space_fix(text: str) -> str:
             return ' '.join(text.split())
 
         def handle_punc(text: str) -> str:
-            exclude = set(string.punctuation + ''.join([u'‘', u'’', u'´', u'`']))
+            exclude = set(string.punctuation +
+                          ''.join([u'‘', u'’', u'´', u'`']))
             return ''.join(ch if ch not in exclude else ' ' for ch in text)
 
         def lower(text: str) -> str:
@@ -86,9 +118,18 @@ def lower(text: str) -> str:
         def replace_underscore(text: str) -> str:
             return text.replace('_', ' ')
 
-        return white_space_fix(remove_articles(handle_punc(lower(replace_underscore(answer))))).strip()
-
-    def update(self, batch: Optional[Dict[str, Any]], outputs: List[str], labels: List[List[str]]):
+        return white_space_fix(
+            remove_articles(handle_punc(lower(
+                replace_underscore(answer))))).strip()
+
+    def update(
+        self,
+        batch: Optional[Dict[str, Any]],
+        outputs: List[str],
+        labels: List[List[str]],
+    ):
+        if batch is None:
+            batch = {}
         cot_delimiter = batch.get('cot_delimiter', '')
         do_normalization = batch.get('do_normalization', True)
         stopping_criteria = batch.get('stopping_criteria', None)
@@ -96,19 +137,24 @@ def update(self, batch: Optional[Dict[str, Any]], outputs: List[str], labels: Li
             final_answer = sample_output
 
             if stopping_criteria is not None and len(stopping_criteria) > 0:
-                final_answer = re.split('|'.join(stopping_criteria), final_answer)[0]
+                final_answer = re.split('|'.join(stopping_criteria),
+                                        final_answer)[0]
 
             if cot_delimiter is not None and len(cot_delimiter) > 0:
                 final_answer = final_answer.split(cot_delimiter)[-1]
 
             if do_normalization:
                 cleaned_final_answer = self.normalize_answer(final_answer)
-                cleaned_sample_labels = {self.normalize_answer(label) for label in sample_labels}
+                cleaned_sample_labels = {
+                    self.normalize_answer(label) for label in sample_labels
+                }
             else:
                 cleaned_final_answer = final_answer
                 cleaned_sample_labels = set(sample_labels)
 
-            if any(cleaned_final_answer.startswith(label) for label in cleaned_sample_labels):
+            if any(
+                    cleaned_final_answer.startswith(label)
+                    for label in cleaned_sample_labels):
                 self.correct += torch.tensor(1.0)
             self.total += torch.tensor(1.0)
 
@@ -119,7 +165,8 @@ def compute(self):
 
 
 class InContextLearningLMAccuracy(InContextLearningMetric):
-    r"""Computes accuracy for In-context learning (ICL) language modeling (LM) tasks.
+    r"""Computes accuracy for In-context learning (ICL) language modeling (LM)
+    tasks.
 
     ICL LM tasks consist of some number of example language modeling tasks (referred to as the 'context'), followed by a test task where the model must correctly predict all the tokens
     following tokens in some passage (referred to as the 'continuation').
@@ -143,15 +190,26 @@ class InContextLearningLMAccuracy(InContextLearningMetric):
     full_state_update = False
 
     def __init__(self, dist_sync_on_step: bool = False):
+        warnings.warn(
+            ('InContextLearningLMAccuracy is deprecated and will be removed in a future '
+             'release. Its functionality has been reimplemented '
+             'in llmfoundry.eval.metrics.nlp.InContextLearningLMAccuracy.'),
+            DeprecationWarning,
+        )
         # state from multiple processes
         super().__init__(dist_sync_on_step=dist_sync_on_step)
-        self.add_state('correct', default=torch.tensor(0.), dist_reduce_fx='sum')
+        self.add_state('correct',
+                       default=torch.tensor(0.),
+                       dist_reduce_fx='sum')
         self.add_state('total', default=torch.tensor(0.), dist_reduce_fx='sum')
 
-    def update(self, batch: dict, output_logits: torch.Tensor, labels: torch.Tensor):
+    def update(self, batch: dict, output_logits: torch.Tensor,
+               labels: torch.Tensor):
         for batch_idx, cont_idx in enumerate(batch['continuation_indices']):
-            cont_tok_pred = output_logits[batch_idx].index_select(dim=0, index=cont_idx - 1).argmax(dim=-1)
-            cont_tok_targ = labels[batch_idx].index_select(dim=0, index=cont_idx - 1)
+            cont_tok_pred = output_logits[batch_idx].index_select(
+                dim=0, index=cont_idx - 1).argmax(dim=-1)
+            cont_tok_targ = labels[batch_idx].index_select(dim=0,
+                                                           index=cont_idx - 1)
 
             self.correct += (cont_tok_pred == cont_tok_targ).all().int()
             self.total += torch.tensor(1.0)
@@ -162,59 +220,6 @@ def compute(self):
         return self.correct / self.total
 
 
-class InContextLearningMultipleChoiceAccuracy(InContextLearningMetric):
-    r"""Computes accuracy for In-context learning (ICL) multiple choice (MC) tasks.
-
-    ICL MC tasks consists of a series of questions with some number of possible choices (only one of which can be correct).
-    At inference time each possible choice is given to the model as a separate input and the one for which the model assigns
-    the lowest perplexity to the choice is considered the model's choice. The model is correct if it "chooses" the right answer.
-
-    Context: `The dog is->fuzzy\nthe water is->hot\nthe tree is->`
-    Continuation: `green`
-
-    Adds metric state variables:
-        correct (float): The number of instances where the prediction masked the target.
-        total (float): The number of total instances that were predicted.
-
-    Args:
-        dist_sync_on_step (bool, optional): Synchronize metric state across processes at
-            each forward() before returning the value at the step. Default: ``False``.
-    """
-
-    # Make torchmetrics call update only once
-    full_state_update = False
-
-    def __init__(self, dist_sync_on_step: bool = False):
-        # state from multiple processes
-        super().__init__(dist_sync_on_step=dist_sync_on_step)
-        self.add_state('correct', default=torch.tensor(0.0), dist_reduce_fx='sum')
-        self.add_state('total', default=torch.tensor(0.0), dist_reduce_fx='sum')
-
-    def update(self, batch: dict, output_logits: torch.Tensor, labels: torch.Tensor):
-        perplexities = []
-        for batch_idx, cont_idx in enumerate(batch['continuation_indices']):
-            # continuation indices refer to indices in the original input's token space
-            cont_tok_logits = output_logits[batch_idx].index_select(dim=0, index=cont_idx - 1)
-            # labels have been shifted left by one index, so the cont_idx needs to be shifted as well.
-            cont_tok_targ = labels[batch_idx].index_select(dim=0, index=cont_idx - 1)
-            cross_entropy = F.cross_entropy(cont_tok_logits, cont_tok_targ)
-            perplexity = torch.exp(cross_entropy)
-            perplexities.append(perplexity)
-
-        for (start, end), gold_idx in zip(batch['choice_groupings'], batch['gold_indices']):
-            subset = perplexities[start:end]
-            idx_min = subset.index(min(subset))
-
-            if idx_min == gold_idx:
-                self.correct += torch.tensor(1.0)
-            self.total += torch.tensor(1.0)
-
-    def compute(self):
-        assert isinstance(self.correct, Tensor)
-        assert isinstance(self.total, Tensor)
-        return self.correct.float() / self.total
-
-
 class InContextLearningCodeEvalAccuracy(InContextLearningMetric):
     r"""Computes accuracy for In-context learning (ICL) code evaluation tasks.
 
@@ -239,9 +244,18 @@ class InContextLearningCodeEvalAccuracy(InContextLearningMetric):
     full_state_update = False
 
     def __init__(self, dist_sync_on_step: bool = False):
+        warnings.warn(
+            ('InContextLearningCodeEvalAccuracy is deprecated and will be removed in a future '
+             'release. Its functionality has been reimplemented '
+             'in llmfoundry.eval.metrics.nlp.InContextLearningCodeEvalAccuracy.'
+            ),
+            DeprecationWarning,
+        )
         # state from multiple processes
         super().__init__(dist_sync_on_step=dist_sync_on_step)
-        self.add_state('correct', default=torch.tensor(0.), dist_reduce_fx='sum')
+        self.add_state('correct',
+                       default=torch.tensor(0.),
+                       dist_reduce_fx='sum')
         self.add_state('total', default=torch.tensor(0.), dist_reduce_fx='sum')
 
         self.eval_device = os.environ.get('CODE_EVAL_DEVICE', None)
@@ -269,8 +283,9 @@ def get_client(self) -> EvalClient:
                 'to one of `LOCAL` (for unsafe local eval), `LAMBDA` (for AWS lambda ',
                 'evaluation), or `MOSAICML` (for lambda eval through MAPI).')
         else:
-            raise ValueError('Environment variable `CODE_EVAL_DEVICE` must be one of `LOCAL`, '
-                             f'`LAMBDA`, or `MOSAICML` but got {self.eval_device}.')
+            raise ValueError(
+                'Environment variable `CODE_EVAL_DEVICE` must be one of `LOCAL`, '
+                f'`LAMBDA`, or `MOSAICML` but got {self.eval_device}.')
 
         return client
 
@@ -286,7 +301,8 @@ def estimator(self, n: int, c: int, k: int) -> float:
             return 1.0
         return 1.0 - float(np.prod(1.0 - k / np.arange(n - c + 1, n + 1)))
 
-    def update(self, batch: Dict[str, Any], outputs: List[str], labels: List[str]):
+    def update(self, batch: Dict[str, Any], outputs: List[str],
+               labels: List[str]):
         """Updates the pass@k accuracy of code generation.
 
         Given a batch of prompts, test cases, and code generations, evaluates the code generations
@@ -316,16 +332,20 @@ def update(self, batch: Dict[str, Any], outputs: List[str], labels: List[str]):
         pass_at_k = batch['pass_at_k']
         num_generations = batch['generation_kwargs']['num_return_sequences']
         processed_outputs = [
-            outputs[i * num_generations:(i + 1) * num_generations] for i in range(len(batch['prompts']))
+            outputs[i * num_generations:(i + 1) * num_generations]
+            for i in range(len(batch['prompts']))
         ]
         payloads = []
         for sample_outputs, sample_prompt, test_inputs, test_outputs, entry_point, language in zip(
-                processed_outputs, batch['prompts'], batch['test_inputs'], batch['test_outputs'], batch['entry_points'],
+                processed_outputs, batch['prompts'], batch['test_inputs'],
+                batch['test_outputs'], batch['entry_points'],
                 batch['languages']):
             self.total += torch.tensor(1.0)
             prompt_payload = []
             for code_gen in sample_outputs:
-                code_gen = re.split(r'\n[A-Za-z0-9#`]', code_gen)[0]  # remove everything after function ends
+                code_gen = re.split(
+                    r'\n[A-Za-z0-9#`]',
+                    code_gen)[0]  # remove everything after function ends
                 final_code = sample_prompt + code_gen  # combine prompt with the code generation
                 generation_payload = []
                 for test_input, test_output in zip(test_inputs, test_outputs):
@@ -349,7 +369,8 @@ def update(self, batch: Dict[str, Any], outputs: List[str], labels: List[str]):
                 if correct:
                     num_correct += 1
 
-            pass_at_k_rate = self.estimator(num_generations, num_correct, pass_at_k)
+            pass_at_k_rate = self.estimator(num_generations, num_correct,
+                                            pass_at_k)
             self.correct += torch.tensor(pass_at_k_rate)
 
         client.close()  # pyright: ignore [reportOptionalMemberAccess]
@@ -357,4 +378,214 @@ def update(self, batch: Dict[str, Any], outputs: List[str], labels: List[str]):
     def compute(self):
         assert isinstance(self.correct, Tensor)
         assert isinstance(self.total, Tensor)
-        return self.correct / self.total
\ No newline at end of file
+        return self.correct / self.total
+
+
+class InContextLearningMultipleChoiceAccuracy(InContextLearningMetric):
+    r"""Computes accuracy for In-context learning (ICL) multiple choice (MC)
+    tasks.
+
+    ICL MC tasks consists of a series of questions with some number of possible choices (only one of which can be correct).
+    At inference time each possible choice is given to the model as a separate input and the one for which the model assigns
+    the lowest perplexity to the choice is considered the model's choice. The model is correct if it "chooses" the right answer.
+
+    Context: `The dog is->fuzzy\nthe water is->hot\nthe tree is->`
+    Continuation: `green`
+
+    Adds metric state variables:
+        correct (float): The number of instances where the prediction masked the target.
+        total (float): The number of total instances that were predicted.
+
+    Args:
+        dist_sync_on_step (bool, optional): Synchronize metric state across processes at
+            each forward() before returning the value at the step. Default: ``False``.
+    """
+
+    # Make torchmetrics call update only once
+    full_state_update = False
+
+    def __init__(self, dist_sync_on_step: bool = False):
+        warnings.warn(
+            ('InContextLearningMultipleChoiceAccuracy is deprecated and will be removed in a future '
+             'release. Its functionality has been reimplemented '
+             'in llmfoundry.eval.metrics.nlp.InContextLearningMultipleChoiceAccuracy.'
+            ),
+            DeprecationWarning,
+        )
+        # state from multiple processes
+        super().__init__(dist_sync_on_step=dist_sync_on_step)
+        self.add_state('correct',
+                       default=torch.tensor(0.0),
+                       dist_reduce_fx='sum')
+        self.add_state('total', default=torch.tensor(0.0), dist_reduce_fx='sum')
+
+    def update(self, batch: dict, output_logits: torch.Tensor,
+               labels: torch.Tensor):
+        perplexities = []
+        for batch_idx, cont_idx in enumerate(batch['continuation_indices']):
+            # continuation indices refer to indices in the original input's token space
+            cont_tok_logits = output_logits[batch_idx].index_select(
+                dim=0, index=cont_idx - 1)
+            # labels have been shifted left by one index, so the cont_idx needs to be shifted as well.
+            cont_tok_targ = labels[batch_idx].index_select(dim=0,
+                                                           index=cont_idx - 1)
+            cross_entropy = F.cross_entropy(cont_tok_logits, cont_tok_targ)
+            perplexity = torch.exp(cross_entropy)
+            perplexities.append(perplexity)
+
+        for (start, end), gold_idx in zip(batch['choice_groupings'],
+                                          batch['gold_indices']):
+            subset = perplexities[start:end]
+            idx_min = subset.index(min(subset))
+
+            if idx_min == gold_idx:
+                self.correct += torch.tensor(1.0)
+            self.total += torch.tensor(1.0)
+
+    def compute(self):
+        assert isinstance(self.correct, Tensor)
+        assert isinstance(self.total, Tensor)
+        return self.correct.float() / self.total
+
+
+class InContextLearningExpectedCalibrationError(InContextLearningMetric):
+    """Generic class for Expected Calibration Error (ECE) (cite:
+    https://arxiv.org/pdf/1706.04599.pdf).
+
+    Expected calibration error is calculated by dividing predictions into buckets based on the model's confidence (a probability value between 0 and 1).
+    We then calculate the accuracy within each bucket and calculate the average gap between confidence and accuracy
+    across buckets, weighted by the number of samples in each bucket.
+
+    Each task must implement its own definition of "confidence" to be computed via the `update` method.
+
+    Adds metric state variables:
+    bucket_totals (float): The number of instances where the prediction masked the target per bucket.
+    bucket_correct (float): The number of total instances that were predicted per bucket.
+
+    Args:
+        dist_sync_on_step (bool, optional): Synchronize metric state across processes at
+            each forward() before returning the value at the step. Default: ``False``.
+        n_buckets (int): Number of distinct buckets to split the confidence distribution into
+    """
+
+    def __init__(self, dist_sync_on_step: bool = False, n_buckets: int = 10):
+        warnings.warn(
+            ('InContextLearningExpectedCalibrationError is deprecated and will be removed in a future '
+             'release.'),
+            DeprecationWarning,
+        )
+        # state from multiple processes
+        super().__init__(dist_sync_on_step=dist_sync_on_step)
+        self.n_buckets = n_buckets
+        if n_buckets < 1:
+            raise Exception('`n_buckets`')
+        self.add_state('bucket_totals',
+                       default=torch.zeros(n_buckets),
+                       dist_reduce_fx='sum')
+        self.add_state('bucket_correct',
+                       default=torch.zeros(n_buckets),
+                       dist_reduce_fx='sum')
+
+    def update(self, batch: dict, output_logits: torch.Tensor,
+               labels: torch.Tensor):
+        pass
+
+    def compute(self):
+        assert isinstance(self.bucket_correct, Tensor)
+        assert isinstance(self.bucket_totals, Tensor)
+
+        result = torch.tensor(0.0, device=self.bucket_correct.device)
+        total_obs = torch.sum(self.bucket_totals)
+        for i in range(self.n_buckets):
+            if self.bucket_totals[i] == 0:
+                continue
+
+            acc_bucket_i = self.bucket_correct[i] / self.bucket_totals[i]
+            upper_bound = (i + 1) / self.n_buckets
+            lower_bound = i / self.n_buckets
+            conf_bucket_i = torch.tensor((upper_bound + lower_bound) / 2,
+                                         device=self.bucket_correct.device)
+            result += (self.bucket_totals[i] /
+                       total_obs) * torch.abs(acc_bucket_i - conf_bucket_i)
+        return result
+
+
+class InContextLearningMCExpectedCalibrationError(
+        InContextLearningExpectedCalibrationError):
+    r"""Computes Expected Calibration Error (ECE) for In-context learning (ICL)
+    multiple choice (MC) tasks. (source: https://arxiv.org/abs/2012.00955).
+
+    For MC tasks, the model confidence is defined as the softmax of average per-token probability assigned to the top question choice.
+
+    See `InContextLearningExpectedCalibrationError` for more info.
+    """
+
+    # Make torchmetrics call update only once
+    full_state_update = False
+
+    def update(self, batch: Dict[str, Any], output_logits: torch.Tensor,
+               labels: torch.Tensor):
+        output_logits = torch.softmax(output_logits, dim=2)
+        probabilites = []
+        for batch_idx, cont_idx in enumerate(batch['continuation_indices']):
+            cont_tok_logits = output_logits[batch_idx].index_select(
+                dim=0, index=cont_idx - 1)
+            cont_tok_targ = labels[batch_idx].index_select(dim=0,
+                                                           index=cont_idx - 1)
+            probability = cont_tok_logits.index_select(
+                dim=1, index=cont_tok_targ).diagonal().mean()
+            probabilites.append(probability)
+
+        for (start, end), gold_idx in zip(batch['choice_groupings'],
+                                          batch['gold_indices']):
+            subset = probabilites[start:end]
+            idx_max = subset.index(max(subset))
+            confidence = torch.tensor(subset).max() / torch.tensor(subset).sum()
+
+            assert confidence >= 0.0 and confidence <= 1.0
+            bucket_idx = int(confidence * self.n_buckets)
+            if bucket_idx == self.n_buckets:
+                bucket_idx -= 1
+
+            if idx_max == gold_idx:
+                self.bucket_correct[
+                    bucket_idx] += 1  # pyright: ignore [reportGeneralTypeIssues]
+
+            self.bucket_totals[
+                bucket_idx] += 1  # pyright: ignore [reportGeneralTypeIssues]
+
+
+class InContextLearningLMExpectedCalibrationError(
+        InContextLearningExpectedCalibrationError):
+    r"""Computes Expected Calibration Error (ECE) for In-context learning (ICL)
+    language modeling (LM) tasks. (cite: https://arxiv.org/pdf/1706.04599.pdf).
+
+    For LM tasks, the model confidence is defined as the minimum probability assigned to all tokens in the continuation.
+
+    See `InContextLearningExpectedCalibrationError` for more info.
+    """
+
+    # Make torchmetrics call update only once
+    full_state_update = False
+
+    def update(self, batch: Dict[str, Any], output_logits: torch.Tensor,
+               labels: torch.Tensor):
+        output_logits = torch.softmax(output_logits, dim=2)
+        for batch_idx, cont_idx in enumerate(batch['continuation_indices']):
+            cont_tok_logits = output_logits[batch_idx].index_select(
+                dim=0, index=cont_idx - 1)
+            cont_tok_pred = cont_tok_logits.argmax(dim=-1)
+            confidence = cont_tok_logits.max(dim=-1).values.min()
+            cont_tok_targ = labels[batch_idx].index_select(dim=0,
+                                                           index=cont_idx - 1)
+            assert confidence >= 0.0 and confidence <= 1.0
+            bucket_idx = int(confidence * self.n_buckets)
+            if bucket_idx == self.n_buckets:
+                bucket_idx -= 1
+
+            if (cont_tok_pred == cont_tok_targ).all():
+                self.bucket_correct[
+                    bucket_idx] += 1  # pyright: ignore [reportGeneralTypeIssues]
+
+            self.bucket_totals[
+                bucket_idx] += 1  # pyright: ignore [reportGeneralTypeIssues]
diff --git a/llmfoundry/models/hf/hf_causal_lm.py b/llmfoundry/models/hf/hf_causal_lm.py
index e51d7c55d0..3ff5e15eed 100644
--- a/llmfoundry/models/hf/hf_causal_lm.py
+++ b/llmfoundry/models/hf/hf_causal_lm.py
@@ -9,12 +9,6 @@
 from typing import TYPE_CHECKING, Any, Dict, Mapping
 
 # required for loading a python model into composer
-import transformers
-from llmfoundry.eval.metrics.nlp import (InContextLearningCodeEvalAccuracy,
-                                  InContextLearningLMAccuracy,
-                                  InContextLearningMultipleChoiceAccuracy,
-                                  InContextLearningQAAccuracy,
-                                  )
 from composer.metrics.nlp import LanguageCrossEntropy, LanguagePerplexity
 from composer.models.huggingface import peft_installed
 from composer.utils import dist
@@ -22,6 +16,9 @@
 from transformers import (AutoConfig, AutoModelForCausalLM, PreTrainedModel,
                           PreTrainedTokenizerBase)
 
+from llmfoundry.eval.metrics.nlp import (
+    InContextLearningCodeEvalAccuracy, InContextLearningLMAccuracy,
+    InContextLearningMultipleChoiceAccuracy, InContextLearningQAAccuracy)
 from llmfoundry.models.hf.hf_fsdp import hf_get_init_device
 from llmfoundry.models.hf.model_wrapper import HuggingFaceModelWithZLoss
 from llmfoundry.models.layers.attention import is_flash_v2_installed
diff --git a/llmfoundry/models/inference_api_wrapper/interface.py b/llmfoundry/models/inference_api_wrapper/interface.py
index 30be41e022..19e4dc8e6e 100644
--- a/llmfoundry/models/inference_api_wrapper/interface.py
+++ b/llmfoundry/models/inference_api_wrapper/interface.py
@@ -6,16 +6,15 @@
 import torch
 from composer.core.types import Batch
 from composer.metrics import InContextLearningMetric
-from llmfoundry.eval.metrics.nlp import (InContextLearningLMAccuracy,
-                                  InContextLearningMultipleChoiceAccuracy,
-                                  InContextLearningQAAccuracy,
-                                  )
-
 from composer.metrics.nlp import LanguageCrossEntropy, LanguagePerplexity
 from composer.models import ComposerModel
 from torchmetrics import Metric
 from transformers import AutoTokenizer
 
+from llmfoundry.eval.metrics.nlp import (
+    InContextLearningLMAccuracy, InContextLearningMultipleChoiceAccuracy,
+    InContextLearningQAAccuracy)
+
 
 class InferenceAPIEvalWrapper(ComposerModel):
 
@@ -28,9 +27,7 @@ def __init__(self, model_cfg: Dict, tokenizer: AutoTokenizer):
             LanguagePerplexity(),
             InContextLearningLMAccuracy(),
             InContextLearningMultipleChoiceAccuracy(),
-            InContextLearningQAAccuracy(),
-            InContextLearningLMExpectedCalibrationError(),
-            InContextLearningMCExpectedCalibrationError()
+            InContextLearningQAAccuracy()
         ]
         self.eval_metrics = {
             metric.__class__.__name__: metric for metric in eval_metrics
diff --git a/llmfoundry/models/mpt/modeling_mpt.py b/llmfoundry/models/mpt/modeling_mpt.py
index 2dbbfb506d..2e3d256cc5 100644
--- a/llmfoundry/models/mpt/modeling_mpt.py
+++ b/llmfoundry/models/mpt/modeling_mpt.py
@@ -16,14 +16,13 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from llmfoundry.eval.metrics.nlp import (InContextLearningCodeEvalAccuracy,
-                              InContextLearningLMAccuracy,
-                              InContextLearningMultipleChoiceAccuracy,
-                              InContextLearningQAAccuracy)
 from composer.metrics.nlp import LanguageCrossEntropy, LanguagePerplexity
 from composer.models import HuggingFaceModel
 from composer.utils import dist
 
+from llmfoundry.eval.metrics.nlp import (
+    InContextLearningCodeEvalAccuracy, InContextLearningLMAccuracy,
+    InContextLearningMultipleChoiceAccuracy, InContextLearningQAAccuracy)
 from llmfoundry.models.layers.attention import (is_flash_v1_installed,
                                                 is_flash_v2_installed)
 
diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py
index 6a5ab983bc..9b6e9d869a 100644
--- a/llmfoundry/utils/builders.py
+++ b/llmfoundry/utils/builders.py
@@ -15,8 +15,6 @@
                                 MemoryMonitor, OptimizerMonitor,
                                 RuntimeEstimator, SpeedMonitor)
 from composer.core import Algorithm, Callback, Evaluator
-from llmfoundry.eval.datasets.in_context_learning_evaluation import \
-    get_icl_task_dataloader
 from composer.loggers import (InMemoryLogger, LoggerDestination, MLFlowLogger,
                               TensorboardLogger, WandBLogger)
 from composer.optim import DecoupledAdamW
@@ -35,6 +33,8 @@
                                   LayerFreezing, MonolithicCheckpointSaver,
                                   ScheduledGarbageCollector)
 from llmfoundry.data.dataloader import build_dataloader
+from llmfoundry.eval.datasets.in_context_learning_evaluation import \
+    get_icl_task_dataloader
 from llmfoundry.optim import (DecoupledAdaLRLion, DecoupledClipLion,
                               DecoupledLionW, DecoupledLionW_8bit)
 from llmfoundry.optim.scheduler import InverseSquareRootWithWarmupScheduler
diff --git a/mcli/mcli-hf-eval.yaml b/mcli/mcli-hf-eval.yaml
index d1b641ead6..a528a1b85f 100644
--- a/mcli/mcli-hf-eval.yaml
+++ b/mcli/mcli-hf-eval.yaml
@@ -1,22 +1,20 @@
 integrations:
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  git_branch: migrate_subclasses_to_foundry # v0.4.0
+  git_branch: migrate_subclasses_to_foundry  # v0.4.0
   # git_commit:  # OR use your commit hash
   pip_install: -e ".[gpu]"
   ssh_clone: false  # Should be true if using a private repo
 
 command: |
   cd llm-foundry/scripts
-  pip uninstall mosaicml -y
-  pip install git+https://github.com/bmosaicml/composer.git@remove_subclasses_from_composer 
   composer eval/eval.py /mnt/config/parameters.yaml
 
 # Mosaic Cloud will use run_name (with a unique suffix) to populate the env var $RUN_NAME
 run_name: mpt-eval
 gpu_num: 8
 gpu_type: a100_80gb
-cluster: r1z1 # replace with your cluster here!
+cluster: r1z1  # replace with your cluster here!
 
 image: mosaicml/llm-foundry:2.1.0_cu121_flash2-latest
 
diff --git a/tests/eval/test_in_context_learning_datasets.py b/tests/eval/test_in_context_learning_datasets.py
index ab762d55bc..ec7632bedd 100644
--- a/tests/eval/test_in_context_learning_datasets.py
+++ b/tests/eval/test_in_context_learning_datasets.py
@@ -1,34 +1,582 @@
+# Copyright 2024 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
 # Copyright 2022 MosaicML Composer authors
 # SPDX-License-Identifier: Apache-2.0
 
+import contextlib
 import os
+import random
 import types
 from pathlib import Path
 
 import pytest
 import torch
-from torch.utils.data import DataLoader
-
 from composer import Evaluator
 from composer.core import DataSpec
+from torch.utils.data import DataLoader
 
 # isort: off
 from llmfoundry.eval.datasets.in_context_learning_evaluation import (
-    InContextLearningCodeEvalDataset,
-    InContextLearningMultipleChoiceTaskDataset,
-    InContextLearningQATaskDataset,
-    InContextLearningSchemaTaskDataset,
-    get_icl_task_dataloader,
-)
+    InContextLearningDataset, InContextLearningCodeEvalDataset,
+    InContextLearningMultipleChoiceTaskDataset, InContextLearningQATaskDataset,
+    InContextLearningSchemaTaskDataset, get_icl_task_dataloader, strip_data,
+    _tokenizer_needs_prefix_space, _trim_context, _get_continuation_span,
+    _get_fewshot_sample_idxs, _make_padded_input)
 # isort: on
+from composer.datasets.utils import MultiTokenEOSCriteria
 from composer.loggers import InMemoryLogger
-from llmfoundry.eval.metrics.nlp import (InContextLearningCodeEvalAccuracy, InContextLearningLMAccuracy,
-                              InContextLearningMultipleChoiceAccuracy, InContextLearningQAAccuracy)
 from composer.models import HuggingFaceModel
 from composer.trainer import Trainer
 from composer.utils import dist, reproducibility
 
+from llmfoundry.eval.metrics.nlp import (
+    InContextLearningCodeEvalAccuracy, InContextLearningLMAccuracy,
+    InContextLearningMultipleChoiceAccuracy, InContextLearningQAAccuracy)
+
+
+def test_strip_data():
+    data_to_strip = {
+        'strip_data': '  boo!  \n',
+        'has_space': '  wa hoo!',
+        'end_space': 'yoohoo!  '
+    }
+    stripped_data = strip_data(data_to_strip)
+    for k, v in stripped_data.items():
+        assert k in data_to_strip
+        assert not v[0].isspace()
+        assert not v[-1].isspace()
+
+
+@pytest.mark.skip(
+    reason="Currently don't have a tokenizer that satisfies this test")
+def test_tokenizer_needs_prefix_space_when_space_not_needed(
+        tiny_gpt2_tokenizer):
+    assert not _tokenizer_needs_prefix_space(tiny_gpt2_tokenizer)
+
+
+def test_tokenizer_needs_prefix_space_when_space_needed():
+    transformers = pytest.importorskip('transformers')
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        'facebook/opt-125m',
+        use_fast=False)  # type: ignore reportUnboundVariable
+    assert _tokenizer_needs_prefix_space(tokenizer)
+
+
+def test_trim_context():
+    context = [0] * 99 + [1] * 2037
+    continuation = [2] * 10
+    max_seq_len = 2048
+    trimmed_context = _trim_context(context,
+                                    continuation,
+                                    max_seq_len=max_seq_len)
+    assert len(trimmed_context) == 2038
+    assert trimmed_context[0] == 0
+    assert trimmed_context[1] == 1
+
+
+def test_trim_context_no_continuation():
+    context = [0] * 2048
+    max_seq_len = 2048
+    trimmed_context = _trim_context(context, [], max_seq_len=max_seq_len)
+    assert len(trimmed_context) == 2048
+    context = [0] * 3000 + [1]
+    max_seq_len = 2048
+    trimmed_context = _trim_context(context, [], max_seq_len=max_seq_len)
+    assert len(trimmed_context) == 2048
+    assert trimmed_context[-1] == 1
+
+
+def test_get_continuation_span():
+    context = [0] * 200
+    continuation = [1] * 3
+    cont_span = _get_continuation_span(context, continuation)
+    assert torch.all(torch.eq(cont_span, torch.tensor([200, 201, 202])))
+    continuation = [1]
+    cont_span = _get_continuation_span(context, continuation)
+    assert torch.all(torch.eq(cont_span, torch.tensor([200])))
+
+
+@pytest.mark.parametrize('padding_side', ['left', 'right', 'middle'])
+def test_make_padding(tiny_gpt2_tokenizer, padding_side):
+    context = tiny_gpt2_tokenizer(' cat' * 2000)['input_ids']
+    padding_id = tiny_gpt2_tokenizer.eos_token_id
+
+    error_context = contextlib.nullcontext() if padding_side in {
+        'left', 'right'
+    } else pytest.raises(ValueError)
+
+    with error_context:
+        input_ids = _make_padded_input(context, [],
+                                       2048,
+                                       padding_id,
+                                       padding_side=padding_side)
+
+        if padding_side == 'left':
+            assert input_ids[0] == tiny_gpt2_tokenizer.eos_token_id
+            assert input_ids[48:].tolist() == context
+        elif padding_side == 'right':
+            assert input_ids[-1] == tiny_gpt2_tokenizer.eos_token_id
+            assert input_ids[:-48].tolist() == context
+
+
+def test_batch_padding_logic_no_padding(tiny_gpt2_tokenizer):
+    continuation = tiny_gpt2_tokenizer(' dog' * 2000)['input_ids']
+    context = tiny_gpt2_tokenizer(' cat' * 2000)['input_ids']
+    max_seq_len = 2048
+    trimmed_context = _trim_context(context, continuation, max_seq_len)
+    continuation_spans = _get_continuation_span(trimmed_context, continuation)
+    padded_input = _make_padded_input(trimmed_context,
+                                      continuation,
+                                      max_seq_len,
+                                      tiny_gpt2_tokenizer.pad_token_id,
+                                      padding_side='right')
+    assert continuation_spans[0] == 48 and continuation_spans[-1] == 2047
+    assert len(padded_input) == 2048
+    assert tiny_gpt2_tokenizer.pad_token_id not in padded_input
+
+
+def test_batch_padding_logic_with_padding(tiny_gpt2_tokenizer):
+    continuation = tiny_gpt2_tokenizer(' dog' * 200)['input_ids']
+    context = tiny_gpt2_tokenizer(' cat' * 200)['input_ids']
+    max_seq_len = 2048
+    trimmed_context = _trim_context(context, continuation, max_seq_len)
+    continuation_spans = _get_continuation_span(trimmed_context, continuation)
+    padded_input = _make_padded_input(trimmed_context,
+                                      continuation,
+                                      max_seq_len,
+                                      tiny_gpt2_tokenizer.pad_token_id,
+                                      padding_side='right')
+    assert continuation_spans[0] == 200 and continuation_spans[-1] == 399
+    assert len(padded_input) == 2048
+    assert padded_input[-1] == tiny_gpt2_tokenizer.pad_token_id
+
+
+def test_fewshot_sample_idxs():
+    rng = random.Random(1234)
+
+    fewshot_idxs = _get_fewshot_sample_idxs(dataset_size=5,
+                                            num_fewshot=4,
+                                            example_idx=4,
+                                            rng=rng)
+    assert fewshot_idxs == {0, 1, 2, 3}
+
+    fewshot_idxs = _get_fewshot_sample_idxs(dataset_size=5,
+                                            num_fewshot=5,
+                                            example_idx=4,
+                                            rng=rng)
+    assert fewshot_idxs == {0, 1, 2, 3}
+
+    fewshot_idxs = _get_fewshot_sample_idxs(dataset_size=5,
+                                            num_fewshot=500,
+                                            example_idx=4,
+                                            rng=rng)
+    assert fewshot_idxs == {0, 1, 2, 3}
+
+    fewshot_idxs = _get_fewshot_sample_idxs(dataset_size=10,
+                                            num_fewshot=7,
+                                            example_idx=4,
+                                            rng=rng)
+    assert len(fewshot_idxs) == 7 and 4 not in fewshot_idxs
+
+
+def test_fewshot_sample_idxs_randomness():
+    dataset_size = 10000
+    num_fewshot = 5
+
+    rng_1_seed_1234 = random.Random(1234)
+    rng_2_seed_1234 = random.Random(1234)
+    rng_3_seed_11 = random.Random(11)
+
+    rng_1_sample_1 = _get_fewshot_sample_idxs(dataset_size, num_fewshot, 1,
+                                              rng_1_seed_1234)
+    rng_2_sample_1 = _get_fewshot_sample_idxs(dataset_size, num_fewshot, 1,
+                                              rng_2_seed_1234)
+    rng_3_sample_1 = _get_fewshot_sample_idxs(dataset_size, num_fewshot, 1,
+                                              rng_3_seed_11)
+
+    assert rng_1_sample_1 == rng_2_sample_1
+    assert rng_1_sample_1 != rng_3_sample_1
+
+    rng_1_sample_2 = _get_fewshot_sample_idxs(dataset_size, num_fewshot, 2,
+                                              rng_1_seed_1234)
+    rng_2_sample_2 = _get_fewshot_sample_idxs(dataset_size, num_fewshot, 2,
+                                              rng_2_seed_1234)
+    rng_3_sample_2 = _get_fewshot_sample_idxs(dataset_size, num_fewshot, 2,
+                                              rng_3_seed_11)
+
+    assert rng_1_sample_2 == rng_2_sample_2
+    assert rng_1_sample_2 != rng_3_sample_2
+
+
+@pytest.mark.filterwarnings(
+    r'ignore:The repository for mosaicml/test_dataset contains custom code which must*:FutureWarning'
+)
+def test_update_generation_kwargs(tiny_gpt2_tokenizer, tmp_path):
+    tokenizer = tiny_gpt2_tokenizer
+    seqlen = 2048
+    num_fewshot = 0
+    prompt_string = ''
+    hf_loading_vars = {
+        'split': 'test',
+        'name': 'invoker',
+    }
+    hf_parsing_map = {'context': ['quas', 'wex', 'exort'], 'answer': ['spell']}
+    gen_kwargs = {'test_arg1': 1, 'test_arg2': 2}
+
+    dl = InContextLearningDataset(
+        dataset_uri='hf://mosaicml/test_dataset',
+        tokenizer=tokenizer,
+        max_seq_len=seqlen,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=num_fewshot,
+        fewshot_random_seed=1,
+        prompt_string=prompt_string,
+        example_delimiter='\n',
+        prelimiter='Orbs: ',
+        continuation_delimiter='\nSpell:',
+        destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
+        hf_loading_vars=hf_loading_vars,
+        hf_parsing_map=hf_parsing_map,
+        generation_kwargs=gen_kwargs)
+    assert dl.base_batch['generation_kwargs'] == {
+        'test_arg1': 1,
+        'test_arg2': 2
+    }
+
+
+def test_stop_sequences_criteria(tiny_gpt2_tokenizer):
+    pytest.importorskip('transformers')
+    eos_criteria = MultiTokenEOSCriteria('\n\n', tiny_gpt2_tokenizer, 2)
+    seq1 = tiny_gpt2_tokenizer('Dogs are furry')['input_ids']
+    seq2 = tiny_gpt2_tokenizer('Dogs are furry\n\n')['input_ids']
+    seq1 = [tiny_gpt2_tokenizer.pad_token_id] * (len(seq2) - len(seq1)) + seq1
+    input_ids = torch.LongTensor([seq1, seq2])
+    assert not eos_criteria(input_ids,
+                            None)  # pyright: ignore[reportGeneralTypeIssues]
+
+    eos_criteria = MultiTokenEOSCriteria('\n\n', tiny_gpt2_tokenizer, 2)
+    seq1 = tiny_gpt2_tokenizer('Dogs are furry\n\n')['input_ids']
+    seq2 = tiny_gpt2_tokenizer('Dogs are furry\n\n')['input_ids']
+    input_ids = torch.LongTensor([seq1, seq2])
+    assert eos_criteria(input_ids,
+                        None)  # pyright: ignore[reportGeneralTypeIssues]
+
+
+def test_stop_sequences_criteria_sentencepiece(tiny_llama_tokenizer):
+    pytest.importorskip('datasets')
+
+    tokenizer = tiny_llama_tokenizer
+    eos_criteria = MultiTokenEOSCriteria('\n\n', tokenizer, 2)
+    seq1 = tokenizer(
+        '\n\nDogs'
+    )['input_ids']  # check to make sure starting with the stop sequence doesnt break it
+    seq2 = tokenizer('Dogs are furry\n\n')['input_ids']
+    seq1 = [tokenizer.eos_token_id] * (len(seq2) - len(seq1)) + seq1
+    input_ids = torch.LongTensor([seq1, seq2])
+    assert not eos_criteria(input_ids,
+                            None)  # pyright: ignore[reportGeneralTypeIssues]
+
+    eos_criteria = MultiTokenEOSCriteria('\n\n', tokenizer, 2)
+    seq1 = tokenizer('Dogs are furry\n\n')['input_ids']
+    seq2 = tokenizer('Dogs are furry\n\n')['input_ids']
+    input_ids = torch.LongTensor([seq1, seq2])
+    assert eos_criteria(input_ids,
+                        None)  # pyright: ignore[reportGeneralTypeIssues]
+
+
+@pytest.mark.filterwarnings(
+    r'ignore:The repository for mosaicml/test_dataset contains custom code which must*:FutureWarning'
+)
+def test_update_generation_kwargs_no_kwargs(tiny_gpt2_tokenizer, tmp_path):
+    tokenizer = tiny_gpt2_tokenizer
+    seqlen = 2048
+    num_fewshot = 0
+    prompt_string = ''
+    hf_loading_vars = {
+        'split': 'test',
+        'name': 'invoker',
+    }
+    hf_parsing_map = {'context': ['quas', 'wex', 'exort'], 'answer': ['spell']}
+
+    dl = InContextLearningDataset(
+        dataset_uri='hf://mosaicml/test_dataset',
+        tokenizer=tokenizer,
+        max_seq_len=seqlen,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=num_fewshot,
+        fewshot_random_seed=1,
+        prompt_string=prompt_string,
+        example_delimiter='\n',
+        prelimiter='Orbs: ',
+        continuation_delimiter='\nSpell:',
+        destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
+        hf_loading_vars=hf_loading_vars,
+        hf_parsing_map=hf_parsing_map)
+    assert not 'generation_kwargs' in dl.base_batch
+
+
+def test_update_generation_kwargs_no_kwargs_qa_dataset(tmp_path):
+    pytest.importorskip('datasets')
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    dataset_uri = f'{local_data}/triviaqa_small.jsonl'
+    transformers = pytest.importorskip('transformers')
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        'facebook/opt-125m')  # type: ignore reportUnboundVariable
+
+    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
+    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
+    dl = InContextLearningQATaskDataset(
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        max_seq_len=1024,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=0,
+        fewshot_random_seed=1234,
+        prompt_string='',
+        example_delimiter='\n',
+        continuation_delimiter=': ',
+        destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
+        generation_kwargs=None)
+    assert len(dl.base_batch['generation_kwargs']) == 3
+
+
+def test_update_generation_kwargs_with_kwargs_qa_dataset(tmp_path):
+    pytest.importorskip('datasets')
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    dataset_uri = f'{local_data}/triviaqa_small.jsonl'
+    transformers = pytest.importorskip('transformers')
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        'facebook/opt-125m')  # type: ignore reportUnboundVariable
+
+    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
+    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
+    dl = InContextLearningQATaskDataset(
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        max_seq_len=1024,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=0,
+        fewshot_random_seed=1234,
+        prompt_string='',
+        example_delimiter='\n',
+        continuation_delimiter=': ',
+        destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
+        generation_kwargs={'temperature': 0.9})
+    assert 'generation_kwargs' in dl.base_batch
+    assert dl.base_batch['generation_kwargs']['temperature'] == 0.9
+    assert len(dl.base_batch['generation_kwargs']) == 4
+
+
+@pytest.mark.filterwarnings(
+    r'ignore:The repository for mosaicml/test_dataset contains custom code which must*:FutureWarning'
+)
+def test_construct_context(tiny_gpt2_tokenizer, tmp_path):
+    tokenizer = tiny_gpt2_tokenizer
+    seqlen = 2048
+    num_fewshot = 0
+    prompt_string = ''
+    hf_loading_vars = {
+        'split': 'test',
+        'name': 'invoker',
+    }
+    hf_parsing_map = {'context': ['quas', 'wex', 'exort'], 'answer': ['spell']}
+
+    dl = InContextLearningDataset(
+        dataset_uri='hf://mosaicml/test_dataset',
+        tokenizer=tokenizer,
+        max_seq_len=seqlen,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=num_fewshot,
+        fewshot_random_seed=1,
+        prompt_string=prompt_string,
+        example_delimiter='\n',
+        prelimiter='Orbs: ',
+        continuation_delimiter='\nSpell: ',
+        destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
+        hf_loading_vars=hf_loading_vars,
+        hf_parsing_map=hf_parsing_map)
+    constructed_context = dl.construct_context({
+        'context': 'quas quas exort',
+        'answer': 'ice wall'
+    })
+    assert constructed_context == 'Orbs: quas quas exort\nSpell: '
+    constructed_context = dl.construct_context(
+        {
+            'context': 'quas quas exort',
+            'answer': 'ice wall'
+        }, add_answer=True)
+    assert constructed_context == 'Orbs: quas quas exort\nSpell: ice wall'
+    constructed_context = dl.construct_context(
+        {
+            'context': 'quas quas exort',
+            'answer': 'ice wall'
+        },
+        preceding_text='The harsh White Waste beckons!',
+        add_answer=True)
+    assert constructed_context == '\nOrbs: quas quas exort\nSpell: ice wall'
+
+
+@pytest.mark.filterwarnings(
+    r'ignore:The repository for mosaicml/test_dataset contains custom code which must*:FutureWarning'
+)
+def test_get_answer_from_example(tiny_gpt2_tokenizer, tmp_path):
+    tokenizer = tiny_gpt2_tokenizer
+    seqlen = 2048
+    num_fewshot = 0
+    prompt_string = ''
+    hf_loading_vars = {
+        'split': 'test',
+        'name': 'invoker',
+    }
+    hf_parsing_map = {'context': ['quas', 'wex', 'exort'], 'answer': ['spell']}
+
+    dl = InContextLearningDataset(
+        dataset_uri='hf://mosaicml/test_dataset',
+        tokenizer=tokenizer,
+        max_seq_len=seqlen,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=num_fewshot,
+        fewshot_random_seed=1,
+        prompt_string=prompt_string,
+        example_delimiter='\n',
+        prelimiter='Orbs: ',
+        continuation_delimiter='\nSpell:',
+        destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
+        hf_loading_vars=hf_loading_vars,
+        hf_parsing_map=hf_parsing_map)
+    answer = dl.get_answer_from_example({
+        'context': 'wex exort exort',
+        'answer': 'alacrity'
+    })
+    assert answer == ' alacrity'
+
+
+@pytest.mark.filterwarnings(
+    r'ignore:The repository for mosaicml/test_dataset contains custom code which must*:FutureWarning'
+)
+def test_fix_eos_on_preamble(tmp_path):
+    transformers = pytest.importorskip('transformers')
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        'facebook/opt-125m',
+        use_fast=False)  # type: ignore reportUnboundVariable
+    seqlen = 2048
+    num_fewshot = 0
+    prompt_string = ''
+    hf_loading_vars = {
+        'split': 'test',
+        'name': 'invoker',
+    }
+    hf_parsing_map = {'context': ['quas', 'wex', 'exort'], 'answer': ['spell']}
+
+    dl = InContextLearningDataset(
+        dataset_uri='hf://mosaicml/test_dataset',
+        tokenizer=tokenizer,
+        max_seq_len=seqlen,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=num_fewshot,
+        fewshot_random_seed=1,
+        prompt_string=prompt_string,
+        example_delimiter='\n',
+        prelimiter='Orbs: ',
+        continuation_delimiter='\nSpell:',
+        destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
+        hf_loading_vars=hf_loading_vars,
+        hf_parsing_map=hf_parsing_map)
+    preamble = 'blah blah blah.'
+    tokenized_preamble = tokenizer.encode(preamble)
+    tokenized_preamble += [tokenizer.eos_token_id]
+    fixed_preamble = dl._fix_eos_on_preamble(tokenized_preamble)
+    assert tokenized_preamble[:-1] == fixed_preamble
+    assert fixed_preamble[-1] != tokenizer.eos_token_id
+
+
+@pytest.mark.filterwarnings(
+    r'ignore:The repository for mosaicml/test_dataset contains custom code which must*:FutureWarning'
+)
+def test_tokenize_example_with_tokenize_labels(tiny_gpt2_tokenizer, tmp_path):
+    tokenizer = tiny_gpt2_tokenizer
+    seqlen = 2048
+    num_fewshot = 0
+    prompt_string = ''
+    hf_loading_vars = {
+        'split': 'test',
+        'name': 'invoker',
+    }
+    hf_parsing_map = {'context': ['quas', 'wex', 'exort'], 'answer': ['spell']}
+
+    dl = InContextLearningDataset(
+        dataset_uri='hf://mosaicml/test_dataset',
+        tokenizer=tokenizer,
+        max_seq_len=seqlen,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=num_fewshot,
+        fewshot_random_seed=1,
+        prompt_string=prompt_string,
+        example_delimiter='\n',
+        prelimiter='Orbs: ',
+        continuation_delimiter='\nSpell: ',
+        destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
+        hf_loading_vars=hf_loading_vars,
+        hf_parsing_map=hf_parsing_map,
+        tokenize_labels=True)
+    tokenized_example = dl.tokenize_example('What spell does this invoke? ',
+                                            'exort exort wex\nSpell: ',
+                                            {'answer': ' Meatball'})
+    tokenized_input = [
+        2061, 4822, 857, 428, 26342, 30, 220, 1069, 419, 409, 419, 356, 87, 198,
+        31221, 25, 19145, 1894
+    ]
+    assert tokenized_example['context'][:len(tokenized_input)].tolist(
+    ) == tokenized_input
+    assert tokenized_example['context'][-1] == tokenizer.eos_token_id
+    assert type(tokenized_example['answer'][0]) == int
+    assert len(tokenized_example['context']) == seqlen
+    assert 'continuation_indices' in tokenized_example
+
 
+@pytest.mark.filterwarnings(
+    r'ignore:The repository for mosaicml/test_dataset contains custom code which must*:FutureWarning'
+)
+def test_tokenize_example_with_no_tokenize_labels(tiny_gpt2_tokenizer,
+                                                  tmp_path):
+    tokenizer = tiny_gpt2_tokenizer
+    seqlen = 2048
+    num_fewshot = 0
+    prompt_string = ''
+    hf_loading_vars = {
+        'split': 'test',
+        'name': 'invoker',
+    }
+    hf_parsing_map = {'context': ['quas', 'wex', 'exort'], 'answer': ['spell']}
+
+    dl = InContextLearningDataset(
+        dataset_uri='hf://mosaicml/test_dataset',
+        tokenizer=tokenizer,
+        max_seq_len=seqlen,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=num_fewshot,
+        fewshot_random_seed=1,
+        prompt_string=prompt_string,
+        example_delimiter='\n',
+        prelimiter='Orbs: ',
+        continuation_delimiter='\nSpell: ',
+        destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
+        hf_loading_vars=hf_loading_vars,
+        hf_parsing_map=hf_parsing_map,
+        tokenize_labels=False)
+    tokenized_example = dl.tokenize_example('What spell does this invoke? ',
+                                            'exort exort wex\nSpell: ',
+                                            {'answer': ' Meatball'})
+    tokenized_input = [
+        2061, 4822, 857, 428, 26342, 30, 220, 1069, 419, 409, 419, 356, 87, 198,
+        31221, 25
+    ]
+    assert tokenized_example['context'][:len(tokenized_input)].tolist(
+    ) == tokenized_input
+    assert tokenized_example['context'][-1] == tokenizer.eos_token_id
+    assert len(tokenized_example['context']) == seqlen
+    assert type(tokenized_example['answer']) == str
 
 
 def test_qa_set_cot_no_cot(tmp_path):
@@ -36,7 +584,8 @@ def test_qa_set_cot_no_cot(tmp_path):
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/triviaqa_small.jsonl'
     transformers = pytest.importorskip('transformers')
-    tokenizer = transformers.AutoTokenizer.from_pretrained('facebook/opt-125m')  # type: ignore reportUnboundVariable
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        'facebook/opt-125m')  # type: ignore reportUnboundVariable
 
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
@@ -60,7 +609,8 @@ def test_qa_set_cot_has_cot(tmp_path):
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/gsm8k_small.jsonl'
     transformers = pytest.importorskip('transformers')
-    tokenizer = transformers.AutoTokenizer.from_pretrained('facebook/opt-125m')  # type: ignore reportUnboundVariable
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        'facebook/opt-125m')  # type: ignore reportUnboundVariable
 
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
@@ -189,7 +739,9 @@ def test_qa_tokenize_example(tiny_gpt2_tokenizer, tmp_path):
             'chain_of_thought': "Let's think step by step. "
         })
     assert 'aliases' in tokenized_example
-    assert tokenized_example['aliases'] == ['this is the right answer', 'this is the best answer']
+    assert tokenized_example['aliases'] == [
+        'this is the right answer', 'this is the best answer'
+    ]
 
 
 def test_code_adjust_padding(tiny_gpt2_tokenizer, tmp_path):
@@ -217,7 +769,9 @@ def test_code_adjust_padding(tiny_gpt2_tokenizer, tmp_path):
         generations_per_sample=10,
     )
 
-    assert all(len(data['prompt']) == 148 for data in dl.dataset)  # pyright: ignore [reportGeneralTypeIssues]
+    assert all(
+        len(data['prompt']) == 148
+        for data in dl.dataset)  # pyright: ignore [reportGeneralTypeIssues]
 
 
 def test_code_update_gen_kwargs(tiny_gpt2_tokenizer, tmp_path):
@@ -271,15 +825,23 @@ def test_mc_tokenize_example(tiny_gpt2_tokenizer, tmp_path):
         destination_path=str(tmp_path / 'test_human_eval_small.jsonl'),
     )
     example = {
-        'context': "Who's the best eval researcher?\n A. Jeremy\n B. Tessa\n C. Max\n D. Other\nAnswer: ",
+        'context':
+            "Who's the best eval researcher?\n A. Jeremy\n B. Tessa\n C. Max\n D. Other\nAnswer: ",
         'choices': ['A', 'B', 'C', 'D'],
-        'gold': 2
+        'gold':
+            2
     }
-    tokenized_example = dl.tokenize_example(prompt_and_fewshot='Answer the following: ',
-                                            ctxt=example['context'],
-                                            example=example)
-    unpadded_queries = [context[context != tokenizer.eos_token_id] for context in tokenized_example['query']]
-    untokenized_inputs = [tokenizer.decode(unpadded_input) for unpadded_input in unpadded_queries]
+    tokenized_example = dl.tokenize_example(
+        prompt_and_fewshot='Answer the following: ',
+        ctxt=example['context'],
+        example=example)
+    unpadded_queries = [
+        context[context != tokenizer.eos_token_id]
+        for context in tokenized_example['query']
+    ]
+    untokenized_inputs = [
+        tokenizer.decode(unpadded_input) for unpadded_input in unpadded_queries
+    ]
     correct_output = [
         "Answer the following: Who's the best eval researcher?\n A. Jeremy\n B. Tessa\n C. Max\n D. Other\nAnswer: A",
         "Answer the following: Who's the best eval researcher?\n A. Jeremy\n B. Tessa\n C. Max\n D. Other\nAnswer: B",
@@ -308,7 +870,11 @@ def test_schema_construct_context(tiny_gpt2_tokenizer, tmp_path):
         continuation_delimiter=' ### ',
         destination_path=str(tmp_path / 'test_human_eval_small.jsonl'),
     )
-    example = {'context_options': ['cont one', 'cont two'], 'gold': 0, 'continuation': 'this is a continuation'}
+    example = {
+        'context_options': ['cont one', 'cont two'],
+        'gold': 0,
+        'continuation': 'this is a continuation'
+    }
     constructed_context = dl.construct_context(example)
     assert constructed_context == 'cont one ### this is a continuation'
     constructed_context = dl.construct_context(example, preceding_text='text')
@@ -335,10 +901,15 @@ def test_schema_construct_multiple_contexts(tiny_gpt2_tokenizer, tmp_path):
         continuation_delimiter=' ### ',
         destination_path=str(tmp_path / 'test_human_eval_small.jsonl'),
     )
-    example = {'context_options': ['cont one', 'cont two'], 'gold': 0, 'continuation': 'this is a continuation'}
+    example = {
+        'context_options': ['cont one', 'cont two'],
+        'gold': 0,
+        'continuation': 'this is a continuation'
+    }
     constructed_contexts = dl._construct_multiple_contexts(example)
     assert constructed_contexts == ['cont one', 'cont two']
-    constructed_contexts = dl._construct_multiple_contexts(example, preceding_text='some text')
+    constructed_contexts = dl._construct_multiple_contexts(
+        example, preceding_text='some text')
     assert constructed_contexts == ['\ncont one ###', '\ncont two ###']
 
 
@@ -362,20 +933,34 @@ def test_schema_tokenize_example(tiny_gpt2_tokenizer, tmp_path):
         continuation_delimiter=' ### ',
         destination_path=str(tmp_path / 'test_human_eval_small.jsonl'),
     )
-    example = {'context_options': ['context one', 'context two'], 'gold': 0, 'continuation': 'this is a continuation'}
-    tokenized_example = dl.tokenize_example(prompt_and_fewshot='prompt ',
-                                            context_options=example['context_options'],
-                                            example=example)
-    assert all(tiny_gpt2_tokenizer.decode(cont) == ' this is a continuation' for cont in tokenized_example['answer'])
-    unpadded_inputs = [context[context != tokenizer.eos_token_id] for context in tokenized_example['context_options']]
-    untokenized_inputs = [tokenizer.decode(unpadded_input) for unpadded_input in unpadded_inputs]
+    example = {
+        'context_options': ['context one', 'context two'],
+        'gold': 0,
+        'continuation': 'this is a continuation'
+    }
+    tokenized_example = dl.tokenize_example(
+        prompt_and_fewshot='prompt ',
+        context_options=example['context_options'],
+        example=example)
+    assert all(
+        tiny_gpt2_tokenizer.decode(cont) == ' this is a continuation'
+        for cont in tokenized_example['answer'])
+    unpadded_inputs = [
+        context[context != tokenizer.eos_token_id]
+        for context in tokenized_example['context_options']
+    ]
+    untokenized_inputs = [
+        tokenizer.decode(unpadded_input) for unpadded_input in unpadded_inputs
+    ]
     assert untokenized_inputs == [
-        'prompt context one this is a continuation', 'prompt context two this is a continuation'
+        'prompt context one this is a continuation',
+        'prompt context two this is a continuation'
     ]
 
 
 @pytest.mark.parametrize('dataset_uri', ['mmlu_small.jsonl'])
-def test_mc_task_dataloader_subcategories(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
+def test_mc_task_dataloader_subcategories(dataset_uri, tiny_gpt2_tokenizer,
+                                          tmp_path):
     pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -384,18 +969,20 @@ def test_mc_task_dataloader_subcategories(dataset_uri, tiny_gpt2_tokenizer, tmp_
     dataset_uri = f'{local_data}/{dataset_uri}'
     batch_size = 8
     seqlen = 64
-    dls = get_icl_task_dataloader('multiple_choice',
-                                  dataset_uri=dataset_uri,
-                                  tokenizer=tokenizer,
-                                  batch_size=batch_size,
-                                  max_seq_len=seqlen,
-                                  pad_tok_id=tokenizer.eos_token_id,
-                                  num_fewshot=2,
-                                  prompt_string='The following are multiple choice questions (with answers).\n',
-                                  example_delimiter='\n',
-                                  continuation_delimiter='Answer: ',
-                                  destination_path=str(tmp_path / 'icl.jsonl'),
-                                  has_categories=True)
+    dls = get_icl_task_dataloader(
+        'multiple_choice',
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        batch_size=batch_size,
+        max_seq_len=seqlen,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=2,
+        prompt_string=
+        'The following are multiple choice questions (with answers).\n',
+        example_delimiter='\n',
+        continuation_delimiter='Answer: ',
+        destination_path=str(tmp_path / 'icl.jsonl'),
+        has_categories=True)
     assert isinstance(dls, dict)
 
     assert 'computer_security' in dls
@@ -408,7 +995,8 @@ def test_mc_task_dataloader_subcategories(dataset_uri, tiny_gpt2_tokenizer, tmp_
     assert 'attention_mask' in batch
     assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen)
     assert 'continuation_indices' in batch
-    assert isinstance(batch['continuation_indices'], list) and len(batch['continuation_indices']) == batch_size
+    assert isinstance(batch['continuation_indices'], list) and len(
+        batch['continuation_indices']) == batch_size
     assert 'mode' in batch
     assert batch['mode'] == 'icl_task'
     min_idx = min(batch['continuation_indices'][0]).item()
@@ -419,7 +1007,8 @@ def test_mc_task_dataloader_subcategories(dataset_uri, tiny_gpt2_tokenizer, tmp_
 @pytest.mark.parametrize('dataset_uri', [
     'pubmed_sm.jsonl',
 ])
-def test_lm_task_dataloader_extra_space(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
+def test_lm_task_dataloader_extra_space(dataset_uri, tiny_gpt2_tokenizer,
+                                        tmp_path):
     pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -448,13 +1037,15 @@ def test_lm_task_dataloader_extra_space(dataset_uri, tiny_gpt2_tokenizer, tmp_pa
     assert 'attention_mask' in batch
     assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen)
     assert 'continuation_indices' in batch
-    assert isinstance(batch['continuation_indices'], list) and len(batch['continuation_indices']) == batch_size
+    assert isinstance(batch['continuation_indices'], list) and len(
+        batch['continuation_indices']) == batch_size
     assert 'mode' in batch
     assert batch['mode'] == 'icl_task'
     min_idx = min(batch['continuation_indices'][0]).item()
     max_idx = max(batch['continuation_indices'][0]).item()
     assert '  ' not in tokenizer.decode(batch['input_ids'][0][0:max_idx + 1])
-    assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx + 1]) == ' yes'
+    assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx +
+                                                  1]) == ' yes'
 
 
 @pytest.mark.parametrize('dataset_uri', [
@@ -489,12 +1080,14 @@ def test_lm_task_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
     assert 'attention_mask' in batch
     assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen)
     assert 'continuation_indices' in batch
-    assert isinstance(batch['continuation_indices'], list) and len(batch['continuation_indices']) == batch_size
+    assert isinstance(batch['continuation_indices'], list) and len(
+        batch['continuation_indices']) == batch_size
     assert 'mode' in batch
     assert batch['mode'] == 'icl_task'
     min_idx = min(batch['continuation_indices'][0]).item()
     max_idx = max(batch['continuation_indices'][0]).item()
-    assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx + 1]) == ' glen'
+    assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx +
+                                                  1]) == ' glen'
 
 
 @pytest.mark.parametrize('dataset_uri', ['winograd_small.jsonl'])
@@ -528,18 +1121,21 @@ def test_schema_task_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
     assert 'attention_mask' in batch
     assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen)
     assert 'continuation_indices' in batch
-    assert isinstance(batch['continuation_indices'], list) and len(batch['continuation_indices']) == batch_size
+    assert isinstance(batch['continuation_indices'], list) and len(
+        batch['continuation_indices']) == batch_size
     assert 'mode' in batch
     assert batch['mode'] == 'icl_task'
     assert 'gold_indices' in batch
-    assert isinstance(batch['gold_indices'], list) and len(batch['gold_indices']) == batch_size // choices_per_question
+    assert isinstance(batch['gold_indices'], list) and len(
+        batch['gold_indices']) == batch_size // choices_per_question
     assert 'choice_groupings' in batch
     assert isinstance(batch['choice_groupings'], list) and len(
         batch['choice_groupings']) == batch_size // choices_per_question
 
     min_idx = min(batch['continuation_indices'][0]).item()
     max_idx = max(batch['continuation_indices'][0]).item()
-    assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx + 1]) == ' feared violence.'
+    assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx +
+                                                  1]) == ' feared violence.'
 
 
 @pytest.mark.parametrize('dataset_uri', ['winograd_small.jsonl'])
@@ -575,11 +1171,13 @@ def test_schema_task_dataloader_sentpiece_tokenizer(dataset_uri, tmp_path):
     assert 'attention_mask' in batch
     assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen)
     assert 'continuation_indices' in batch
-    assert isinstance(batch['continuation_indices'], list) and len(batch['continuation_indices']) == batch_size
+    assert isinstance(batch['continuation_indices'], list) and len(
+        batch['continuation_indices']) == batch_size
     assert 'mode' in batch
     assert batch['mode'] == 'icl_task'
     assert 'gold_indices' in batch
-    assert isinstance(batch['gold_indices'], list) and len(batch['gold_indices']) == batch_size // choices_per_question
+    assert isinstance(batch['gold_indices'], list) and len(
+        batch['gold_indices']) == batch_size // choices_per_question
     assert 'choice_groupings' in batch
     assert isinstance(batch['choice_groupings'], list) and len(
         batch['choice_groupings']) == batch_size // choices_per_question
@@ -592,7 +1190,8 @@ def test_schema_task_dataloader_sentpiece_tokenizer(dataset_uri, tmp_path):
 
 @pytest.mark.parametrize('dataset_uri', ['lambada_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [0, 1])
-def test_lm_task_dataloader_opt_tokenizer(tiny_opt_tokenizer, dataset_uri, num_fewshot, tmp_path):
+def test_lm_task_dataloader_opt_tokenizer(tiny_opt_tokenizer, dataset_uri,
+                                          num_fewshot, tmp_path):
     pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -621,19 +1220,22 @@ def test_lm_task_dataloader_opt_tokenizer(tiny_opt_tokenizer, dataset_uri, num_f
     assert 'attention_mask' in batch
     assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen)
     assert 'continuation_indices' in batch
-    assert isinstance(batch['continuation_indices'], list) and len(batch['continuation_indices']) == batch_size
+    assert isinstance(batch['continuation_indices'], list) and len(
+        batch['continuation_indices']) == batch_size
     assert 'mode' in batch
     assert batch['mode'] == 'icl_task'
     min_idx = min(batch['continuation_indices'][0]).item()
     max_idx = max(batch['continuation_indices'][0]).item()
-    assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx + 1]) == ' glen'
+    assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx +
+                                                  1]) == ' glen'
     assert tokenizer.decode(batch['input_ids'][0][0:min_idx]).startswith('</s>')
     assert tokenizer.decode(batch['input_ids'][0][0:min_idx]).count('</s>') == 1
 
 
 @pytest.mark.parametrize('dataset_uri', ['piqa_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [0, 1])
-def test_mc_task_dataloader_opt_tokenizer(tiny_opt_tokenizer, dataset_uri, num_fewshot, tmp_path):
+def test_mc_task_dataloader_opt_tokenizer(tiny_opt_tokenizer, dataset_uri,
+                                          num_fewshot, tmp_path):
     pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -665,18 +1267,21 @@ def test_mc_task_dataloader_opt_tokenizer(tiny_opt_tokenizer, dataset_uri, num_f
     assert 'attention_mask' in batch
     assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen)
     assert 'continuation_indices' in batch
-    assert isinstance(batch['continuation_indices'], list) and len(batch['continuation_indices']) == batch_size
+    assert isinstance(batch['continuation_indices'], list) and len(
+        batch['continuation_indices']) == batch_size
     assert 'mode' in batch
     assert batch['mode'] == 'icl_task'
     assert 'gold_indices' in batch
-    assert isinstance(batch['gold_indices'], list) and len(batch['gold_indices']) == batch_size // choices_per_question
+    assert isinstance(batch['gold_indices'], list) and len(
+        batch['gold_indices']) == batch_size // choices_per_question
     assert 'choice_groupings' in batch
     assert isinstance(batch['choice_groupings'], list) and len(
         batch['choice_groupings']) == batch_size // choices_per_question
 
     min_idx = min(batch['continuation_indices'][0]).item()
     max_idx = max(batch['continuation_indices'][0]).item()
-    assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx + 1]) == ' Pour it onto a plate'
+    assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx +
+                                                  1]) == ' Pour it onto a plate'
     assert tokenizer.decode(batch['input_ids'][0][0:min_idx]).startswith('</s>')
     assert tokenizer.decode(batch['input_ids'][0][0:min_idx]).count('</s>') == 1
 
@@ -715,9 +1320,12 @@ def test_mc_split_batch(tiny_opt_tokenizer, dataset_uri, num_fewshot, tmp_path):
     for i, microbatch in enumerate(microbatches):
         assert dl.get_num_samples_in_batch(microbatch) == 1
         assert 'input_ids' in microbatch
-        assert tuple(microbatch['input_ids'].shape) == (real_microbatch_size, seqlen)
+        assert tuple(microbatch['input_ids'].shape) == (real_microbatch_size,
+                                                        seqlen)
         assert 'attention_mask' in microbatch
-        assert tuple(microbatch['attention_mask'].shape) == (real_microbatch_size, seqlen)
+        assert tuple(
+            microbatch['attention_mask'].shape) == (real_microbatch_size,
+                                                    seqlen)
         assert 'continuation_indices' in microbatch
         assert isinstance(microbatch['continuation_indices'], list) and len(
             microbatch['continuation_indices']) == real_microbatch_size
@@ -725,21 +1333,27 @@ def test_mc_split_batch(tiny_opt_tokenizer, dataset_uri, num_fewshot, tmp_path):
         assert microbatch['mode'] == 'icl_task'
         assert 'gold_indices' in microbatch
         assert isinstance(microbatch['gold_indices'], list) and len(
-            microbatch['gold_indices']) == real_microbatch_size // choices_per_question
+            microbatch['gold_indices']
+        ) == real_microbatch_size // choices_per_question
         assert 'choice_groupings' in microbatch
         assert isinstance(microbatch['choice_groupings'], list) and len(
-            microbatch['choice_groupings']) == real_microbatch_size // choices_per_question
+            microbatch['choice_groupings']
+        ) == real_microbatch_size // choices_per_question
 
         min_idx = min(microbatch['continuation_indices'][0]).item()
         max_idx = max(microbatch['continuation_indices'][0]).item()
         if i == 0:
-            assert tokenizer.decode(microbatch['input_ids'][0][min_idx:max_idx + 1]) == ' Pour it onto a plate'
-        elif i == 1:
             assert tokenizer.decode(
                 microbatch['input_ids'][0][min_idx:max_idx +
-                                           1]) == ' Weld the metal together to get it to stay firmly in place'
-        assert tokenizer.decode(microbatch['input_ids'][0][0:min_idx]).startswith('</s>')
-        assert tokenizer.decode(microbatch['input_ids'][0][0:min_idx]).count('</s>') == 1
+                                           1]) == ' Pour it onto a plate'
+        elif i == 1:
+            assert tokenizer.decode(
+                microbatch['input_ids'][0][min_idx:max_idx + 1]
+            ) == ' Weld the metal together to get it to stay firmly in place'
+        assert tokenizer.decode(
+            microbatch['input_ids'][0][0:min_idx]).startswith('</s>')
+        assert tokenizer.decode(
+            microbatch['input_ids'][0][0:min_idx]).count('</s>') == 1
 
 
 @pytest.mark.parametrize('dataset_uri', ['triviaqa_small.jsonl'])
@@ -797,7 +1411,8 @@ def test_qa_split_batch(tiny_opt_tokenizer, dataset_uri, tmp_path):
 @pytest.mark.parametrize('dataset_uri', ['triviaqa_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [0])
 @pytest.mark.parametrize('prompt_string', ['I am a prompt', ''])
-def test_qa_task_dataloader_w_null_eos(dataset_uri, tiny_gpt2_tokenizer, tmp_path, num_fewshot, prompt_string):
+def test_qa_task_dataloader_w_null_eos(dataset_uri, tiny_gpt2_tokenizer,
+                                       tmp_path, num_fewshot, prompt_string):
     pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -819,13 +1434,15 @@ def test_qa_task_dataloader_w_null_eos(dataset_uri, tiny_gpt2_tokenizer, tmp_pat
                                     example_delimiter='\n',
                                     question_prelimiter='Q: ',
                                     continuation_delimiter='\nA:',
-                                    destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'))
+                                    destination_path=str(
+                                        tmp_path / f'icl_{num_fewshot}.jsonl'))
 
 
 @pytest.mark.parametrize('dataset_uri', ['triviaqa_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [0, 2])
 @pytest.mark.parametrize('prompt_string', ['I am a prompt', ''])
-def test_qa_task_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path, num_fewshot, prompt_string):
+def test_qa_task_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path,
+                            num_fewshot, prompt_string):
     pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -847,14 +1464,17 @@ def test_qa_task_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path, num_fews
                                  example_delimiter='\n',
                                  question_prelimiter='Q: ',
                                  continuation_delimiter='\nA:',
-                                 destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'))
+                                 destination_path=str(
+                                     tmp_path / f'icl_{num_fewshot}.jsonl'))
     assert isinstance(dl, DataSpec)
 
     assert isinstance(dl.dataloader, DataLoader)  # pyright
     batch = next(dl.dataloader._get_iterator())
 
-    assert tuple(batch['input_ids'].shape) == (batch_size, seqlen - maximum_answer_length)
-    assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen - maximum_answer_length)
+    assert tuple(batch['input_ids'].shape) == (batch_size,
+                                               seqlen - maximum_answer_length)
+    assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen -
+                                                    maximum_answer_length)
     assert batch['mode'] == 'generate'
     # the maximum generation length from the small test data
 
@@ -868,16 +1488,19 @@ def test_qa_task_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path, num_fews
     if len(prompt_string) > 0:
         assert all(item.count('I am a prompt') == 1 for item in decoded_batch)
     assert all(
-        set(found) == set(expected)
-        for found, expected in zip(batch['labels'], [['David Seville'], ['Skorpio', 'Scorpio']]))
-    assert decoded_batch[0].endswith('Q: Who was the man behind The Chipmunks?\nA:')
-    assert decoded_batch[1].endswith('Q: What star sign is Jamie Lee Curtis?\nA:')
+        set(found) == set(expected) for found, expected in zip(
+            batch['labels'], [['David Seville'], ['Skorpio', 'Scorpio']]))
+    assert decoded_batch[0].endswith(
+        'Q: Who was the man behind The Chipmunks?\nA:')
+    assert decoded_batch[1].endswith(
+        'Q: What star sign is Jamie Lee Curtis?\nA:')
     assert 'eos_token_id' in batch['generation_kwargs']
 
 
 @pytest.mark.parametrize('dataset_uri', ['gsm8k_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [0, 2])
-def test_qa_task_with_cot_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path, num_fewshot):
+def test_qa_task_with_cot_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path,
+                                     num_fewshot):
     pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -888,24 +1511,27 @@ def test_qa_task_with_cot_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path,
     seqlen = 512
     # empirical number from the small test dataset
     maximum_answer_length = 132
-    dl = get_icl_task_dataloader('question_answering',
-                                 dataset_uri=dataset_uri,
-                                 tokenizer=tokenizer,
-                                 batch_size=batch_size,
-                                 max_seq_len=seqlen,
-                                 pad_tok_id=tokenizer.eos_token_id,
-                                 num_fewshot=num_fewshot,
-                                 prompt_string='',
-                                 example_delimiter='\n',
-                                 question_prelimiter='Q: ',
-                                 continuation_delimiter="\nA: Let's think step by step. ",
-                                 cot_delimiter=' #### ',
-                                 destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'))
+    dl = get_icl_task_dataloader(
+        'question_answering',
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        batch_size=batch_size,
+        max_seq_len=seqlen,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=num_fewshot,
+        prompt_string='',
+        example_delimiter='\n',
+        question_prelimiter='Q: ',
+        continuation_delimiter="\nA: Let's think step by step. ",
+        cot_delimiter=' #### ',
+        destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'))
     assert isinstance(dl, DataSpec)
     assert isinstance(dl.dataloader, DataLoader)  # pyright
     batch = next(dl.dataloader._get_iterator())
-    assert tuple(batch['input_ids'].shape) == (batch_size, seqlen - maximum_answer_length)
-    assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen - maximum_answer_length)
+    assert tuple(batch['input_ids'].shape) == (batch_size,
+                                               seqlen - maximum_answer_length)
+    assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen -
+                                                    maximum_answer_length)
     assert batch['mode'] == 'generate'
     # the maximum generation length from the small test data
     assert batch['generation_length'] == maximum_answer_length
@@ -962,18 +1588,21 @@ def test_mc_task_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
     assert 'attention_mask' in batch
     assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen)
     assert 'continuation_indices' in batch
-    assert isinstance(batch['continuation_indices'], list) and len(batch['continuation_indices']) == batch_size
+    assert isinstance(batch['continuation_indices'], list) and len(
+        batch['continuation_indices']) == batch_size
     assert 'mode' in batch
     assert batch['mode'] == 'icl_task'
     assert 'gold_indices' in batch
-    assert isinstance(batch['gold_indices'], list) and len(batch['gold_indices']) == batch_size // choices_per_question
+    assert isinstance(batch['gold_indices'], list) and len(
+        batch['gold_indices']) == batch_size // choices_per_question
     assert 'choice_groupings' in batch
     assert isinstance(batch['choice_groupings'], list) and len(
         batch['choice_groupings']) == batch_size // choices_per_question
 
     min_idx = min(batch['continuation_indices'][0]).item()
     max_idx = max(batch['continuation_indices'][0]).item()
-    assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx + 1]) == ' Pour it onto a plate'
+    assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx +
+                                                  1]) == ' Pour it onto a plate'
 
 
 @pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
@@ -1048,13 +1677,15 @@ def test_code_eval_split_batch(dataset_uri, tmp_path):
 @pytest.mark.parametrize('num_fewshot', [0, 2])
 @pytest.mark.parametrize('prompt_string', ['Please code:\n', ''])
 @pytest.mark.parametrize('generations_per_sample', [1, 3])
-def test_code_eval_sentpiece_dataloader(dataset_uri, tmp_path, num_fewshot, prompt_string, generations_per_sample):
+def test_code_eval_sentpiece_dataloader(dataset_uri, tmp_path, num_fewshot,
+                                        prompt_string, generations_per_sample):
     pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
 
     transformers = pytest.importorskip('transformers')
-    tokenizer = transformers.AutoTokenizer.from_pretrained('huggyllama/llama-7b')  # type: ignore reportUnboundVariable
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        'huggyllama/llama-7b')  # type: ignore reportUnboundVariable
     dataset_uri = f'{local_data}/{dataset_uri}'
     batch_size = 4
     seqlen = 2048
@@ -1070,7 +1701,8 @@ def test_code_eval_sentpiece_dataloader(dataset_uri, tmp_path, num_fewshot, prom
                                  example_delimiter='\n',
                                  continuation_delimiter='',
                                  question_prelimiter='Code start: \n',
-                                 destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'),
+                                 destination_path=str(
+                                     tmp_path / f'icl_{num_fewshot}.jsonl'),
                                  generations_per_sample=generations_per_sample)
     assert isinstance(dl, DataSpec)
 
@@ -1081,14 +1713,18 @@ def test_code_eval_sentpiece_dataloader(dataset_uri, tmp_path, num_fewshot, prom
     if isinstance(dl.dataloader.dataset, InContextLearningCodeEvalDataset):
         max_prompt_length = dl.dataloader.dataset.max_prompt_length
     assert tuple(batch['input_ids'].shape) == (batch_size, max_prompt_length)
-    assert tuple(batch['attention_mask'].shape) == (batch_size, max_prompt_length)
+    assert tuple(batch['attention_mask'].shape) == (batch_size,
+                                                    max_prompt_length)
     assert batch['mode'] == 'generate'
     # the maximum generation length from the small test data
     assert batch['generation_length'] == 129
-    assert any(item[0] != tokenizer.eos_token_id for item in batch['input_ids'])  # longest should be pushed left
+    assert any(item[0] != tokenizer.eos_token_id
+               for item in batch['input_ids'])  # longest should be pushed left
 
     decoded_batch = tokenizer.batch_decode(batch['input_ids'])
-    assert all(item.count('Code start: \n') == num_fewshot + 1 for item in decoded_batch)
+    assert all(
+        item.count('Code start: \n') == num_fewshot + 1
+        for item in decoded_batch)
 
     if len(prompt_string) > 0:
         assert all(item.count('Please code:\n') == 1 for item in decoded_batch)
@@ -1121,7 +1757,8 @@ def test_code_eval_test_cases(dataset_uri, tmp_path):
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
 
     transformers = pytest.importorskip('transformers')
-    tokenizer = transformers.AutoTokenizer.from_pretrained('huggyllama/llama-7b')  # type: ignore reportUnboundVariable
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        'huggyllama/llama-7b')  # type: ignore reportUnboundVariable
     dataset_uri = f'{local_data}/{dataset_uri}'
     batch_size = 4
     seqlen = 512
@@ -1148,15 +1785,18 @@ def test_code_eval_test_cases(dataset_uri, tmp_path):
     if isinstance(dl.dataloader.dataset, InContextLearningCodeEvalDataset):
         max_prompt_length = dl.dataloader.dataset.max_prompt_length
     assert tuple(batch['input_ids'].shape) == (batch_size, max_prompt_length)
-    assert tuple(batch['attention_mask'].shape) == (batch_size, max_prompt_length)
+    assert tuple(batch['attention_mask'].shape) == (batch_size,
+                                                    max_prompt_length)
     assert batch['mode'] == 'generate'
     # the maximum generation length from the small test data
     assert batch['generation_length'] == 129
-    assert any(item[0] != tokenizer.eos_token_id for item in batch['input_ids'])  # longest should be pushed left
+    assert any(item[0] != tokenizer.eos_token_id
+               for item in batch['input_ids'])  # longest should be pushed left
 
     mod = types.ModuleType('test_module')
-    for prompt, solution, inputs, outputs, entry_point in zip(batch['prompts'], batch['labels'], batch['test_inputs'],
-                                                              batch['test_outputs'], batch['entry_points']):
+    for prompt, solution, inputs, outputs, entry_point in zip(
+            batch['prompts'], batch['labels'], batch['test_inputs'],
+            batch['test_outputs'], batch['entry_points']):
         exec(prompt + solution, mod.__dict__)
         for test_input, test_output in zip(inputs, outputs):
             result = mod.__dict__[entry_point](*eval(test_input))
@@ -1170,7 +1810,8 @@ def test_code_eval_pass_at_k_validity(dataset_uri, tmp_path):
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
 
     transformers = pytest.importorskip('transformers')
-    tokenizer = transformers.AutoTokenizer.from_pretrained('huggyllama/llama-7b')  # type: ignore reportUnboundVariable
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        'huggyllama/llama-7b')  # type: ignore reportUnboundVariable
     dataset_uri = f'{local_data}/{dataset_uri}'
     batch_size = 2
     seqlen = 64
@@ -1196,13 +1837,15 @@ def test_code_eval_pass_at_k_validity(dataset_uri, tmp_path):
 @pytest.mark.parametrize('num_fewshot', [0, 2])
 @pytest.mark.parametrize('prompt_string', ['Please code:\n', ''])
 @pytest.mark.parametrize('generations_per_sample', [1, 3])
-def test_code_eval_task_dataloader(dataset_uri, tmp_path, num_fewshot, prompt_string, generations_per_sample):
+def test_code_eval_task_dataloader(dataset_uri, tmp_path, num_fewshot,
+                                   prompt_string, generations_per_sample):
     pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
 
     transformers = pytest.importorskip('transformers')
-    tokenizer = transformers.AutoTokenizer.from_pretrained('mosaicml/mpt-7b')  # type: ignore reportUnboundVariable
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        'mosaicml/mpt-7b')  # type: ignore reportUnboundVariable
     dataset_uri = f'{local_data}/{dataset_uri}'
     batch_size = 4
     seqlen = 2048
@@ -1218,7 +1861,8 @@ def test_code_eval_task_dataloader(dataset_uri, tmp_path, num_fewshot, prompt_st
                                  example_delimiter='\n',
                                  continuation_delimiter='',
                                  question_prelimiter='Code start: \n',
-                                 destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'),
+                                 destination_path=str(
+                                     tmp_path / f'icl_{num_fewshot}.jsonl'),
                                  generations_per_sample=generations_per_sample,
                                  generation_kwargs={
                                      'temperature': .9,
@@ -1233,14 +1877,18 @@ def test_code_eval_task_dataloader(dataset_uri, tmp_path, num_fewshot, prompt_st
     if isinstance(dl.dataloader.dataset, InContextLearningCodeEvalDataset):
         max_prompt_length = dl.dataloader.dataset.max_prompt_length
     assert tuple(batch['input_ids'].shape) == (batch_size, max_prompt_length)
-    assert tuple(batch['attention_mask'].shape) == (batch_size, max_prompt_length)
+    assert tuple(batch['attention_mask'].shape) == (batch_size,
+                                                    max_prompt_length)
     assert batch['mode'] == 'generate'
     # the maximum generation length from the small test data
     assert batch['generation_length'] == 122
-    assert any(item[0] != tokenizer.eos_token_id for item in batch['input_ids'])  # longest should be pushed left
+    assert any(item[0] != tokenizer.eos_token_id
+               for item in batch['input_ids'])  # longest should be pushed left
 
     decoded_batch = tokenizer.batch_decode(batch['input_ids'])
-    assert all(item.count('Code start: \n') == num_fewshot + 1 for item in decoded_batch)
+    assert all(
+        item.count('Code start: \n') == num_fewshot + 1
+        for item in decoded_batch)
 
     if len(prompt_string) > 0:
         assert all(item.count('Please code:\n') == 1 for item in decoded_batch)
@@ -1268,12 +1916,14 @@ def test_code_eval_task_dataloader(dataset_uri, tmp_path, num_fewshot, prompt_st
 
 @pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [0, 1])
-def test_eval_split_batch(tiny_opt_tokenizer, dataset_uri, num_fewshot, tmp_path):
+def test_eval_split_batch(tiny_opt_tokenizer, dataset_uri, num_fewshot,
+                          tmp_path):
     pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     transformers = pytest.importorskip('transformers')
-    tokenizer = transformers.AutoTokenizer.from_pretrained('mosaicml/mpt-7b')  # type: ignore reportUnboundVariable
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        'mosaicml/mpt-7b')  # type: ignore reportUnboundVariable
     dataset_uri = f'{local_data}/{dataset_uri}'
     batch_size = 4
     seqlen = 512
@@ -1289,7 +1939,8 @@ def test_eval_split_batch(tiny_opt_tokenizer, dataset_uri, num_fewshot, tmp_path
                                  example_delimiter='\n',
                                  continuation_delimiter='',
                                  question_prelimiter='Code start: \n',
-                                 destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'),
+                                 destination_path=str(
+                                     tmp_path / f'icl_{num_fewshot}.jsonl'),
                                  generations_per_sample=1,
                                  generation_kwargs={
                                      'temperature': .9,
@@ -1323,9 +1974,11 @@ def test_eval_split_batch(tiny_opt_tokenizer, dataset_uri, num_fewshot, tmp_path
 @pytest.mark.parametrize('num_fewshot', [0, 5])
 @pytest.mark.gpu
 @pytest.mark.world_size(2)
-def test_lm_task_evaluation(dataset_uri, num_fewshot, tiny_gpt2_tokenizer, tmp_path):
+def test_lm_task_evaluation(dataset_uri, num_fewshot, tiny_gpt2_tokenizer,
+                            tmp_path):
     pytest.importorskip('datasets')
-    in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
+    in_memory_logger = InMemoryLogger(
+    )  # track the logged metrics in the in_memory_logger
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/{dataset_uri}'
     tokenizer = tiny_gpt2_tokenizer
@@ -1344,7 +1997,9 @@ def test_lm_task_evaluation(dataset_uri, num_fewshot, tiny_gpt2_tokenizer, tmp_p
         destination_path=str(tmp_path / 'icl.jsonl'),
     )
 
-    evaluator = Evaluator(label='lambada', dataloader=dl, metric_names=['InContextLearningLMAccuracy'])
+    evaluator = Evaluator(label='lambada',
+                          dataloader=dl,
+                          metric_names=['InContextLearningLMAccuracy'])
 
     transformers = pytest.importorskip('transformers')
     config = transformers.AutoConfig.from_pretrained('EleutherAI/gpt-neo-125M')
@@ -1358,16 +2013,20 @@ def test_lm_task_evaluation(dataset_uri, num_fewshot, tiny_gpt2_tokenizer, tmp_p
 
     trainer = Trainer(model=model, max_duration='1ep', loggers=in_memory_logger)
     trainer.eval(eval_dataloader=evaluator, subset_num_batches=2)
-    assert 'metrics/lambada/InContextLearningLMAccuracy' in in_memory_logger.data.keys()
-    assert in_memory_logger.data['metrics/lambada/InContextLearningLMAccuracy'][0][1].item() == 0
+    assert 'metrics/lambada/InContextLearningLMAccuracy' in in_memory_logger.data.keys(
+    )
+    assert in_memory_logger.data['metrics/lambada/InContextLearningLMAccuracy'][
+        0][1].item() == 0
 
 
 @pytest.mark.parametrize('num_fewshot', [0, 5])
 @pytest.mark.parametrize('dataset_uri', ['winograd_small.jsonl'])
 @pytest.mark.filterwarnings(r'ignore:Cannot split .* of length.*:UserWarning')
-def test_schema_task_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tmp_path, tiny_gpt2_model):
+def test_schema_task_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer,
+                                tmp_path, tiny_gpt2_model):
     pytest.importorskip('datasets')
-    in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
+    in_memory_logger = InMemoryLogger(
+    )  # track the logged metrics in the in_memory_logger
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/{dataset_uri}'
     tokenizer = tiny_gpt2_tokenizer
@@ -1386,7 +2045,10 @@ def test_schema_task_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer, t
         destination_path=str(tmp_path / 'icl.jsonl'),
     )
 
-    evaluator = Evaluator(label='winograd', dataloader=dl, metric_names=['InContextLearningMultipleChoiceAccuracy'])
+    evaluator = Evaluator(
+        label='winograd',
+        dataloader=dl,
+        metric_names=['InContextLearningMultipleChoiceAccuracy'])
 
     model = HuggingFaceModel(
         model=tiny_gpt2_model,
@@ -1397,13 +2059,17 @@ def test_schema_task_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer, t
 
     trainer = Trainer(model=model, max_duration='1ba', loggers=in_memory_logger)
     trainer.eval(eval_dataloader=evaluator)
-    assert 'metrics/winograd/InContextLearningMultipleChoiceAccuracy' in in_memory_logger.data.keys()
-    assert in_memory_logger.data['metrics/winograd/InContextLearningMultipleChoiceAccuracy'][0][1].item() > 0
+    assert 'metrics/winograd/InContextLearningMultipleChoiceAccuracy' in in_memory_logger.data.keys(
+    )
+    assert in_memory_logger.data[
+        'metrics/winograd/InContextLearningMultipleChoiceAccuracy'][0][1].item(
+        ) > 0
     num_samples = 0
     with open(dataset_uri) as f:
         for _ in f:
             num_samples += 1
-    assert trainer.state.eval_metrics['winograd']['InContextLearningMultipleChoiceAccuracy'].total == num_samples
+    assert trainer.state.eval_metrics['winograd'][
+        'InContextLearningMultipleChoiceAccuracy'].total == num_samples
 
 
 @pytest.mark.parametrize('dataset_uri', ['mmlu_small.jsonl'])
@@ -1411,10 +2077,12 @@ def test_schema_task_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer, t
 @pytest.mark.gpu
 @pytest.mark.world_size(2)
 @pytest.mark.filterwarnings(r'ignore:Cannot split .* of length.*:UserWarning')
-def test_mc_task_evaluation_subcategories(dataset_uri, num_fewshot, tiny_gpt2_model,
-                                          tiny_gpt2_tokenizer, tmp_path):
+def test_mc_task_evaluation_subcategories(dataset_uri, num_fewshot,
+                                          tiny_gpt2_model, tiny_gpt2_tokenizer,
+                                          tmp_path):
     pytest.importorskip('datasets')
-    in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
+    in_memory_logger = InMemoryLogger(
+    )  # track the logged metrics in the in_memory_logger
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/{dataset_uri}'
     tokenizer = tiny_gpt2_tokenizer
@@ -1433,12 +2101,15 @@ def test_mc_task_evaluation_subcategories(dataset_uri, num_fewshot, tiny_gpt2_mo
                                   prompt_string='',
                                   example_delimiter='\n',
                                   continuation_delimiter=': ',
-                                  destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
+                                  destination_path=str(
+                                      Path(gathered_paths[0]) / 'icl.jsonl'),
                                   has_categories=True)
 
     assert isinstance(dls, dict)
     evaluators = [
-        Evaluator(label='mmlu/' + k, dataloader=dl, metric_names=['InContextLearningMultipleChoiceAccuracy'])
+        Evaluator(label='mmlu/' + k,
+                  dataloader=dl,
+                  metric_names=['InContextLearningMultipleChoiceAccuracy'])
         for k, dl in dls.items()
     ]
 
@@ -1451,23 +2122,28 @@ def test_mc_task_evaluation_subcategories(dataset_uri, num_fewshot, tiny_gpt2_mo
 
     trainer = Trainer(model=model, loggers=in_memory_logger)
     trainer.eval(eval_dataloader=evaluators)
-    assert 'metrics/mmlu/computer_security/InContextLearningMultipleChoiceAccuracy' in in_memory_logger.data.keys()
-    assert in_memory_logger.data['metrics/mmlu/computer_security/InContextLearningMultipleChoiceAccuracy'][0][1].item(
-    ) > 0
-    total = trainer.state.eval_metrics['mmlu/computer_security']['InContextLearningMultipleChoiceAccuracy'].total
+    assert 'metrics/mmlu/computer_security/InContextLearningMultipleChoiceAccuracy' in in_memory_logger.data.keys(
+    )
+    assert in_memory_logger.data[
+        'metrics/mmlu/computer_security/InContextLearningMultipleChoiceAccuracy'][
+            0][1].item() > 0
+    total = trainer.state.eval_metrics['mmlu/computer_security'][
+        'InContextLearningMultipleChoiceAccuracy'].total
     dist.all_reduce(total)  # type: ignore
     assert total.item() == 4  # type: ignore
 
 
-@pytest.mark.parametrize('dataset_uri', ['piqa_small.jsonl', 'hellaswag_small.jsonl'])
+@pytest.mark.parametrize('dataset_uri',
+                         ['piqa_small.jsonl', 'hellaswag_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [0, 5])
 @pytest.mark.filterwarnings(r'ignore:Cannot split .* of length.*:UserWarning')
 @pytest.mark.gpu
 @pytest.mark.world_size(2)
-def test_mc_task_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tmp_path,
-                            tiny_gpt2_model):
+def test_mc_task_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer,
+                            tmp_path, tiny_gpt2_model):
     pytest.importorskip('datasets')
-    in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
+    in_memory_logger = InMemoryLogger(
+    )  # track the logged metrics in the in_memory_logger
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/{dataset_uri}'
     tokenizer = tiny_gpt2_tokenizer
@@ -1491,7 +2167,10 @@ def test_mc_task_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tmp_p
         destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
     )
 
-    evaluator = Evaluator(label='mc', dataloader=dl, metric_names=['InContextLearningMultipleChoiceAccuracy'])
+    evaluator = Evaluator(
+        label='mc',
+        dataloader=dl,
+        metric_names=['InContextLearningMultipleChoiceAccuracy'])
 
     model = HuggingFaceModel(
         model=tiny_gpt2_model,
@@ -1502,27 +2181,33 @@ def test_mc_task_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tmp_p
 
     trainer = Trainer(model=model, max_duration='1ba', loggers=in_memory_logger)
     trainer.eval(eval_dataloader=evaluator)
-    assert 'metrics/mc/InContextLearningMultipleChoiceAccuracy' in in_memory_logger.data.keys()
-    assert in_memory_logger.data['metrics/mc/InContextLearningMultipleChoiceAccuracy'][0][1].item() >= 0
+    assert 'metrics/mc/InContextLearningMultipleChoiceAccuracy' in in_memory_logger.data.keys(
+    )
+    assert in_memory_logger.data[
+        'metrics/mc/InContextLearningMultipleChoiceAccuracy'][0][1].item() >= 0
     num_samples = 0
     with open(dataset_uri) as f:
         for _ in f:
             num_samples += 1
-    total = trainer.state.eval_metrics['mc']['InContextLearningMultipleChoiceAccuracy'].total
+    total = trainer.state.eval_metrics['mc'][
+        'InContextLearningMultipleChoiceAccuracy'].total
     dist.all_reduce(total)  # type: ignore
     assert total.item() == num_samples  # type: ignore
 
 
 @pytest.mark.parametrize('num_fewshot', [0, 5])
 @pytest.mark.parametrize('dataset_uri', ['triviaqa_small.jsonl'])
-@pytest.mark.filterwarnings(r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning')
+@pytest.mark.filterwarnings(
+    r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning'
+)
 @pytest.mark.filterwarnings(r'ignore:Cannot split .* of length.*:UserWarning')
 @pytest.mark.gpu
 @pytest.mark.world_size(2)
-def test_qa_task_evaluation_opt_tokenizer(tiny_opt_tokenizer, tiny_opt_model, num_fewshot,
-                                          dataset_uri, tmp_path):
+def test_qa_task_evaluation_opt_tokenizer(tiny_opt_tokenizer, tiny_opt_model,
+                                          num_fewshot, dataset_uri, tmp_path):
     pytest.importorskip('datasets')
-    in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
+    in_memory_logger = InMemoryLogger(
+    )  # track the logged metrics in the in_memory_logger
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/{dataset_uri}'
     tokenizer = tiny_opt_tokenizer
@@ -1544,7 +2229,9 @@ def test_qa_task_evaluation_opt_tokenizer(tiny_opt_tokenizer, tiny_opt_model, nu
         destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
     )
 
-    evaluator = Evaluator(label='triviaqa', dataloader=dl, metric_names=['InContextLearningQAAccuracy'])
+    evaluator = Evaluator(label='triviaqa',
+                          dataloader=dl,
+                          metric_names=['InContextLearningQAAccuracy'])
     model = HuggingFaceModel(
         model=tiny_opt_model,
         tokenizer=tokenizer,
@@ -1555,20 +2242,26 @@ def test_qa_task_evaluation_opt_tokenizer(tiny_opt_tokenizer, tiny_opt_model, nu
     trainer = Trainer(model=model, max_duration='1ba', loggers=in_memory_logger)
 
     trainer.eval(eval_dataloader=evaluator, subset_num_batches=2)
-    assert 'metrics/triviaqa/InContextLearningQAAccuracy' in in_memory_logger.data.keys()
-    assert in_memory_logger.data['metrics/triviaqa/InContextLearningQAAccuracy'][0][1].item() == 0
+    assert 'metrics/triviaqa/InContextLearningQAAccuracy' in in_memory_logger.data.keys(
+    )
+    assert in_memory_logger.data[
+        'metrics/triviaqa/InContextLearningQAAccuracy'][0][1].item() == 0
 
 
 @pytest.mark.parametrize('num_fewshot', [5])
 @pytest.mark.parametrize('dataset_uri', ['gsm8k_small.jsonl'])
 @pytest.mark.gpu
 @pytest.mark.world_size(2)
-@pytest.mark.filterwarnings(r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning')
+@pytest.mark.filterwarnings(
+    r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning'
+)
 @pytest.mark.filterwarnings(r'ignore:Cannot split .* of length.*:UserWarning')
-def test_qa_task_evaluation_with_cot_opt_tokenizer(tiny_opt_tokenizer, tiny_opt_model, num_fewshot,
+def test_qa_task_evaluation_with_cot_opt_tokenizer(tiny_opt_tokenizer,
+                                                   tiny_opt_model, num_fewshot,
                                                    dataset_uri, tmp_path):
     pytest.importorskip('datasets')
-    in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
+    in_memory_logger = InMemoryLogger(
+    )  # track the logged metrics in the in_memory_logger
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/{dataset_uri}'
     tokenizer = tiny_opt_tokenizer
@@ -1591,7 +2284,9 @@ def test_qa_task_evaluation_with_cot_opt_tokenizer(tiny_opt_tokenizer, tiny_opt_
         destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
     )
 
-    evaluator = Evaluator(label='gsm8k', dataloader=dl, metric_names=['InContextLearningQAAccuracy'])
+    evaluator = Evaluator(label='gsm8k',
+                          dataloader=dl,
+                          metric_names=['InContextLearningQAAccuracy'])
     model = HuggingFaceModel(
         model=tiny_opt_model,
         tokenizer=tokenizer,
@@ -1602,19 +2297,24 @@ def test_qa_task_evaluation_with_cot_opt_tokenizer(tiny_opt_tokenizer, tiny_opt_
     trainer = Trainer(model=model, max_duration='1ba', loggers=in_memory_logger)
 
     trainer.eval(eval_dataloader=evaluator, subset_num_batches=2)
-    assert 'metrics/gsm8k/InContextLearningQAAccuracy' in in_memory_logger.data.keys()
-    assert in_memory_logger.data['metrics/gsm8k/InContextLearningQAAccuracy'][0][1].item() == 0
+    assert 'metrics/gsm8k/InContextLearningQAAccuracy' in in_memory_logger.data.keys(
+    )
+    assert in_memory_logger.data['metrics/gsm8k/InContextLearningQAAccuracy'][
+        0][1].item() == 0
 
 
 @pytest.mark.parametrize('dataset_uri', ['triviaqa_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [0, 5])
 @pytest.mark.gpu
 @pytest.mark.world_size(2)
-@pytest.mark.filterwarnings(r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning')
-def test_qa_task_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tiny_gpt2_model,
-                            tmp_path):
+@pytest.mark.filterwarnings(
+    r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning'
+)
+def test_qa_task_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer,
+                            tiny_gpt2_model, tmp_path):
     pytest.importorskip('datasets')
-    in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
+    in_memory_logger = InMemoryLogger(
+    )  # track the logged metrics in the in_memory_logger
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/{dataset_uri}'
     tokenizer = tiny_gpt2_tokenizer
@@ -1635,7 +2335,9 @@ def test_qa_task_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tiny_
         destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
     )
 
-    evaluator = Evaluator(label='triviaqa', dataloader=dl, metric_names=['InContextLearningQAAccuracy'])
+    evaluator = Evaluator(label='triviaqa',
+                          dataloader=dl,
+                          metric_names=['InContextLearningQAAccuracy'])
 
     model = HuggingFaceModel(
         model=tiny_gpt2_model,
@@ -1647,19 +2349,25 @@ def test_qa_task_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tiny_
     trainer = Trainer(model=model, max_duration='1ba', loggers=in_memory_logger)
 
     trainer.eval(eval_dataloader=evaluator, subset_num_batches=2)
-    assert 'metrics/triviaqa/InContextLearningQAAccuracy' in in_memory_logger.data.keys()
-    assert in_memory_logger.data['metrics/triviaqa/InContextLearningQAAccuracy'][0][1].item() == 0
+    assert 'metrics/triviaqa/InContextLearningQAAccuracy' in in_memory_logger.data.keys(
+    )
+    assert in_memory_logger.data[
+        'metrics/triviaqa/InContextLearningQAAccuracy'][0][1].item() == 0
 
 
 @pytest.mark.parametrize('dataset_uri', ['gsm8k_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [5])
-@pytest.mark.filterwarnings(r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning')
+@pytest.mark.filterwarnings(
+    r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning'
+)
 @pytest.mark.gpu
 @pytest.mark.world_size(2)
-def test_qa_task_with_cot_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tiny_gpt2_model,
+def test_qa_task_with_cot_evaluation(num_fewshot, dataset_uri,
+                                     tiny_gpt2_tokenizer, tiny_gpt2_model,
                                      tmp_path):
     pytest.importorskip('datasets')
-    in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
+    in_memory_logger = InMemoryLogger(
+    )  # track the logged metrics in the in_memory_logger
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/{dataset_uri}'
     tokenizer = tiny_gpt2_tokenizer
@@ -1681,7 +2389,9 @@ def test_qa_task_with_cot_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokeniz
         destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
     )
 
-    evaluator = Evaluator(label='gsm8k', dataloader=dl, metric_names=['InContextLearningQAAccuracy'])
+    evaluator = Evaluator(label='gsm8k',
+                          dataloader=dl,
+                          metric_names=['InContextLearningQAAccuracy'])
 
     model = HuggingFaceModel(
         model=tiny_gpt2_model,
@@ -1693,19 +2403,25 @@ def test_qa_task_with_cot_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokeniz
     trainer = Trainer(model=model, max_duration='1ba', loggers=in_memory_logger)
 
     trainer.eval(eval_dataloader=evaluator, subset_num_batches=2)
-    assert 'metrics/gsm8k/InContextLearningQAAccuracy' in in_memory_logger.data.keys()
-    assert in_memory_logger.data['metrics/gsm8k/InContextLearningQAAccuracy'][0][1].item() == 0
+    assert 'metrics/gsm8k/InContextLearningQAAccuracy' in in_memory_logger.data.keys(
+    )
+    assert in_memory_logger.data['metrics/gsm8k/InContextLearningQAAccuracy'][
+        0][1].item() == 0
 
 
 def test_code_eval_requires_envvar(monkeypatch):
     monkeypatch.delenv('CODE_EVAL_DEVICE', raising=False)
-    with pytest.raises(ValueError, match='Attempting to use InContextLearningCodeEvalAccuracy but.*'):
+    with pytest.raises(
+            ValueError,
+            match='Attempting to use InContextLearningCodeEvalAccuracy but.*'):
         InContextLearningCodeEvalAccuracy().get_client()
 
 
 def test_code_eval_requires_valid_envvar(monkeypatch):
     monkeypatch.setenv('CODE_EVAL_DEVICE', 'bigchungus')
-    with pytest.raises(ValueError, match='Environment variable `CODE_EVAL_DEVICE` must be on.*'):
+    with pytest.raises(
+            ValueError,
+            match='Environment variable `CODE_EVAL_DEVICE` must be on.*'):
         InContextLearningCodeEvalAccuracy().get_client()
 
 
@@ -1714,12 +2430,16 @@ def test_code_eval_requires_valid_envvar(monkeypatch):
 @pytest.mark.parametrize('generations_per_sample', range(1, 3))
 @pytest.mark.gpu
 @pytest.mark.world_size(2)
-@pytest.mark.filterwarnings(r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning')
-def test_code_eval_microbatching(monkeypatch, tiny_opt_tokenizer, tiny_opt_model, num_fewshot,
-                                 dataset_uri, tmp_path, generations_per_sample):
+@pytest.mark.filterwarnings(
+    r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning'
+)
+def test_code_eval_microbatching(monkeypatch, tiny_opt_tokenizer,
+                                 tiny_opt_model, num_fewshot, dataset_uri,
+                                 tmp_path, generations_per_sample):
     pytest.importorskip('datasets')
     monkeypatch.setenv('CODE_EVAL_DEVICE', 'LOCAL')
-    in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
+    in_memory_logger = InMemoryLogger(
+    )  # track the logged metrics in the in_memory_logger
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/{dataset_uri}'
     tokenizer = tiny_opt_tokenizer
@@ -1757,8 +2477,10 @@ def test_code_eval_microbatching(monkeypatch, tiny_opt_tokenizer, tiny_opt_model
     torch.use_deterministic_algorithms(False)
     trainer.eval(eval_dataloader=evaluator, subset_num_batches=2)
     torch.use_deterministic_algorithms(True)
-    assert 'metrics/humaneval/InContextLearningCodeEvalAccuracy' in in_memory_logger.data.keys()
-    assert in_memory_logger.data['metrics/humaneval/InContextLearningCodeEvalAccuracy'][0][1].item() == 0
+    assert 'metrics/humaneval/InContextLearningCodeEvalAccuracy' in in_memory_logger.data.keys(
+    )
+    assert in_memory_logger.data[
+        'metrics/humaneval/InContextLearningCodeEvalAccuracy'][0][1].item() == 0
 
 
 @pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
@@ -1766,12 +2488,16 @@ def test_code_eval_microbatching(monkeypatch, tiny_opt_tokenizer, tiny_opt_model
 @pytest.mark.parametrize('generations_per_sample', range(1, 3))
 @pytest.mark.gpu
 @pytest.mark.world_size(2)
-@pytest.mark.filterwarnings(r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning')
-def test_code_eval_sentpiece_evaluation(monkeypatch, num_fewshot, dataset_uri, tiny_t5_tokenizer,
-                                        tiny_t5_model, tmp_path, generations_per_sample):
+@pytest.mark.filterwarnings(
+    r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning'
+)
+def test_code_eval_sentpiece_evaluation(monkeypatch, num_fewshot, dataset_uri,
+                                        tiny_t5_tokenizer, tiny_t5_model,
+                                        tmp_path, generations_per_sample):
     pytest.importorskip('datasets')
     monkeypatch.setenv('CODE_EVAL_DEVICE', 'LOCAL')
-    in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
+    in_memory_logger = InMemoryLogger(
+    )  # track the logged metrics in the in_memory_logger
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/{dataset_uri}'
     tokenizer = tiny_t5_tokenizer
@@ -1793,7 +2519,9 @@ def test_code_eval_sentpiece_evaluation(monkeypatch, num_fewshot, dataset_uri, t
         generations_per_sample=generations_per_sample,
     )
 
-    evaluator = Evaluator(label='humaneval', dataloader=dl, metric_names=['InContextLearningCodeEvalAccuracy'])
+    evaluator = Evaluator(label='humaneval',
+                          dataloader=dl,
+                          metric_names=['InContextLearningCodeEvalAccuracy'])
     model = HuggingFaceModel(
         model=tiny_t5_model,
         tokenizer=tiny_t5_tokenizer,
@@ -1805,8 +2533,10 @@ def test_code_eval_sentpiece_evaluation(monkeypatch, num_fewshot, dataset_uri, t
     torch.use_deterministic_algorithms(False)
     trainer.eval(eval_dataloader=evaluator, subset_num_batches=2)
     torch.use_deterministic_algorithms(True)
-    assert 'metrics/humaneval/InContextLearningCodeEvalAccuracy' in in_memory_logger.data.keys()
-    assert in_memory_logger.data['metrics/humaneval/InContextLearningCodeEvalAccuracy'][0][1].item() == 0
+    assert 'metrics/humaneval/InContextLearningCodeEvalAccuracy' in in_memory_logger.data.keys(
+    )
+    assert in_memory_logger.data[
+        'metrics/humaneval/InContextLearningCodeEvalAccuracy'][0][1].item() == 0
 
 
 @pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
@@ -1815,12 +2545,16 @@ def test_code_eval_sentpiece_evaluation(monkeypatch, num_fewshot, dataset_uri, t
 @pytest.mark.filterwarnings(r'ignore: Input length of input_ids is')
 @pytest.mark.gpu
 @pytest.mark.world_size(2)
-@pytest.mark.filterwarnings(r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning')
-def test_code_eval_task_evaluation(monkeypatch, num_fewshot, dataset_uri, tiny_gpt2_tokenizer,
-                                   tiny_gpt2_model, tmp_path, generations_per_sample):
+@pytest.mark.filterwarnings(
+    r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning'
+)
+def test_code_eval_task_evaluation(monkeypatch, num_fewshot, dataset_uri,
+                                   tiny_gpt2_tokenizer, tiny_gpt2_model,
+                                   tmp_path, generations_per_sample):
     pytest.importorskip('datasets')
     monkeypatch.setenv('CODE_EVAL_DEVICE', 'LOCAL')
-    in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
+    in_memory_logger = InMemoryLogger(
+    )  # track the logged metrics in the in_memory_logger
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/{dataset_uri}'
     tokenizer = tiny_gpt2_tokenizer
@@ -1842,7 +2576,9 @@ def test_code_eval_task_evaluation(monkeypatch, num_fewshot, dataset_uri, tiny_g
         generations_per_sample=generations_per_sample,
     )
 
-    evaluator = Evaluator(label='humaneval', dataloader=dl, metric_names=['InContextLearningCodeEvalAccuracy'])
+    evaluator = Evaluator(label='humaneval',
+                          dataloader=dl,
+                          metric_names=['InContextLearningCodeEvalAccuracy'])
     model = HuggingFaceModel(
         model=tiny_gpt2_model,
         tokenizer=tiny_gpt2_tokenizer,
@@ -1854,8 +2590,10 @@ def test_code_eval_task_evaluation(monkeypatch, num_fewshot, dataset_uri, tiny_g
     torch.use_deterministic_algorithms(False)
     trainer.eval(eval_dataloader=evaluator, subset_num_batches=2)
     torch.use_deterministic_algorithms(True)
-    assert 'metrics/humaneval/InContextLearningCodeEvalAccuracy' in in_memory_logger.data.keys()
-    assert in_memory_logger.data['metrics/humaneval/InContextLearningCodeEvalAccuracy'][0][1].item() == 0
+    assert 'metrics/humaneval/InContextLearningCodeEvalAccuracy' in in_memory_logger.data.keys(
+    )
+    assert in_memory_logger.data[
+        'metrics/humaneval/InContextLearningCodeEvalAccuracy'][0][1].item() == 0
 
 
 @pytest.mark.parametrize('dataset_uri', ['lambada_small.jsonl'])
@@ -1884,8 +2622,10 @@ def test_lm_spacing_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
     first_batch = next(dl.dataloader._get_iterator())
     second_batch = next(dl.dataloader._get_iterator())
 
-    first_batch_text = tokenizer.decode(first_batch['input_ids'][0], skip_special_tokens=True)
-    second_batch_text = tokenizer.decode(second_batch['input_ids'][0], skip_special_tokens=True)
+    first_batch_text = tokenizer.decode(first_batch['input_ids'][0],
+                                        skip_special_tokens=True)
+    second_batch_text = tokenizer.decode(second_batch['input_ids'][0],
+                                         skip_special_tokens=True)
 
     first_batch_without_last_word = ' '.join(first_batch_text.split(' ')[:-1])
     second_batch_without_last_word = ' '.join(second_batch_text.split(' ')[:-1])
@@ -1904,29 +2644,37 @@ def test_lm_spacing_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
     'split': 'test',
     'name': 'juggernaut',
 }])
-@pytest.mark.parametrize('hf_parsing_map', [None, {'context': ['context'], 'continuation': ['continuation']}])
+@pytest.mark.parametrize(
+    'hf_parsing_map',
+    [None, {
+        'context': ['context'],
+        'continuation': ['continuation']
+    }])
 @pytest.mark.filterwarnings(
-    r'ignore:The repository for mosaicml/test_dataset contains custom code which must*:FutureWarning')
-def test_hf_dataloading_lm_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path, num_fewshot, prompt_string,
+    r'ignore:The repository for mosaicml/test_dataset contains custom code which must*:FutureWarning'
+)
+def test_hf_dataloading_lm_dataloader(dataset_uri, tiny_gpt2_tokenizer,
+                                      tmp_path, num_fewshot, prompt_string,
                                       hf_loading_vars, hf_parsing_map):
     pytest.importorskip('datasets')
 
     tokenizer = tiny_gpt2_tokenizer
     batch_size = 2
     seqlen = 2048
-    dl = get_icl_task_dataloader('language_modeling',
-                                 dataset_uri=dataset_uri,
-                                 tokenizer=tokenizer,
-                                 batch_size=batch_size,
-                                 max_seq_len=seqlen,
-                                 pad_tok_id=tokenizer.eos_token_id,
-                                 num_fewshot=0,
-                                 prompt_string='',
-                                 example_delimiter='\n',
-                                 continuation_delimiter=' ',
-                                 destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
-                                 hf_loading_vars=hf_loading_vars,
-                                 hf_parsing_map=hf_parsing_map)
+    dl = get_icl_task_dataloader(
+        'language_modeling',
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        batch_size=batch_size,
+        max_seq_len=seqlen,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=0,
+        prompt_string='',
+        example_delimiter='\n',
+        continuation_delimiter=' ',
+        destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
+        hf_loading_vars=hf_loading_vars,
+        hf_parsing_map=hf_parsing_map)
     assert isinstance(dl, DataSpec)
     assert isinstance(dl.dataloader, DataLoader)  # pyright
     batch = next(dl.dataloader._get_iterator())
@@ -1936,16 +2684,22 @@ def test_hf_dataloading_lm_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path
     assert 'attention_mask' in batch
     assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen)
     assert 'continuation_indices' in batch
-    assert isinstance(batch['continuation_indices'], list) and len(batch['continuation_indices']) == batch_size
+    assert isinstance(batch['continuation_indices'], list) and len(
+        batch['continuation_indices']) == batch_size
     assert 'mode' in batch
     assert batch['mode'] == 'icl_task'
     min_idx = min(batch['continuation_indices'][0]).item()
     max_idx = max(batch['continuation_indices'][0]).item()
-    assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx + 1]) == ' and me.'
+    assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx +
+                                                  1]) == ' and me.'
 
-    decoded_batch = [tokenizer.decode(row[row != tokenizer.eos_token_id]) for row in batch['input_ids']]
+    decoded_batch = [
+        tokenizer.decode(row[row != tokenizer.eos_token_id])
+        for row in batch['input_ids']
+    ]
     assert decoded_batch[0] == "Looks like it's just you and me."
-    assert decoded_batch[1] == "There's a fine line between bravery and stupidity."
+    assert decoded_batch[
+        1] == "There's a fine line between bravery and stupidity."
 
 
 @pytest.mark.parametrize('dataset_uri', ['hf://mosaicml/test_dataset'])
@@ -1955,10 +2709,15 @@ def test_hf_dataloading_lm_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path
     'split': 'test',
     'name': 'invoker',
 }])
-@pytest.mark.parametrize('hf_parsing_map', [{'context': ['quas', 'wex', 'exort'], 'answer': ['spell']}])
+@pytest.mark.parametrize('hf_parsing_map', [{
+    'context': ['quas', 'wex', 'exort'],
+    'answer': ['spell']
+}])
 @pytest.mark.filterwarnings(
-    r'ignore:The repository for mosaicml/test_dataset contains custom code which must*:FutureWarning')
-def test_hf_dataloading_custom_parsing(dataset_uri, tiny_gpt2_tokenizer, tmp_path, num_fewshot, prompt_string,
+    r'ignore:The repository for mosaicml/test_dataset contains custom code which must*:FutureWarning'
+)
+def test_hf_dataloading_custom_parsing(dataset_uri, tiny_gpt2_tokenizer,
+                                       tmp_path, num_fewshot, prompt_string,
                                        hf_loading_vars, hf_parsing_map):
     pytest.importorskip('datasets')
 
@@ -1969,38 +2728,46 @@ def test_hf_dataloading_custom_parsing(dataset_uri, tiny_gpt2_tokenizer, tmp_pat
     # empirical number from the small test dataset
     maximum_answer_length = 4
 
-    dl = get_icl_task_dataloader('question_answering',
-                                 dataset_uri=dataset_uri,
-                                 tokenizer=tokenizer,
-                                 batch_size=batch_size,
-                                 max_seq_len=seqlen,
-                                 pad_tok_id=tokenizer.eos_token_id,
-                                 num_fewshot=num_fewshot,
-                                 prompt_string=prompt_string,
-                                 example_delimiter='\n',
-                                 question_prelimiter='Orbs: ',
-                                 continuation_delimiter='\nSpell:',
-                                 destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
-                                 hf_loading_vars=hf_loading_vars,
-                                 hf_parsing_map=hf_parsing_map)
+    dl = get_icl_task_dataloader(
+        'question_answering',
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        batch_size=batch_size,
+        max_seq_len=seqlen,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=num_fewshot,
+        prompt_string=prompt_string,
+        example_delimiter='\n',
+        question_prelimiter='Orbs: ',
+        continuation_delimiter='\nSpell:',
+        destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
+        hf_loading_vars=hf_loading_vars,
+        hf_parsing_map=hf_parsing_map)
     assert isinstance(dl, DataSpec)
     assert isinstance(dl.dataloader, DataLoader)  # pyright
     batch = next(dl.dataloader._get_iterator())
 
-    assert tuple(batch['input_ids'].shape) == (batch_size, seqlen - maximum_answer_length)
-    assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen - maximum_answer_length)
+    assert tuple(batch['input_ids'].shape) == (batch_size,
+                                               seqlen - maximum_answer_length)
+    assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen -
+                                                    maximum_answer_length)
     assert batch['mode'] == 'generate'
     # the maximum generation length from the small test data
     assert batch['generation_length'] == maximum_answer_length
     assert all(item[0] == tokenizer.eos_token_id for item in batch['input_ids'])
 
     decoded_batch = tokenizer.batch_decode(batch['input_ids'])
-    assert all(item.count('Orbs: ') == num_fewshot + 1 for item in decoded_batch)
-    assert all(item.count('\nSpell:') == num_fewshot + 1 for item in decoded_batch)
+    assert all(
+        item.count('Orbs: ') == num_fewshot + 1 for item in decoded_batch)
+    assert all(
+        item.count('\nSpell:') == num_fewshot + 1 for item in decoded_batch)
 
     if len(prompt_string) > 0:
-        assert all(item.count('What spell does this invoke? ') == 1 for item in decoded_batch)
+        assert all(
+            item.count('What spell does this invoke? ') == 1
+            for item in decoded_batch)
     assert all(
-        set(found) == set(expected) for found, expected in zip(batch['labels'], [['defeaning blast'], ['cold snap']]))
+        set(found) == set(expected) for found, expected in zip(
+            batch['labels'], [['defeaning blast'], ['cold snap']]))
     assert decoded_batch[0].endswith('Orbs: quas wex exort\nSpell:')
     assert decoded_batch[1].endswith('Orbs: quas quas quas\nSpell:')
diff --git a/tests/eval/test_nlp_metrics.py b/tests/eval/test_nlp_metrics.py
index 93c0f91035..84d84933f4 100644
--- a/tests/eval/test_nlp_metrics.py
+++ b/tests/eval/test_nlp_metrics.py
@@ -1,12 +1,14 @@
-# Copyright 2022 MosaicML Composer authors
+# Copyright 2024 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 
+# Copyright 2022 MosaicML Composer authors
+# SPDX-License-Identifier: Apache-2.0
 
 import torch
 
-from llmfoundry.eval.metrics.nlp import ( InContextLearningCodeEvalAccuracy, InContextLearningLMAccuracy, InContextLearningMultipleChoiceAccuracy,
-                                  InContextLearningQAAccuracy,)
-
+from llmfoundry.eval.metrics.nlp import (
+    InContextLearningCodeEvalAccuracy, InContextLearningLMAccuracy,
+    InContextLearningMultipleChoiceAccuracy, InContextLearningQAAccuracy)
 
 
 def test_in_context_learning_lm_accuracy(tiny_gpt2_tokenizer):
@@ -14,10 +16,12 @@ def test_in_context_learning_lm_accuracy(tiny_gpt2_tokenizer):
     continuations = [' furry', ' pie', ' long lines', ' snowy']
     pad = tiny_gpt2_tokenizer.pad_token_id
     inputs = [
-        tiny_gpt2_tokenizer(context)['input_ids'] + tiny_gpt2_tokenizer(continuation)['input_ids']
+        tiny_gpt2_tokenizer(context)['input_ids'] +
+        tiny_gpt2_tokenizer(continuation)['input_ids']
         for context, continuation in zip(contexts, continuations)
     ]
-    inputs = torch.tensor([input + [pad] * (2048 - len(input)) for input in inputs])
+    inputs = torch.tensor(
+        [input + [pad] * (2048 - len(input)) for input in inputs])
 
     cont_idxs = []
     for context, continuation in zip(contexts, continuations):
@@ -25,19 +29,27 @@ def test_in_context_learning_lm_accuracy(tiny_gpt2_tokenizer):
         end = start + len(tiny_gpt2_tokenizer(continuation)['input_ids'])
         cont_idxs.append(torch.tensor(list(range(start, end))))
 
-    batch = {'continuation_indices': cont_idxs, 'labels': inputs.roll(-1), 'input_ids': inputs}
-    logits = torch.nn.functional.one_hot(inputs.roll(-1), num_classes=pad + 1).float() * 100
+    batch = {
+        'continuation_indices': cont_idxs,
+        'labels': inputs.roll(-1),
+        'input_ids': inputs
+    }
+    logits = torch.nn.functional.one_hot(inputs.roll(-1),
+                                         num_classes=pad + 1).float() * 100
     start, end = cont_idxs[1].tolist()[0] - 1, cont_idxs[1].tolist()[-1]
-    logits[1][start:end] = logits[0][start:end].clone()  # make one of the answer's continuations incorrect
+    logits[1][start:end] = logits[0][start:end].clone(
+    )  # make one of the answer's continuations incorrect
     metric = InContextLearningLMAccuracy()
     metric.update(batch, logits, batch['labels'])
 
     assert metric.compute() == 0.75
 
 
-
 def test_in_context_learning_qa_accuracy():
-    outputs = ['Correct but then some more text', 'Incorrect', ' the CORREct with weird casing and spacing']
+    outputs = [
+        'Correct but then some more text', 'Incorrect',
+        ' the CORREct with weird casing and spacing'
+    ]
     labels = [['Correct'], ['blah', 'blah2'], ['blah', 'correct']]
     batch = {'cot_delimiter': '', 'labels': labels}
     metric = InContextLearningQAAccuracy()
@@ -49,11 +61,17 @@ def test_in_context_learning_qa_accuracy():
 def test_in_context_learning_qa_cot_accuracy():
     outputs = [
         'chain of thought ### Correct but then some more text\n\nanother chain of thought ### Incorrect answer this time',
-        'Incorrect', 'chain of thought ### the CORREct with weird casing and spacing',
+        'Incorrect',
+        'chain of thought ### the CORREct with weird casing and spacing',
         'incorrect chain of thought delimiter ## Correct but wrong delimiter'
     ]
     labels = [['Correct'], ['blah', 'blah2'], ['blah', 'correct'], ['correct']]
-    batch = {'cot_delimiter': ' ### ', 'labels': labels, 'do_normalization': True, 'stopping_criteria': '\n\n'}
+    batch = {
+        'cot_delimiter': ' ### ',
+        'labels': labels,
+        'do_normalization': True,
+        'stopping_criteria': '\n\n'
+    }
     metric = InContextLearningQAAccuracy()
     metric.update(batch, outputs, labels)
 
@@ -70,9 +88,12 @@ def test_in_context_learning_code_eval_accuracy(monkeypatch):
         '    return n + 1'
     ]  # correct
     labels = []
-    prompts = ['def fib(n):\n', 'def multiply_by_two(n):\n', 'def add_one(n):\n']
+    prompts = [
+        'def fib(n):\n', 'def multiply_by_two(n):\n', 'def add_one(n):\n'
+    ]
     entry_points = ['fib', 'multiply_by_two', 'add_one']
-    test_inputs = [['(1,)', '(2,)', '(4,)'], ['(1,)', '(2,)', '(4,)'], ['(1,)', '(2,)', '(4,)']]
+    test_inputs = [['(1,)', '(2,)', '(4,)'], ['(1,)', '(2,)', '(4,)'],
+                   ['(1,)', '(2,)', '(4,)']]
     test_outputs = [['1', '2', '5'], ['2', '4', '8'], ['2', '3', '5']]
     languages = ['python', 'python', 'python']
     monkeypatch.setenv('CODE_EVAL_DEVICE', 'LOCAL')
@@ -102,18 +123,23 @@ def test_in_context_learning_code_eval_accuracy(monkeypatch):
 
 def test_in_context_learning_mc_accuracy(tiny_gpt2_tokenizer):
     contexts = [
-        'Q: How do you cook a cake?', 'Q: How do you cook a cake?', 'Q: How old is the earth?',
-        'Q: How old is the earth?'
+        'Q: How do you cook a cake?', 'Q: How do you cook a cake?',
+        'Q: How old is the earth?', 'Q: How old is the earth?'
+    ]
+    continuations = [
+        ' A: turn on the oven', ' A: do a backflip', ' A: 2 minutes',
+        ' A: 4.5 billion years'
     ]
-    continuations = [' A: turn on the oven', ' A: do a backflip', ' A: 2 minutes', ' A: 4.5 billion years']
     gold_indices = [0, 1]
     choice_groupings = [(0, 2), (2, 4)]
     pad = tiny_gpt2_tokenizer.pad_token_id
     inputs = [
-        tiny_gpt2_tokenizer(context)['input_ids'] + tiny_gpt2_tokenizer(continuation)['input_ids']
+        tiny_gpt2_tokenizer(context)['input_ids'] +
+        tiny_gpt2_tokenizer(continuation)['input_ids']
         for context, continuation in zip(contexts, continuations)
     ]
-    inputs = torch.tensor([input + [pad] * (2048 - len(input)) for input in inputs])
+    inputs = torch.tensor(
+        [input + [pad] * (2048 - len(input)) for input in inputs])
 
     cont_idxs = []
     for context, continuation in zip(contexts, continuations):
@@ -128,7 +154,8 @@ def test_in_context_learning_mc_accuracy(tiny_gpt2_tokenizer):
         'gold_indices': gold_indices,
         'choice_groupings': choice_groupings
     }
-    logits = torch.nn.functional.one_hot(inputs.roll(-1), num_classes=pad + 1).float()
+    logits = torch.nn.functional.one_hot(inputs.roll(-1),
+                                         num_classes=pad + 1).float()
 
     # for the first two, the correct answer is continuation 0
     # make the answer correct by making continuation 0 more likely for both answers
diff --git a/tests/fixtures/models.py b/tests/fixtures/models.py
index 33cb27ee8a..449fdbf5bc 100644
--- a/tests/fixtures/models.py
+++ b/tests/fixtures/models.py
@@ -1,13 +1,14 @@
 # Copyright 2022 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 
+import copy
 from typing import Any, Callable
 
+import pytest
 from omegaconf import DictConfig
 from pytest import fixture
-import pytest
 from transformers import PreTrainedTokenizerBase
-import copy
+
 from llmfoundry.models.hf.hf_causal_lm import ComposerHFCausalLM
 from llmfoundry.models.model_registry import COMPOSER_MODEL_REGISTRY
 from llmfoundry.models.mpt.modeling_mpt import ComposerMPTCausalLM
@@ -23,6 +24,7 @@ def _build_model(config: DictConfig, tokenizer: PreTrainedTokenizerBase):
 def mpt_tokenizer():
     return build_tokenizer('EleutherAI/gpt-neox-20b', {})
 
+
 @fixture
 def build_tiny_mpt(
     mpt_tokenizer: PreTrainedTokenizerBase
@@ -70,7 +72,6 @@ def build(**kwargs: Any) -> ComposerHFCausalLM:
     return build
 
 
-
 def tiny_gpt2_model_helper(config):
     transformers = pytest.importorskip('transformers')
 
@@ -110,7 +111,8 @@ def tiny_gpt2_tokenizer_helper():
 def tiny_llama_tokenizer_helper():
     transformers = pytest.importorskip('transformers')
 
-    hf_tokenizer = transformers.AutoTokenizer.from_pretrained('huggyllama/llama-7b', use_fast=False)
+    hf_tokenizer = transformers.AutoTokenizer.from_pretrained(
+        'huggyllama/llama-7b', use_fast=False)
     return hf_tokenizer
 
 
@@ -124,13 +126,11 @@ def _session_tiny_llama_tokenizer():  # type: ignore
     return tiny_llama_tokenizer_helper()
 
 
-
-
-
 def tiny_opt_tokenizer_helper():
     transformers = pytest.importorskip('transformers')
 
-    hf_tokenizer = transformers.AutoTokenizer.from_pretrained('facebook/opt-125m')
+    hf_tokenizer = transformers.AutoTokenizer.from_pretrained(
+        'facebook/opt-125m')
     hf_tokenizer.add_special_tokens({'pad_token': '[PAD]'})
     return hf_tokenizer
 
@@ -173,5 +173,3 @@ def tiny_opt_tokenizer(_session_tiny_opt_tokenizer):
 @pytest.fixture
 def tiny_opt_model(_session_tiny_opt_model):
     return copy.deepcopy(_session_tiny_opt_model)
-
-

From 72ce7936dd10959219191fedf9f2fa473a45cf3d Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Wed, 7 Feb 2024 15:11:03 -0500
Subject: [PATCH 10/59] migration

---
 .../in_context_learning_evaluation.py         |  949 +++++++++++--
 llmfoundry/eval/metrics/nlp.py                |  395 ++++--
 llmfoundry/models/hf/hf_causal_lm.py          |    9 +-
 .../models/inference_api_wrapper/interface.py |   13 +-
 llmfoundry/models/mpt/modeling_mpt.py         |    7 +-
 llmfoundry/utils/builders.py                  |    4 +-
 mcli/mcli-hf-eval.yaml                        |    8 +-
 .../eval/test_in_context_learning_datasets.py | 1253 +++++++++++++----
 tests/eval/test_nlp_metrics.py                |   69 +-
 tests/fixtures/models.py                      |   18 +-
 10 files changed, 2198 insertions(+), 527 deletions(-)

diff --git a/llmfoundry/eval/datasets/in_context_learning_evaluation.py b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
index bcc7996189..668dd25145 100644
--- a/llmfoundry/eval/datasets/in_context_learning_evaluation.py
+++ b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
@@ -1,3 +1,6 @@
+# Copyright 2024 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
 # Copyright 2022 MosaicML Composer authors
 # SPDX-License-Identifier: Apache-2.0
 # This code is based on the implementation in https://github.com/EleutherAI/lm-evaluation-harness/blob/8c048e266a22a1c85ccbdb0c209ac712e4f39989/lm_eval/base.py#L221-L330
@@ -8,19 +11,20 @@
 import json
 import os
 import random
+import warnings
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Union
 
 import torch
-from torch.utils.data import DataLoader
-
 from composer.core import DataSpec
 from composer.core.data_spec import _default_split_batch, _split_list
 from composer.datasets.utils import stop_sequences_criteria
 from composer.utils import MissingConditionalImportError, dist, get_file
-from composer.datasets import InContextLearningDataset
+from torch.utils.data import DataLoader, Dataset
+
 if TYPE_CHECKING:
     import transformers
-    from datasets import Dataset as HFDataset  # pyright: ignore[reportGeneralTypeIssues]
+    from datasets import \
+        Dataset as HFDataset  # pyright: ignore[reportGeneralTypeIssues]
 
 # Allow models to have slightly more tokens than were used in the most verbose CoT in the dataset
 _MAX_ANSWER_BUFFER_LENGTH = 10
@@ -35,8 +39,8 @@
 
 
 def strip_data(example: Dict) -> Dict:
-    """
-    Remove white space from the begging and end of string values in a dictionary
+    """Remove white space from the begging and end of string values in a
+    dictionary.
 
     Args:
         example: Dictionary to be stripped
@@ -44,14 +48,33 @@ def strip_data(example: Dict) -> Dict:
     Returns:
         dict: The same dictionary with .strip() applied to any value in the dict that is a string
     """
-    return {k: v.strip() if isinstance(v, str) else v for k, v in example.items()}
+    return {
+        k: v.strip() if isinstance(v, str) else v for k, v in example.items()
+    }
+
 
+def _tokenizer_needs_prefix_space(
+        tokenizer: transformers.PreTrainedTokenizerBase) -> bool:
+    """Test for whether a prefix space is needed before the continuation.
+    Sentencepiece tokenization should not have a prefix space, but gpt2 style
+    BPE should.
+
+    Args:
+        tokenizer: Tokenizer to test
 
-def _trim_context(context_enc: List, continuation_enc: List, max_seq_len: int) -> List:
+    Returns:
+        bool: Whether or not the tokenizer needs a prefix space
     """
-    Trims a list of tokens down to `max_seq_len` if the length of the list plus the continuation
-    is more than `max_seq_len`. It will always trim tokens from the left, i.e. tokens at the beginning
-    of the context will be removed.
+    test_tokens = tokenizer(' a', add_special_tokens=False)['input_ids']
+    assert isinstance(test_tokens, list)
+    return len(test_tokens) == 1
+
+
+def _trim_context(context_enc: List, continuation_enc: List,
+                  max_seq_len: int) -> List:
+    """Trims a list of tokens down to `max_seq_len` if the length of the list
+    plus the continuation is more than `max_seq_len`. It will always trim tokens
+    from the left, i.e. tokens at the beginning of the context will be removed.
 
     Args:
         context_enc (list): List of tokens in the context
@@ -66,16 +89,18 @@ def _trim_context(context_enc: List, continuation_enc: List, max_seq_len: int) -
 
         if context_max_subseq_len < 0:
             # can't support continuations which are longer than the max seq len
-            raise Exception(f'Dataset included continuation longer than the max seq len')
+            raise Exception(
+                f'Dataset included continuation longer than the max seq len')
 
         # clip from the end
         context_enc = context_enc[-(context_max_subseq_len):]
     return context_enc
 
 
-def _get_continuation_span(context_enc: List, continuation_enc: List) -> torch.Tensor:
-    """
-    Gets the list of indices of the continuation tokens for language modeling or generation tasks.
+def _get_continuation_span(context_enc: List,
+                           continuation_enc: List) -> torch.Tensor:
+    """Gets the list of indices of the continuation tokens for language modeling
+    or generation tasks.
 
     Args:
         context_enc (list): List of context tokens
@@ -84,7 +109,9 @@ def _get_continuation_span(context_enc: List, continuation_enc: List) -> torch.T
     Returns:
         torch.tensor: A tensor containing indices corresponding to continuation tokens
     """
-    return torch.tensor(range(len(context_enc), len(context_enc) + len(continuation_enc)))
+    return torch.tensor(
+        range(len(context_enc),
+              len(context_enc) + len(continuation_enc)))
 
 
 def _make_padded_input(context_enc: List,
@@ -92,9 +119,8 @@ def _make_padded_input(context_enc: List,
                        max_seq_len: int,
                        pad_tok_id: int,
                        padding_side: str = 'right') -> torch.Tensor:
-    """
-    Takes an encoded context and continuation and clips the beginning of the context if they're too long.
-    Adds the padding token to the specified side.
+    """Takes an encoded context and continuation and clips the beginning of the
+    context if they're too long. Adds the padding token to the specified side.
 
     Args:
         context_enc (List): The encoded input to the model
@@ -117,7 +143,9 @@ def _make_padded_input(context_enc: List,
     # Sometimes tokenizers that have neither a pad_tok_id or eos_tok_id will pass None in as the padding
     # token and cause errors
     if not isinstance(pad_tok_id, int):
-        raise ValueError(f'`pad_tok_id` must be an integer. Found {type(pad_tok_id)} instead')
+        raise ValueError(
+            f'`pad_tok_id` must be an integer. Found {type(pad_tok_id)} instead'
+        )
     # pad length from seq to padding_length
     if padding_side == 'right':
         inp = torch.cat(
@@ -136,15 +164,18 @@ def _make_padded_input(context_enc: List,
             dim=0,
         )
     else:
-        raise ValueError(f"Unknown padding_side {padding_side}. padding_side must be either 'left' or 'right'")
+        raise ValueError(
+            f"Unknown padding_side {padding_side}. padding_side must be either 'left' or 'right'"
+        )
 
     return inp
 
 
-def convert_tokens_to_tensors(batch: Dict, tokenize_labels: bool) -> Dict[str, Any]:
-    """
-    HF Datasets converts tensors into lists when we store them, and we don't want to use `type='torch'`
-    because some content in the dataset, like generation args or single ints, should not be converted.
+def convert_tokens_to_tensors(batch: Dict,
+                              tokenize_labels: bool) -> Dict[str, Any]:
+    """HF Datasets converts tensors into lists when we store them, and we don't
+    want to use `type='torch'` because some content in the dataset, like
+    generation args or single ints, should not be converted.
 
     Here, we convert those lists of tokens back into tensors in order to feed them into the model.
 
@@ -155,14 +186,17 @@ def convert_tokens_to_tensors(batch: Dict, tokenize_labels: bool) -> Dict[str, A
     Returns:
         dict: The batch with torch tensors in the corresponding keys instead of lists of lists
     """
-    batch['input_ids'] = torch.stack(list(map(torch.tensor, batch['input_ids'])))
+    batch['input_ids'] = torch.stack(list(map(torch.tensor,
+                                              batch['input_ids'])))
     if tokenize_labels:
         batch['labels'] = torch.stack(list(map(torch.tensor, batch['labels'])))
-        batch['continuation_indices'] = list(map(torch.tensor, batch['continuation_indices']))
+        batch['continuation_indices'] = list(
+            map(torch.tensor, batch['continuation_indices']))
     return batch
 
 
-def _get_fewshot_sample_idxs(dataset_size: int, num_fewshot: int, example_idx: int, rng: random.Random) -> Set[int]:
+def _get_fewshot_sample_idxs(dataset_size: int, num_fewshot: int,
+                             example_idx: int, rng: random.Random) -> Set[int]:
     """
     Samples indices without replacement. If num_fewshot exceeds the number of unique examples in the dataset,
     then we will have fewer than num_fewshot examples in context.
@@ -189,10 +223,482 @@ def _get_fewshot_sample_idxs(dataset_size: int, num_fewshot: int, example_idx: i
         fewshot_idxs.add(replacement_sample)
     return fewshot_idxs
 
-class InContextLearningQATaskDataset(InContextLearningDataset):
+
+class InContextLearningDataset(Dataset):
+    """A base dataset that constructs batches for in-context learning task
+    evaluations. The dataset format is expected to be a local jsonl file, a
+    cloud link to a jsonl file, or a Hugging Face dataset link. 'context' refers
+    to the input a model will recieve before generating an output. For example,
+    the question in question answering tasks, the preceding text in a language
+    modeling task, or the document and question regarding the document in a
+    document understanding task. 'example' refers to a loaded dictionary,
+    generally containing a context, an answer, and any other information needed
+    to run the task. 'answer' refers to the desired output of the model.
+
+    When creating a new ICL Dataset, it is likely that you will need to reimplement the following methods:
+
+    - construct_context(): Takes a single example dictionary and formulates the context as a string for that eval question.
+    - get_answer_from_example(): Takes a single example dictionary and formulates the correct, ground truth answer as a string.
+    - tokenize_example(): Tokenizes the example and adds any extra content from the original dictionary that needs to be passed downstream.
+    - read_dataset(): Loads the dataset and does basic parsing. If additional parsing must be done, this is a good place to do so (See InContextLearningQATaskDataset.read_dataset())
+
+    Additionally, base_batch and batch_mapping must be defined.
+
+    - base_batch (Dict): The base dictionary that the dataset will use to construct a batch. This should contain static values, like generation_kwargs or mode,
+      and empty lists for values that will need to be accumulated from each example.
+      NOTE: Sometimes you will need to set base_batch directly after the init call, e.g. in order to use class variables
+      like self.pad_tok_id or self.max_answer_length. If you manually set generation_kwargs this way, you'll need to call self.update_generation_kwargs()
+      after setting self.base_batch.
+    - batch_mapping (Dict): A mapping with keys that are keys in the batch and values that are columns in the loaded dataset.
+      collate_fn will use this mapping to create batches from self.dataset.
+
+    Args:
+        dataset_uri (str): A local path, a remote path beginning with ``s3://`` or another backend, or a HuggingFace dataset uri prepended with ``hf://``.
+            Alternate backends must be supported by :meth:`composer.utils.maybe_create_object_store_from_uri`.
+            A local dataset must consist of rows of JSON data points with task dependent fields.
+            The default keys expected are "context" and "answer".
+        tokenizer (transformers.PreTrainedTokenizerBase): The tokenizer used to map between strings and token ids.
+        max_seq_len (int): The maximum sequence length supported by the model.
+        pad_tok_id (int): The special token used for padding batches.
+        num_fewshot (int): The number of complete fewshot examples to prepend before each test example. These are not identical across examples.
+        fewshot_random_seed (int): Random seed to use for fewshot sampling.
+        prompt_string (str): Prompt string to put once before all fewshot examples/test examples (e.g. 'Translate english to french.').
+        example_delimiter (str): Separator inserted before (context, answer) pairs (e.g. '\\n') for fewshot sampling and prompting.
+        continuation_delimiter: (str): Separator inserted between context and answer in each example (e.g. '\\nA: ').
+        destination_path (str): Temporary path to store downloaded datasets.
+        prelimiter (str): Text to be prepended before each context, including few shot examples (e.g. "Question: ").
+        context_key (str): The key in the loaded dataset that contains the context.
+        answer_key (str): The key in the loaded dataset that contains the answer.
+        strip_dataset (bool): Boolean for whether to strip whitespace from data. Trailing whitespace can cause degenerative outputs,
+            so unless whitespace should be preserved (for example in code), this should be set to True.
+        padding_side (str): Side of the content and answer on which to apply padding. Can be either 'right' or 'left'.
+        padding_size (int): The final size of the tensor after padding. Defaults to max_sequence_length.
+        base_batch (Dict): The base dictionary upon which a batch is created. See above for more details.
+        base_mapping (Dict): A mapping of batch keys to dataset columns, used to create batches. See above for more details.
+        hf_loading_vars (Dict): A dictionary containing keyword arguments to be passed into `load_dataset` if dataset is being pulled from HF.
+        hf_parsing_map (Dict): A dictionary containing a mapping from HF columns to ICL dataset keys. The dictionary should be formatted {icl_key:[hf_key1, hf_key1]}.
+            Column contents will be concatenated with ' ' seperating them. If not included, will load the columns already present in the HF dataset.
+        tokenize_labels (bool): Whether or not the labels should be tokenized. Generally determined by which metric a dataset uses.
+        generation_kwargs (Dict): A dictionary containing keyword arguments to be passed along to the model's generate function.
     """
-    A dataset that constructs batches for in-context learning question answering evaluation.
-    QA tasks evaluate a model's ability to answer questions using a consistent format.
+
+    def __init__(
+        self,
+        dataset_uri: str,
+        tokenizer: transformers.PreTrainedTokenizerBase,
+        max_seq_len: int,
+        pad_tok_id: int,
+        num_fewshot: int,
+        fewshot_random_seed: int,
+        prompt_string: str,
+        example_delimiter: str,
+        continuation_delimiter: str,
+        destination_path: str,
+        prelimiter: str = '',
+        context_key: str = 'context',
+        answer_key: str = 'answer',
+        strip_dataset: bool = True,
+        padding_side: str = 'right',
+        tokenize_labels: bool = True,
+        static_keys: Optional[List] = None,
+        list_keys: Optional[List] = None,
+        tensor_keys: Optional[List] = None,
+        padding_size: Optional[int] = None,
+        base_batch: Optional[Dict] = None,
+        batch_mapping: Optional[Dict] = None,
+        hf_loading_vars: Optional[Dict] = None,
+        hf_parsing_map: Optional[Dict] = None,
+        generation_kwargs: Optional[Dict] = None,
+    ):
+        try:
+            import datasets
+            del datasets
+        except ImportError as e:
+            raise MissingConditionalImportError(
+                extra_deps_group='nlp',
+                conda_package='datasets',
+                conda_channel='conda-forge',
+            ) from e
+
+        self.tokenizer = tokenizer
+        self.prefix_space = _tokenizer_needs_prefix_space(self.tokenizer)
+
+        self.max_seq_len = max_seq_len
+        self.pad_tok_id = pad_tok_id
+        self.num_fewshot = num_fewshot
+        self.padding_side = padding_side
+        self.padding_size = padding_size if padding_size else self.max_seq_len
+        self.prelimiter = prelimiter
+        self.example_delimiter = example_delimiter
+        self.continuation_delimiter = continuation_delimiter
+        self.context_key = context_key
+        self.answer_key = answer_key
+        self.tokenize_labels = tokenize_labels
+        self.batch_mapping = batch_mapping or {}
+        self.base_batch = base_batch or {}
+        if generation_kwargs:
+            self.update_generation_kwargs(generation_kwargs)
+
+        self.static_keys = static_keys
+        self.list_keys = list_keys
+        self.tensor_keys = tensor_keys
+
+        hf_loading_vars = hf_loading_vars or {}
+        self.dataset: HFDataset = self.read_dataset(dataset_uri,
+                                                    destination_path,
+                                                    hf_loading_vars,
+                                                    hf_parsing_map)
+        self.strip_data = strip_dataset
+        if self.strip_data:
+            self.dataset = self.dataset.map(strip_data)
+
+        fewshot_rng = random.Random(fewshot_random_seed)
+        self.dataset: HFDataset = self.dataset.map(
+            self._prep_example,
+            with_indices=True,
+            fn_kwargs={
+                'num_fewshot': num_fewshot,
+                'prompt_string': prompt_string,
+                'fewshot_rng': fewshot_rng,
+            },
+        )
+
+    def __getitem__(self, index: int) -> Dict:
+        return self.dataset[index]
+
+    def __len__(self) -> int:
+        return len(self.dataset)
+
+    def get_num_samples_in_batch(self, batch: Dict) -> int:
+        return batch['input_ids'].shape[0]
+
+    def update_generation_kwargs(self, generation_kwargs: Dict) -> None:
+        """Updates self.base_batch with the passed in generation_kwargs. This
+        must be run after self.base_batch is set (for example, if
+        self.base_batch is set after __init__() is run, likely because
+        base_batch needs a class variable like self.pad_tok_id or
+        self.max_answer_length).
+
+        Args:
+            dict: Keyword arguments that be written into base_batch['generation_kwargs']
+        """
+        if generation_kwargs:
+            if 'generation_kwargs' not in self.base_batch:
+                self.base_batch['generation_kwargs'] = {}
+            self.base_batch['generation_kwargs'].update(generation_kwargs)
+
+    def read_dataset(
+            self,
+            dataset_uri: str,
+            destination_path: str,
+            hf_loading_vars: Optional[Dict[str, Any]] = None,
+            hf_parsing_map: Optional[Dict[str, Any]] = None) -> 'HFDataset':
+        """Reads a dataset and handles parsing it from HuggingFace.
+
+        Args:
+            dataset_uri (str): A local path, a remote path beginning with ``s3://`` or another backend, or a HuggingFace dataset uri.
+                Alternate backends must be supported by :meth:`composer.utils.maybe_create_object_store_from_uri`.
+            destination_path (str): A local path where the data will be stored
+            hf_loading_vars (Dict): If parsing from HuggingFace, keyword args that will be passed into load_dataset
+            hf_parsing_map (Dict): Dictionary in the form of {icl_key: [hf_col1, hf_col2]} that will map one or more hf columns, in order, to ICL dataset columns
+
+        Returns:
+            dataset: A loaded HF dataset
+        """
+        from datasets import \
+            Dataset as HFDataset  # pyright: ignore[reportGeneralTypeIssues]
+        from datasets import \
+            load_dataset  # pyright: ignore[reportGeneralTypeIssues]
+        if 'hf://' in dataset_uri:
+            dataset_uri = dataset_uri.replace('hf://', '')
+            if hf_loading_vars is None:
+                hf_loading_vars = {}
+            dataset = load_dataset(dataset_uri, **hf_loading_vars)
+            if hf_parsing_map:
+                dataset_parsing_func = lambda example: {
+                    k: ' '.join([str(example[col]) for col in v])
+                    for k, v in hf_parsing_map.items(
+                    )  # pyright: ignore[reportOptionalMemberAccess]
+                }
+                assert isinstance(dataset, HFDataset)
+                dataset = dataset.map(dataset_parsing_func,
+                                      remove_columns=dataset.column_names)
+        else:
+            with dist.local_rank_zero_download_and_wait(destination_path):
+                if dist.get_local_rank() == 0:
+                    get_file(dataset_uri, destination_path, overwrite=True)
+            dataset = load_dataset('json',
+                                   data_files=destination_path,
+                                   split='train',
+                                   streaming=False)
+        assert isinstance(dataset, HFDataset)
+        return dataset
+
+    def _generate_few_shot_prompt(
+        self,
+        num_fewshot: int,
+        example_idx: int,
+        preamble: str,
+        fewshot_rng: random.Random,
+    ) -> str:
+        """Formats the fewshot prompt for test example `example_idx`.
+
+        Randomly selects `num_fewshot` samples from the dataset (excluding the example at `example_idx`) and constructs
+        contextes with answers appended.
+
+        Returns the formatted prompt_string + concatenated list of formatted few shot examples as a string.
+
+        Args:
+            num_fewshot (int): Number of examples to prepend
+            example_idx (int): Current example idx
+            preamble (str): Text to occur at the beginning of the task. Generally instructions or a prompt.
+            fewshot_rng (random.Random): Seeded sampler to chose samples with
+
+        Returns:
+            str: The original preamble with num_fewshot examples appended
+        """
+        few_shot_text = preamble
+
+        if num_fewshot > 0:
+            fewshot_idxs = _get_fewshot_sample_idxs(
+                len(self.dataset),
+                num_fewshot,
+                example_idx,
+                fewshot_rng,
+            )
+            for fewshot_idx in fewshot_idxs:
+                ctxt = self.construct_context(
+                    self.dataset[fewshot_idx],
+                    few_shot_text,
+                    add_answer=True,
+                )
+                few_shot_text += ctxt
+
+        return few_shot_text
+
+    def construct_context(self,
+                          example: Dict,
+                          preceding_text: str = '',
+                          add_answer: bool = False) -> str:
+        """Takes an example and constructs a context, i.e. the input the model
+        reads for this example. Optionally adds the correct answer (for fewshot
+        examples) and handles example delimiters.
+
+        Args:
+            example (Dict): The example from which to construct the context
+            preceding_text (str): Any preceding text, used as a check for prepending self.example_delimiter
+            add_answer (bool): Bool for whether or not to add the answer on the end of the context (e.g. for fewshot examples)
+
+        Returns:
+            str: The constructed context. The default output context is
+                 formatted as follows: f'{self.prelimiter}{example[self.context_key]}{self.continuation_delimiter}'
+        """
+        ctxt = example[self.context_key]
+        ctxt = f'{self.prelimiter}{ctxt}'
+        if len(preceding_text) > 0:
+            ctxt = f'{self.example_delimiter}{ctxt}'
+        ctxt = f'{ctxt}{self.continuation_delimiter}'
+        if add_answer:
+            ctxt = f'{ctxt}{self.get_answer_from_example(example, in_context=add_answer)}'
+        return ctxt
+
+    def get_answer_from_example(self,
+                                example: Dict[str, Any],
+                                in_context: bool = False) -> str:
+        """Returns the answer from the example.
+
+        Args:
+            example (Dict): The example from which to retrieve the answer
+
+        Returns:
+            str: The answer in the example
+        """
+        cont = example[self.answer_key]
+        if self.prefix_space and not cont.startswith(' ') and not in_context:
+            cont = f' {cont}'
+        return cont
+
+    def _fix_eos_on_preamble(self, input_ids: List[int]) -> List[int]:
+        """If the input_ids is empty then input_ids will be a 0-length List
+        unless the tokenizer adds special tokens to empty strings (e.g. OPT
+        tokenizer). If there is an EOS token added, we need to remove it so it
+        is not in the middle of the prompt, as the specific eval question's
+        prompt will follow the input_ids.
+
+        Args:
+            input_ids (List): The tokenized input
+
+        Returns:
+            input_ids: The tokenized input conditionally edited
+        """
+        if (self.tokenizer.eos_token_id is not None and len(input_ids) > 1 and
+                input_ids[-1] == self.tokenizer.eos_token_id):
+            input_ids = input_ids[:-1]
+        return input_ids
+
+    def tokenize_example(self, prompt_and_fewshot: str, ctxt: str,
+                         example: Dict) -> Dict[str, Any]:
+        """Runs text through the tokenizer and handle special cases.
+
+        Args:
+            prompt_and_fewshot (str): The collection of the prompt and fewshot examples that belongs before the example's context
+            ctxt (str): The specific example's derrived context
+            example (Dict): The example as a dictionary. Used for additional processing in inherited classes.
+
+        Returns:
+            Dict: Dictionary with the tokenized data
+        """
+        tokenized_example = {}
+        # Always add special tokens to preamble
+        preamble = self.tokenizer(prompt_and_fewshot)['input_ids']
+        assert isinstance(preamble, list)
+        preamble = self._fix_eos_on_preamble(preamble)
+        if self.strip_data:
+            # rstrip context because a prompt ending in a space results in degenerate output
+            ctxt = ctxt.rstrip()
+        # Never add special tokens to context
+        tokenized_context = self.tokenizer(
+            ctxt, add_special_tokens=False)['input_ids']
+        assert isinstance(preamble, list)
+        assert isinstance(tokenized_context, list)
+
+        tokenized_context = preamble + tokenized_context
+
+        if self.tokenize_labels:
+            # Never add special tokens to answer
+            tokenized_answer = self.tokenizer(
+                self.get_answer_from_example(example),
+                add_special_tokens=False)['input_ids']
+            assert isinstance(tokenized_answer, list)
+            trimmed_context = _trim_context(tokenized_context, tokenized_answer,
+                                            self.padding_size)
+            assert isinstance(trimmed_context, list)
+            continuation_indices = _get_continuation_span(
+                trimmed_context, tokenized_answer)
+            padded_context = _make_padded_input(trimmed_context,
+                                                tokenized_answer,
+                                                self.padding_size,
+                                                self.pad_tok_id,
+                                                self.padding_side)
+
+            tokenized_example[self.context_key] = padded_context
+            tokenized_example[self.answer_key] = tokenized_answer
+            tokenized_example['continuation_indices'] = continuation_indices
+        else:
+            assert isinstance(tokenized_context, list)
+            trimmed_context = _trim_context(
+                tokenized_context,
+                [],
+                self.padding_size,
+            )
+            assert isinstance(trimmed_context, list)
+            padded_context = _make_padded_input(trimmed_context, [],
+                                                self.padding_size,
+                                                self.pad_tok_id,
+                                                self.padding_side)
+
+            tokenized_example[self.context_key] = padded_context
+            tokenized_example[self.answer_key] = self.get_answer_from_example(
+                example)
+
+        return tokenized_example
+
+    def _prep_example(
+        self,
+        example: Dict,
+        example_idx: int,
+        num_fewshot: int,
+        prompt_string: str,
+        fewshot_rng: random.Random,
+    ) -> Dict[str, Any]:
+        """Prepares a single example from a HF Dataset into tokenized format
+        with prompt and fewshot examples.
+
+        Each task consists of a context and a continuation as well as an optional prompt and optional list of
+        example context/continuation pairs which precede the test context/continuation pair.
+
+        Args:
+            example (Dict): A Dictionary from the hf dataset
+            example_idx (int): The index of example
+            num_fewshot (int): Number of examples context/continuation pairs to prepend to the test pair
+            prompt_string (str): The prompt to prepend to all inputs
+            fewshot_rng (random.Random): Random number generator to use for fewshot sampling
+
+        Returns:
+            Dict: Contains a dictionary with the tokenized data
+        """
+        prompt_and_fewshot = self._generate_few_shot_prompt(
+            num_fewshot, example_idx, prompt_string, fewshot_rng)
+        ctxt = self.construct_context(example,
+                                      prompt_and_fewshot,
+                                      add_answer=False)
+        tokenized_example = self.tokenize_example(prompt_and_fewshot, ctxt,
+                                                  example)
+        return tokenized_example
+
+    def collate_fn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """The function that the dataloader uses to accumulate data into
+        batches.
+
+        Args:
+            data (List): List of tokenized datapoints (dicts returned by self._tokenize_example)
+
+        Returns:
+            Dict: Dictionary for a single batch
+        """
+        batch = copy.deepcopy(self.base_batch)
+        for data_pair in data:
+            for batch_key, data_key in self.batch_mapping.items():
+                batch[batch_key].append(data_pair[data_key])
+            if 'continuation_indices' in data_pair:
+                batch['continuation_indices'].append(
+                    data_pair['continuation_indices'])
+
+        batch = convert_tokens_to_tensors(batch, self.tokenize_labels)
+        batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
+        return batch
+
+    def split_batch(self, batch: Any,
+                    microbatch_size: int) -> List[Dict[str, Any]]:
+        """Handling for certain specialty columns that must be split into
+        batches in different formats.
+
+        Args:
+            batch (Dict): Batch of data
+            microbatch_size (int): Size of microbatches
+
+        Returns:
+            List: List of chunked batches
+        """
+        # Don't split kwargs that don't change
+        # Normally split torch tensors
+        # List split lists of strings
+        chunked = {}
+        for k, v in batch.items():
+            if k in self.static_keys:
+                # Defer broadcasting until we know num_chunks
+                pass
+            elif k in self.list_keys:
+                chunked[k] = _split_list(v, microbatch_size)
+            elif k in self.tensor_keys:
+                chunked[k] = _default_split_batch(v, microbatch_size)
+            else:
+                raise ValueError(f'Unexpected key {k} in batch splitting')
+        num_chunks = len(chunked['input_ids'])
+        for k, v in batch.items():
+            if k in self.static_keys:
+                chunked[k] = [v] * num_chunks
+
+        batched_list = [
+            {k: v[idx] for k, v in chunked.items()} for idx in range(num_chunks)
+        ]
+        return batched_list
+
+
+class InContextLearningQATaskDataset(InContextLearningDataset):
+    """A dataset that constructs batches for in-context learning question
+    answering evaluation. QA tasks evaluate a model's ability to answer
+    questions using a consistent format.
 
     The input format is expected to be a jsonl file with the following fields:
     - context: The question
@@ -211,13 +717,23 @@ def __init__(self,
                  do_normalization: bool = True,
                  *args,
                  **kwargs):
+        warnings.warn(
+            ('InContextLearningQATaskDataset is deprecated and will be removed in a future '
+             'release. Its functionality has been reimplemented '
+             'in llmfoundry.eval.datasets.in_context_learning_evaluation.InContextLearningQATaskDataset.'
+            ),
+            DeprecationWarning,
+        )
         if kwargs['tokenizer'].eos_token_id is None:
-            raise ValueError('`InContextLearningQATaskDataset` tokenizer must have non-null `eos_token_id`')
+            raise ValueError(
+                '`InContextLearningQATaskDataset` tokenizer must have non-null `eos_token_id`'
+            )
         self.cot_delimiter = cot_delimiter
         self.has_cot = False
         self.max_answer_length = 0
         static_keys = [
-            'mode', 'cot_delimiter', 'generation_length', 'generation_kwargs', 'do_normalization', 'stopping_criteria'
+            'mode', 'cot_delimiter', 'generation_length', 'generation_kwargs',
+            'do_normalization', 'stopping_criteria'
         ]
         tensor_keys = ['input_ids', 'attention_mask']
         list_keys = ['labels']
@@ -248,7 +764,8 @@ def __init__(self,
             'input_ids': self.context_key,
             'labels': 'aliases',
         }
-        self.update_generation_kwargs(kwargs.get('generation_kwargs', {}))
+        if 'generation_kwargs' in kwargs:
+            self.update_generation_kwargs(kwargs['generation_kwargs'])
 
     def read_dataset(
         self,
@@ -257,14 +774,19 @@ def read_dataset(
         hf_loading_vars: Dict,
         hf_parsing_map: Dict,
     ) -> 'HFDataset':
-        dataset = super().read_dataset(dataset_uri, destination_path, hf_loading_vars, hf_parsing_map)
+        dataset = super().read_dataset(dataset_uri, destination_path,
+                                       hf_loading_vars, hf_parsing_map)
         self.has_cot = 'chain_of_thought' in dataset.features
         dataset = dataset.map(
             lambda examples: {
-                'context': examples['context'],
-                'answer': examples['answer'],
-                'aliases': set([examples['answer']] + examples.get('aliases', [])),
-                'chain_of_thought': examples.get('chain_of_thought', ''),
+                'context':
+                    examples['context'],
+                'answer':
+                    examples['answer'],
+                'aliases':
+                    set([examples['answer']] + examples.get('aliases', [])),
+                'chain_of_thought':
+                    examples.get('chain_of_thought', ''),
             })
         self.max_answer_length = self._get_max_answer_length(dataset)
         # NOTE: This is the only time we use the class variable padding_size.
@@ -285,7 +807,8 @@ def get_answer_from_example(self, example: Dict, in_context=False) -> str:
         else:
             return example[self.answer_key]
 
-    def tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -> Dict[str, Any]:
+    def tokenize_example(self, prompt_and_fewshot: str, ctxt: str,
+                         example: Dict) -> Dict[str, Any]:
         """
         Run text through the tokenizer and handle special cases.
         Args:
@@ -296,7 +819,8 @@ def tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) ->
         Returns:
             Dict: Dictionary with the tokenized data
         """
-        tokenized_example = super().tokenize_example(prompt_and_fewshot, ctxt, example)
+        tokenized_example = super().tokenize_example(prompt_and_fewshot, ctxt,
+                                                     example)
         tokenized_example['aliases'] = list(example.get('aliases', []))
         return tokenized_example
 
@@ -309,16 +833,21 @@ def _get_max_answer_length(self, dataset) -> int:
         """
         max_answer_length = 0
         for example in dataset:
-            all_answers = [example[self.answer_key]] + list(example.get('aliases', []))
+            all_answers = [example[self.answer_key]] + list(
+                example.get('aliases', []))
             for answer in all_answers:
                 if self.has_cot:
-                    response = (f'{example["chain_of_thought"]}{self.cot_delimiter}{answer}')
+                    response = (
+                        f'{example["chain_of_thought"]}{self.cot_delimiter}{answer}'
+                    )
                 else:
                     response = answer
                 tokenized_repsonse = self.tokenizer(response)['input_ids']
                 assert isinstance(tokenized_repsonse, list)
-                max_answer_length = max(max_answer_length, len(tokenized_repsonse))
-        max_answer_length = max_answer_length + (_MAX_ANSWER_BUFFER_LENGTH if len(self.cot_delimiter) > 0 else 0)
+                max_answer_length = max(max_answer_length,
+                                        len(tokenized_repsonse))
+        max_answer_length = max_answer_length + (
+            _MAX_ANSWER_BUFFER_LENGTH if len(self.cot_delimiter) > 0 else 0)
         return max_answer_length
 
     def collate_fn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
@@ -327,18 +856,20 @@ def collate_fn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
         stopping_criteria = None
         if self.early_stopping_criteria:
             if stop_sequences_criteria is None:  # pyright: ignore [reportUnnecessaryComparison]
-                raise MissingConditionalImportError(extra_deps_group='nlp',
-                                                    conda_package='transformers',
-                                                    conda_channel='conda-forge')
-            stopping_criteria = stop_sequences_criteria(self.tokenizer, self.early_stopping_criteria, batch_size)
+                raise MissingConditionalImportError(
+                    extra_deps_group='nlp',
+                    conda_package='transformers',
+                    conda_channel='conda-forge')
+            stopping_criteria = stop_sequences_criteria(
+                self.tokenizer, self.early_stopping_criteria, batch_size)
         batch['generation_kwargs']['stopping_criteria'] = stopping_criteria
         return batch
 
 
 class InContextLearningLMTaskDataset(InContextLearningDataset):
-    """
-    A dataset that constructs batches for in-context learning language modeling evaluation.
-    Language modeling tasks test a model's ability to properly predict tokens based on preceding tokens.
+    """A dataset that constructs batches for in-context learning language
+    modeling evaluation. Language modeling tasks test a model's ability to
+    properly predict tokens based on preceding tokens.
 
     The input format is expected to be a jsonl file with the following fields:
     - context: Preceding text
@@ -348,9 +879,19 @@ class InContextLearningLMTaskDataset(InContextLearningDataset):
     """
 
     def __init__(self, *args, **kwargs):
+        warnings.warn(
+            ('InContextLearningLMTaskDataset is deprecated and will be removed in a future '
+             'release. Its functionality has been reimplemented '
+             'in llmfoundry.eval.datasets.in_context_learning_evaluation.InContextLearningLMTaskDataset.'
+            ),
+            DeprecationWarning,
+        )
         super().__init__(answer_key='continuation',
                          static_keys=['mode'],
-                         tensor_keys=['input_ids', 'continuation_indices', 'labels', 'attention_mask'],
+                         tensor_keys=[
+                             'input_ids', 'continuation_indices', 'labels',
+                             'attention_mask'
+                         ],
                          base_batch={
                              'input_ids': [],
                              'continuation_indices': [],
@@ -367,8 +908,8 @@ def __init__(self, *args, **kwargs):
 
 
 class InContextLearningMultipleChoiceTaskDataset(InContextLearningDataset):
-    """
-    A dataset that construct batches for in-context learning multiple choice evaluation.
+    """A dataset that construct batches for in-context learning multiple choice
+    evaluation.
 
     If each question has N answer choices, we construct N distinct inputs per question. In order to ensure
     consistency across multi-GPU, we set the batch size to be `min(N, batch_size)` so that all N
@@ -399,6 +940,13 @@ def __init__(self,
                  list_of_primitives: Optional[List] = None,
                  *args,
                  **kwargs):
+        warnings.warn(
+            ('InContextLearningMultipleChoiceTaskDataset is deprecated and will be removed in a future '
+             'release. Its functionality has been reimplemented '
+             'in llmfoundry.eval.datasets.in_context_learning_evaluation.InContextLearningMultipleChoiceTaskDataset.'
+            ),
+            DeprecationWarning,
+        )
         self.choices_key = choices_key
         base_batch = {
             'input_ids': [],
@@ -410,8 +958,11 @@ def __init__(self,
         }
         context_key = kwargs.pop('context_key', 'query')
         static_keys = kwargs.pop('static_keys', ['mode', 'generation_kwargs'])
-        tensor_keys = kwargs.pop('tensor_keys', ['input_ids', 'labels', 'attention_mask'])
-        self.list_of_tensors_keys = list_of_tensors_keys or ['continuation_indices']
+        tensor_keys = kwargs.pop('tensor_keys',
+                                 ['input_ids', 'labels', 'attention_mask'])
+        self.list_of_tensors_keys = list_of_tensors_keys or [
+            'continuation_indices'
+        ]
         self.list_of_tuples_keys = list_of_tuples_keys or ['choice_groupings']
         self.list_of_primitives = list_of_primitives or ['gold_indices']
         super().__init__(context_key=context_key,
@@ -422,7 +973,10 @@ def __init__(self,
                          *args,
                          **kwargs)
         self.num_choices = len(self.dataset[0][self.choices_key])
-        self.batch_mapping_per_choice = {'input_ids': 'context', 'labels': 'context'}
+        self.batch_mapping_per_choice = {
+            'input_ids': 'context',
+            'labels': 'context'
+        }
         self.batch_map_per_example = {'gold_indices': 'gold'}
 
     def get_answer_from_example(self, example: Dict, in_context=False) -> str:
@@ -438,7 +992,8 @@ def get_answer_from_example(self, example: Dict, in_context=False) -> str:
         gold_idx = example['gold']
         return choices[gold_idx]
 
-    def tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -> Dict[str, Any]:
+    def tokenize_example(self, prompt_and_fewshot: str, ctxt: str,
+                         example: Dict) -> Dict[str, Any]:
         """
         Runs text through the tokenizer and handle special cases.
         Args:
@@ -459,7 +1014,8 @@ def tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) ->
             # rstrip context because a prompt ending in a space results in degenerate output
             ctxt = ctxt.rstrip()
         # Never add special tokens to context
-        tokenized_context = self.tokenizer(ctxt, add_special_tokens=False)['input_ids']
+        tokenized_context = self.tokenizer(
+            ctxt, add_special_tokens=False)['input_ids']
         assert isinstance(tokenized_context, list)
         tokenized_context = preamble + tokenized_context
 
@@ -472,12 +1028,15 @@ def tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) ->
                 choice = f' {choice}' if not choice.startswith(' ') else choice
 
             # Never add special tokens to answer
-            tokenized_answer = self.tokenizer(choice, add_special_tokens=False)['input_ids']
+            tokenized_answer = self.tokenizer(
+                choice, add_special_tokens=False)['input_ids']
             assert isinstance(tokenized_context, list)
             assert isinstance(tokenized_answer, list)
-            trimmed_context = _trim_context(tokenized_context, tokenized_answer, self.padding_size)
+            trimmed_context = _trim_context(tokenized_context, tokenized_answer,
+                                            self.padding_size)
             assert isinstance(trimmed_context, list)
-            continuation_indices = _get_continuation_span(trimmed_context, tokenized_answer)
+            continuation_indices = _get_continuation_span(
+                trimmed_context, tokenized_answer)
             padded_context = _make_padded_input(
                 trimmed_context,
                 tokenized_answer,
@@ -488,16 +1047,17 @@ def tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) ->
 
             tokenized_example[self.context_key].append(padded_context)
             tokenized_example[self.answer_key].append(tokenized_answer)
-            tokenized_example['continuation_indices'].append(continuation_indices)
+            tokenized_example['continuation_indices'].append(
+                continuation_indices)
 
         tokenized_example['gold'] = example['gold']
         return tokenized_example
 
     def collate_fn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
-        """
-        The function that the dataloader uses to accumulate data into batches.
-        We run each distinct query + answer choice through the model separately and determine which
-        answer has the lowest per-token-perplexity.
+        """The function that the dataloader uses to accumulate data into
+        batches. We run each distinct query + answer choice through the model
+        separately and determine which answer has the lowest per-token-
+        perplexity.
 
         If each question has N possible choices, all N must be grouped together as distinct elements of the batch
         since the batch may consist of multiple questions, the choice_groupings indicates
@@ -515,7 +1075,8 @@ def collate_fn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
             # NOTE: not using batch_mapping
             for i, context_enc in enumerate(data_pair[self.context_key]):
                 batch['input_ids'].append(context_enc)
-                batch['continuation_indices'].append(data_pair['continuation_indices'][i])
+                batch['continuation_indices'].append(
+                    data_pair['continuation_indices'][i])
                 batch['labels'].append(context_enc)
 
             batch['gold_indices'].append(data_pair['gold'])
@@ -529,9 +1090,10 @@ def collate_fn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
     def get_num_samples_in_batch(self, batch) -> int:
         return batch['input_ids'].shape[0] // self.num_choices
 
-    def split_batch(self, batch: Any, microbatch_size: int) -> List[Dict[str, Any]]:
-        """
-        Split batch while ensuring all continuations are in the same microbatch.
+    def split_batch(self, batch: Any,
+                    microbatch_size: int) -> List[Dict[str, Any]]:
+        """Split batch while ensuring all continuations are in the same
+        microbatch.
 
         In ICL Multiple Choice, we duplicate each data point for each possible continuation.
         When splitting a batch, we have logical example, which refer to one possible question,
@@ -553,7 +1115,8 @@ def split_batch(self, batch: Any, microbatch_size: int) -> List[Dict[str, Any]]:
             elif type(v) == list:
                 # list of tensors - 'continuation_indices'
                 if k in self.list_of_tensors_keys:
-                    chunked[k] = _split_list(v, microbatch_size * self.num_choices)
+                    chunked[k] = _split_list(v,
+                                             microbatch_size * self.num_choices)
                 # list of tuples - 'choice_groupings'
                 elif k in self.list_of_tuples_keys:
                     chunked[k] = _split_list(v, microbatch_size)
@@ -563,7 +1126,8 @@ def split_batch(self, batch: Any, microbatch_size: int) -> List[Dict[str, Any]]:
                 else:
                     raise ValueError(f'Unexpected key {k} in list splitting')
             elif k in self.tensor_keys:
-                chunked[k] = _default_split_batch(v, microbatch_size * self.num_choices)
+                chunked[k] = _default_split_batch(
+                    v, microbatch_size * self.num_choices)
             else:
                 raise ValueError(f'Unexpected key {k} in batch splitting')
         num_chunks = len(chunked['input_ids'])
@@ -572,14 +1136,19 @@ def split_batch(self, batch: Any, microbatch_size: int) -> List[Dict[str, Any]]:
             if k in self.static_keys:
                 chunked[k] = [v] * num_chunks
 
-        return [{k: v[idx] for k, v in chunked.items()} for idx in range(num_chunks)]
+        return [
+            {k: v[idx] for k, v in chunked.items()} for idx in range(num_chunks)
+        ]
 
 
-class InContextLearningSchemaTaskDataset(InContextLearningMultipleChoiceTaskDataset):
-    """A dataset that constructs batches for in-context learning schema evaluation.
-    A schema task involves sentences with a fill-in-the-blank where the user needs to choose the correct word
-    to fill in from a set of N options. We use the partial evaluation technique from https://arxiv.org/abs/1806.02847
-    to determine the model's choice of fill-in word.
+class InContextLearningSchemaTaskDataset(
+        InContextLearningMultipleChoiceTaskDataset):
+    """A dataset that constructs batches for in-context learning schema
+    evaluation. A schema task involves sentences with a fill-in-the-blank where
+    the user needs to choose the correct word to fill in from a set of N
+    options. We use the partial evaluation technique from
+    https://arxiv.org/abs/1806.02847 to determine the model's choice of fill-in
+    word.
 
     The default input format is a jsonl file with the following fields:
     - context_options: List of strings corresponding to possible preceding context options for the continuation
@@ -593,13 +1162,19 @@ class InContextLearningSchemaTaskDataset(InContextLearningMultipleChoiceTaskData
     - labels: Identical to the input, used by the model to calculate loss/metrics
     - gold_indices: List of length ``batch_size // N`` indicating for each question, which of the answers is correct (via an integer [0, N-1])
     - choice_groupings: Indicates which indices of the batch correspond to which questions
-
     """
 
     def __init__(self, choices_key='context_options', *args, **kwargs):
         static_keys = ['mode']
         tensor_keys = ['input_ids', 'labels', 'attention_mask']
         list_of_tensors_keys = ['continuation_indices']
+        warnings.warn(
+            ('InContextLearningSchemaTaskDataset is deprecated and will be removed in a future '
+             'release. Its functionality has been reimplemented '
+             'in llmfoundry.eval.datasets.in_context_learning_evaluation.InContextLearningSchemaTaskDataset.'
+            ),
+            DeprecationWarning,
+        )
         super().__init__(choices_key=choices_key,
                          context_key=choices_key,
                          static_keys=static_keys,
@@ -616,9 +1191,12 @@ def __init__(self, choices_key='context_options', *args, **kwargs):
             'choice_groupings': [],
         }
 
-    def construct_context(self, example, preceding_text: str = '', add_answer: bool = False) -> str:
-        """
-        Takes a example and constructs a context with the correct context for the example's continuation.
+    def construct_context(self,
+                          example,
+                          preceding_text: str = '',
+                          add_answer: bool = False) -> str:
+        """Takes a example and constructs a context with the correct context for
+        the example's continuation.
 
         Args:
             example (Dict): The example from which to construct the context
@@ -637,10 +1215,11 @@ def construct_context(self, example, preceding_text: str = '', add_answer: bool
         context = f'{context}{self.continuation_delimiter}{continuation}'
         return context
 
-    def _construct_multiple_contexts(self, example: Dict, preceding_text: str = '') -> List[str]:
-        """
-        Takes a example and constructs all contexts. Optionally, appends this to preceeding text (such as a
-        prompt or fewshot examples).
+    def _construct_multiple_contexts(self,
+                                     example: Dict,
+                                     preceding_text: str = '') -> List[str]:
+        """Takes a example and constructs all contexts. Optionally, appends this
+        to preceeding text (such as a prompt or fewshot examples).
 
         Args:
             example (Dict): The example from which to construct the context
@@ -655,7 +1234,10 @@ def _construct_multiple_contexts(self, example: Dict, preceding_text: str = '')
                 cont_del = self.continuation_delimiter.rstrip()
             else:
                 cont_del = self.continuation_delimiter
-            context_options = [f'{self.example_delimiter}{c}{cont_del}' for c in context_options]
+            context_options = [
+                f'{self.example_delimiter}{c}{cont_del}'
+                for c in context_options
+            ]
         return context_options
 
     def _prep_example(
@@ -666,8 +1248,8 @@ def _prep_example(
         prompt_string: str,
         fewshot_rng: random.Random,
     ) -> Dict[str, Any]:
-        """
-        Prepares a single example from a HF Dataset into tokenized format with prompt and fewshot examples.
+        """Prepares a single example from a HF Dataset into tokenized format
+        with prompt and fewshot examples.
 
         Each task consists of multiple contexts and a single, correct continuation. Will preprend fewshot examples and
         prompt if present.
@@ -682,14 +1264,17 @@ def _prep_example(
         Returns:
             Dict: Contains a dictionary with the tokenized data
         """
-        prompt_and_fewshot = self._generate_few_shot_prompt(num_fewshot, example_idx, prompt_string, fewshot_rng)
+        prompt_and_fewshot = self._generate_few_shot_prompt(
+            num_fewshot, example_idx, prompt_string, fewshot_rng)
         ctxt = self._construct_multiple_contexts(example, prompt_and_fewshot)
-        tokenized_example = self.tokenize_example(prompt_and_fewshot, ctxt, example)
+        tokenized_example = self.tokenize_example(prompt_and_fewshot, ctxt,
+                                                  example)
         return tokenized_example
 
-    def tokenize_example(self, prompt_and_fewshot: str, context_options: List[str], example: Dict) -> Dict[str, Any]:
-        """
-        Runs text through the tokenizer and handle special cases.
+    def tokenize_example(self, prompt_and_fewshot: str,
+                         context_options: List[str],
+                         example: Dict) -> Dict[str, Any]:
+        """Runs text through the tokenizer and handle special cases.
 
         Args:
             prompt_and_fewshot (str): The collection of the prompt and fewshot examples that belongs before the example's context
@@ -704,14 +1289,18 @@ def tokenize_example(self, prompt_and_fewshot: str, context_options: List[str],
         assert isinstance(preamble, list)
         preamble = self._fix_eos_on_preamble(preamble)
         encoded_contexts = [
-            preamble +  # pyright: ignore[reportOperatorIssue, reportGeneralTypeIssues]
-            self.tokenizer(c, add_special_tokens=False)['input_ids']  # pyright: ignore[reportOperatorIssue, ]
+            preamble
+            +  # pyright: ignore[reportOperatorIssue, reportGeneralTypeIssues]
+            self.tokenizer(c, add_special_tokens=False)[
+                'input_ids']  # pyright: ignore[reportOperatorIssue, ]
             for c in context_options
         ]
         continuation = example['continuation']
         if self.prefix_space:
-            continuation = (f' {continuation}' if not continuation.startswith(' ') else continuation)
-        tokenized_continuation = self.tokenizer(continuation, add_special_tokens=False)['input_ids']
+            continuation = (f' {continuation}' if
+                            not continuation.startswith(' ') else continuation)
+        tokenized_continuation = self.tokenizer(
+            continuation, add_special_tokens=False)['input_ids']
 
         tokenized_example[self.context_key] = []
         tokenized_example['continuation_indices'] = []
@@ -719,13 +1308,19 @@ def tokenize_example(self, prompt_and_fewshot: str, context_options: List[str],
         for context in encoded_contexts:
             assert isinstance(context, list)
             assert isinstance(tokenized_continuation, list)
-            trimmed_context = _trim_context(context, tokenized_continuation, self.padding_size)
+            trimmed_context = _trim_context(context, tokenized_continuation,
+                                            self.padding_size)
             assert isinstance(trimmed_context, list)
-            continuation_indices = _get_continuation_span(trimmed_context, tokenized_continuation)
-            padded_context = _make_padded_input(trimmed_context, tokenized_continuation, self.padding_size,
-                                                self.pad_tok_id, self.padding_side)
+            continuation_indices = _get_continuation_span(
+                trimmed_context, tokenized_continuation)
+            padded_context = _make_padded_input(trimmed_context,
+                                                tokenized_continuation,
+                                                self.padding_size,
+                                                self.pad_tok_id,
+                                                self.padding_side)
             tokenized_example[self.context_key].append(padded_context)
-            tokenized_example['continuation_indices'].append(continuation_indices)
+            tokenized_example['continuation_indices'].append(
+                continuation_indices)
             tokenized_example[self.answer_key].append(tokenized_continuation)
 
         tokenized_example['gold'] = example['gold']
@@ -733,8 +1328,8 @@ def tokenize_example(self, prompt_and_fewshot: str, context_options: List[str],
 
 
 class InContextLearningCodeEvalDataset(InContextLearningDataset):
-    """
-    A dataset that constructs batches for in-context learning code evaluation.
+    """A dataset that constructs batches for in-context learning code
+    evaluation.
 
     The input format is expected to be a jsonl file with the following fields:
 
@@ -782,6 +1377,13 @@ def __init__(
         *args,
         **kwargs,
     ):
+        warnings.warn(
+            ('InContextLearningCodeEvalDataset is deprecated and will be removed in a future '
+             'release. Its functionality has been reimplemented '
+             'in llmfoundry.eval.datasets.in_context_learning_evaluation.InContextLearningCodeEvalDataset.'
+            ),
+            DeprecationWarning,
+        )
         if generations_per_sample < pass_at_k:
             raise ValueError(
                 f'generations_per_sample ({generations_per_sample}) must be greater than or equal to pass_at_k ({pass_at_k}) for code evaluation.'
@@ -799,8 +1401,13 @@ def __init__(
         # Linting complains if these are not set in init
         self.max_prompt_length = 0
         self.max_answer_length = 0
-        static_keys = ['mode', 'pass_at_k', 'generation_length', 'generation_kwargs']
-        list_keys = ['prompts', 'tests', 'entry_points', 'test_inputs', 'test_outputs', 'languages', 'labels']
+        static_keys = [
+            'mode', 'pass_at_k', 'generation_length', 'generation_kwargs'
+        ]
+        list_keys = [
+            'prompts', 'tests', 'entry_points', 'test_inputs', 'test_outputs',
+            'languages', 'labels'
+        ]
         tensor_keys = ['input_ids', 'attention_mask']
         super().__init__(
             context_key='prompt',
@@ -819,7 +1426,8 @@ def __init__(
         self.dataset = self.dataset.map(self._trim_padding)
         self.base_batch = {
             'input_ids': [],
-            'mode': 'generate',
+            'mode':
+                'generate',
             'labels': [],
             'prompts': [],
             'tests': [],
@@ -827,8 +1435,11 @@ def __init__(
             'test_inputs': [],
             'test_outputs': [],
             'languages': [],
-            'pass_at_k': pass_at_k,
-            'generation_length': min(self.max_answer_length, self.max_seq_len - self.max_prompt_length),
+            'pass_at_k':
+                pass_at_k,
+            'generation_length':
+                min(self.max_answer_length,
+                    self.max_seq_len - self.max_prompt_length),
             'generation_kwargs': {
                 'pad_token_id': self.pad_tok_id,
                 'num_beams': 1,  # single beam
@@ -838,11 +1449,12 @@ def __init__(
                 'eos_token_id': self.tokenizer.eos_token_id
             }
         }
-        self.update_generation_kwargs(kwargs.get('generation_kwargs', {}))
+        if 'generation_kwargs' in kwargs:
+            self.update_generation_kwargs(kwargs['generation_kwargs'])
 
     def _set_max_prompt_and_answer_lengths(self):
-        """
-        Iterates through the dataset and finds the maximum prompt length and sequence lengths
+        """Iterates through the dataset and finds the maximum prompt length and
+        sequence lengths.
 
         Returns:
             None
@@ -851,10 +1463,15 @@ def _set_max_prompt_and_answer_lengths(self):
         max_answer_length = 0
         for example in self.dataset:
             assert isinstance(example, Dict)
-            unpadded_example = [token for token in example[self.context_key] if token != self.pad_tok_id]
+            unpadded_example = [
+                token for token in example[self.context_key]
+                if token != self.pad_tok_id
+            ]
             max_prompt_length = max(max_prompt_length, len(unpadded_example))
 
-            tokenized_answer = self.tokenizer(example['canonical_solution'], add_special_tokens=False)['input_ids']
+            tokenized_answer = self.tokenizer(
+                example['canonical_solution'],
+                add_special_tokens=False)['input_ids']
             assert isinstance(tokenized_answer, list)
             len_tokenized_answer = len(tokenized_answer)
             max_answer_length = max(max_answer_length, len_tokenized_answer)
@@ -863,29 +1480,35 @@ def _set_max_prompt_and_answer_lengths(self):
         self.max_answer_length = max_answer_length + _MAX_ANSWER_BUFFER_LENGTH
 
     def _trim_padding(self, example: Dict):
-        """
-        Adjusts padding to the maximum prompt length rather than max_seq_len.
-        Needs to be done after the dataset has been processed because we don't know the maximum
-        prompt length until after we've tokenized it.
+        """Adjusts padding to the maximum prompt length rather than max_seq_len.
+        Needs to be done after the dataset has been processed because we don't
+        know the maximum prompt length until after we've tokenized it.
 
         Returns:
             dataset: A HuggingFace Dataset with different padding lengths for example[self.context_key]
         """
         # Remove padding tokens applied during tokenization
-        unpadded_prompt = [token for token in example[self.context_key] if token != self.pad_tok_id]
+        unpadded_prompt = [
+            token for token in example[self.context_key]
+            if token != self.pad_tok_id
+        ]
         # Reapply padding only to max_prompt_length
         full_prompt = _trim_context(unpadded_prompt, [], self.max_prompt_length)
-        padded_context = _make_padded_input(full_prompt, [], self.max_prompt_length, self.pad_tok_id, self.padding_side)
+        padded_context = _make_padded_input(full_prompt, [],
+                                            self.max_prompt_length,
+                                            self.pad_tok_id, self.padding_side)
 
         example[self.context_key] = padded_context
         return example
 
-    def tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -> Dict[str, Any]:
-        """
-        Adds extra code task details to the example dictionary.
+    def tokenize_example(self, prompt_and_fewshot: str, ctxt: str,
+                         example: Dict) -> Dict[str, Any]:
+        """Adds extra code task details to the example dictionary.
+
         See InContextLearningDataset for more details
         """
-        tokenized_example = super().tokenize_example(prompt_and_fewshot, ctxt, example)
+        tokenized_example = super().tokenize_example(prompt_and_fewshot, ctxt,
+                                                     example)
         tokenized_example['prompt_text'] = example['prompt']
         tokenized_example['task_id'] = example['task_id']
         tokenized_example['canonical_solution'] = example['canonical_solution']
@@ -919,9 +1542,9 @@ def build_icl_dataloader(
         generation_kwargs: Dict,
         early_stopping_criteria: Optional[List[str]] = None,
         do_normalization: bool = True) -> DataSpec:
-    """
-    Factory method that builds the specific dataset for the specified icl_task_type.
-    See documentation for `get_icl_task_dataloader` for arugment documentation.
+    """Factory method that builds the specific dataset for the specified
+    icl_task_type. See documentation for `get_icl_task_dataloader` for arugment
+    documentation.
 
     When writing a dataset for a new task, here you will need to:
         1. add the dataset to the factory and choose an appropriate string
@@ -929,6 +1552,13 @@ def build_icl_dataloader(
             this might be different)
         3. set the `split_batch` funciton if necessary
     """
+    warnings.warn(
+        ('build_icl_dataloader is deprecated and will be removed in a future '
+         'release. Its functionality has been reimplemented '
+         'in llmfoundry.eval.datasets.in_context_learning_evaluation.build_icl_dataloader.'
+        ),
+        DeprecationWarning,
+    )
     if icl_task_type == 'multiple_choice':
         dataset = InContextLearningMultipleChoiceTaskDataset(
             dataset_uri=dataset_uri,
@@ -1052,10 +1682,12 @@ def build_icl_dataloader(
     )
 
 
-def partition_dataset_by_category(dataset_uri: str, destination_path: str, hf_loading_vars: Dict,
+def partition_dataset_by_category(dataset_uri: str, destination_path: str,
+                                  hf_loading_vars: Dict,
                                   hf_parsing_map: Dict) -> Dict[str, str]:
-    """
-    If has_categories is enabled, we partition the dataset into a separate dataset for each category value in the data and write each partition to a local file.
+    """If has_categories is enabled, we partition the dataset into a separate
+    dataset for each category value in the data and write each partition to a
+    local file.
 
     Args:
         dataset_uri (str): Location of dataset.
@@ -1068,8 +1700,10 @@ def partition_dataset_by_category(dataset_uri: str, destination_path: str, hf_lo
         Dict[str, str]: Mapping of category names to partitioned dataset local files names.
     """
     try:
-        from datasets import Dataset as HFDataset  # pyright: ignore[reportGeneralTypeIssues]
-        from datasets import IterableDataset, load_dataset  # pyright: ignore[reportGeneralTypeIssues]
+        from datasets import \
+            Dataset as HFDataset  # pyright: ignore[reportGeneralTypeIssues]
+        from datasets import (  # pyright: ignore[reportGeneralTypeIssues]
+            IterableDataset, load_dataset)
     except ImportError as e:
         raise MissingConditionalImportError(
             extra_deps_group='nlp',
@@ -1079,26 +1713,35 @@ def partition_dataset_by_category(dataset_uri: str, destination_path: str, hf_lo
     if dataset_uri.startswith('hf://'):
         dataset_uri = dataset_uri.replace('hf://', '')
         dataset = load_dataset(dataset_uri, **hf_loading_vars)
-        assert isinstance(dataset, HFDataset) or isinstance(dataset, IterableDataset)
+        assert isinstance(dataset, HFDataset) or isinstance(
+            dataset, IterableDataset)
         if hf_parsing_map:
             dataset_parsing_func = lambda example: {
-                k: ' '.join([str(example[col]) for col in v]) for k, v in hf_parsing_map.items()
+                k: ' '.join([str(example[col]) for col in v])
+                for k, v in hf_parsing_map.items()
             }
             assert hasattr(dataset, 'column_names')
-            dataset = dataset.map(dataset_parsing_func, remove_columns=dataset.column_names)
+            dataset = dataset.map(dataset_parsing_func,
+                                  remove_columns=dataset.column_names)
     else:
         with dist.local_rank_zero_download_and_wait(destination_path):
             if dist.get_local_rank() == 0:
                 get_file(dataset_uri, destination_path, overwrite=True)
-        dataset = load_dataset('json', data_files=destination_path, split='train', streaming=False)
-    assert isinstance(dataset, HFDataset) or isinstance(dataset, IterableDataset)
+        dataset = load_dataset('json',
+                               data_files=destination_path,
+                               split='train',
+                               streaming=False)
+    assert isinstance(dataset, HFDataset) or isinstance(dataset,
+                                                        IterableDataset)
     assert hasattr(dataset, 'features')
     assert dataset.features is not None
     if 'category' not in dataset.features.keys():
         raise Exception(f"""Attempted to partition dataset by `category` \
             but it doesn't have a `category` key. \
             Got keys: {str(list(dataset.features.keys()))}""")
-    categories = sorted(set(dataset['category']))  # pyright: ignore[reportIndexIssue, reportGeneralTypeIssues]
+    categories = sorted(
+        set(dataset['category']
+           ))  # pyright: ignore[reportIndexIssue, reportGeneralTypeIssues]
     output_files = {}
     for cat in categories:
         path = destination_path.split('/')
@@ -1107,7 +1750,8 @@ def partition_dataset_by_category(dataset_uri: str, destination_path: str, hf_lo
         gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
         if dist.get_local_rank() == 0:
             subset = [
-                l for l in dataset if l['category'] == cat  # pyright: ignore[reportGeneralTypeIssues]
+                l for l in dataset if
+                l['category'] == cat  # pyright: ignore[reportGeneralTypeIssues]
             ]  # pyright: ignore[reportArgumentType, reportCallIssue]
             with open(gathered_paths[0], 'w', encoding='utf8') as f:
                 for l in subset:
@@ -1119,7 +1763,8 @@ def partition_dataset_by_category(dataset_uri: str, destination_path: str, hf_lo
 def get_icl_task_dataloader(
         icl_task_type: str,
         dataset_uri: str,
-        tokenizer: Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast],
+        tokenizer: Union[transformers.PreTrainedTokenizer,
+                         transformers.PreTrainedTokenizerFast],
         batch_size: int,
         max_seq_len: int,
         pad_tok_id: int,
@@ -1139,7 +1784,9 @@ def get_icl_task_dataloader(
         generation_kwargs: Optional[Dict] = None,
         early_stopping_criteria: Optional[List[str]] = None,
         do_normalization: bool = True) -> Union[DataSpec, Dict[str, DataSpec]]:
-    """This constructs a dataloader (or dataloaders if has_categories is True) capable of evaluating LLMs on in-context learning language modeling tasks, for example LAMBADA. An example usage is below:
+    """This constructs a dataloader (or dataloaders if has_categories is True)
+    capable of evaluating LLMs on in-context learning language modeling tasks,
+    for example LAMBADA. An example usage is below:
 
         .. testsetup::
 
@@ -1217,6 +1864,13 @@ def get_icl_task_dataloader(
     Returns:
         DataLoader: A dataloader used for performing in-context learning evaluation on the dataset provided.
     """
+    warnings.warn(
+        ('get_icl_task_dataloader is deprecated and will be removed in a future '
+         'release. Its functionality has been reimplemented '
+         'in llmfoundry.eval.datasets.in_context_learning_evaluation.get_icl_task_dataloader.'
+        ),
+        DeprecationWarning,
+    )
     if hf_loading_vars is None:
         hf_loading_vars = {}
     if hf_parsing_map is None:
@@ -1228,7 +1882,10 @@ def get_icl_task_dataloader(
 
     if has_categories:
         result_dls = {}
-        output_files = partition_dataset_by_category(dataset_uri, destination_path, hf_loading_vars, hf_parsing_map)
+        output_files = partition_dataset_by_category(dataset_uri,
+                                                     destination_path,
+                                                     hf_loading_vars,
+                                                     hf_parsing_map)
         categories = sorted(output_files.keys())
         for category in categories:
             partition_uri = output_files[category]
@@ -1279,4 +1936,4 @@ def get_icl_task_dataloader(
             generation_kwargs=generation_kwargs,
             early_stopping_criteria=early_stopping_criteria,
             do_normalization=do_normalization,
-        )
\ No newline at end of file
+        )
diff --git a/llmfoundry/eval/metrics/nlp.py b/llmfoundry/eval/metrics/nlp.py
index bef7d2f3c0..065c90306f 100644
--- a/llmfoundry/eval/metrics/nlp.py
+++ b/llmfoundry/eval/metrics/nlp.py
@@ -1,3 +1,6 @@
+# Copyright 2024 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
 # Copyright 2022 MosaicML Composer authors
 # SPDX-License-Identifier: Apache-2.0
 
@@ -12,29 +15,49 @@
 
 import numpy as np
 import torch
+from composer.metrics.nlp import InContextLearningMetric
+from composer.utils.eval_client import (EvalClient, LambdaEvalClient,
+                                        LocalEvalClient,
+                                        MosaicMLLambdaEvalClient)
 from torch import Tensor
 from torch.nn import functional as F
-from composer.metrics.nlp import InContextLearningMetric
-from composer.utils.eval_client import EvalClient, LambdaEvalClient, LocalEvalClient, MosaicMLLambdaEvalClient
 
 log = logging.getLogger(__name__)
 
 __all__ = [
+    'InContextLearningMetric',
     'InContextLearningLMAccuracy',
     'InContextLearningMultipleChoiceAccuracy',
     'InContextLearningQAAccuracy',
     'InContextLearningCodeEvalAccuracy',
-    'BinaryF1Score',
-    'LanguageCrossEntropy',
-    'MaskedAccuracy',
-    'LanguagePerplexity',
     'InContextLearningLMExpectedCalibrationError',
     'InContextLearningMCExpectedCalibrationError',
 ]
 
 
+class InContextLearningMetric(
+        InContextLearningMetric
+):  # TODO: this is a temporary solution until Max deprecates composer's superclass entirely
+
+    def update(self, batch: dict, output_logits: torch.Tensor,
+               labels: torch.Tensor):
+        """Abstract interface for computing an in-context learning metrics.
+
+        Args:
+            batch (dict): Batch must consist minimally of `input_ids` as well as any other structure needed
+                to compute the metric.
+            output_logits (torch.Tensor): The model outputs evaluated on the batch `input_ids`
+            labels (torch.Tensor): The correct outputs.
+
+        Raises:
+            NotImplementedError: Abstract method must be implemented by subclasses
+        """
+        raise NotImplementedError
+
+
 class InContextLearningQAAccuracy(InContextLearningMetric):
-    r"""Computes accuracy for In-context learning (ICL) question answering (QA) tasks.
+    r"""Computes accuracy for In-context learning (ICL) question answering (QA)
+    tasks.
 
     ICL QA tasks consist of some number of example question answering tasks (referred to as the 'context'), followed by a test task where the model must
     match one of the possible answer aliases (referred to as the 'continuation').
@@ -60,8 +83,16 @@ class InContextLearningQAAccuracy(InContextLearningMetric):
 
     def __init__(self, dist_sync_on_step: bool = False):
         # state from multiple processes
+        warnings.warn(
+            ('InContextLearningQAAccuracy is deprecated and will be removed in a future '
+             'release. Its functionality has been reimplemented '
+             'in llmfoundry.eval.metrics.nlp.InContextLearningQAAccuracy.'),
+            DeprecationWarning,
+        )
         super().__init__(dist_sync_on_step=dist_sync_on_step)
-        self.add_state('correct', default=torch.tensor(0.), dist_reduce_fx='sum')
+        self.add_state('correct',
+                       default=torch.tensor(0.),
+                       dist_reduce_fx='sum')
         self.add_state('total', default=torch.tensor(0.), dist_reduce_fx='sum')
 
     def normalize_answer(self, answer: str):
@@ -77,7 +108,8 @@ def white_space_fix(text: str) -> str:
             return ' '.join(text.split())
 
         def handle_punc(text: str) -> str:
-            exclude = set(string.punctuation + ''.join([u'‘', u'’', u'´', u'`']))
+            exclude = set(string.punctuation +
+                          ''.join([u'‘', u'’', u'´', u'`']))
             return ''.join(ch if ch not in exclude else ' ' for ch in text)
 
         def lower(text: str) -> str:
@@ -86,9 +118,18 @@ def lower(text: str) -> str:
         def replace_underscore(text: str) -> str:
             return text.replace('_', ' ')
 
-        return white_space_fix(remove_articles(handle_punc(lower(replace_underscore(answer))))).strip()
-
-    def update(self, batch: Optional[Dict[str, Any]], outputs: List[str], labels: List[List[str]]):
+        return white_space_fix(
+            remove_articles(handle_punc(lower(
+                replace_underscore(answer))))).strip()
+
+    def update(
+        self,
+        batch: Optional[Dict[str, Any]],
+        outputs: List[str],
+        labels: List[List[str]],
+    ):
+        if batch is None:
+            batch = {}
         cot_delimiter = batch.get('cot_delimiter', '')
         do_normalization = batch.get('do_normalization', True)
         stopping_criteria = batch.get('stopping_criteria', None)
@@ -96,19 +137,24 @@ def update(self, batch: Optional[Dict[str, Any]], outputs: List[str], labels: Li
             final_answer = sample_output
 
             if stopping_criteria is not None and len(stopping_criteria) > 0:
-                final_answer = re.split('|'.join(stopping_criteria), final_answer)[0]
+                final_answer = re.split('|'.join(stopping_criteria),
+                                        final_answer)[0]
 
             if cot_delimiter is not None and len(cot_delimiter) > 0:
                 final_answer = final_answer.split(cot_delimiter)[-1]
 
             if do_normalization:
                 cleaned_final_answer = self.normalize_answer(final_answer)
-                cleaned_sample_labels = {self.normalize_answer(label) for label in sample_labels}
+                cleaned_sample_labels = {
+                    self.normalize_answer(label) for label in sample_labels
+                }
             else:
                 cleaned_final_answer = final_answer
                 cleaned_sample_labels = set(sample_labels)
 
-            if any(cleaned_final_answer.startswith(label) for label in cleaned_sample_labels):
+            if any(
+                    cleaned_final_answer.startswith(label)
+                    for label in cleaned_sample_labels):
                 self.correct += torch.tensor(1.0)
             self.total += torch.tensor(1.0)
 
@@ -119,7 +165,8 @@ def compute(self):
 
 
 class InContextLearningLMAccuracy(InContextLearningMetric):
-    r"""Computes accuracy for In-context learning (ICL) language modeling (LM) tasks.
+    r"""Computes accuracy for In-context learning (ICL) language modeling (LM)
+    tasks.
 
     ICL LM tasks consist of some number of example language modeling tasks (referred to as the 'context'), followed by a test task where the model must correctly predict all the tokens
     following tokens in some passage (referred to as the 'continuation').
@@ -143,15 +190,26 @@ class InContextLearningLMAccuracy(InContextLearningMetric):
     full_state_update = False
 
     def __init__(self, dist_sync_on_step: bool = False):
+        warnings.warn(
+            ('InContextLearningLMAccuracy is deprecated and will be removed in a future '
+             'release. Its functionality has been reimplemented '
+             'in llmfoundry.eval.metrics.nlp.InContextLearningLMAccuracy.'),
+            DeprecationWarning,
+        )
         # state from multiple processes
         super().__init__(dist_sync_on_step=dist_sync_on_step)
-        self.add_state('correct', default=torch.tensor(0.), dist_reduce_fx='sum')
+        self.add_state('correct',
+                       default=torch.tensor(0.),
+                       dist_reduce_fx='sum')
         self.add_state('total', default=torch.tensor(0.), dist_reduce_fx='sum')
 
-    def update(self, batch: dict, output_logits: torch.Tensor, labels: torch.Tensor):
+    def update(self, batch: dict, output_logits: torch.Tensor,
+               labels: torch.Tensor):
         for batch_idx, cont_idx in enumerate(batch['continuation_indices']):
-            cont_tok_pred = output_logits[batch_idx].index_select(dim=0, index=cont_idx - 1).argmax(dim=-1)
-            cont_tok_targ = labels[batch_idx].index_select(dim=0, index=cont_idx - 1)
+            cont_tok_pred = output_logits[batch_idx].index_select(
+                dim=0, index=cont_idx - 1).argmax(dim=-1)
+            cont_tok_targ = labels[batch_idx].index_select(dim=0,
+                                                           index=cont_idx - 1)
 
             self.correct += (cont_tok_pred == cont_tok_targ).all().int()
             self.total += torch.tensor(1.0)
@@ -162,59 +220,6 @@ def compute(self):
         return self.correct / self.total
 
 
-class InContextLearningMultipleChoiceAccuracy(InContextLearningMetric):
-    r"""Computes accuracy for In-context learning (ICL) multiple choice (MC) tasks.
-
-    ICL MC tasks consists of a series of questions with some number of possible choices (only one of which can be correct).
-    At inference time each possible choice is given to the model as a separate input and the one for which the model assigns
-    the lowest perplexity to the choice is considered the model's choice. The model is correct if it "chooses" the right answer.
-
-    Context: `The dog is->fuzzy\nthe water is->hot\nthe tree is->`
-    Continuation: `green`
-
-    Adds metric state variables:
-        correct (float): The number of instances where the prediction masked the target.
-        total (float): The number of total instances that were predicted.
-
-    Args:
-        dist_sync_on_step (bool, optional): Synchronize metric state across processes at
-            each forward() before returning the value at the step. Default: ``False``.
-    """
-
-    # Make torchmetrics call update only once
-    full_state_update = False
-
-    def __init__(self, dist_sync_on_step: bool = False):
-        # state from multiple processes
-        super().__init__(dist_sync_on_step=dist_sync_on_step)
-        self.add_state('correct', default=torch.tensor(0.0), dist_reduce_fx='sum')
-        self.add_state('total', default=torch.tensor(0.0), dist_reduce_fx='sum')
-
-    def update(self, batch: dict, output_logits: torch.Tensor, labels: torch.Tensor):
-        perplexities = []
-        for batch_idx, cont_idx in enumerate(batch['continuation_indices']):
-            # continuation indices refer to indices in the original input's token space
-            cont_tok_logits = output_logits[batch_idx].index_select(dim=0, index=cont_idx - 1)
-            # labels have been shifted left by one index, so the cont_idx needs to be shifted as well.
-            cont_tok_targ = labels[batch_idx].index_select(dim=0, index=cont_idx - 1)
-            cross_entropy = F.cross_entropy(cont_tok_logits, cont_tok_targ)
-            perplexity = torch.exp(cross_entropy)
-            perplexities.append(perplexity)
-
-        for (start, end), gold_idx in zip(batch['choice_groupings'], batch['gold_indices']):
-            subset = perplexities[start:end]
-            idx_min = subset.index(min(subset))
-
-            if idx_min == gold_idx:
-                self.correct += torch.tensor(1.0)
-            self.total += torch.tensor(1.0)
-
-    def compute(self):
-        assert isinstance(self.correct, Tensor)
-        assert isinstance(self.total, Tensor)
-        return self.correct.float() / self.total
-
-
 class InContextLearningCodeEvalAccuracy(InContextLearningMetric):
     r"""Computes accuracy for In-context learning (ICL) code evaluation tasks.
 
@@ -239,9 +244,18 @@ class InContextLearningCodeEvalAccuracy(InContextLearningMetric):
     full_state_update = False
 
     def __init__(self, dist_sync_on_step: bool = False):
+        warnings.warn(
+            ('InContextLearningCodeEvalAccuracy is deprecated and will be removed in a future '
+             'release. Its functionality has been reimplemented '
+             'in llmfoundry.eval.metrics.nlp.InContextLearningCodeEvalAccuracy.'
+            ),
+            DeprecationWarning,
+        )
         # state from multiple processes
         super().__init__(dist_sync_on_step=dist_sync_on_step)
-        self.add_state('correct', default=torch.tensor(0.), dist_reduce_fx='sum')
+        self.add_state('correct',
+                       default=torch.tensor(0.),
+                       dist_reduce_fx='sum')
         self.add_state('total', default=torch.tensor(0.), dist_reduce_fx='sum')
 
         self.eval_device = os.environ.get('CODE_EVAL_DEVICE', None)
@@ -269,8 +283,9 @@ def get_client(self) -> EvalClient:
                 'to one of `LOCAL` (for unsafe local eval), `LAMBDA` (for AWS lambda ',
                 'evaluation), or `MOSAICML` (for lambda eval through MAPI).')
         else:
-            raise ValueError('Environment variable `CODE_EVAL_DEVICE` must be one of `LOCAL`, '
-                             f'`LAMBDA`, or `MOSAICML` but got {self.eval_device}.')
+            raise ValueError(
+                'Environment variable `CODE_EVAL_DEVICE` must be one of `LOCAL`, '
+                f'`LAMBDA`, or `MOSAICML` but got {self.eval_device}.')
 
         return client
 
@@ -286,7 +301,8 @@ def estimator(self, n: int, c: int, k: int) -> float:
             return 1.0
         return 1.0 - float(np.prod(1.0 - k / np.arange(n - c + 1, n + 1)))
 
-    def update(self, batch: Dict[str, Any], outputs: List[str], labels: List[str]):
+    def update(self, batch: Dict[str, Any], outputs: List[str],
+               labels: List[str]):
         """Updates the pass@k accuracy of code generation.
 
         Given a batch of prompts, test cases, and code generations, evaluates the code generations
@@ -316,16 +332,20 @@ def update(self, batch: Dict[str, Any], outputs: List[str], labels: List[str]):
         pass_at_k = batch['pass_at_k']
         num_generations = batch['generation_kwargs']['num_return_sequences']
         processed_outputs = [
-            outputs[i * num_generations:(i + 1) * num_generations] for i in range(len(batch['prompts']))
+            outputs[i * num_generations:(i + 1) * num_generations]
+            for i in range(len(batch['prompts']))
         ]
         payloads = []
         for sample_outputs, sample_prompt, test_inputs, test_outputs, entry_point, language in zip(
-                processed_outputs, batch['prompts'], batch['test_inputs'], batch['test_outputs'], batch['entry_points'],
+                processed_outputs, batch['prompts'], batch['test_inputs'],
+                batch['test_outputs'], batch['entry_points'],
                 batch['languages']):
             self.total += torch.tensor(1.0)
             prompt_payload = []
             for code_gen in sample_outputs:
-                code_gen = re.split(r'\n[A-Za-z0-9#`]', code_gen)[0]  # remove everything after function ends
+                code_gen = re.split(
+                    r'\n[A-Za-z0-9#`]',
+                    code_gen)[0]  # remove everything after function ends
                 final_code = sample_prompt + code_gen  # combine prompt with the code generation
                 generation_payload = []
                 for test_input, test_output in zip(test_inputs, test_outputs):
@@ -349,7 +369,8 @@ def update(self, batch: Dict[str, Any], outputs: List[str], labels: List[str]):
                 if correct:
                     num_correct += 1
 
-            pass_at_k_rate = self.estimator(num_generations, num_correct, pass_at_k)
+            pass_at_k_rate = self.estimator(num_generations, num_correct,
+                                            pass_at_k)
             self.correct += torch.tensor(pass_at_k_rate)
 
         client.close()  # pyright: ignore [reportOptionalMemberAccess]
@@ -357,4 +378,214 @@ def update(self, batch: Dict[str, Any], outputs: List[str], labels: List[str]):
     def compute(self):
         assert isinstance(self.correct, Tensor)
         assert isinstance(self.total, Tensor)
-        return self.correct / self.total
\ No newline at end of file
+        return self.correct / self.total
+
+
+class InContextLearningMultipleChoiceAccuracy(InContextLearningMetric):
+    r"""Computes accuracy for In-context learning (ICL) multiple choice (MC)
+    tasks.
+
+    ICL MC tasks consists of a series of questions with some number of possible choices (only one of which can be correct).
+    At inference time each possible choice is given to the model as a separate input and the one for which the model assigns
+    the lowest perplexity to the choice is considered the model's choice. The model is correct if it "chooses" the right answer.
+
+    Context: `The dog is->fuzzy\nthe water is->hot\nthe tree is->`
+    Continuation: `green`
+
+    Adds metric state variables:
+        correct (float): The number of instances where the prediction masked the target.
+        total (float): The number of total instances that were predicted.
+
+    Args:
+        dist_sync_on_step (bool, optional): Synchronize metric state across processes at
+            each forward() before returning the value at the step. Default: ``False``.
+    """
+
+    # Make torchmetrics call update only once
+    full_state_update = False
+
+    def __init__(self, dist_sync_on_step: bool = False):
+        warnings.warn(
+            ('InContextLearningMultipleChoiceAccuracy is deprecated and will be removed in a future '
+             'release. Its functionality has been reimplemented '
+             'in llmfoundry.eval.metrics.nlp.InContextLearningMultipleChoiceAccuracy.'
+            ),
+            DeprecationWarning,
+        )
+        # state from multiple processes
+        super().__init__(dist_sync_on_step=dist_sync_on_step)
+        self.add_state('correct',
+                       default=torch.tensor(0.0),
+                       dist_reduce_fx='sum')
+        self.add_state('total', default=torch.tensor(0.0), dist_reduce_fx='sum')
+
+    def update(self, batch: dict, output_logits: torch.Tensor,
+               labels: torch.Tensor):
+        perplexities = []
+        for batch_idx, cont_idx in enumerate(batch['continuation_indices']):
+            # continuation indices refer to indices in the original input's token space
+            cont_tok_logits = output_logits[batch_idx].index_select(
+                dim=0, index=cont_idx - 1)
+            # labels have been shifted left by one index, so the cont_idx needs to be shifted as well.
+            cont_tok_targ = labels[batch_idx].index_select(dim=0,
+                                                           index=cont_idx - 1)
+            cross_entropy = F.cross_entropy(cont_tok_logits, cont_tok_targ)
+            perplexity = torch.exp(cross_entropy)
+            perplexities.append(perplexity)
+
+        for (start, end), gold_idx in zip(batch['choice_groupings'],
+                                          batch['gold_indices']):
+            subset = perplexities[start:end]
+            idx_min = subset.index(min(subset))
+
+            if idx_min == gold_idx:
+                self.correct += torch.tensor(1.0)
+            self.total += torch.tensor(1.0)
+
+    def compute(self):
+        assert isinstance(self.correct, Tensor)
+        assert isinstance(self.total, Tensor)
+        return self.correct.float() / self.total
+
+
+class InContextLearningExpectedCalibrationError(InContextLearningMetric):
+    """Generic class for Expected Calibration Error (ECE) (cite:
+    https://arxiv.org/pdf/1706.04599.pdf).
+
+    Expected calibration error is calculated by dividing predictions into buckets based on the model's confidence (a probability value between 0 and 1).
+    We then calculate the accuracy within each bucket and calculate the average gap between confidence and accuracy
+    across buckets, weighted by the number of samples in each bucket.
+
+    Each task must implement its own definition of "confidence" to be computed via the `update` method.
+
+    Adds metric state variables:
+    bucket_totals (float): The number of instances where the prediction masked the target per bucket.
+    bucket_correct (float): The number of total instances that were predicted per bucket.
+
+    Args:
+        dist_sync_on_step (bool, optional): Synchronize metric state across processes at
+            each forward() before returning the value at the step. Default: ``False``.
+        n_buckets (int): Number of distinct buckets to split the confidence distribution into
+    """
+
+    def __init__(self, dist_sync_on_step: bool = False, n_buckets: int = 10):
+        warnings.warn(
+            ('InContextLearningExpectedCalibrationError is deprecated and will be removed in a future '
+             'release.'),
+            DeprecationWarning,
+        )
+        # state from multiple processes
+        super().__init__(dist_sync_on_step=dist_sync_on_step)
+        self.n_buckets = n_buckets
+        if n_buckets < 1:
+            raise Exception('`n_buckets`')
+        self.add_state('bucket_totals',
+                       default=torch.zeros(n_buckets),
+                       dist_reduce_fx='sum')
+        self.add_state('bucket_correct',
+                       default=torch.zeros(n_buckets),
+                       dist_reduce_fx='sum')
+
+    def update(self, batch: dict, output_logits: torch.Tensor,
+               labels: torch.Tensor):
+        pass
+
+    def compute(self):
+        assert isinstance(self.bucket_correct, Tensor)
+        assert isinstance(self.bucket_totals, Tensor)
+
+        result = torch.tensor(0.0, device=self.bucket_correct.device)
+        total_obs = torch.sum(self.bucket_totals)
+        for i in range(self.n_buckets):
+            if self.bucket_totals[i] == 0:
+                continue
+
+            acc_bucket_i = self.bucket_correct[i] / self.bucket_totals[i]
+            upper_bound = (i + 1) / self.n_buckets
+            lower_bound = i / self.n_buckets
+            conf_bucket_i = torch.tensor((upper_bound + lower_bound) / 2,
+                                         device=self.bucket_correct.device)
+            result += (self.bucket_totals[i] /
+                       total_obs) * torch.abs(acc_bucket_i - conf_bucket_i)
+        return result
+
+
+class InContextLearningMCExpectedCalibrationError(
+        InContextLearningExpectedCalibrationError):
+    r"""Computes Expected Calibration Error (ECE) for In-context learning (ICL)
+    multiple choice (MC) tasks. (source: https://arxiv.org/abs/2012.00955).
+
+    For MC tasks, the model confidence is defined as the softmax of average per-token probability assigned to the top question choice.
+
+    See `InContextLearningExpectedCalibrationError` for more info.
+    """
+
+    # Make torchmetrics call update only once
+    full_state_update = False
+
+    def update(self, batch: Dict[str, Any], output_logits: torch.Tensor,
+               labels: torch.Tensor):
+        output_logits = torch.softmax(output_logits, dim=2)
+        probabilites = []
+        for batch_idx, cont_idx in enumerate(batch['continuation_indices']):
+            cont_tok_logits = output_logits[batch_idx].index_select(
+                dim=0, index=cont_idx - 1)
+            cont_tok_targ = labels[batch_idx].index_select(dim=0,
+                                                           index=cont_idx - 1)
+            probability = cont_tok_logits.index_select(
+                dim=1, index=cont_tok_targ).diagonal().mean()
+            probabilites.append(probability)
+
+        for (start, end), gold_idx in zip(batch['choice_groupings'],
+                                          batch['gold_indices']):
+            subset = probabilites[start:end]
+            idx_max = subset.index(max(subset))
+            confidence = torch.tensor(subset).max() / torch.tensor(subset).sum()
+
+            assert confidence >= 0.0 and confidence <= 1.0
+            bucket_idx = int(confidence * self.n_buckets)
+            if bucket_idx == self.n_buckets:
+                bucket_idx -= 1
+
+            if idx_max == gold_idx:
+                self.bucket_correct[
+                    bucket_idx] += 1  # pyright: ignore [reportGeneralTypeIssues]
+
+            self.bucket_totals[
+                bucket_idx] += 1  # pyright: ignore [reportGeneralTypeIssues]
+
+
+class InContextLearningLMExpectedCalibrationError(
+        InContextLearningExpectedCalibrationError):
+    r"""Computes Expected Calibration Error (ECE) for In-context learning (ICL)
+    language modeling (LM) tasks. (cite: https://arxiv.org/pdf/1706.04599.pdf).
+
+    For LM tasks, the model confidence is defined as the minimum probability assigned to all tokens in the continuation.
+
+    See `InContextLearningExpectedCalibrationError` for more info.
+    """
+
+    # Make torchmetrics call update only once
+    full_state_update = False
+
+    def update(self, batch: Dict[str, Any], output_logits: torch.Tensor,
+               labels: torch.Tensor):
+        output_logits = torch.softmax(output_logits, dim=2)
+        for batch_idx, cont_idx in enumerate(batch['continuation_indices']):
+            cont_tok_logits = output_logits[batch_idx].index_select(
+                dim=0, index=cont_idx - 1)
+            cont_tok_pred = cont_tok_logits.argmax(dim=-1)
+            confidence = cont_tok_logits.max(dim=-1).values.min()
+            cont_tok_targ = labels[batch_idx].index_select(dim=0,
+                                                           index=cont_idx - 1)
+            assert confidence >= 0.0 and confidence <= 1.0
+            bucket_idx = int(confidence * self.n_buckets)
+            if bucket_idx == self.n_buckets:
+                bucket_idx -= 1
+
+            if (cont_tok_pred == cont_tok_targ).all():
+                self.bucket_correct[
+                    bucket_idx] += 1  # pyright: ignore [reportGeneralTypeIssues]
+
+            self.bucket_totals[
+                bucket_idx] += 1  # pyright: ignore [reportGeneralTypeIssues]
diff --git a/llmfoundry/models/hf/hf_causal_lm.py b/llmfoundry/models/hf/hf_causal_lm.py
index e51d7c55d0..3ff5e15eed 100644
--- a/llmfoundry/models/hf/hf_causal_lm.py
+++ b/llmfoundry/models/hf/hf_causal_lm.py
@@ -9,12 +9,6 @@
 from typing import TYPE_CHECKING, Any, Dict, Mapping
 
 # required for loading a python model into composer
-import transformers
-from llmfoundry.eval.metrics.nlp import (InContextLearningCodeEvalAccuracy,
-                                  InContextLearningLMAccuracy,
-                                  InContextLearningMultipleChoiceAccuracy,
-                                  InContextLearningQAAccuracy,
-                                  )
 from composer.metrics.nlp import LanguageCrossEntropy, LanguagePerplexity
 from composer.models.huggingface import peft_installed
 from composer.utils import dist
@@ -22,6 +16,9 @@
 from transformers import (AutoConfig, AutoModelForCausalLM, PreTrainedModel,
                           PreTrainedTokenizerBase)
 
+from llmfoundry.eval.metrics.nlp import (
+    InContextLearningCodeEvalAccuracy, InContextLearningLMAccuracy,
+    InContextLearningMultipleChoiceAccuracy, InContextLearningQAAccuracy)
 from llmfoundry.models.hf.hf_fsdp import hf_get_init_device
 from llmfoundry.models.hf.model_wrapper import HuggingFaceModelWithZLoss
 from llmfoundry.models.layers.attention import is_flash_v2_installed
diff --git a/llmfoundry/models/inference_api_wrapper/interface.py b/llmfoundry/models/inference_api_wrapper/interface.py
index 30be41e022..19e4dc8e6e 100644
--- a/llmfoundry/models/inference_api_wrapper/interface.py
+++ b/llmfoundry/models/inference_api_wrapper/interface.py
@@ -6,16 +6,15 @@
 import torch
 from composer.core.types import Batch
 from composer.metrics import InContextLearningMetric
-from llmfoundry.eval.metrics.nlp import (InContextLearningLMAccuracy,
-                                  InContextLearningMultipleChoiceAccuracy,
-                                  InContextLearningQAAccuracy,
-                                  )
-
 from composer.metrics.nlp import LanguageCrossEntropy, LanguagePerplexity
 from composer.models import ComposerModel
 from torchmetrics import Metric
 from transformers import AutoTokenizer
 
+from llmfoundry.eval.metrics.nlp import (
+    InContextLearningLMAccuracy, InContextLearningMultipleChoiceAccuracy,
+    InContextLearningQAAccuracy)
+
 
 class InferenceAPIEvalWrapper(ComposerModel):
 
@@ -28,9 +27,7 @@ def __init__(self, model_cfg: Dict, tokenizer: AutoTokenizer):
             LanguagePerplexity(),
             InContextLearningLMAccuracy(),
             InContextLearningMultipleChoiceAccuracy(),
-            InContextLearningQAAccuracy(),
-            InContextLearningLMExpectedCalibrationError(),
-            InContextLearningMCExpectedCalibrationError()
+            InContextLearningQAAccuracy()
         ]
         self.eval_metrics = {
             metric.__class__.__name__: metric for metric in eval_metrics
diff --git a/llmfoundry/models/mpt/modeling_mpt.py b/llmfoundry/models/mpt/modeling_mpt.py
index 2dbbfb506d..2e3d256cc5 100644
--- a/llmfoundry/models/mpt/modeling_mpt.py
+++ b/llmfoundry/models/mpt/modeling_mpt.py
@@ -16,14 +16,13 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from llmfoundry.eval.metrics.nlp import (InContextLearningCodeEvalAccuracy,
-                              InContextLearningLMAccuracy,
-                              InContextLearningMultipleChoiceAccuracy,
-                              InContextLearningQAAccuracy)
 from composer.metrics.nlp import LanguageCrossEntropy, LanguagePerplexity
 from composer.models import HuggingFaceModel
 from composer.utils import dist
 
+from llmfoundry.eval.metrics.nlp import (
+    InContextLearningCodeEvalAccuracy, InContextLearningLMAccuracy,
+    InContextLearningMultipleChoiceAccuracy, InContextLearningQAAccuracy)
 from llmfoundry.models.layers.attention import (is_flash_v1_installed,
                                                 is_flash_v2_installed)
 
diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py
index 6a5ab983bc..9b6e9d869a 100644
--- a/llmfoundry/utils/builders.py
+++ b/llmfoundry/utils/builders.py
@@ -15,8 +15,6 @@
                                 MemoryMonitor, OptimizerMonitor,
                                 RuntimeEstimator, SpeedMonitor)
 from composer.core import Algorithm, Callback, Evaluator
-from llmfoundry.eval.datasets.in_context_learning_evaluation import \
-    get_icl_task_dataloader
 from composer.loggers import (InMemoryLogger, LoggerDestination, MLFlowLogger,
                               TensorboardLogger, WandBLogger)
 from composer.optim import DecoupledAdamW
@@ -35,6 +33,8 @@
                                   LayerFreezing, MonolithicCheckpointSaver,
                                   ScheduledGarbageCollector)
 from llmfoundry.data.dataloader import build_dataloader
+from llmfoundry.eval.datasets.in_context_learning_evaluation import \
+    get_icl_task_dataloader
 from llmfoundry.optim import (DecoupledAdaLRLion, DecoupledClipLion,
                               DecoupledLionW, DecoupledLionW_8bit)
 from llmfoundry.optim.scheduler import InverseSquareRootWithWarmupScheduler
diff --git a/mcli/mcli-hf-eval.yaml b/mcli/mcli-hf-eval.yaml
index d1b641ead6..8c495a4316 100644
--- a/mcli/mcli-hf-eval.yaml
+++ b/mcli/mcli-hf-eval.yaml
@@ -1,22 +1,20 @@
 integrations:
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  git_branch: migrate_subclasses_to_foundry # v0.4.0
+  git_branch: migrate_subclasses_to_foundry  # v0.4.0
   # git_commit:  # OR use your commit hash
   pip_install: -e ".[gpu]"
   ssh_clone: false  # Should be true if using a private repo
 
 command: |
   cd llm-foundry/scripts
-  pip uninstall mosaicml -y
-  pip install git+https://github.com/bmosaicml/composer.git@remove_subclasses_from_composer 
   composer eval/eval.py /mnt/config/parameters.yaml
 
 # Mosaic Cloud will use run_name (with a unique suffix) to populate the env var $RUN_NAME
-run_name: mpt-eval
+name: mpt-eval
 gpu_num: 8
 gpu_type: a100_80gb
-cluster: r1z1 # replace with your cluster here!
+cluster: r1z1  # replace with your cluster here!
 
 image: mosaicml/llm-foundry:2.1.0_cu121_flash2-latest
 
diff --git a/tests/eval/test_in_context_learning_datasets.py b/tests/eval/test_in_context_learning_datasets.py
index ab762d55bc..ec7632bedd 100644
--- a/tests/eval/test_in_context_learning_datasets.py
+++ b/tests/eval/test_in_context_learning_datasets.py
@@ -1,34 +1,582 @@
+# Copyright 2024 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
 # Copyright 2022 MosaicML Composer authors
 # SPDX-License-Identifier: Apache-2.0
 
+import contextlib
 import os
+import random
 import types
 from pathlib import Path
 
 import pytest
 import torch
-from torch.utils.data import DataLoader
-
 from composer import Evaluator
 from composer.core import DataSpec
+from torch.utils.data import DataLoader
 
 # isort: off
 from llmfoundry.eval.datasets.in_context_learning_evaluation import (
-    InContextLearningCodeEvalDataset,
-    InContextLearningMultipleChoiceTaskDataset,
-    InContextLearningQATaskDataset,
-    InContextLearningSchemaTaskDataset,
-    get_icl_task_dataloader,
-)
+    InContextLearningDataset, InContextLearningCodeEvalDataset,
+    InContextLearningMultipleChoiceTaskDataset, InContextLearningQATaskDataset,
+    InContextLearningSchemaTaskDataset, get_icl_task_dataloader, strip_data,
+    _tokenizer_needs_prefix_space, _trim_context, _get_continuation_span,
+    _get_fewshot_sample_idxs, _make_padded_input)
 # isort: on
+from composer.datasets.utils import MultiTokenEOSCriteria
 from composer.loggers import InMemoryLogger
-from llmfoundry.eval.metrics.nlp import (InContextLearningCodeEvalAccuracy, InContextLearningLMAccuracy,
-                              InContextLearningMultipleChoiceAccuracy, InContextLearningQAAccuracy)
 from composer.models import HuggingFaceModel
 from composer.trainer import Trainer
 from composer.utils import dist, reproducibility
 
+from llmfoundry.eval.metrics.nlp import (
+    InContextLearningCodeEvalAccuracy, InContextLearningLMAccuracy,
+    InContextLearningMultipleChoiceAccuracy, InContextLearningQAAccuracy)
+
+
+def test_strip_data():
+    data_to_strip = {
+        'strip_data': '  boo!  \n',
+        'has_space': '  wa hoo!',
+        'end_space': 'yoohoo!  '
+    }
+    stripped_data = strip_data(data_to_strip)
+    for k, v in stripped_data.items():
+        assert k in data_to_strip
+        assert not v[0].isspace()
+        assert not v[-1].isspace()
+
+
+@pytest.mark.skip(
+    reason="Currently don't have a tokenizer that satisfies this test")
+def test_tokenizer_needs_prefix_space_when_space_not_needed(
+        tiny_gpt2_tokenizer):
+    assert not _tokenizer_needs_prefix_space(tiny_gpt2_tokenizer)
+
+
+def test_tokenizer_needs_prefix_space_when_space_needed():
+    transformers = pytest.importorskip('transformers')
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        'facebook/opt-125m',
+        use_fast=False)  # type: ignore reportUnboundVariable
+    assert _tokenizer_needs_prefix_space(tokenizer)
+
+
+def test_trim_context():
+    context = [0] * 99 + [1] * 2037
+    continuation = [2] * 10
+    max_seq_len = 2048
+    trimmed_context = _trim_context(context,
+                                    continuation,
+                                    max_seq_len=max_seq_len)
+    assert len(trimmed_context) == 2038
+    assert trimmed_context[0] == 0
+    assert trimmed_context[1] == 1
+
+
+def test_trim_context_no_continuation():
+    context = [0] * 2048
+    max_seq_len = 2048
+    trimmed_context = _trim_context(context, [], max_seq_len=max_seq_len)
+    assert len(trimmed_context) == 2048
+    context = [0] * 3000 + [1]
+    max_seq_len = 2048
+    trimmed_context = _trim_context(context, [], max_seq_len=max_seq_len)
+    assert len(trimmed_context) == 2048
+    assert trimmed_context[-1] == 1
+
+
+def test_get_continuation_span():
+    context = [0] * 200
+    continuation = [1] * 3
+    cont_span = _get_continuation_span(context, continuation)
+    assert torch.all(torch.eq(cont_span, torch.tensor([200, 201, 202])))
+    continuation = [1]
+    cont_span = _get_continuation_span(context, continuation)
+    assert torch.all(torch.eq(cont_span, torch.tensor([200])))
+
+
+@pytest.mark.parametrize('padding_side', ['left', 'right', 'middle'])
+def test_make_padding(tiny_gpt2_tokenizer, padding_side):
+    context = tiny_gpt2_tokenizer(' cat' * 2000)['input_ids']
+    padding_id = tiny_gpt2_tokenizer.eos_token_id
+
+    error_context = contextlib.nullcontext() if padding_side in {
+        'left', 'right'
+    } else pytest.raises(ValueError)
+
+    with error_context:
+        input_ids = _make_padded_input(context, [],
+                                       2048,
+                                       padding_id,
+                                       padding_side=padding_side)
+
+        if padding_side == 'left':
+            assert input_ids[0] == tiny_gpt2_tokenizer.eos_token_id
+            assert input_ids[48:].tolist() == context
+        elif padding_side == 'right':
+            assert input_ids[-1] == tiny_gpt2_tokenizer.eos_token_id
+            assert input_ids[:-48].tolist() == context
+
+
+def test_batch_padding_logic_no_padding(tiny_gpt2_tokenizer):
+    continuation = tiny_gpt2_tokenizer(' dog' * 2000)['input_ids']
+    context = tiny_gpt2_tokenizer(' cat' * 2000)['input_ids']
+    max_seq_len = 2048
+    trimmed_context = _trim_context(context, continuation, max_seq_len)
+    continuation_spans = _get_continuation_span(trimmed_context, continuation)
+    padded_input = _make_padded_input(trimmed_context,
+                                      continuation,
+                                      max_seq_len,
+                                      tiny_gpt2_tokenizer.pad_token_id,
+                                      padding_side='right')
+    assert continuation_spans[0] == 48 and continuation_spans[-1] == 2047
+    assert len(padded_input) == 2048
+    assert tiny_gpt2_tokenizer.pad_token_id not in padded_input
+
+
+def test_batch_padding_logic_with_padding(tiny_gpt2_tokenizer):
+    continuation = tiny_gpt2_tokenizer(' dog' * 200)['input_ids']
+    context = tiny_gpt2_tokenizer(' cat' * 200)['input_ids']
+    max_seq_len = 2048
+    trimmed_context = _trim_context(context, continuation, max_seq_len)
+    continuation_spans = _get_continuation_span(trimmed_context, continuation)
+    padded_input = _make_padded_input(trimmed_context,
+                                      continuation,
+                                      max_seq_len,
+                                      tiny_gpt2_tokenizer.pad_token_id,
+                                      padding_side='right')
+    assert continuation_spans[0] == 200 and continuation_spans[-1] == 399
+    assert len(padded_input) == 2048
+    assert padded_input[-1] == tiny_gpt2_tokenizer.pad_token_id
+
+
+def test_fewshot_sample_idxs():
+    rng = random.Random(1234)
+
+    fewshot_idxs = _get_fewshot_sample_idxs(dataset_size=5,
+                                            num_fewshot=4,
+                                            example_idx=4,
+                                            rng=rng)
+    assert fewshot_idxs == {0, 1, 2, 3}
+
+    fewshot_idxs = _get_fewshot_sample_idxs(dataset_size=5,
+                                            num_fewshot=5,
+                                            example_idx=4,
+                                            rng=rng)
+    assert fewshot_idxs == {0, 1, 2, 3}
+
+    fewshot_idxs = _get_fewshot_sample_idxs(dataset_size=5,
+                                            num_fewshot=500,
+                                            example_idx=4,
+                                            rng=rng)
+    assert fewshot_idxs == {0, 1, 2, 3}
+
+    fewshot_idxs = _get_fewshot_sample_idxs(dataset_size=10,
+                                            num_fewshot=7,
+                                            example_idx=4,
+                                            rng=rng)
+    assert len(fewshot_idxs) == 7 and 4 not in fewshot_idxs
+
+
+def test_fewshot_sample_idxs_randomness():
+    dataset_size = 10000
+    num_fewshot = 5
+
+    rng_1_seed_1234 = random.Random(1234)
+    rng_2_seed_1234 = random.Random(1234)
+    rng_3_seed_11 = random.Random(11)
+
+    rng_1_sample_1 = _get_fewshot_sample_idxs(dataset_size, num_fewshot, 1,
+                                              rng_1_seed_1234)
+    rng_2_sample_1 = _get_fewshot_sample_idxs(dataset_size, num_fewshot, 1,
+                                              rng_2_seed_1234)
+    rng_3_sample_1 = _get_fewshot_sample_idxs(dataset_size, num_fewshot, 1,
+                                              rng_3_seed_11)
+
+    assert rng_1_sample_1 == rng_2_sample_1
+    assert rng_1_sample_1 != rng_3_sample_1
+
+    rng_1_sample_2 = _get_fewshot_sample_idxs(dataset_size, num_fewshot, 2,
+                                              rng_1_seed_1234)
+    rng_2_sample_2 = _get_fewshot_sample_idxs(dataset_size, num_fewshot, 2,
+                                              rng_2_seed_1234)
+    rng_3_sample_2 = _get_fewshot_sample_idxs(dataset_size, num_fewshot, 2,
+                                              rng_3_seed_11)
+
+    assert rng_1_sample_2 == rng_2_sample_2
+    assert rng_1_sample_2 != rng_3_sample_2
+
+
+@pytest.mark.filterwarnings(
+    r'ignore:The repository for mosaicml/test_dataset contains custom code which must*:FutureWarning'
+)
+def test_update_generation_kwargs(tiny_gpt2_tokenizer, tmp_path):
+    tokenizer = tiny_gpt2_tokenizer
+    seqlen = 2048
+    num_fewshot = 0
+    prompt_string = ''
+    hf_loading_vars = {
+        'split': 'test',
+        'name': 'invoker',
+    }
+    hf_parsing_map = {'context': ['quas', 'wex', 'exort'], 'answer': ['spell']}
+    gen_kwargs = {'test_arg1': 1, 'test_arg2': 2}
+
+    dl = InContextLearningDataset(
+        dataset_uri='hf://mosaicml/test_dataset',
+        tokenizer=tokenizer,
+        max_seq_len=seqlen,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=num_fewshot,
+        fewshot_random_seed=1,
+        prompt_string=prompt_string,
+        example_delimiter='\n',
+        prelimiter='Orbs: ',
+        continuation_delimiter='\nSpell:',
+        destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
+        hf_loading_vars=hf_loading_vars,
+        hf_parsing_map=hf_parsing_map,
+        generation_kwargs=gen_kwargs)
+    assert dl.base_batch['generation_kwargs'] == {
+        'test_arg1': 1,
+        'test_arg2': 2
+    }
+
+
+def test_stop_sequences_criteria(tiny_gpt2_tokenizer):
+    pytest.importorskip('transformers')
+    eos_criteria = MultiTokenEOSCriteria('\n\n', tiny_gpt2_tokenizer, 2)
+    seq1 = tiny_gpt2_tokenizer('Dogs are furry')['input_ids']
+    seq2 = tiny_gpt2_tokenizer('Dogs are furry\n\n')['input_ids']
+    seq1 = [tiny_gpt2_tokenizer.pad_token_id] * (len(seq2) - len(seq1)) + seq1
+    input_ids = torch.LongTensor([seq1, seq2])
+    assert not eos_criteria(input_ids,
+                            None)  # pyright: ignore[reportGeneralTypeIssues]
+
+    eos_criteria = MultiTokenEOSCriteria('\n\n', tiny_gpt2_tokenizer, 2)
+    seq1 = tiny_gpt2_tokenizer('Dogs are furry\n\n')['input_ids']
+    seq2 = tiny_gpt2_tokenizer('Dogs are furry\n\n')['input_ids']
+    input_ids = torch.LongTensor([seq1, seq2])
+    assert eos_criteria(input_ids,
+                        None)  # pyright: ignore[reportGeneralTypeIssues]
+
+
+def test_stop_sequences_criteria_sentencepiece(tiny_llama_tokenizer):
+    pytest.importorskip('datasets')
+
+    tokenizer = tiny_llama_tokenizer
+    eos_criteria = MultiTokenEOSCriteria('\n\n', tokenizer, 2)
+    seq1 = tokenizer(
+        '\n\nDogs'
+    )['input_ids']  # check to make sure starting with the stop sequence doesnt break it
+    seq2 = tokenizer('Dogs are furry\n\n')['input_ids']
+    seq1 = [tokenizer.eos_token_id] * (len(seq2) - len(seq1)) + seq1
+    input_ids = torch.LongTensor([seq1, seq2])
+    assert not eos_criteria(input_ids,
+                            None)  # pyright: ignore[reportGeneralTypeIssues]
+
+    eos_criteria = MultiTokenEOSCriteria('\n\n', tokenizer, 2)
+    seq1 = tokenizer('Dogs are furry\n\n')['input_ids']
+    seq2 = tokenizer('Dogs are furry\n\n')['input_ids']
+    input_ids = torch.LongTensor([seq1, seq2])
+    assert eos_criteria(input_ids,
+                        None)  # pyright: ignore[reportGeneralTypeIssues]
+
+
+@pytest.mark.filterwarnings(
+    r'ignore:The repository for mosaicml/test_dataset contains custom code which must*:FutureWarning'
+)
+def test_update_generation_kwargs_no_kwargs(tiny_gpt2_tokenizer, tmp_path):
+    tokenizer = tiny_gpt2_tokenizer
+    seqlen = 2048
+    num_fewshot = 0
+    prompt_string = ''
+    hf_loading_vars = {
+        'split': 'test',
+        'name': 'invoker',
+    }
+    hf_parsing_map = {'context': ['quas', 'wex', 'exort'], 'answer': ['spell']}
+
+    dl = InContextLearningDataset(
+        dataset_uri='hf://mosaicml/test_dataset',
+        tokenizer=tokenizer,
+        max_seq_len=seqlen,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=num_fewshot,
+        fewshot_random_seed=1,
+        prompt_string=prompt_string,
+        example_delimiter='\n',
+        prelimiter='Orbs: ',
+        continuation_delimiter='\nSpell:',
+        destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
+        hf_loading_vars=hf_loading_vars,
+        hf_parsing_map=hf_parsing_map)
+    assert not 'generation_kwargs' in dl.base_batch
+
+
+def test_update_generation_kwargs_no_kwargs_qa_dataset(tmp_path):
+    pytest.importorskip('datasets')
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    dataset_uri = f'{local_data}/triviaqa_small.jsonl'
+    transformers = pytest.importorskip('transformers')
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        'facebook/opt-125m')  # type: ignore reportUnboundVariable
+
+    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
+    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
+    dl = InContextLearningQATaskDataset(
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        max_seq_len=1024,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=0,
+        fewshot_random_seed=1234,
+        prompt_string='',
+        example_delimiter='\n',
+        continuation_delimiter=': ',
+        destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
+        generation_kwargs=None)
+    assert len(dl.base_batch['generation_kwargs']) == 3
+
+
+def test_update_generation_kwargs_with_kwargs_qa_dataset(tmp_path):
+    pytest.importorskip('datasets')
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    dataset_uri = f'{local_data}/triviaqa_small.jsonl'
+    transformers = pytest.importorskip('transformers')
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        'facebook/opt-125m')  # type: ignore reportUnboundVariable
+
+    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
+    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
+    dl = InContextLearningQATaskDataset(
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        max_seq_len=1024,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=0,
+        fewshot_random_seed=1234,
+        prompt_string='',
+        example_delimiter='\n',
+        continuation_delimiter=': ',
+        destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
+        generation_kwargs={'temperature': 0.9})
+    assert 'generation_kwargs' in dl.base_batch
+    assert dl.base_batch['generation_kwargs']['temperature'] == 0.9
+    assert len(dl.base_batch['generation_kwargs']) == 4
+
+
+@pytest.mark.filterwarnings(
+    r'ignore:The repository for mosaicml/test_dataset contains custom code which must*:FutureWarning'
+)
+def test_construct_context(tiny_gpt2_tokenizer, tmp_path):
+    tokenizer = tiny_gpt2_tokenizer
+    seqlen = 2048
+    num_fewshot = 0
+    prompt_string = ''
+    hf_loading_vars = {
+        'split': 'test',
+        'name': 'invoker',
+    }
+    hf_parsing_map = {'context': ['quas', 'wex', 'exort'], 'answer': ['spell']}
+
+    dl = InContextLearningDataset(
+        dataset_uri='hf://mosaicml/test_dataset',
+        tokenizer=tokenizer,
+        max_seq_len=seqlen,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=num_fewshot,
+        fewshot_random_seed=1,
+        prompt_string=prompt_string,
+        example_delimiter='\n',
+        prelimiter='Orbs: ',
+        continuation_delimiter='\nSpell: ',
+        destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
+        hf_loading_vars=hf_loading_vars,
+        hf_parsing_map=hf_parsing_map)
+    constructed_context = dl.construct_context({
+        'context': 'quas quas exort',
+        'answer': 'ice wall'
+    })
+    assert constructed_context == 'Orbs: quas quas exort\nSpell: '
+    constructed_context = dl.construct_context(
+        {
+            'context': 'quas quas exort',
+            'answer': 'ice wall'
+        }, add_answer=True)
+    assert constructed_context == 'Orbs: quas quas exort\nSpell: ice wall'
+    constructed_context = dl.construct_context(
+        {
+            'context': 'quas quas exort',
+            'answer': 'ice wall'
+        },
+        preceding_text='The harsh White Waste beckons!',
+        add_answer=True)
+    assert constructed_context == '\nOrbs: quas quas exort\nSpell: ice wall'
+
+
+@pytest.mark.filterwarnings(
+    r'ignore:The repository for mosaicml/test_dataset contains custom code which must*:FutureWarning'
+)
+def test_get_answer_from_example(tiny_gpt2_tokenizer, tmp_path):
+    tokenizer = tiny_gpt2_tokenizer
+    seqlen = 2048
+    num_fewshot = 0
+    prompt_string = ''
+    hf_loading_vars = {
+        'split': 'test',
+        'name': 'invoker',
+    }
+    hf_parsing_map = {'context': ['quas', 'wex', 'exort'], 'answer': ['spell']}
+
+    dl = InContextLearningDataset(
+        dataset_uri='hf://mosaicml/test_dataset',
+        tokenizer=tokenizer,
+        max_seq_len=seqlen,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=num_fewshot,
+        fewshot_random_seed=1,
+        prompt_string=prompt_string,
+        example_delimiter='\n',
+        prelimiter='Orbs: ',
+        continuation_delimiter='\nSpell:',
+        destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
+        hf_loading_vars=hf_loading_vars,
+        hf_parsing_map=hf_parsing_map)
+    answer = dl.get_answer_from_example({
+        'context': 'wex exort exort',
+        'answer': 'alacrity'
+    })
+    assert answer == ' alacrity'
+
+
+@pytest.mark.filterwarnings(
+    r'ignore:The repository for mosaicml/test_dataset contains custom code which must*:FutureWarning'
+)
+def test_fix_eos_on_preamble(tmp_path):
+    transformers = pytest.importorskip('transformers')
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        'facebook/opt-125m',
+        use_fast=False)  # type: ignore reportUnboundVariable
+    seqlen = 2048
+    num_fewshot = 0
+    prompt_string = ''
+    hf_loading_vars = {
+        'split': 'test',
+        'name': 'invoker',
+    }
+    hf_parsing_map = {'context': ['quas', 'wex', 'exort'], 'answer': ['spell']}
+
+    dl = InContextLearningDataset(
+        dataset_uri='hf://mosaicml/test_dataset',
+        tokenizer=tokenizer,
+        max_seq_len=seqlen,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=num_fewshot,
+        fewshot_random_seed=1,
+        prompt_string=prompt_string,
+        example_delimiter='\n',
+        prelimiter='Orbs: ',
+        continuation_delimiter='\nSpell:',
+        destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
+        hf_loading_vars=hf_loading_vars,
+        hf_parsing_map=hf_parsing_map)
+    preamble = 'blah blah blah.'
+    tokenized_preamble = tokenizer.encode(preamble)
+    tokenized_preamble += [tokenizer.eos_token_id]
+    fixed_preamble = dl._fix_eos_on_preamble(tokenized_preamble)
+    assert tokenized_preamble[:-1] == fixed_preamble
+    assert fixed_preamble[-1] != tokenizer.eos_token_id
+
+
+@pytest.mark.filterwarnings(
+    r'ignore:The repository for mosaicml/test_dataset contains custom code which must*:FutureWarning'
+)
+def test_tokenize_example_with_tokenize_labels(tiny_gpt2_tokenizer, tmp_path):
+    tokenizer = tiny_gpt2_tokenizer
+    seqlen = 2048
+    num_fewshot = 0
+    prompt_string = ''
+    hf_loading_vars = {
+        'split': 'test',
+        'name': 'invoker',
+    }
+    hf_parsing_map = {'context': ['quas', 'wex', 'exort'], 'answer': ['spell']}
+
+    dl = InContextLearningDataset(
+        dataset_uri='hf://mosaicml/test_dataset',
+        tokenizer=tokenizer,
+        max_seq_len=seqlen,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=num_fewshot,
+        fewshot_random_seed=1,
+        prompt_string=prompt_string,
+        example_delimiter='\n',
+        prelimiter='Orbs: ',
+        continuation_delimiter='\nSpell: ',
+        destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
+        hf_loading_vars=hf_loading_vars,
+        hf_parsing_map=hf_parsing_map,
+        tokenize_labels=True)
+    tokenized_example = dl.tokenize_example('What spell does this invoke? ',
+                                            'exort exort wex\nSpell: ',
+                                            {'answer': ' Meatball'})
+    tokenized_input = [
+        2061, 4822, 857, 428, 26342, 30, 220, 1069, 419, 409, 419, 356, 87, 198,
+        31221, 25, 19145, 1894
+    ]
+    assert tokenized_example['context'][:len(tokenized_input)].tolist(
+    ) == tokenized_input
+    assert tokenized_example['context'][-1] == tokenizer.eos_token_id
+    assert type(tokenized_example['answer'][0]) == int
+    assert len(tokenized_example['context']) == seqlen
+    assert 'continuation_indices' in tokenized_example
+
 
+@pytest.mark.filterwarnings(
+    r'ignore:The repository for mosaicml/test_dataset contains custom code which must*:FutureWarning'
+)
+def test_tokenize_example_with_no_tokenize_labels(tiny_gpt2_tokenizer,
+                                                  tmp_path):
+    tokenizer = tiny_gpt2_tokenizer
+    seqlen = 2048
+    num_fewshot = 0
+    prompt_string = ''
+    hf_loading_vars = {
+        'split': 'test',
+        'name': 'invoker',
+    }
+    hf_parsing_map = {'context': ['quas', 'wex', 'exort'], 'answer': ['spell']}
+
+    dl = InContextLearningDataset(
+        dataset_uri='hf://mosaicml/test_dataset',
+        tokenizer=tokenizer,
+        max_seq_len=seqlen,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=num_fewshot,
+        fewshot_random_seed=1,
+        prompt_string=prompt_string,
+        example_delimiter='\n',
+        prelimiter='Orbs: ',
+        continuation_delimiter='\nSpell: ',
+        destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
+        hf_loading_vars=hf_loading_vars,
+        hf_parsing_map=hf_parsing_map,
+        tokenize_labels=False)
+    tokenized_example = dl.tokenize_example('What spell does this invoke? ',
+                                            'exort exort wex\nSpell: ',
+                                            {'answer': ' Meatball'})
+    tokenized_input = [
+        2061, 4822, 857, 428, 26342, 30, 220, 1069, 419, 409, 419, 356, 87, 198,
+        31221, 25
+    ]
+    assert tokenized_example['context'][:len(tokenized_input)].tolist(
+    ) == tokenized_input
+    assert tokenized_example['context'][-1] == tokenizer.eos_token_id
+    assert len(tokenized_example['context']) == seqlen
+    assert type(tokenized_example['answer']) == str
 
 
 def test_qa_set_cot_no_cot(tmp_path):
@@ -36,7 +584,8 @@ def test_qa_set_cot_no_cot(tmp_path):
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/triviaqa_small.jsonl'
     transformers = pytest.importorskip('transformers')
-    tokenizer = transformers.AutoTokenizer.from_pretrained('facebook/opt-125m')  # type: ignore reportUnboundVariable
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        'facebook/opt-125m')  # type: ignore reportUnboundVariable
 
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
@@ -60,7 +609,8 @@ def test_qa_set_cot_has_cot(tmp_path):
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/gsm8k_small.jsonl'
     transformers = pytest.importorskip('transformers')
-    tokenizer = transformers.AutoTokenizer.from_pretrained('facebook/opt-125m')  # type: ignore reportUnboundVariable
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        'facebook/opt-125m')  # type: ignore reportUnboundVariable
 
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
@@ -189,7 +739,9 @@ def test_qa_tokenize_example(tiny_gpt2_tokenizer, tmp_path):
             'chain_of_thought': "Let's think step by step. "
         })
     assert 'aliases' in tokenized_example
-    assert tokenized_example['aliases'] == ['this is the right answer', 'this is the best answer']
+    assert tokenized_example['aliases'] == [
+        'this is the right answer', 'this is the best answer'
+    ]
 
 
 def test_code_adjust_padding(tiny_gpt2_tokenizer, tmp_path):
@@ -217,7 +769,9 @@ def test_code_adjust_padding(tiny_gpt2_tokenizer, tmp_path):
         generations_per_sample=10,
     )
 
-    assert all(len(data['prompt']) == 148 for data in dl.dataset)  # pyright: ignore [reportGeneralTypeIssues]
+    assert all(
+        len(data['prompt']) == 148
+        for data in dl.dataset)  # pyright: ignore [reportGeneralTypeIssues]
 
 
 def test_code_update_gen_kwargs(tiny_gpt2_tokenizer, tmp_path):
@@ -271,15 +825,23 @@ def test_mc_tokenize_example(tiny_gpt2_tokenizer, tmp_path):
         destination_path=str(tmp_path / 'test_human_eval_small.jsonl'),
     )
     example = {
-        'context': "Who's the best eval researcher?\n A. Jeremy\n B. Tessa\n C. Max\n D. Other\nAnswer: ",
+        'context':
+            "Who's the best eval researcher?\n A. Jeremy\n B. Tessa\n C. Max\n D. Other\nAnswer: ",
         'choices': ['A', 'B', 'C', 'D'],
-        'gold': 2
+        'gold':
+            2
     }
-    tokenized_example = dl.tokenize_example(prompt_and_fewshot='Answer the following: ',
-                                            ctxt=example['context'],
-                                            example=example)
-    unpadded_queries = [context[context != tokenizer.eos_token_id] for context in tokenized_example['query']]
-    untokenized_inputs = [tokenizer.decode(unpadded_input) for unpadded_input in unpadded_queries]
+    tokenized_example = dl.tokenize_example(
+        prompt_and_fewshot='Answer the following: ',
+        ctxt=example['context'],
+        example=example)
+    unpadded_queries = [
+        context[context != tokenizer.eos_token_id]
+        for context in tokenized_example['query']
+    ]
+    untokenized_inputs = [
+        tokenizer.decode(unpadded_input) for unpadded_input in unpadded_queries
+    ]
     correct_output = [
         "Answer the following: Who's the best eval researcher?\n A. Jeremy\n B. Tessa\n C. Max\n D. Other\nAnswer: A",
         "Answer the following: Who's the best eval researcher?\n A. Jeremy\n B. Tessa\n C. Max\n D. Other\nAnswer: B",
@@ -308,7 +870,11 @@ def test_schema_construct_context(tiny_gpt2_tokenizer, tmp_path):
         continuation_delimiter=' ### ',
         destination_path=str(tmp_path / 'test_human_eval_small.jsonl'),
     )
-    example = {'context_options': ['cont one', 'cont two'], 'gold': 0, 'continuation': 'this is a continuation'}
+    example = {
+        'context_options': ['cont one', 'cont two'],
+        'gold': 0,
+        'continuation': 'this is a continuation'
+    }
     constructed_context = dl.construct_context(example)
     assert constructed_context == 'cont one ### this is a continuation'
     constructed_context = dl.construct_context(example, preceding_text='text')
@@ -335,10 +901,15 @@ def test_schema_construct_multiple_contexts(tiny_gpt2_tokenizer, tmp_path):
         continuation_delimiter=' ### ',
         destination_path=str(tmp_path / 'test_human_eval_small.jsonl'),
     )
-    example = {'context_options': ['cont one', 'cont two'], 'gold': 0, 'continuation': 'this is a continuation'}
+    example = {
+        'context_options': ['cont one', 'cont two'],
+        'gold': 0,
+        'continuation': 'this is a continuation'
+    }
     constructed_contexts = dl._construct_multiple_contexts(example)
     assert constructed_contexts == ['cont one', 'cont two']
-    constructed_contexts = dl._construct_multiple_contexts(example, preceding_text='some text')
+    constructed_contexts = dl._construct_multiple_contexts(
+        example, preceding_text='some text')
     assert constructed_contexts == ['\ncont one ###', '\ncont two ###']
 
 
@@ -362,20 +933,34 @@ def test_schema_tokenize_example(tiny_gpt2_tokenizer, tmp_path):
         continuation_delimiter=' ### ',
         destination_path=str(tmp_path / 'test_human_eval_small.jsonl'),
     )
-    example = {'context_options': ['context one', 'context two'], 'gold': 0, 'continuation': 'this is a continuation'}
-    tokenized_example = dl.tokenize_example(prompt_and_fewshot='prompt ',
-                                            context_options=example['context_options'],
-                                            example=example)
-    assert all(tiny_gpt2_tokenizer.decode(cont) == ' this is a continuation' for cont in tokenized_example['answer'])
-    unpadded_inputs = [context[context != tokenizer.eos_token_id] for context in tokenized_example['context_options']]
-    untokenized_inputs = [tokenizer.decode(unpadded_input) for unpadded_input in unpadded_inputs]
+    example = {
+        'context_options': ['context one', 'context two'],
+        'gold': 0,
+        'continuation': 'this is a continuation'
+    }
+    tokenized_example = dl.tokenize_example(
+        prompt_and_fewshot='prompt ',
+        context_options=example['context_options'],
+        example=example)
+    assert all(
+        tiny_gpt2_tokenizer.decode(cont) == ' this is a continuation'
+        for cont in tokenized_example['answer'])
+    unpadded_inputs = [
+        context[context != tokenizer.eos_token_id]
+        for context in tokenized_example['context_options']
+    ]
+    untokenized_inputs = [
+        tokenizer.decode(unpadded_input) for unpadded_input in unpadded_inputs
+    ]
     assert untokenized_inputs == [
-        'prompt context one this is a continuation', 'prompt context two this is a continuation'
+        'prompt context one this is a continuation',
+        'prompt context two this is a continuation'
     ]
 
 
 @pytest.mark.parametrize('dataset_uri', ['mmlu_small.jsonl'])
-def test_mc_task_dataloader_subcategories(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
+def test_mc_task_dataloader_subcategories(dataset_uri, tiny_gpt2_tokenizer,
+                                          tmp_path):
     pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -384,18 +969,20 @@ def test_mc_task_dataloader_subcategories(dataset_uri, tiny_gpt2_tokenizer, tmp_
     dataset_uri = f'{local_data}/{dataset_uri}'
     batch_size = 8
     seqlen = 64
-    dls = get_icl_task_dataloader('multiple_choice',
-                                  dataset_uri=dataset_uri,
-                                  tokenizer=tokenizer,
-                                  batch_size=batch_size,
-                                  max_seq_len=seqlen,
-                                  pad_tok_id=tokenizer.eos_token_id,
-                                  num_fewshot=2,
-                                  prompt_string='The following are multiple choice questions (with answers).\n',
-                                  example_delimiter='\n',
-                                  continuation_delimiter='Answer: ',
-                                  destination_path=str(tmp_path / 'icl.jsonl'),
-                                  has_categories=True)
+    dls = get_icl_task_dataloader(
+        'multiple_choice',
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        batch_size=batch_size,
+        max_seq_len=seqlen,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=2,
+        prompt_string=
+        'The following are multiple choice questions (with answers).\n',
+        example_delimiter='\n',
+        continuation_delimiter='Answer: ',
+        destination_path=str(tmp_path / 'icl.jsonl'),
+        has_categories=True)
     assert isinstance(dls, dict)
 
     assert 'computer_security' in dls
@@ -408,7 +995,8 @@ def test_mc_task_dataloader_subcategories(dataset_uri, tiny_gpt2_tokenizer, tmp_
     assert 'attention_mask' in batch
     assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen)
     assert 'continuation_indices' in batch
-    assert isinstance(batch['continuation_indices'], list) and len(batch['continuation_indices']) == batch_size
+    assert isinstance(batch['continuation_indices'], list) and len(
+        batch['continuation_indices']) == batch_size
     assert 'mode' in batch
     assert batch['mode'] == 'icl_task'
     min_idx = min(batch['continuation_indices'][0]).item()
@@ -419,7 +1007,8 @@ def test_mc_task_dataloader_subcategories(dataset_uri, tiny_gpt2_tokenizer, tmp_
 @pytest.mark.parametrize('dataset_uri', [
     'pubmed_sm.jsonl',
 ])
-def test_lm_task_dataloader_extra_space(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
+def test_lm_task_dataloader_extra_space(dataset_uri, tiny_gpt2_tokenizer,
+                                        tmp_path):
     pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -448,13 +1037,15 @@ def test_lm_task_dataloader_extra_space(dataset_uri, tiny_gpt2_tokenizer, tmp_pa
     assert 'attention_mask' in batch
     assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen)
     assert 'continuation_indices' in batch
-    assert isinstance(batch['continuation_indices'], list) and len(batch['continuation_indices']) == batch_size
+    assert isinstance(batch['continuation_indices'], list) and len(
+        batch['continuation_indices']) == batch_size
     assert 'mode' in batch
     assert batch['mode'] == 'icl_task'
     min_idx = min(batch['continuation_indices'][0]).item()
     max_idx = max(batch['continuation_indices'][0]).item()
     assert '  ' not in tokenizer.decode(batch['input_ids'][0][0:max_idx + 1])
-    assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx + 1]) == ' yes'
+    assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx +
+                                                  1]) == ' yes'
 
 
 @pytest.mark.parametrize('dataset_uri', [
@@ -489,12 +1080,14 @@ def test_lm_task_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
     assert 'attention_mask' in batch
     assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen)
     assert 'continuation_indices' in batch
-    assert isinstance(batch['continuation_indices'], list) and len(batch['continuation_indices']) == batch_size
+    assert isinstance(batch['continuation_indices'], list) and len(
+        batch['continuation_indices']) == batch_size
     assert 'mode' in batch
     assert batch['mode'] == 'icl_task'
     min_idx = min(batch['continuation_indices'][0]).item()
     max_idx = max(batch['continuation_indices'][0]).item()
-    assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx + 1]) == ' glen'
+    assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx +
+                                                  1]) == ' glen'
 
 
 @pytest.mark.parametrize('dataset_uri', ['winograd_small.jsonl'])
@@ -528,18 +1121,21 @@ def test_schema_task_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
     assert 'attention_mask' in batch
     assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen)
     assert 'continuation_indices' in batch
-    assert isinstance(batch['continuation_indices'], list) and len(batch['continuation_indices']) == batch_size
+    assert isinstance(batch['continuation_indices'], list) and len(
+        batch['continuation_indices']) == batch_size
     assert 'mode' in batch
     assert batch['mode'] == 'icl_task'
     assert 'gold_indices' in batch
-    assert isinstance(batch['gold_indices'], list) and len(batch['gold_indices']) == batch_size // choices_per_question
+    assert isinstance(batch['gold_indices'], list) and len(
+        batch['gold_indices']) == batch_size // choices_per_question
     assert 'choice_groupings' in batch
     assert isinstance(batch['choice_groupings'], list) and len(
         batch['choice_groupings']) == batch_size // choices_per_question
 
     min_idx = min(batch['continuation_indices'][0]).item()
     max_idx = max(batch['continuation_indices'][0]).item()
-    assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx + 1]) == ' feared violence.'
+    assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx +
+                                                  1]) == ' feared violence.'
 
 
 @pytest.mark.parametrize('dataset_uri', ['winograd_small.jsonl'])
@@ -575,11 +1171,13 @@ def test_schema_task_dataloader_sentpiece_tokenizer(dataset_uri, tmp_path):
     assert 'attention_mask' in batch
     assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen)
     assert 'continuation_indices' in batch
-    assert isinstance(batch['continuation_indices'], list) and len(batch['continuation_indices']) == batch_size
+    assert isinstance(batch['continuation_indices'], list) and len(
+        batch['continuation_indices']) == batch_size
     assert 'mode' in batch
     assert batch['mode'] == 'icl_task'
     assert 'gold_indices' in batch
-    assert isinstance(batch['gold_indices'], list) and len(batch['gold_indices']) == batch_size // choices_per_question
+    assert isinstance(batch['gold_indices'], list) and len(
+        batch['gold_indices']) == batch_size // choices_per_question
     assert 'choice_groupings' in batch
     assert isinstance(batch['choice_groupings'], list) and len(
         batch['choice_groupings']) == batch_size // choices_per_question
@@ -592,7 +1190,8 @@ def test_schema_task_dataloader_sentpiece_tokenizer(dataset_uri, tmp_path):
 
 @pytest.mark.parametrize('dataset_uri', ['lambada_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [0, 1])
-def test_lm_task_dataloader_opt_tokenizer(tiny_opt_tokenizer, dataset_uri, num_fewshot, tmp_path):
+def test_lm_task_dataloader_opt_tokenizer(tiny_opt_tokenizer, dataset_uri,
+                                          num_fewshot, tmp_path):
     pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -621,19 +1220,22 @@ def test_lm_task_dataloader_opt_tokenizer(tiny_opt_tokenizer, dataset_uri, num_f
     assert 'attention_mask' in batch
     assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen)
     assert 'continuation_indices' in batch
-    assert isinstance(batch['continuation_indices'], list) and len(batch['continuation_indices']) == batch_size
+    assert isinstance(batch['continuation_indices'], list) and len(
+        batch['continuation_indices']) == batch_size
     assert 'mode' in batch
     assert batch['mode'] == 'icl_task'
     min_idx = min(batch['continuation_indices'][0]).item()
     max_idx = max(batch['continuation_indices'][0]).item()
-    assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx + 1]) == ' glen'
+    assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx +
+                                                  1]) == ' glen'
     assert tokenizer.decode(batch['input_ids'][0][0:min_idx]).startswith('</s>')
     assert tokenizer.decode(batch['input_ids'][0][0:min_idx]).count('</s>') == 1
 
 
 @pytest.mark.parametrize('dataset_uri', ['piqa_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [0, 1])
-def test_mc_task_dataloader_opt_tokenizer(tiny_opt_tokenizer, dataset_uri, num_fewshot, tmp_path):
+def test_mc_task_dataloader_opt_tokenizer(tiny_opt_tokenizer, dataset_uri,
+                                          num_fewshot, tmp_path):
     pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -665,18 +1267,21 @@ def test_mc_task_dataloader_opt_tokenizer(tiny_opt_tokenizer, dataset_uri, num_f
     assert 'attention_mask' in batch
     assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen)
     assert 'continuation_indices' in batch
-    assert isinstance(batch['continuation_indices'], list) and len(batch['continuation_indices']) == batch_size
+    assert isinstance(batch['continuation_indices'], list) and len(
+        batch['continuation_indices']) == batch_size
     assert 'mode' in batch
     assert batch['mode'] == 'icl_task'
     assert 'gold_indices' in batch
-    assert isinstance(batch['gold_indices'], list) and len(batch['gold_indices']) == batch_size // choices_per_question
+    assert isinstance(batch['gold_indices'], list) and len(
+        batch['gold_indices']) == batch_size // choices_per_question
     assert 'choice_groupings' in batch
     assert isinstance(batch['choice_groupings'], list) and len(
         batch['choice_groupings']) == batch_size // choices_per_question
 
     min_idx = min(batch['continuation_indices'][0]).item()
     max_idx = max(batch['continuation_indices'][0]).item()
-    assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx + 1]) == ' Pour it onto a plate'
+    assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx +
+                                                  1]) == ' Pour it onto a plate'
     assert tokenizer.decode(batch['input_ids'][0][0:min_idx]).startswith('</s>')
     assert tokenizer.decode(batch['input_ids'][0][0:min_idx]).count('</s>') == 1
 
@@ -715,9 +1320,12 @@ def test_mc_split_batch(tiny_opt_tokenizer, dataset_uri, num_fewshot, tmp_path):
     for i, microbatch in enumerate(microbatches):
         assert dl.get_num_samples_in_batch(microbatch) == 1
         assert 'input_ids' in microbatch
-        assert tuple(microbatch['input_ids'].shape) == (real_microbatch_size, seqlen)
+        assert tuple(microbatch['input_ids'].shape) == (real_microbatch_size,
+                                                        seqlen)
         assert 'attention_mask' in microbatch
-        assert tuple(microbatch['attention_mask'].shape) == (real_microbatch_size, seqlen)
+        assert tuple(
+            microbatch['attention_mask'].shape) == (real_microbatch_size,
+                                                    seqlen)
         assert 'continuation_indices' in microbatch
         assert isinstance(microbatch['continuation_indices'], list) and len(
             microbatch['continuation_indices']) == real_microbatch_size
@@ -725,21 +1333,27 @@ def test_mc_split_batch(tiny_opt_tokenizer, dataset_uri, num_fewshot, tmp_path):
         assert microbatch['mode'] == 'icl_task'
         assert 'gold_indices' in microbatch
         assert isinstance(microbatch['gold_indices'], list) and len(
-            microbatch['gold_indices']) == real_microbatch_size // choices_per_question
+            microbatch['gold_indices']
+        ) == real_microbatch_size // choices_per_question
         assert 'choice_groupings' in microbatch
         assert isinstance(microbatch['choice_groupings'], list) and len(
-            microbatch['choice_groupings']) == real_microbatch_size // choices_per_question
+            microbatch['choice_groupings']
+        ) == real_microbatch_size // choices_per_question
 
         min_idx = min(microbatch['continuation_indices'][0]).item()
         max_idx = max(microbatch['continuation_indices'][0]).item()
         if i == 0:
-            assert tokenizer.decode(microbatch['input_ids'][0][min_idx:max_idx + 1]) == ' Pour it onto a plate'
-        elif i == 1:
             assert tokenizer.decode(
                 microbatch['input_ids'][0][min_idx:max_idx +
-                                           1]) == ' Weld the metal together to get it to stay firmly in place'
-        assert tokenizer.decode(microbatch['input_ids'][0][0:min_idx]).startswith('</s>')
-        assert tokenizer.decode(microbatch['input_ids'][0][0:min_idx]).count('</s>') == 1
+                                           1]) == ' Pour it onto a plate'
+        elif i == 1:
+            assert tokenizer.decode(
+                microbatch['input_ids'][0][min_idx:max_idx + 1]
+            ) == ' Weld the metal together to get it to stay firmly in place'
+        assert tokenizer.decode(
+            microbatch['input_ids'][0][0:min_idx]).startswith('</s>')
+        assert tokenizer.decode(
+            microbatch['input_ids'][0][0:min_idx]).count('</s>') == 1
 
 
 @pytest.mark.parametrize('dataset_uri', ['triviaqa_small.jsonl'])
@@ -797,7 +1411,8 @@ def test_qa_split_batch(tiny_opt_tokenizer, dataset_uri, tmp_path):
 @pytest.mark.parametrize('dataset_uri', ['triviaqa_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [0])
 @pytest.mark.parametrize('prompt_string', ['I am a prompt', ''])
-def test_qa_task_dataloader_w_null_eos(dataset_uri, tiny_gpt2_tokenizer, tmp_path, num_fewshot, prompt_string):
+def test_qa_task_dataloader_w_null_eos(dataset_uri, tiny_gpt2_tokenizer,
+                                       tmp_path, num_fewshot, prompt_string):
     pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -819,13 +1434,15 @@ def test_qa_task_dataloader_w_null_eos(dataset_uri, tiny_gpt2_tokenizer, tmp_pat
                                     example_delimiter='\n',
                                     question_prelimiter='Q: ',
                                     continuation_delimiter='\nA:',
-                                    destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'))
+                                    destination_path=str(
+                                        tmp_path / f'icl_{num_fewshot}.jsonl'))
 
 
 @pytest.mark.parametrize('dataset_uri', ['triviaqa_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [0, 2])
 @pytest.mark.parametrize('prompt_string', ['I am a prompt', ''])
-def test_qa_task_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path, num_fewshot, prompt_string):
+def test_qa_task_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path,
+                            num_fewshot, prompt_string):
     pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -847,14 +1464,17 @@ def test_qa_task_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path, num_fews
                                  example_delimiter='\n',
                                  question_prelimiter='Q: ',
                                  continuation_delimiter='\nA:',
-                                 destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'))
+                                 destination_path=str(
+                                     tmp_path / f'icl_{num_fewshot}.jsonl'))
     assert isinstance(dl, DataSpec)
 
     assert isinstance(dl.dataloader, DataLoader)  # pyright
     batch = next(dl.dataloader._get_iterator())
 
-    assert tuple(batch['input_ids'].shape) == (batch_size, seqlen - maximum_answer_length)
-    assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen - maximum_answer_length)
+    assert tuple(batch['input_ids'].shape) == (batch_size,
+                                               seqlen - maximum_answer_length)
+    assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen -
+                                                    maximum_answer_length)
     assert batch['mode'] == 'generate'
     # the maximum generation length from the small test data
 
@@ -868,16 +1488,19 @@ def test_qa_task_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path, num_fews
     if len(prompt_string) > 0:
         assert all(item.count('I am a prompt') == 1 for item in decoded_batch)
     assert all(
-        set(found) == set(expected)
-        for found, expected in zip(batch['labels'], [['David Seville'], ['Skorpio', 'Scorpio']]))
-    assert decoded_batch[0].endswith('Q: Who was the man behind The Chipmunks?\nA:')
-    assert decoded_batch[1].endswith('Q: What star sign is Jamie Lee Curtis?\nA:')
+        set(found) == set(expected) for found, expected in zip(
+            batch['labels'], [['David Seville'], ['Skorpio', 'Scorpio']]))
+    assert decoded_batch[0].endswith(
+        'Q: Who was the man behind The Chipmunks?\nA:')
+    assert decoded_batch[1].endswith(
+        'Q: What star sign is Jamie Lee Curtis?\nA:')
     assert 'eos_token_id' in batch['generation_kwargs']
 
 
 @pytest.mark.parametrize('dataset_uri', ['gsm8k_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [0, 2])
-def test_qa_task_with_cot_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path, num_fewshot):
+def test_qa_task_with_cot_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path,
+                                     num_fewshot):
     pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -888,24 +1511,27 @@ def test_qa_task_with_cot_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path,
     seqlen = 512
     # empirical number from the small test dataset
     maximum_answer_length = 132
-    dl = get_icl_task_dataloader('question_answering',
-                                 dataset_uri=dataset_uri,
-                                 tokenizer=tokenizer,
-                                 batch_size=batch_size,
-                                 max_seq_len=seqlen,
-                                 pad_tok_id=tokenizer.eos_token_id,
-                                 num_fewshot=num_fewshot,
-                                 prompt_string='',
-                                 example_delimiter='\n',
-                                 question_prelimiter='Q: ',
-                                 continuation_delimiter="\nA: Let's think step by step. ",
-                                 cot_delimiter=' #### ',
-                                 destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'))
+    dl = get_icl_task_dataloader(
+        'question_answering',
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        batch_size=batch_size,
+        max_seq_len=seqlen,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=num_fewshot,
+        prompt_string='',
+        example_delimiter='\n',
+        question_prelimiter='Q: ',
+        continuation_delimiter="\nA: Let's think step by step. ",
+        cot_delimiter=' #### ',
+        destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'))
     assert isinstance(dl, DataSpec)
     assert isinstance(dl.dataloader, DataLoader)  # pyright
     batch = next(dl.dataloader._get_iterator())
-    assert tuple(batch['input_ids'].shape) == (batch_size, seqlen - maximum_answer_length)
-    assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen - maximum_answer_length)
+    assert tuple(batch['input_ids'].shape) == (batch_size,
+                                               seqlen - maximum_answer_length)
+    assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen -
+                                                    maximum_answer_length)
     assert batch['mode'] == 'generate'
     # the maximum generation length from the small test data
     assert batch['generation_length'] == maximum_answer_length
@@ -962,18 +1588,21 @@ def test_mc_task_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
     assert 'attention_mask' in batch
     assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen)
     assert 'continuation_indices' in batch
-    assert isinstance(batch['continuation_indices'], list) and len(batch['continuation_indices']) == batch_size
+    assert isinstance(batch['continuation_indices'], list) and len(
+        batch['continuation_indices']) == batch_size
     assert 'mode' in batch
     assert batch['mode'] == 'icl_task'
     assert 'gold_indices' in batch
-    assert isinstance(batch['gold_indices'], list) and len(batch['gold_indices']) == batch_size // choices_per_question
+    assert isinstance(batch['gold_indices'], list) and len(
+        batch['gold_indices']) == batch_size // choices_per_question
     assert 'choice_groupings' in batch
     assert isinstance(batch['choice_groupings'], list) and len(
         batch['choice_groupings']) == batch_size // choices_per_question
 
     min_idx = min(batch['continuation_indices'][0]).item()
     max_idx = max(batch['continuation_indices'][0]).item()
-    assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx + 1]) == ' Pour it onto a plate'
+    assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx +
+                                                  1]) == ' Pour it onto a plate'
 
 
 @pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
@@ -1048,13 +1677,15 @@ def test_code_eval_split_batch(dataset_uri, tmp_path):
 @pytest.mark.parametrize('num_fewshot', [0, 2])
 @pytest.mark.parametrize('prompt_string', ['Please code:\n', ''])
 @pytest.mark.parametrize('generations_per_sample', [1, 3])
-def test_code_eval_sentpiece_dataloader(dataset_uri, tmp_path, num_fewshot, prompt_string, generations_per_sample):
+def test_code_eval_sentpiece_dataloader(dataset_uri, tmp_path, num_fewshot,
+                                        prompt_string, generations_per_sample):
     pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
 
     transformers = pytest.importorskip('transformers')
-    tokenizer = transformers.AutoTokenizer.from_pretrained('huggyllama/llama-7b')  # type: ignore reportUnboundVariable
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        'huggyllama/llama-7b')  # type: ignore reportUnboundVariable
     dataset_uri = f'{local_data}/{dataset_uri}'
     batch_size = 4
     seqlen = 2048
@@ -1070,7 +1701,8 @@ def test_code_eval_sentpiece_dataloader(dataset_uri, tmp_path, num_fewshot, prom
                                  example_delimiter='\n',
                                  continuation_delimiter='',
                                  question_prelimiter='Code start: \n',
-                                 destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'),
+                                 destination_path=str(
+                                     tmp_path / f'icl_{num_fewshot}.jsonl'),
                                  generations_per_sample=generations_per_sample)
     assert isinstance(dl, DataSpec)
 
@@ -1081,14 +1713,18 @@ def test_code_eval_sentpiece_dataloader(dataset_uri, tmp_path, num_fewshot, prom
     if isinstance(dl.dataloader.dataset, InContextLearningCodeEvalDataset):
         max_prompt_length = dl.dataloader.dataset.max_prompt_length
     assert tuple(batch['input_ids'].shape) == (batch_size, max_prompt_length)
-    assert tuple(batch['attention_mask'].shape) == (batch_size, max_prompt_length)
+    assert tuple(batch['attention_mask'].shape) == (batch_size,
+                                                    max_prompt_length)
     assert batch['mode'] == 'generate'
     # the maximum generation length from the small test data
     assert batch['generation_length'] == 129
-    assert any(item[0] != tokenizer.eos_token_id for item in batch['input_ids'])  # longest should be pushed left
+    assert any(item[0] != tokenizer.eos_token_id
+               for item in batch['input_ids'])  # longest should be pushed left
 
     decoded_batch = tokenizer.batch_decode(batch['input_ids'])
-    assert all(item.count('Code start: \n') == num_fewshot + 1 for item in decoded_batch)
+    assert all(
+        item.count('Code start: \n') == num_fewshot + 1
+        for item in decoded_batch)
 
     if len(prompt_string) > 0:
         assert all(item.count('Please code:\n') == 1 for item in decoded_batch)
@@ -1121,7 +1757,8 @@ def test_code_eval_test_cases(dataset_uri, tmp_path):
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
 
     transformers = pytest.importorskip('transformers')
-    tokenizer = transformers.AutoTokenizer.from_pretrained('huggyllama/llama-7b')  # type: ignore reportUnboundVariable
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        'huggyllama/llama-7b')  # type: ignore reportUnboundVariable
     dataset_uri = f'{local_data}/{dataset_uri}'
     batch_size = 4
     seqlen = 512
@@ -1148,15 +1785,18 @@ def test_code_eval_test_cases(dataset_uri, tmp_path):
     if isinstance(dl.dataloader.dataset, InContextLearningCodeEvalDataset):
         max_prompt_length = dl.dataloader.dataset.max_prompt_length
     assert tuple(batch['input_ids'].shape) == (batch_size, max_prompt_length)
-    assert tuple(batch['attention_mask'].shape) == (batch_size, max_prompt_length)
+    assert tuple(batch['attention_mask'].shape) == (batch_size,
+                                                    max_prompt_length)
     assert batch['mode'] == 'generate'
     # the maximum generation length from the small test data
     assert batch['generation_length'] == 129
-    assert any(item[0] != tokenizer.eos_token_id for item in batch['input_ids'])  # longest should be pushed left
+    assert any(item[0] != tokenizer.eos_token_id
+               for item in batch['input_ids'])  # longest should be pushed left
 
     mod = types.ModuleType('test_module')
-    for prompt, solution, inputs, outputs, entry_point in zip(batch['prompts'], batch['labels'], batch['test_inputs'],
-                                                              batch['test_outputs'], batch['entry_points']):
+    for prompt, solution, inputs, outputs, entry_point in zip(
+            batch['prompts'], batch['labels'], batch['test_inputs'],
+            batch['test_outputs'], batch['entry_points']):
         exec(prompt + solution, mod.__dict__)
         for test_input, test_output in zip(inputs, outputs):
             result = mod.__dict__[entry_point](*eval(test_input))
@@ -1170,7 +1810,8 @@ def test_code_eval_pass_at_k_validity(dataset_uri, tmp_path):
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
 
     transformers = pytest.importorskip('transformers')
-    tokenizer = transformers.AutoTokenizer.from_pretrained('huggyllama/llama-7b')  # type: ignore reportUnboundVariable
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        'huggyllama/llama-7b')  # type: ignore reportUnboundVariable
     dataset_uri = f'{local_data}/{dataset_uri}'
     batch_size = 2
     seqlen = 64
@@ -1196,13 +1837,15 @@ def test_code_eval_pass_at_k_validity(dataset_uri, tmp_path):
 @pytest.mark.parametrize('num_fewshot', [0, 2])
 @pytest.mark.parametrize('prompt_string', ['Please code:\n', ''])
 @pytest.mark.parametrize('generations_per_sample', [1, 3])
-def test_code_eval_task_dataloader(dataset_uri, tmp_path, num_fewshot, prompt_string, generations_per_sample):
+def test_code_eval_task_dataloader(dataset_uri, tmp_path, num_fewshot,
+                                   prompt_string, generations_per_sample):
     pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
 
     transformers = pytest.importorskip('transformers')
-    tokenizer = transformers.AutoTokenizer.from_pretrained('mosaicml/mpt-7b')  # type: ignore reportUnboundVariable
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        'mosaicml/mpt-7b')  # type: ignore reportUnboundVariable
     dataset_uri = f'{local_data}/{dataset_uri}'
     batch_size = 4
     seqlen = 2048
@@ -1218,7 +1861,8 @@ def test_code_eval_task_dataloader(dataset_uri, tmp_path, num_fewshot, prompt_st
                                  example_delimiter='\n',
                                  continuation_delimiter='',
                                  question_prelimiter='Code start: \n',
-                                 destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'),
+                                 destination_path=str(
+                                     tmp_path / f'icl_{num_fewshot}.jsonl'),
                                  generations_per_sample=generations_per_sample,
                                  generation_kwargs={
                                      'temperature': .9,
@@ -1233,14 +1877,18 @@ def test_code_eval_task_dataloader(dataset_uri, tmp_path, num_fewshot, prompt_st
     if isinstance(dl.dataloader.dataset, InContextLearningCodeEvalDataset):
         max_prompt_length = dl.dataloader.dataset.max_prompt_length
     assert tuple(batch['input_ids'].shape) == (batch_size, max_prompt_length)
-    assert tuple(batch['attention_mask'].shape) == (batch_size, max_prompt_length)
+    assert tuple(batch['attention_mask'].shape) == (batch_size,
+                                                    max_prompt_length)
     assert batch['mode'] == 'generate'
     # the maximum generation length from the small test data
     assert batch['generation_length'] == 122
-    assert any(item[0] != tokenizer.eos_token_id for item in batch['input_ids'])  # longest should be pushed left
+    assert any(item[0] != tokenizer.eos_token_id
+               for item in batch['input_ids'])  # longest should be pushed left
 
     decoded_batch = tokenizer.batch_decode(batch['input_ids'])
-    assert all(item.count('Code start: \n') == num_fewshot + 1 for item in decoded_batch)
+    assert all(
+        item.count('Code start: \n') == num_fewshot + 1
+        for item in decoded_batch)
 
     if len(prompt_string) > 0:
         assert all(item.count('Please code:\n') == 1 for item in decoded_batch)
@@ -1268,12 +1916,14 @@ def test_code_eval_task_dataloader(dataset_uri, tmp_path, num_fewshot, prompt_st
 
 @pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [0, 1])
-def test_eval_split_batch(tiny_opt_tokenizer, dataset_uri, num_fewshot, tmp_path):
+def test_eval_split_batch(tiny_opt_tokenizer, dataset_uri, num_fewshot,
+                          tmp_path):
     pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     transformers = pytest.importorskip('transformers')
-    tokenizer = transformers.AutoTokenizer.from_pretrained('mosaicml/mpt-7b')  # type: ignore reportUnboundVariable
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        'mosaicml/mpt-7b')  # type: ignore reportUnboundVariable
     dataset_uri = f'{local_data}/{dataset_uri}'
     batch_size = 4
     seqlen = 512
@@ -1289,7 +1939,8 @@ def test_eval_split_batch(tiny_opt_tokenizer, dataset_uri, num_fewshot, tmp_path
                                  example_delimiter='\n',
                                  continuation_delimiter='',
                                  question_prelimiter='Code start: \n',
-                                 destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'),
+                                 destination_path=str(
+                                     tmp_path / f'icl_{num_fewshot}.jsonl'),
                                  generations_per_sample=1,
                                  generation_kwargs={
                                      'temperature': .9,
@@ -1323,9 +1974,11 @@ def test_eval_split_batch(tiny_opt_tokenizer, dataset_uri, num_fewshot, tmp_path
 @pytest.mark.parametrize('num_fewshot', [0, 5])
 @pytest.mark.gpu
 @pytest.mark.world_size(2)
-def test_lm_task_evaluation(dataset_uri, num_fewshot, tiny_gpt2_tokenizer, tmp_path):
+def test_lm_task_evaluation(dataset_uri, num_fewshot, tiny_gpt2_tokenizer,
+                            tmp_path):
     pytest.importorskip('datasets')
-    in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
+    in_memory_logger = InMemoryLogger(
+    )  # track the logged metrics in the in_memory_logger
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/{dataset_uri}'
     tokenizer = tiny_gpt2_tokenizer
@@ -1344,7 +1997,9 @@ def test_lm_task_evaluation(dataset_uri, num_fewshot, tiny_gpt2_tokenizer, tmp_p
         destination_path=str(tmp_path / 'icl.jsonl'),
     )
 
-    evaluator = Evaluator(label='lambada', dataloader=dl, metric_names=['InContextLearningLMAccuracy'])
+    evaluator = Evaluator(label='lambada',
+                          dataloader=dl,
+                          metric_names=['InContextLearningLMAccuracy'])
 
     transformers = pytest.importorskip('transformers')
     config = transformers.AutoConfig.from_pretrained('EleutherAI/gpt-neo-125M')
@@ -1358,16 +2013,20 @@ def test_lm_task_evaluation(dataset_uri, num_fewshot, tiny_gpt2_tokenizer, tmp_p
 
     trainer = Trainer(model=model, max_duration='1ep', loggers=in_memory_logger)
     trainer.eval(eval_dataloader=evaluator, subset_num_batches=2)
-    assert 'metrics/lambada/InContextLearningLMAccuracy' in in_memory_logger.data.keys()
-    assert in_memory_logger.data['metrics/lambada/InContextLearningLMAccuracy'][0][1].item() == 0
+    assert 'metrics/lambada/InContextLearningLMAccuracy' in in_memory_logger.data.keys(
+    )
+    assert in_memory_logger.data['metrics/lambada/InContextLearningLMAccuracy'][
+        0][1].item() == 0
 
 
 @pytest.mark.parametrize('num_fewshot', [0, 5])
 @pytest.mark.parametrize('dataset_uri', ['winograd_small.jsonl'])
 @pytest.mark.filterwarnings(r'ignore:Cannot split .* of length.*:UserWarning')
-def test_schema_task_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tmp_path, tiny_gpt2_model):
+def test_schema_task_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer,
+                                tmp_path, tiny_gpt2_model):
     pytest.importorskip('datasets')
-    in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
+    in_memory_logger = InMemoryLogger(
+    )  # track the logged metrics in the in_memory_logger
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/{dataset_uri}'
     tokenizer = tiny_gpt2_tokenizer
@@ -1386,7 +2045,10 @@ def test_schema_task_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer, t
         destination_path=str(tmp_path / 'icl.jsonl'),
     )
 
-    evaluator = Evaluator(label='winograd', dataloader=dl, metric_names=['InContextLearningMultipleChoiceAccuracy'])
+    evaluator = Evaluator(
+        label='winograd',
+        dataloader=dl,
+        metric_names=['InContextLearningMultipleChoiceAccuracy'])
 
     model = HuggingFaceModel(
         model=tiny_gpt2_model,
@@ -1397,13 +2059,17 @@ def test_schema_task_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer, t
 
     trainer = Trainer(model=model, max_duration='1ba', loggers=in_memory_logger)
     trainer.eval(eval_dataloader=evaluator)
-    assert 'metrics/winograd/InContextLearningMultipleChoiceAccuracy' in in_memory_logger.data.keys()
-    assert in_memory_logger.data['metrics/winograd/InContextLearningMultipleChoiceAccuracy'][0][1].item() > 0
+    assert 'metrics/winograd/InContextLearningMultipleChoiceAccuracy' in in_memory_logger.data.keys(
+    )
+    assert in_memory_logger.data[
+        'metrics/winograd/InContextLearningMultipleChoiceAccuracy'][0][1].item(
+        ) > 0
     num_samples = 0
     with open(dataset_uri) as f:
         for _ in f:
             num_samples += 1
-    assert trainer.state.eval_metrics['winograd']['InContextLearningMultipleChoiceAccuracy'].total == num_samples
+    assert trainer.state.eval_metrics['winograd'][
+        'InContextLearningMultipleChoiceAccuracy'].total == num_samples
 
 
 @pytest.mark.parametrize('dataset_uri', ['mmlu_small.jsonl'])
@@ -1411,10 +2077,12 @@ def test_schema_task_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer, t
 @pytest.mark.gpu
 @pytest.mark.world_size(2)
 @pytest.mark.filterwarnings(r'ignore:Cannot split .* of length.*:UserWarning')
-def test_mc_task_evaluation_subcategories(dataset_uri, num_fewshot, tiny_gpt2_model,
-                                          tiny_gpt2_tokenizer, tmp_path):
+def test_mc_task_evaluation_subcategories(dataset_uri, num_fewshot,
+                                          tiny_gpt2_model, tiny_gpt2_tokenizer,
+                                          tmp_path):
     pytest.importorskip('datasets')
-    in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
+    in_memory_logger = InMemoryLogger(
+    )  # track the logged metrics in the in_memory_logger
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/{dataset_uri}'
     tokenizer = tiny_gpt2_tokenizer
@@ -1433,12 +2101,15 @@ def test_mc_task_evaluation_subcategories(dataset_uri, num_fewshot, tiny_gpt2_mo
                                   prompt_string='',
                                   example_delimiter='\n',
                                   continuation_delimiter=': ',
-                                  destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
+                                  destination_path=str(
+                                      Path(gathered_paths[0]) / 'icl.jsonl'),
                                   has_categories=True)
 
     assert isinstance(dls, dict)
     evaluators = [
-        Evaluator(label='mmlu/' + k, dataloader=dl, metric_names=['InContextLearningMultipleChoiceAccuracy'])
+        Evaluator(label='mmlu/' + k,
+                  dataloader=dl,
+                  metric_names=['InContextLearningMultipleChoiceAccuracy'])
         for k, dl in dls.items()
     ]
 
@@ -1451,23 +2122,28 @@ def test_mc_task_evaluation_subcategories(dataset_uri, num_fewshot, tiny_gpt2_mo
 
     trainer = Trainer(model=model, loggers=in_memory_logger)
     trainer.eval(eval_dataloader=evaluators)
-    assert 'metrics/mmlu/computer_security/InContextLearningMultipleChoiceAccuracy' in in_memory_logger.data.keys()
-    assert in_memory_logger.data['metrics/mmlu/computer_security/InContextLearningMultipleChoiceAccuracy'][0][1].item(
-    ) > 0
-    total = trainer.state.eval_metrics['mmlu/computer_security']['InContextLearningMultipleChoiceAccuracy'].total
+    assert 'metrics/mmlu/computer_security/InContextLearningMultipleChoiceAccuracy' in in_memory_logger.data.keys(
+    )
+    assert in_memory_logger.data[
+        'metrics/mmlu/computer_security/InContextLearningMultipleChoiceAccuracy'][
+            0][1].item() > 0
+    total = trainer.state.eval_metrics['mmlu/computer_security'][
+        'InContextLearningMultipleChoiceAccuracy'].total
     dist.all_reduce(total)  # type: ignore
     assert total.item() == 4  # type: ignore
 
 
-@pytest.mark.parametrize('dataset_uri', ['piqa_small.jsonl', 'hellaswag_small.jsonl'])
+@pytest.mark.parametrize('dataset_uri',
+                         ['piqa_small.jsonl', 'hellaswag_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [0, 5])
 @pytest.mark.filterwarnings(r'ignore:Cannot split .* of length.*:UserWarning')
 @pytest.mark.gpu
 @pytest.mark.world_size(2)
-def test_mc_task_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tmp_path,
-                            tiny_gpt2_model):
+def test_mc_task_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer,
+                            tmp_path, tiny_gpt2_model):
     pytest.importorskip('datasets')
-    in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
+    in_memory_logger = InMemoryLogger(
+    )  # track the logged metrics in the in_memory_logger
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/{dataset_uri}'
     tokenizer = tiny_gpt2_tokenizer
@@ -1491,7 +2167,10 @@ def test_mc_task_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tmp_p
         destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
     )
 
-    evaluator = Evaluator(label='mc', dataloader=dl, metric_names=['InContextLearningMultipleChoiceAccuracy'])
+    evaluator = Evaluator(
+        label='mc',
+        dataloader=dl,
+        metric_names=['InContextLearningMultipleChoiceAccuracy'])
 
     model = HuggingFaceModel(
         model=tiny_gpt2_model,
@@ -1502,27 +2181,33 @@ def test_mc_task_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tmp_p
 
     trainer = Trainer(model=model, max_duration='1ba', loggers=in_memory_logger)
     trainer.eval(eval_dataloader=evaluator)
-    assert 'metrics/mc/InContextLearningMultipleChoiceAccuracy' in in_memory_logger.data.keys()
-    assert in_memory_logger.data['metrics/mc/InContextLearningMultipleChoiceAccuracy'][0][1].item() >= 0
+    assert 'metrics/mc/InContextLearningMultipleChoiceAccuracy' in in_memory_logger.data.keys(
+    )
+    assert in_memory_logger.data[
+        'metrics/mc/InContextLearningMultipleChoiceAccuracy'][0][1].item() >= 0
     num_samples = 0
     with open(dataset_uri) as f:
         for _ in f:
             num_samples += 1
-    total = trainer.state.eval_metrics['mc']['InContextLearningMultipleChoiceAccuracy'].total
+    total = trainer.state.eval_metrics['mc'][
+        'InContextLearningMultipleChoiceAccuracy'].total
     dist.all_reduce(total)  # type: ignore
     assert total.item() == num_samples  # type: ignore
 
 
 @pytest.mark.parametrize('num_fewshot', [0, 5])
 @pytest.mark.parametrize('dataset_uri', ['triviaqa_small.jsonl'])
-@pytest.mark.filterwarnings(r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning')
+@pytest.mark.filterwarnings(
+    r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning'
+)
 @pytest.mark.filterwarnings(r'ignore:Cannot split .* of length.*:UserWarning')
 @pytest.mark.gpu
 @pytest.mark.world_size(2)
-def test_qa_task_evaluation_opt_tokenizer(tiny_opt_tokenizer, tiny_opt_model, num_fewshot,
-                                          dataset_uri, tmp_path):
+def test_qa_task_evaluation_opt_tokenizer(tiny_opt_tokenizer, tiny_opt_model,
+                                          num_fewshot, dataset_uri, tmp_path):
     pytest.importorskip('datasets')
-    in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
+    in_memory_logger = InMemoryLogger(
+    )  # track the logged metrics in the in_memory_logger
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/{dataset_uri}'
     tokenizer = tiny_opt_tokenizer
@@ -1544,7 +2229,9 @@ def test_qa_task_evaluation_opt_tokenizer(tiny_opt_tokenizer, tiny_opt_model, nu
         destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
     )
 
-    evaluator = Evaluator(label='triviaqa', dataloader=dl, metric_names=['InContextLearningQAAccuracy'])
+    evaluator = Evaluator(label='triviaqa',
+                          dataloader=dl,
+                          metric_names=['InContextLearningQAAccuracy'])
     model = HuggingFaceModel(
         model=tiny_opt_model,
         tokenizer=tokenizer,
@@ -1555,20 +2242,26 @@ def test_qa_task_evaluation_opt_tokenizer(tiny_opt_tokenizer, tiny_opt_model, nu
     trainer = Trainer(model=model, max_duration='1ba', loggers=in_memory_logger)
 
     trainer.eval(eval_dataloader=evaluator, subset_num_batches=2)
-    assert 'metrics/triviaqa/InContextLearningQAAccuracy' in in_memory_logger.data.keys()
-    assert in_memory_logger.data['metrics/triviaqa/InContextLearningQAAccuracy'][0][1].item() == 0
+    assert 'metrics/triviaqa/InContextLearningQAAccuracy' in in_memory_logger.data.keys(
+    )
+    assert in_memory_logger.data[
+        'metrics/triviaqa/InContextLearningQAAccuracy'][0][1].item() == 0
 
 
 @pytest.mark.parametrize('num_fewshot', [5])
 @pytest.mark.parametrize('dataset_uri', ['gsm8k_small.jsonl'])
 @pytest.mark.gpu
 @pytest.mark.world_size(2)
-@pytest.mark.filterwarnings(r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning')
+@pytest.mark.filterwarnings(
+    r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning'
+)
 @pytest.mark.filterwarnings(r'ignore:Cannot split .* of length.*:UserWarning')
-def test_qa_task_evaluation_with_cot_opt_tokenizer(tiny_opt_tokenizer, tiny_opt_model, num_fewshot,
+def test_qa_task_evaluation_with_cot_opt_tokenizer(tiny_opt_tokenizer,
+                                                   tiny_opt_model, num_fewshot,
                                                    dataset_uri, tmp_path):
     pytest.importorskip('datasets')
-    in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
+    in_memory_logger = InMemoryLogger(
+    )  # track the logged metrics in the in_memory_logger
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/{dataset_uri}'
     tokenizer = tiny_opt_tokenizer
@@ -1591,7 +2284,9 @@ def test_qa_task_evaluation_with_cot_opt_tokenizer(tiny_opt_tokenizer, tiny_opt_
         destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
     )
 
-    evaluator = Evaluator(label='gsm8k', dataloader=dl, metric_names=['InContextLearningQAAccuracy'])
+    evaluator = Evaluator(label='gsm8k',
+                          dataloader=dl,
+                          metric_names=['InContextLearningQAAccuracy'])
     model = HuggingFaceModel(
         model=tiny_opt_model,
         tokenizer=tokenizer,
@@ -1602,19 +2297,24 @@ def test_qa_task_evaluation_with_cot_opt_tokenizer(tiny_opt_tokenizer, tiny_opt_
     trainer = Trainer(model=model, max_duration='1ba', loggers=in_memory_logger)
 
     trainer.eval(eval_dataloader=evaluator, subset_num_batches=2)
-    assert 'metrics/gsm8k/InContextLearningQAAccuracy' in in_memory_logger.data.keys()
-    assert in_memory_logger.data['metrics/gsm8k/InContextLearningQAAccuracy'][0][1].item() == 0
+    assert 'metrics/gsm8k/InContextLearningQAAccuracy' in in_memory_logger.data.keys(
+    )
+    assert in_memory_logger.data['metrics/gsm8k/InContextLearningQAAccuracy'][
+        0][1].item() == 0
 
 
 @pytest.mark.parametrize('dataset_uri', ['triviaqa_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [0, 5])
 @pytest.mark.gpu
 @pytest.mark.world_size(2)
-@pytest.mark.filterwarnings(r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning')
-def test_qa_task_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tiny_gpt2_model,
-                            tmp_path):
+@pytest.mark.filterwarnings(
+    r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning'
+)
+def test_qa_task_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer,
+                            tiny_gpt2_model, tmp_path):
     pytest.importorskip('datasets')
-    in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
+    in_memory_logger = InMemoryLogger(
+    )  # track the logged metrics in the in_memory_logger
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/{dataset_uri}'
     tokenizer = tiny_gpt2_tokenizer
@@ -1635,7 +2335,9 @@ def test_qa_task_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tiny_
         destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
     )
 
-    evaluator = Evaluator(label='triviaqa', dataloader=dl, metric_names=['InContextLearningQAAccuracy'])
+    evaluator = Evaluator(label='triviaqa',
+                          dataloader=dl,
+                          metric_names=['InContextLearningQAAccuracy'])
 
     model = HuggingFaceModel(
         model=tiny_gpt2_model,
@@ -1647,19 +2349,25 @@ def test_qa_task_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tiny_
     trainer = Trainer(model=model, max_duration='1ba', loggers=in_memory_logger)
 
     trainer.eval(eval_dataloader=evaluator, subset_num_batches=2)
-    assert 'metrics/triviaqa/InContextLearningQAAccuracy' in in_memory_logger.data.keys()
-    assert in_memory_logger.data['metrics/triviaqa/InContextLearningQAAccuracy'][0][1].item() == 0
+    assert 'metrics/triviaqa/InContextLearningQAAccuracy' in in_memory_logger.data.keys(
+    )
+    assert in_memory_logger.data[
+        'metrics/triviaqa/InContextLearningQAAccuracy'][0][1].item() == 0
 
 
 @pytest.mark.parametrize('dataset_uri', ['gsm8k_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [5])
-@pytest.mark.filterwarnings(r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning')
+@pytest.mark.filterwarnings(
+    r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning'
+)
 @pytest.mark.gpu
 @pytest.mark.world_size(2)
-def test_qa_task_with_cot_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tiny_gpt2_model,
+def test_qa_task_with_cot_evaluation(num_fewshot, dataset_uri,
+                                     tiny_gpt2_tokenizer, tiny_gpt2_model,
                                      tmp_path):
     pytest.importorskip('datasets')
-    in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
+    in_memory_logger = InMemoryLogger(
+    )  # track the logged metrics in the in_memory_logger
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/{dataset_uri}'
     tokenizer = tiny_gpt2_tokenizer
@@ -1681,7 +2389,9 @@ def test_qa_task_with_cot_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokeniz
         destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
     )
 
-    evaluator = Evaluator(label='gsm8k', dataloader=dl, metric_names=['InContextLearningQAAccuracy'])
+    evaluator = Evaluator(label='gsm8k',
+                          dataloader=dl,
+                          metric_names=['InContextLearningQAAccuracy'])
 
     model = HuggingFaceModel(
         model=tiny_gpt2_model,
@@ -1693,19 +2403,25 @@ def test_qa_task_with_cot_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokeniz
     trainer = Trainer(model=model, max_duration='1ba', loggers=in_memory_logger)
 
     trainer.eval(eval_dataloader=evaluator, subset_num_batches=2)
-    assert 'metrics/gsm8k/InContextLearningQAAccuracy' in in_memory_logger.data.keys()
-    assert in_memory_logger.data['metrics/gsm8k/InContextLearningQAAccuracy'][0][1].item() == 0
+    assert 'metrics/gsm8k/InContextLearningQAAccuracy' in in_memory_logger.data.keys(
+    )
+    assert in_memory_logger.data['metrics/gsm8k/InContextLearningQAAccuracy'][
+        0][1].item() == 0
 
 
 def test_code_eval_requires_envvar(monkeypatch):
     monkeypatch.delenv('CODE_EVAL_DEVICE', raising=False)
-    with pytest.raises(ValueError, match='Attempting to use InContextLearningCodeEvalAccuracy but.*'):
+    with pytest.raises(
+            ValueError,
+            match='Attempting to use InContextLearningCodeEvalAccuracy but.*'):
         InContextLearningCodeEvalAccuracy().get_client()
 
 
 def test_code_eval_requires_valid_envvar(monkeypatch):
     monkeypatch.setenv('CODE_EVAL_DEVICE', 'bigchungus')
-    with pytest.raises(ValueError, match='Environment variable `CODE_EVAL_DEVICE` must be on.*'):
+    with pytest.raises(
+            ValueError,
+            match='Environment variable `CODE_EVAL_DEVICE` must be on.*'):
         InContextLearningCodeEvalAccuracy().get_client()
 
 
@@ -1714,12 +2430,16 @@ def test_code_eval_requires_valid_envvar(monkeypatch):
 @pytest.mark.parametrize('generations_per_sample', range(1, 3))
 @pytest.mark.gpu
 @pytest.mark.world_size(2)
-@pytest.mark.filterwarnings(r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning')
-def test_code_eval_microbatching(monkeypatch, tiny_opt_tokenizer, tiny_opt_model, num_fewshot,
-                                 dataset_uri, tmp_path, generations_per_sample):
+@pytest.mark.filterwarnings(
+    r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning'
+)
+def test_code_eval_microbatching(monkeypatch, tiny_opt_tokenizer,
+                                 tiny_opt_model, num_fewshot, dataset_uri,
+                                 tmp_path, generations_per_sample):
     pytest.importorskip('datasets')
     monkeypatch.setenv('CODE_EVAL_DEVICE', 'LOCAL')
-    in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
+    in_memory_logger = InMemoryLogger(
+    )  # track the logged metrics in the in_memory_logger
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/{dataset_uri}'
     tokenizer = tiny_opt_tokenizer
@@ -1757,8 +2477,10 @@ def test_code_eval_microbatching(monkeypatch, tiny_opt_tokenizer, tiny_opt_model
     torch.use_deterministic_algorithms(False)
     trainer.eval(eval_dataloader=evaluator, subset_num_batches=2)
     torch.use_deterministic_algorithms(True)
-    assert 'metrics/humaneval/InContextLearningCodeEvalAccuracy' in in_memory_logger.data.keys()
-    assert in_memory_logger.data['metrics/humaneval/InContextLearningCodeEvalAccuracy'][0][1].item() == 0
+    assert 'metrics/humaneval/InContextLearningCodeEvalAccuracy' in in_memory_logger.data.keys(
+    )
+    assert in_memory_logger.data[
+        'metrics/humaneval/InContextLearningCodeEvalAccuracy'][0][1].item() == 0
 
 
 @pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
@@ -1766,12 +2488,16 @@ def test_code_eval_microbatching(monkeypatch, tiny_opt_tokenizer, tiny_opt_model
 @pytest.mark.parametrize('generations_per_sample', range(1, 3))
 @pytest.mark.gpu
 @pytest.mark.world_size(2)
-@pytest.mark.filterwarnings(r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning')
-def test_code_eval_sentpiece_evaluation(monkeypatch, num_fewshot, dataset_uri, tiny_t5_tokenizer,
-                                        tiny_t5_model, tmp_path, generations_per_sample):
+@pytest.mark.filterwarnings(
+    r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning'
+)
+def test_code_eval_sentpiece_evaluation(monkeypatch, num_fewshot, dataset_uri,
+                                        tiny_t5_tokenizer, tiny_t5_model,
+                                        tmp_path, generations_per_sample):
     pytest.importorskip('datasets')
     monkeypatch.setenv('CODE_EVAL_DEVICE', 'LOCAL')
-    in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
+    in_memory_logger = InMemoryLogger(
+    )  # track the logged metrics in the in_memory_logger
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/{dataset_uri}'
     tokenizer = tiny_t5_tokenizer
@@ -1793,7 +2519,9 @@ def test_code_eval_sentpiece_evaluation(monkeypatch, num_fewshot, dataset_uri, t
         generations_per_sample=generations_per_sample,
     )
 
-    evaluator = Evaluator(label='humaneval', dataloader=dl, metric_names=['InContextLearningCodeEvalAccuracy'])
+    evaluator = Evaluator(label='humaneval',
+                          dataloader=dl,
+                          metric_names=['InContextLearningCodeEvalAccuracy'])
     model = HuggingFaceModel(
         model=tiny_t5_model,
         tokenizer=tiny_t5_tokenizer,
@@ -1805,8 +2533,10 @@ def test_code_eval_sentpiece_evaluation(monkeypatch, num_fewshot, dataset_uri, t
     torch.use_deterministic_algorithms(False)
     trainer.eval(eval_dataloader=evaluator, subset_num_batches=2)
     torch.use_deterministic_algorithms(True)
-    assert 'metrics/humaneval/InContextLearningCodeEvalAccuracy' in in_memory_logger.data.keys()
-    assert in_memory_logger.data['metrics/humaneval/InContextLearningCodeEvalAccuracy'][0][1].item() == 0
+    assert 'metrics/humaneval/InContextLearningCodeEvalAccuracy' in in_memory_logger.data.keys(
+    )
+    assert in_memory_logger.data[
+        'metrics/humaneval/InContextLearningCodeEvalAccuracy'][0][1].item() == 0
 
 
 @pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
@@ -1815,12 +2545,16 @@ def test_code_eval_sentpiece_evaluation(monkeypatch, num_fewshot, dataset_uri, t
 @pytest.mark.filterwarnings(r'ignore: Input length of input_ids is')
 @pytest.mark.gpu
 @pytest.mark.world_size(2)
-@pytest.mark.filterwarnings(r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning')
-def test_code_eval_task_evaluation(monkeypatch, num_fewshot, dataset_uri, tiny_gpt2_tokenizer,
-                                   tiny_gpt2_model, tmp_path, generations_per_sample):
+@pytest.mark.filterwarnings(
+    r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning'
+)
+def test_code_eval_task_evaluation(monkeypatch, num_fewshot, dataset_uri,
+                                   tiny_gpt2_tokenizer, tiny_gpt2_model,
+                                   tmp_path, generations_per_sample):
     pytest.importorskip('datasets')
     monkeypatch.setenv('CODE_EVAL_DEVICE', 'LOCAL')
-    in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
+    in_memory_logger = InMemoryLogger(
+    )  # track the logged metrics in the in_memory_logger
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/{dataset_uri}'
     tokenizer = tiny_gpt2_tokenizer
@@ -1842,7 +2576,9 @@ def test_code_eval_task_evaluation(monkeypatch, num_fewshot, dataset_uri, tiny_g
         generations_per_sample=generations_per_sample,
     )
 
-    evaluator = Evaluator(label='humaneval', dataloader=dl, metric_names=['InContextLearningCodeEvalAccuracy'])
+    evaluator = Evaluator(label='humaneval',
+                          dataloader=dl,
+                          metric_names=['InContextLearningCodeEvalAccuracy'])
     model = HuggingFaceModel(
         model=tiny_gpt2_model,
         tokenizer=tiny_gpt2_tokenizer,
@@ -1854,8 +2590,10 @@ def test_code_eval_task_evaluation(monkeypatch, num_fewshot, dataset_uri, tiny_g
     torch.use_deterministic_algorithms(False)
     trainer.eval(eval_dataloader=evaluator, subset_num_batches=2)
     torch.use_deterministic_algorithms(True)
-    assert 'metrics/humaneval/InContextLearningCodeEvalAccuracy' in in_memory_logger.data.keys()
-    assert in_memory_logger.data['metrics/humaneval/InContextLearningCodeEvalAccuracy'][0][1].item() == 0
+    assert 'metrics/humaneval/InContextLearningCodeEvalAccuracy' in in_memory_logger.data.keys(
+    )
+    assert in_memory_logger.data[
+        'metrics/humaneval/InContextLearningCodeEvalAccuracy'][0][1].item() == 0
 
 
 @pytest.mark.parametrize('dataset_uri', ['lambada_small.jsonl'])
@@ -1884,8 +2622,10 @@ def test_lm_spacing_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
     first_batch = next(dl.dataloader._get_iterator())
     second_batch = next(dl.dataloader._get_iterator())
 
-    first_batch_text = tokenizer.decode(first_batch['input_ids'][0], skip_special_tokens=True)
-    second_batch_text = tokenizer.decode(second_batch['input_ids'][0], skip_special_tokens=True)
+    first_batch_text = tokenizer.decode(first_batch['input_ids'][0],
+                                        skip_special_tokens=True)
+    second_batch_text = tokenizer.decode(second_batch['input_ids'][0],
+                                         skip_special_tokens=True)
 
     first_batch_without_last_word = ' '.join(first_batch_text.split(' ')[:-1])
     second_batch_without_last_word = ' '.join(second_batch_text.split(' ')[:-1])
@@ -1904,29 +2644,37 @@ def test_lm_spacing_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
     'split': 'test',
     'name': 'juggernaut',
 }])
-@pytest.mark.parametrize('hf_parsing_map', [None, {'context': ['context'], 'continuation': ['continuation']}])
+@pytest.mark.parametrize(
+    'hf_parsing_map',
+    [None, {
+        'context': ['context'],
+        'continuation': ['continuation']
+    }])
 @pytest.mark.filterwarnings(
-    r'ignore:The repository for mosaicml/test_dataset contains custom code which must*:FutureWarning')
-def test_hf_dataloading_lm_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path, num_fewshot, prompt_string,
+    r'ignore:The repository for mosaicml/test_dataset contains custom code which must*:FutureWarning'
+)
+def test_hf_dataloading_lm_dataloader(dataset_uri, tiny_gpt2_tokenizer,
+                                      tmp_path, num_fewshot, prompt_string,
                                       hf_loading_vars, hf_parsing_map):
     pytest.importorskip('datasets')
 
     tokenizer = tiny_gpt2_tokenizer
     batch_size = 2
     seqlen = 2048
-    dl = get_icl_task_dataloader('language_modeling',
-                                 dataset_uri=dataset_uri,
-                                 tokenizer=tokenizer,
-                                 batch_size=batch_size,
-                                 max_seq_len=seqlen,
-                                 pad_tok_id=tokenizer.eos_token_id,
-                                 num_fewshot=0,
-                                 prompt_string='',
-                                 example_delimiter='\n',
-                                 continuation_delimiter=' ',
-                                 destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
-                                 hf_loading_vars=hf_loading_vars,
-                                 hf_parsing_map=hf_parsing_map)
+    dl = get_icl_task_dataloader(
+        'language_modeling',
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        batch_size=batch_size,
+        max_seq_len=seqlen,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=0,
+        prompt_string='',
+        example_delimiter='\n',
+        continuation_delimiter=' ',
+        destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
+        hf_loading_vars=hf_loading_vars,
+        hf_parsing_map=hf_parsing_map)
     assert isinstance(dl, DataSpec)
     assert isinstance(dl.dataloader, DataLoader)  # pyright
     batch = next(dl.dataloader._get_iterator())
@@ -1936,16 +2684,22 @@ def test_hf_dataloading_lm_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path
     assert 'attention_mask' in batch
     assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen)
     assert 'continuation_indices' in batch
-    assert isinstance(batch['continuation_indices'], list) and len(batch['continuation_indices']) == batch_size
+    assert isinstance(batch['continuation_indices'], list) and len(
+        batch['continuation_indices']) == batch_size
     assert 'mode' in batch
     assert batch['mode'] == 'icl_task'
     min_idx = min(batch['continuation_indices'][0]).item()
     max_idx = max(batch['continuation_indices'][0]).item()
-    assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx + 1]) == ' and me.'
+    assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx +
+                                                  1]) == ' and me.'
 
-    decoded_batch = [tokenizer.decode(row[row != tokenizer.eos_token_id]) for row in batch['input_ids']]
+    decoded_batch = [
+        tokenizer.decode(row[row != tokenizer.eos_token_id])
+        for row in batch['input_ids']
+    ]
     assert decoded_batch[0] == "Looks like it's just you and me."
-    assert decoded_batch[1] == "There's a fine line between bravery and stupidity."
+    assert decoded_batch[
+        1] == "There's a fine line between bravery and stupidity."
 
 
 @pytest.mark.parametrize('dataset_uri', ['hf://mosaicml/test_dataset'])
@@ -1955,10 +2709,15 @@ def test_hf_dataloading_lm_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path
     'split': 'test',
     'name': 'invoker',
 }])
-@pytest.mark.parametrize('hf_parsing_map', [{'context': ['quas', 'wex', 'exort'], 'answer': ['spell']}])
+@pytest.mark.parametrize('hf_parsing_map', [{
+    'context': ['quas', 'wex', 'exort'],
+    'answer': ['spell']
+}])
 @pytest.mark.filterwarnings(
-    r'ignore:The repository for mosaicml/test_dataset contains custom code which must*:FutureWarning')
-def test_hf_dataloading_custom_parsing(dataset_uri, tiny_gpt2_tokenizer, tmp_path, num_fewshot, prompt_string,
+    r'ignore:The repository for mosaicml/test_dataset contains custom code which must*:FutureWarning'
+)
+def test_hf_dataloading_custom_parsing(dataset_uri, tiny_gpt2_tokenizer,
+                                       tmp_path, num_fewshot, prompt_string,
                                        hf_loading_vars, hf_parsing_map):
     pytest.importorskip('datasets')
 
@@ -1969,38 +2728,46 @@ def test_hf_dataloading_custom_parsing(dataset_uri, tiny_gpt2_tokenizer, tmp_pat
     # empirical number from the small test dataset
     maximum_answer_length = 4
 
-    dl = get_icl_task_dataloader('question_answering',
-                                 dataset_uri=dataset_uri,
-                                 tokenizer=tokenizer,
-                                 batch_size=batch_size,
-                                 max_seq_len=seqlen,
-                                 pad_tok_id=tokenizer.eos_token_id,
-                                 num_fewshot=num_fewshot,
-                                 prompt_string=prompt_string,
-                                 example_delimiter='\n',
-                                 question_prelimiter='Orbs: ',
-                                 continuation_delimiter='\nSpell:',
-                                 destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
-                                 hf_loading_vars=hf_loading_vars,
-                                 hf_parsing_map=hf_parsing_map)
+    dl = get_icl_task_dataloader(
+        'question_answering',
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        batch_size=batch_size,
+        max_seq_len=seqlen,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=num_fewshot,
+        prompt_string=prompt_string,
+        example_delimiter='\n',
+        question_prelimiter='Orbs: ',
+        continuation_delimiter='\nSpell:',
+        destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
+        hf_loading_vars=hf_loading_vars,
+        hf_parsing_map=hf_parsing_map)
     assert isinstance(dl, DataSpec)
     assert isinstance(dl.dataloader, DataLoader)  # pyright
     batch = next(dl.dataloader._get_iterator())
 
-    assert tuple(batch['input_ids'].shape) == (batch_size, seqlen - maximum_answer_length)
-    assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen - maximum_answer_length)
+    assert tuple(batch['input_ids'].shape) == (batch_size,
+                                               seqlen - maximum_answer_length)
+    assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen -
+                                                    maximum_answer_length)
     assert batch['mode'] == 'generate'
     # the maximum generation length from the small test data
     assert batch['generation_length'] == maximum_answer_length
     assert all(item[0] == tokenizer.eos_token_id for item in batch['input_ids'])
 
     decoded_batch = tokenizer.batch_decode(batch['input_ids'])
-    assert all(item.count('Orbs: ') == num_fewshot + 1 for item in decoded_batch)
-    assert all(item.count('\nSpell:') == num_fewshot + 1 for item in decoded_batch)
+    assert all(
+        item.count('Orbs: ') == num_fewshot + 1 for item in decoded_batch)
+    assert all(
+        item.count('\nSpell:') == num_fewshot + 1 for item in decoded_batch)
 
     if len(prompt_string) > 0:
-        assert all(item.count('What spell does this invoke? ') == 1 for item in decoded_batch)
+        assert all(
+            item.count('What spell does this invoke? ') == 1
+            for item in decoded_batch)
     assert all(
-        set(found) == set(expected) for found, expected in zip(batch['labels'], [['defeaning blast'], ['cold snap']]))
+        set(found) == set(expected) for found, expected in zip(
+            batch['labels'], [['defeaning blast'], ['cold snap']]))
     assert decoded_batch[0].endswith('Orbs: quas wex exort\nSpell:')
     assert decoded_batch[1].endswith('Orbs: quas quas quas\nSpell:')
diff --git a/tests/eval/test_nlp_metrics.py b/tests/eval/test_nlp_metrics.py
index 93c0f91035..84d84933f4 100644
--- a/tests/eval/test_nlp_metrics.py
+++ b/tests/eval/test_nlp_metrics.py
@@ -1,12 +1,14 @@
-# Copyright 2022 MosaicML Composer authors
+# Copyright 2024 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 
+# Copyright 2022 MosaicML Composer authors
+# SPDX-License-Identifier: Apache-2.0
 
 import torch
 
-from llmfoundry.eval.metrics.nlp import ( InContextLearningCodeEvalAccuracy, InContextLearningLMAccuracy, InContextLearningMultipleChoiceAccuracy,
-                                  InContextLearningQAAccuracy,)
-
+from llmfoundry.eval.metrics.nlp import (
+    InContextLearningCodeEvalAccuracy, InContextLearningLMAccuracy,
+    InContextLearningMultipleChoiceAccuracy, InContextLearningQAAccuracy)
 
 
 def test_in_context_learning_lm_accuracy(tiny_gpt2_tokenizer):
@@ -14,10 +16,12 @@ def test_in_context_learning_lm_accuracy(tiny_gpt2_tokenizer):
     continuations = [' furry', ' pie', ' long lines', ' snowy']
     pad = tiny_gpt2_tokenizer.pad_token_id
     inputs = [
-        tiny_gpt2_tokenizer(context)['input_ids'] + tiny_gpt2_tokenizer(continuation)['input_ids']
+        tiny_gpt2_tokenizer(context)['input_ids'] +
+        tiny_gpt2_tokenizer(continuation)['input_ids']
         for context, continuation in zip(contexts, continuations)
     ]
-    inputs = torch.tensor([input + [pad] * (2048 - len(input)) for input in inputs])
+    inputs = torch.tensor(
+        [input + [pad] * (2048 - len(input)) for input in inputs])
 
     cont_idxs = []
     for context, continuation in zip(contexts, continuations):
@@ -25,19 +29,27 @@ def test_in_context_learning_lm_accuracy(tiny_gpt2_tokenizer):
         end = start + len(tiny_gpt2_tokenizer(continuation)['input_ids'])
         cont_idxs.append(torch.tensor(list(range(start, end))))
 
-    batch = {'continuation_indices': cont_idxs, 'labels': inputs.roll(-1), 'input_ids': inputs}
-    logits = torch.nn.functional.one_hot(inputs.roll(-1), num_classes=pad + 1).float() * 100
+    batch = {
+        'continuation_indices': cont_idxs,
+        'labels': inputs.roll(-1),
+        'input_ids': inputs
+    }
+    logits = torch.nn.functional.one_hot(inputs.roll(-1),
+                                         num_classes=pad + 1).float() * 100
     start, end = cont_idxs[1].tolist()[0] - 1, cont_idxs[1].tolist()[-1]
-    logits[1][start:end] = logits[0][start:end].clone()  # make one of the answer's continuations incorrect
+    logits[1][start:end] = logits[0][start:end].clone(
+    )  # make one of the answer's continuations incorrect
     metric = InContextLearningLMAccuracy()
     metric.update(batch, logits, batch['labels'])
 
     assert metric.compute() == 0.75
 
 
-
 def test_in_context_learning_qa_accuracy():
-    outputs = ['Correct but then some more text', 'Incorrect', ' the CORREct with weird casing and spacing']
+    outputs = [
+        'Correct but then some more text', 'Incorrect',
+        ' the CORREct with weird casing and spacing'
+    ]
     labels = [['Correct'], ['blah', 'blah2'], ['blah', 'correct']]
     batch = {'cot_delimiter': '', 'labels': labels}
     metric = InContextLearningQAAccuracy()
@@ -49,11 +61,17 @@ def test_in_context_learning_qa_accuracy():
 def test_in_context_learning_qa_cot_accuracy():
     outputs = [
         'chain of thought ### Correct but then some more text\n\nanother chain of thought ### Incorrect answer this time',
-        'Incorrect', 'chain of thought ### the CORREct with weird casing and spacing',
+        'Incorrect',
+        'chain of thought ### the CORREct with weird casing and spacing',
         'incorrect chain of thought delimiter ## Correct but wrong delimiter'
     ]
     labels = [['Correct'], ['blah', 'blah2'], ['blah', 'correct'], ['correct']]
-    batch = {'cot_delimiter': ' ### ', 'labels': labels, 'do_normalization': True, 'stopping_criteria': '\n\n'}
+    batch = {
+        'cot_delimiter': ' ### ',
+        'labels': labels,
+        'do_normalization': True,
+        'stopping_criteria': '\n\n'
+    }
     metric = InContextLearningQAAccuracy()
     metric.update(batch, outputs, labels)
 
@@ -70,9 +88,12 @@ def test_in_context_learning_code_eval_accuracy(monkeypatch):
         '    return n + 1'
     ]  # correct
     labels = []
-    prompts = ['def fib(n):\n', 'def multiply_by_two(n):\n', 'def add_one(n):\n']
+    prompts = [
+        'def fib(n):\n', 'def multiply_by_two(n):\n', 'def add_one(n):\n'
+    ]
     entry_points = ['fib', 'multiply_by_two', 'add_one']
-    test_inputs = [['(1,)', '(2,)', '(4,)'], ['(1,)', '(2,)', '(4,)'], ['(1,)', '(2,)', '(4,)']]
+    test_inputs = [['(1,)', '(2,)', '(4,)'], ['(1,)', '(2,)', '(4,)'],
+                   ['(1,)', '(2,)', '(4,)']]
     test_outputs = [['1', '2', '5'], ['2', '4', '8'], ['2', '3', '5']]
     languages = ['python', 'python', 'python']
     monkeypatch.setenv('CODE_EVAL_DEVICE', 'LOCAL')
@@ -102,18 +123,23 @@ def test_in_context_learning_code_eval_accuracy(monkeypatch):
 
 def test_in_context_learning_mc_accuracy(tiny_gpt2_tokenizer):
     contexts = [
-        'Q: How do you cook a cake?', 'Q: How do you cook a cake?', 'Q: How old is the earth?',
-        'Q: How old is the earth?'
+        'Q: How do you cook a cake?', 'Q: How do you cook a cake?',
+        'Q: How old is the earth?', 'Q: How old is the earth?'
+    ]
+    continuations = [
+        ' A: turn on the oven', ' A: do a backflip', ' A: 2 minutes',
+        ' A: 4.5 billion years'
     ]
-    continuations = [' A: turn on the oven', ' A: do a backflip', ' A: 2 minutes', ' A: 4.5 billion years']
     gold_indices = [0, 1]
     choice_groupings = [(0, 2), (2, 4)]
     pad = tiny_gpt2_tokenizer.pad_token_id
     inputs = [
-        tiny_gpt2_tokenizer(context)['input_ids'] + tiny_gpt2_tokenizer(continuation)['input_ids']
+        tiny_gpt2_tokenizer(context)['input_ids'] +
+        tiny_gpt2_tokenizer(continuation)['input_ids']
         for context, continuation in zip(contexts, continuations)
     ]
-    inputs = torch.tensor([input + [pad] * (2048 - len(input)) for input in inputs])
+    inputs = torch.tensor(
+        [input + [pad] * (2048 - len(input)) for input in inputs])
 
     cont_idxs = []
     for context, continuation in zip(contexts, continuations):
@@ -128,7 +154,8 @@ def test_in_context_learning_mc_accuracy(tiny_gpt2_tokenizer):
         'gold_indices': gold_indices,
         'choice_groupings': choice_groupings
     }
-    logits = torch.nn.functional.one_hot(inputs.roll(-1), num_classes=pad + 1).float()
+    logits = torch.nn.functional.one_hot(inputs.roll(-1),
+                                         num_classes=pad + 1).float()
 
     # for the first two, the correct answer is continuation 0
     # make the answer correct by making continuation 0 more likely for both answers
diff --git a/tests/fixtures/models.py b/tests/fixtures/models.py
index 33cb27ee8a..449fdbf5bc 100644
--- a/tests/fixtures/models.py
+++ b/tests/fixtures/models.py
@@ -1,13 +1,14 @@
 # Copyright 2022 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 
+import copy
 from typing import Any, Callable
 
+import pytest
 from omegaconf import DictConfig
 from pytest import fixture
-import pytest
 from transformers import PreTrainedTokenizerBase
-import copy
+
 from llmfoundry.models.hf.hf_causal_lm import ComposerHFCausalLM
 from llmfoundry.models.model_registry import COMPOSER_MODEL_REGISTRY
 from llmfoundry.models.mpt.modeling_mpt import ComposerMPTCausalLM
@@ -23,6 +24,7 @@ def _build_model(config: DictConfig, tokenizer: PreTrainedTokenizerBase):
 def mpt_tokenizer():
     return build_tokenizer('EleutherAI/gpt-neox-20b', {})
 
+
 @fixture
 def build_tiny_mpt(
     mpt_tokenizer: PreTrainedTokenizerBase
@@ -70,7 +72,6 @@ def build(**kwargs: Any) -> ComposerHFCausalLM:
     return build
 
 
-
 def tiny_gpt2_model_helper(config):
     transformers = pytest.importorskip('transformers')
 
@@ -110,7 +111,8 @@ def tiny_gpt2_tokenizer_helper():
 def tiny_llama_tokenizer_helper():
     transformers = pytest.importorskip('transformers')
 
-    hf_tokenizer = transformers.AutoTokenizer.from_pretrained('huggyllama/llama-7b', use_fast=False)
+    hf_tokenizer = transformers.AutoTokenizer.from_pretrained(
+        'huggyllama/llama-7b', use_fast=False)
     return hf_tokenizer
 
 
@@ -124,13 +126,11 @@ def _session_tiny_llama_tokenizer():  # type: ignore
     return tiny_llama_tokenizer_helper()
 
 
-
-
-
 def tiny_opt_tokenizer_helper():
     transformers = pytest.importorskip('transformers')
 
-    hf_tokenizer = transformers.AutoTokenizer.from_pretrained('facebook/opt-125m')
+    hf_tokenizer = transformers.AutoTokenizer.from_pretrained(
+        'facebook/opt-125m')
     hf_tokenizer.add_special_tokens({'pad_token': '[PAD]'})
     return hf_tokenizer
 
@@ -173,5 +173,3 @@ def tiny_opt_tokenizer(_session_tiny_opt_tokenizer):
 @pytest.fixture
 def tiny_opt_model(_session_tiny_opt_model):
     return copy.deepcopy(_session_tiny_opt_model)
-
-

From ceff0c4c47799448a65a0acd7a2cc0a1cdd56637 Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Fri, 9 Feb 2024 13:04:47 -0500
Subject: [PATCH 11/59] full migration

---
 llmfoundry/eval/metrics/nlp.py                | 58 +++++++++++--------
 .../models/inference_api_wrapper/interface.py |  5 +-
 mcli/mcli-hf-eval.yaml                        |  8 ++-
 .../callbacks/test_eval_gauntlet_callback.py  |  2 +-
 4 files changed, 41 insertions(+), 32 deletions(-)

diff --git a/llmfoundry/eval/metrics/nlp.py b/llmfoundry/eval/metrics/nlp.py
index 065c90306f..c0d85407e9 100644
--- a/llmfoundry/eval/metrics/nlp.py
+++ b/llmfoundry/eval/metrics/nlp.py
@@ -15,12 +15,12 @@
 
 import numpy as np
 import torch
-from composer.metrics.nlp import InContextLearningMetric
 from composer.utils.eval_client import (EvalClient, LambdaEvalClient,
                                         LocalEvalClient,
                                         MosaicMLLambdaEvalClient)
 from torch import Tensor
 from torch.nn import functional as F
+from torchmetrics import Metric
 
 log = logging.getLogger(__name__)
 
@@ -35,18 +35,25 @@
 ]
 
 
-class InContextLearningMetric(
-        InContextLearningMetric
-):  # TODO: this is a temporary solution until Max deprecates composer's superclass entirely
+class InContextLearningMetric(Metric):
 
-    def update(self, batch: dict, output_logits: torch.Tensor,
-               labels: torch.Tensor):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.needs_batch = True
+
+    def update(self,
+               batch: dict,
+               outputs: Optional[torch.Tensor] = None,
+               labels: Optional[torch.Tensor] = None):
         """Abstract interface for computing an in-context learning metrics.
 
+        The `outputs` argument is deprecated and will be removed in v0.21 while it's functionality will
+        be moved to `outputs`.
+
         Args:
             batch (dict): Batch must consist minimally of `input_ids` as well as any other structure needed
                 to compute the metric.
-            output_logits (torch.Tensor): The model outputs evaluated on the batch `input_ids`
+            outputs (torch.Tensor): The model outputs evaluated on the batch `input_ids`
             labels (torch.Tensor): The correct outputs.
 
         Raises:
@@ -203,11 +210,11 @@ def __init__(self, dist_sync_on_step: bool = False):
                        dist_reduce_fx='sum')
         self.add_state('total', default=torch.tensor(0.), dist_reduce_fx='sum')
 
-    def update(self, batch: dict, output_logits: torch.Tensor,
-               labels: torch.Tensor):
+    def update(self, batch: dict, outputs: torch.Tensor, labels: torch.Tensor):
         for batch_idx, cont_idx in enumerate(batch['continuation_indices']):
-            cont_tok_pred = output_logits[batch_idx].index_select(
-                dim=0, index=cont_idx - 1).argmax(dim=-1)
+            cont_tok_pred = outputs[batch_idx].index_select(dim=0,
+                                                            index=cont_idx -
+                                                            1).argmax(dim=-1)
             cont_tok_targ = labels[batch_idx].index_select(dim=0,
                                                            index=cont_idx - 1)
 
@@ -419,13 +426,13 @@ def __init__(self, dist_sync_on_step: bool = False):
                        dist_reduce_fx='sum')
         self.add_state('total', default=torch.tensor(0.0), dist_reduce_fx='sum')
 
-    def update(self, batch: dict, output_logits: torch.Tensor,
-               labels: torch.Tensor):
+    def update(self, batch: dict, outputs: torch.Tensor, labels: torch.Tensor):
         perplexities = []
         for batch_idx, cont_idx in enumerate(batch['continuation_indices']):
             # continuation indices refer to indices in the original input's token space
-            cont_tok_logits = output_logits[batch_idx].index_select(
-                dim=0, index=cont_idx - 1)
+            cont_tok_logits = outputs[batch_idx].index_select(dim=0,
+                                                              index=cont_idx -
+                                                              1)
             # labels have been shifted left by one index, so the cont_idx needs to be shifted as well.
             cont_tok_targ = labels[batch_idx].index_select(dim=0,
                                                            index=cont_idx - 1)
@@ -486,8 +493,7 @@ def __init__(self, dist_sync_on_step: bool = False, n_buckets: int = 10):
                        default=torch.zeros(n_buckets),
                        dist_reduce_fx='sum')
 
-    def update(self, batch: dict, output_logits: torch.Tensor,
-               labels: torch.Tensor):
+    def update(self, batch: dict, outputs: torch.Tensor, labels: torch.Tensor):
         pass
 
     def compute(self):
@@ -523,13 +529,14 @@ class InContextLearningMCExpectedCalibrationError(
     # Make torchmetrics call update only once
     full_state_update = False
 
-    def update(self, batch: Dict[str, Any], output_logits: torch.Tensor,
+    def update(self, batch: Dict[str, Any], outputs: torch.Tensor,
                labels: torch.Tensor):
-        output_logits = torch.softmax(output_logits, dim=2)
+        outputs = torch.softmax(outputs, dim=2)
         probabilites = []
         for batch_idx, cont_idx in enumerate(batch['continuation_indices']):
-            cont_tok_logits = output_logits[batch_idx].index_select(
-                dim=0, index=cont_idx - 1)
+            cont_tok_logits = outputs[batch_idx].index_select(dim=0,
+                                                              index=cont_idx -
+                                                              1)
             cont_tok_targ = labels[batch_idx].index_select(dim=0,
                                                            index=cont_idx - 1)
             probability = cont_tok_logits.index_select(
@@ -568,12 +575,13 @@ class InContextLearningLMExpectedCalibrationError(
     # Make torchmetrics call update only once
     full_state_update = False
 
-    def update(self, batch: Dict[str, Any], output_logits: torch.Tensor,
+    def update(self, batch: Dict[str, Any], outputs: torch.Tensor,
                labels: torch.Tensor):
-        output_logits = torch.softmax(output_logits, dim=2)
+        outputs = torch.softmax(outputs, dim=2)
         for batch_idx, cont_idx in enumerate(batch['continuation_indices']):
-            cont_tok_logits = output_logits[batch_idx].index_select(
-                dim=0, index=cont_idx - 1)
+            cont_tok_logits = outputs[batch_idx].index_select(dim=0,
+                                                              index=cont_idx -
+                                                              1)
             cont_tok_pred = cont_tok_logits.argmax(dim=-1)
             confidence = cont_tok_logits.max(dim=-1).values.min()
             cont_tok_targ = labels[batch_idx].index_select(dim=0,
diff --git a/llmfoundry/models/inference_api_wrapper/interface.py b/llmfoundry/models/inference_api_wrapper/interface.py
index 19e4dc8e6e..a4b038c650 100644
--- a/llmfoundry/models/inference_api_wrapper/interface.py
+++ b/llmfoundry/models/inference_api_wrapper/interface.py
@@ -5,15 +5,14 @@
 
 import torch
 from composer.core.types import Batch
-from composer.metrics import InContextLearningMetric
 from composer.metrics.nlp import LanguageCrossEntropy, LanguagePerplexity
 from composer.models import ComposerModel
 from torchmetrics import Metric
 from transformers import AutoTokenizer
 
 from llmfoundry.eval.metrics.nlp import (
-    InContextLearningLMAccuracy, InContextLearningMultipleChoiceAccuracy,
-    InContextLearningQAAccuracy)
+    InContextLearningLMAccuracy, InContextLearningMetric,
+    InContextLearningMultipleChoiceAccuracy, InContextLearningQAAccuracy)
 
 
 class InferenceAPIEvalWrapper(ComposerModel):
diff --git a/mcli/mcli-hf-eval.yaml b/mcli/mcli-hf-eval.yaml
index 8c495a4316..77e506c878 100644
--- a/mcli/mcli-hf-eval.yaml
+++ b/mcli/mcli-hf-eval.yaml
@@ -8,6 +8,8 @@ integrations:
 
 command: |
   cd llm-foundry/scripts
+  pip uninstall mosaicml -y
+  pip install git+https://github.com/mosaicml/composer.git@refactor_update_metric
   composer eval/eval.py /mnt/config/parameters.yaml
 
 # Mosaic Cloud will use run_name (with a unique suffix) to populate the env var $RUN_NAME
@@ -28,16 +30,16 @@ parameters:
 
   models:
   -
-    model_name: mosaicml/mpt-7b-instruct
+    model_name: mosaicml/mpt-7b
     # Tokenizer
     tokenizer:
-      name: EleutherAI/gpt-neox-20b
+      name: mosaicml/mpt-7b
       kwargs:
         model_max_length: ${max_seq_len}
 
     model:
       name: hf_causal_lm
-      pretrained_model_name_or_path: mosaicml/mpt-7b-instruct
+      pretrained_model_name_or_path: mosaicml/mpt-7b
       init_device: mixed
       pretrained: true
       use_auth_token: false
diff --git a/tests/callbacks/test_eval_gauntlet_callback.py b/tests/callbacks/test_eval_gauntlet_callback.py
index 3a1e371ab8..8d9938e3a1 100644
--- a/tests/callbacks/test_eval_gauntlet_callback.py
+++ b/tests/callbacks/test_eval_gauntlet_callback.py
@@ -9,9 +9,9 @@
 import torch
 from composer.core import State
 from composer.loggers import InMemoryLogger, Logger
-from composer.metrics import InContextLearningLMAccuracy
 from transformers import AutoTokenizer
 
+from llmfoundry.eval.metrics.nlp import InContextLearningLMAccuracy
 from llmfoundry.utils.builders import build_icl_data_and_gauntlet
 
 

From 5bb06cc57c5308ad242ecf91d2430af0c1764369 Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Mon, 12 Feb 2024 13:19:13 -0500
Subject: [PATCH 12/59] precommit

---
 llmfoundry/eval/datasets/__init__.py          |  19 +
 .../in_context_learning_evaluation.py         |  51 +-
 llmfoundry/eval/metrics/__init__.py           |  20 +
 llmfoundry/eval/metrics/nlp.py                | 435 ++++++++++--------
 llmfoundry/models/hf/hf_causal_lm.py          |   7 +-
 .../models/inference_api_wrapper/interface.py |   7 +-
 llmfoundry/models/mpt/modeling_mpt.py         |   7 +-
 llmfoundry/utils/builders.py                  |   3 +-
 mcli/mcli-hf-eval.yaml                        |   3 +-
 .../eval/test_in_context_learning_datasets.py |   9 +-
 tests/eval/test_nlp_metrics.py                |   7 +-
 11 files changed, 294 insertions(+), 274 deletions(-)
 create mode 100644 llmfoundry/eval/datasets/__init__.py
 create mode 100644 llmfoundry/eval/metrics/__init__.py

diff --git a/llmfoundry/eval/datasets/__init__.py b/llmfoundry/eval/datasets/__init__.py
new file mode 100644
index 0000000000..2d22ae8e90
--- /dev/null
+++ b/llmfoundry/eval/datasets/__init__.py
@@ -0,0 +1,19 @@
+# Copyright 2022 MosaicML Composer authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Natively supported datasets."""
+
+from llmfoundry.eval.datasets.in_context_learning_evaluation import (InContextLearningCodeEvalDataset,
+                                                              InContextLearningDataset, InContextLearningLMTaskDataset,
+                                                              InContextLearningMultipleChoiceTaskDataset,
+                                                              InContextLearningQATaskDataset,
+                                                              InContextLearningSchemaTaskDataset)
+
+__all__ = [
+    'InContextLearningDataset',
+    'InContextLearningQATaskDataset',
+    'InContextLearningLMTaskDataset',
+    'InContextLearningCodeEvalDataset',
+    'InContextLearningMultipleChoiceTaskDataset',
+    'InContextLearningSchemaTaskDataset',
+]
diff --git a/llmfoundry/eval/datasets/in_context_learning_evaluation.py b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
index 668dd25145..14d2b29f5d 100644
--- a/llmfoundry/eval/datasets/in_context_learning_evaluation.py
+++ b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
@@ -11,7 +11,6 @@
 import json
 import os
 import random
-import warnings
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Union
 
 import torch
@@ -32,6 +31,7 @@
 __all__ = [
     'InContextLearningLMTaskDataset',
     'InContextLearningMultipleChoiceTaskDataset',
+    'InContextLearningSchemaTaskDataset',
     'InContextLearningCodeEvalDataset',
     'InContextLearningQATaskDataset',
     'get_icl_task_dataloader',
@@ -717,13 +717,6 @@ def __init__(self,
                  do_normalization: bool = True,
                  *args,
                  **kwargs):
-        warnings.warn(
-            ('InContextLearningQATaskDataset is deprecated and will be removed in a future '
-             'release. Its functionality has been reimplemented '
-             'in llmfoundry.eval.datasets.in_context_learning_evaluation.InContextLearningQATaskDataset.'
-            ),
-            DeprecationWarning,
-        )
         if kwargs['tokenizer'].eos_token_id is None:
             raise ValueError(
                 '`InContextLearningQATaskDataset` tokenizer must have non-null `eos_token_id`'
@@ -879,13 +872,6 @@ class InContextLearningLMTaskDataset(InContextLearningDataset):
     """
 
     def __init__(self, *args, **kwargs):
-        warnings.warn(
-            ('InContextLearningLMTaskDataset is deprecated and will be removed in a future '
-             'release. Its functionality has been reimplemented '
-             'in llmfoundry.eval.datasets.in_context_learning_evaluation.InContextLearningLMTaskDataset.'
-            ),
-            DeprecationWarning,
-        )
         super().__init__(answer_key='continuation',
                          static_keys=['mode'],
                          tensor_keys=[
@@ -940,13 +926,6 @@ def __init__(self,
                  list_of_primitives: Optional[List] = None,
                  *args,
                  **kwargs):
-        warnings.warn(
-            ('InContextLearningMultipleChoiceTaskDataset is deprecated and will be removed in a future '
-             'release. Its functionality has been reimplemented '
-             'in llmfoundry.eval.datasets.in_context_learning_evaluation.InContextLearningMultipleChoiceTaskDataset.'
-            ),
-            DeprecationWarning,
-        )
         self.choices_key = choices_key
         base_batch = {
             'input_ids': [],
@@ -1168,13 +1147,6 @@ def __init__(self, choices_key='context_options', *args, **kwargs):
         static_keys = ['mode']
         tensor_keys = ['input_ids', 'labels', 'attention_mask']
         list_of_tensors_keys = ['continuation_indices']
-        warnings.warn(
-            ('InContextLearningSchemaTaskDataset is deprecated and will be removed in a future '
-             'release. Its functionality has been reimplemented '
-             'in llmfoundry.eval.datasets.in_context_learning_evaluation.InContextLearningSchemaTaskDataset.'
-            ),
-            DeprecationWarning,
-        )
         super().__init__(choices_key=choices_key,
                          context_key=choices_key,
                          static_keys=static_keys,
@@ -1377,13 +1349,6 @@ def __init__(
         *args,
         **kwargs,
     ):
-        warnings.warn(
-            ('InContextLearningCodeEvalDataset is deprecated and will be removed in a future '
-             'release. Its functionality has been reimplemented '
-             'in llmfoundry.eval.datasets.in_context_learning_evaluation.InContextLearningCodeEvalDataset.'
-            ),
-            DeprecationWarning,
-        )
         if generations_per_sample < pass_at_k:
             raise ValueError(
                 f'generations_per_sample ({generations_per_sample}) must be greater than or equal to pass_at_k ({pass_at_k}) for code evaluation.'
@@ -1552,13 +1517,6 @@ def build_icl_dataloader(
             this might be different)
         3. set the `split_batch` funciton if necessary
     """
-    warnings.warn(
-        ('build_icl_dataloader is deprecated and will be removed in a future '
-         'release. Its functionality has been reimplemented '
-         'in llmfoundry.eval.datasets.in_context_learning_evaluation.build_icl_dataloader.'
-        ),
-        DeprecationWarning,
-    )
     if icl_task_type == 'multiple_choice':
         dataset = InContextLearningMultipleChoiceTaskDataset(
             dataset_uri=dataset_uri,
@@ -1864,13 +1822,6 @@ def get_icl_task_dataloader(
     Returns:
         DataLoader: A dataloader used for performing in-context learning evaluation on the dataset provided.
     """
-    warnings.warn(
-        ('get_icl_task_dataloader is deprecated and will be removed in a future '
-         'release. Its functionality has been reimplemented '
-         'in llmfoundry.eval.datasets.in_context_learning_evaluation.get_icl_task_dataloader.'
-        ),
-        DeprecationWarning,
-    )
     if hf_loading_vars is None:
         hf_loading_vars = {}
     if hf_parsing_map is None:
diff --git a/llmfoundry/eval/metrics/__init__.py b/llmfoundry/eval/metrics/__init__.py
new file mode 100644
index 0000000000..444bdb6aea
--- /dev/null
+++ b/llmfoundry/eval/metrics/__init__.py
@@ -0,0 +1,20 @@
+# Copyright 2022 MosaicML Composer authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""A collection of common torchmetrics."""
+
+from llmfoundry.eval.metrics.nlp import (InContextLearningCodeEvalAccuracy, InContextLearningLMAccuracy,
+                                  InContextLearningLMExpectedCalibrationError,
+                                  InContextLearningMCExpectedCalibrationError, InContextLearningMetric,
+                                  InContextLearningMultipleChoiceAccuracy, InContextLearningQAAccuracy)
+
+__all__ = [
+    'InContextLearningLMAccuracy',
+    'InContextLearningMultipleChoiceAccuracy',
+    'InContextLearningQAAccuracy',
+    'InContextLearningMCExpectedCalibrationError',
+    'InContextLearningLMExpectedCalibrationError',
+    'InContextLearningMetric',
+    'InContextLearningCodeEvalAccuracy',
+]
+
diff --git a/llmfoundry/eval/metrics/nlp.py b/llmfoundry/eval/metrics/nlp.py
index c0d85407e9..aa603db129 100644
--- a/llmfoundry/eval/metrics/nlp.py
+++ b/llmfoundry/eval/metrics/nlp.py
@@ -43,17 +43,18 @@ def __init__(self, *args, **kwargs):
 
     def update(self,
                batch: dict,
-               outputs: Optional[torch.Tensor] = None,
-               labels: Optional[torch.Tensor] = None):
+               output_logits: Optional[torch.Tensor] = None,
+               labels: Optional[torch.Tensor] = None,
+               outputs: Optional[torch.Tensor] = None):
         """Abstract interface for computing an in-context learning metrics.
 
-        The `outputs` argument is deprecated and will be removed in v0.21 while it's functionality will
+        The `output_logits` argument is deprecated and will be removed in v0.21 while it's functionality will
         be moved to `outputs`.
 
         Args:
             batch (dict): Batch must consist minimally of `input_ids` as well as any other structure needed
                 to compute the metric.
-            outputs (torch.Tensor): The model outputs evaluated on the batch `input_ids`
+            output_logits (torch.Tensor): The model outputs evaluated on the batch `input_ids`
             labels (torch.Tensor): The correct outputs.
 
         Raises:
@@ -61,6 +62,30 @@ def update(self,
         """
         raise NotImplementedError
 
+    @staticmethod
+    def rename_args(
+        batch: dict,
+        output_logits: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        outputs: Optional[torch.Tensor] = None
+    ) -> Tuple[dict, torch.Tensor, torch.Tensor]:
+        if outputs is not None and output_logits is not None:
+            raise ValueError('Cannot use both `outputs` and `output_logits`')
+        if output_logits is not None:
+            warnings.warn(
+                ('`output_logits` has been renamed to `outputs` and will be removed in v0.21'
+                ),
+                DeprecationWarning,
+            )
+            outputs = output_logits
+
+        if labels is None:
+            raise ValueError('`labels` cannot be None')
+        if outputs is None:
+            raise ValueError('`outputs` cannot be None')
+
+        return batch, outputs, labels
+
 
 class InContextLearningQAAccuracy(InContextLearningMetric):
     r"""Computes accuracy for In-context learning (ICL) question answering (QA)
@@ -90,12 +115,6 @@ class InContextLearningQAAccuracy(InContextLearningMetric):
 
     def __init__(self, dist_sync_on_step: bool = False):
         # state from multiple processes
-        warnings.warn(
-            ('InContextLearningQAAccuracy is deprecated and will be removed in a future '
-             'release. Its functionality has been reimplemented '
-             'in llmfoundry.eval.metrics.nlp.InContextLearningQAAccuracy.'),
-            DeprecationWarning,
-        )
         super().__init__(dist_sync_on_step=dist_sync_on_step)
         self.add_state('correct',
                        default=torch.tensor(0.),
@@ -129,14 +148,8 @@ def replace_underscore(text: str) -> str:
             remove_articles(handle_punc(lower(
                 replace_underscore(answer))))).strip()
 
-    def update(
-        self,
-        batch: Optional[Dict[str, Any]],
-        outputs: List[str],
-        labels: List[List[str]],
-    ):
-        if batch is None:
-            batch = {}
+    def update(self, outputs: List[str], labels: List[List[str]],
+               batch: Dict[str, Any]):
         cot_delimiter = batch.get('cot_delimiter', '')
         do_normalization = batch.get('do_normalization', True)
         stopping_criteria = batch.get('stopping_criteria', None)
@@ -197,12 +210,6 @@ class InContextLearningLMAccuracy(InContextLearningMetric):
     full_state_update = False
 
     def __init__(self, dist_sync_on_step: bool = False):
-        warnings.warn(
-            ('InContextLearningLMAccuracy is deprecated and will be removed in a future '
-             'release. Its functionality has been reimplemented '
-             'in llmfoundry.eval.metrics.nlp.InContextLearningLMAccuracy.'),
-            DeprecationWarning,
-        )
         # state from multiple processes
         super().__init__(dist_sync_on_step=dist_sync_on_step)
         self.add_state('correct',
@@ -210,7 +217,17 @@ def __init__(self, dist_sync_on_step: bool = False):
                        dist_reduce_fx='sum')
         self.add_state('total', default=torch.tensor(0.), dist_reduce_fx='sum')
 
-    def update(self, batch: dict, outputs: torch.Tensor, labels: torch.Tensor):
+    def update(self,
+               batch: dict,
+               output_logits: Optional[torch.Tensor] = None,
+               labels: Optional[torch.Tensor] = None,
+               outputs: Optional[torch.Tensor] = None):
+        batch, outputs, labels = InContextLearningMetric.rename_args(
+            batch=batch,
+            output_logits=output_logits,
+            labels=labels,
+            outputs=outputs)
+
         for batch_idx, cont_idx in enumerate(batch['continuation_indices']):
             cont_tok_pred = outputs[batch_idx].index_select(dim=0,
                                                             index=cont_idx -
@@ -227,167 +244,6 @@ def compute(self):
         return self.correct / self.total
 
 
-class InContextLearningCodeEvalAccuracy(InContextLearningMetric):
-    r"""Computes accuracy for In-context learning (ICL) code evaluation tasks.
-
-    ICL code eval tasks consist of some number of example code eval tasks (referred to as the 'context'), followed by a test task where the model must
-    complete the code, where we term the code completion a 'continuation'.
-
-    In each case, the model constructs a given number of continuations (termed pass@K for K continuations), and each continuation is run against a set of test cases. The model is considered
-    correct if at least one of the proposed continuations passes all the test cases.
-
-    Runs on AWS Lambdas by default.
-
-    Adds metric state variables:
-        correct (float): The number of instances where the predictions passed all the test cases.
-        total (float): The number of total instances that were predicted.
-
-    Args:
-        dist_sync_on_step (bool, optional): Synchronize metric state across processes at
-            each forward() before returning the value at the step. Default: ``False``.
-    """
-
-    # Make torchmetrics call update only once
-    full_state_update = False
-
-    def __init__(self, dist_sync_on_step: bool = False):
-        warnings.warn(
-            ('InContextLearningCodeEvalAccuracy is deprecated and will be removed in a future '
-             'release. Its functionality has been reimplemented '
-             'in llmfoundry.eval.metrics.nlp.InContextLearningCodeEvalAccuracy.'
-            ),
-            DeprecationWarning,
-        )
-        # state from multiple processes
-        super().__init__(dist_sync_on_step=dist_sync_on_step)
-        self.add_state('correct',
-                       default=torch.tensor(0.),
-                       dist_reduce_fx='sum')
-        self.add_state('total', default=torch.tensor(0.), dist_reduce_fx='sum')
-
-        self.eval_device = os.environ.get('CODE_EVAL_DEVICE', None)
-        if self.eval_device is not None:
-            self.eval_device = self.eval_device.upper()
-
-    def get_client(self) -> EvalClient:
-        """Returns a client for the appropriate remote platform."""
-        client = None
-        if self.eval_device == 'LOCAL':
-            warnings.warn(
-                'Running code eval locally may be insecure. Please set environment variable CODE_EVAL_DEVICE '
-                'to LAMBDA to run on remote. To use Lambdas, spin up your instance that checks code, set the URL as '
-                'CODE_EVAL_URL and the API key as CODE_EVAL_APIKEY.')
-            log.debug('Running code eval locally.')
-            client = LocalEvalClient()
-        elif self.eval_device == 'LAMBDA':
-            client = LambdaEvalClient()
-        elif self.eval_device == 'MOSAICML':
-            client = MosaicMLLambdaEvalClient()
-        elif self.eval_device is None:
-            raise ValueError(
-                'Attempting to use InContextLearningCodeEvalAccuracy but environment '
-                'variable `CODE_EVAL_DEVICE` is not set. Please set it to `CODE_EVAL_DEVICE` '
-                'to one of `LOCAL` (for unsafe local eval), `LAMBDA` (for AWS lambda ',
-                'evaluation), or `MOSAICML` (for lambda eval through MAPI).')
-        else:
-            raise ValueError(
-                'Environment variable `CODE_EVAL_DEVICE` must be one of `LOCAL`, '
-                f'`LAMBDA`, or `MOSAICML` but got {self.eval_device}.')
-
-        return client
-
-    def estimator(self, n: int, c: int, k: int) -> float:
-        """Computes the pass@k metric.
-
-        Given the number of generated samples, n, the number of correct samples, c, and the k of interest,
-        this function calculates pass@k as 1 - comb(n - c, k) / comb(n, k) as per the definition of
-        pass@k in the HumanEval paper (https://arxiv.org/abs/2107.03374) and it's associated implementation:
-        https://github.com/openai/human-eval.
-        """
-        if n - c < k:
-            return 1.0
-        return 1.0 - float(np.prod(1.0 - k / np.arange(n - c + 1, n + 1)))
-
-    def update(self, batch: Dict[str, Any], outputs: List[str],
-               labels: List[str]):
-        """Updates the pass@k accuracy of code generation.
-
-        Given a batch of prompts, test cases, and code generations, evaluates the code generations
-        against the test cases and augments the pass@k accuracy of the batch to the values so far.
-
-        Args:
-            batch (Dict[str, Any]): A batch of data produced by the InContextLearningCodeEvalDataset, with
-            the prompt, test cases, and entry points. This will be a dictionary that must have the following
-            arguments:
-            {
-                'prompts': List[str],
-                'test_inputs': List[List[str]],
-                'test_outputs': List[List[str]],
-                'entry_points': List[str],
-                'languages': List[str],
-                'generation_kwargs': Dict[str, Any]
-            }
-            outputs (List[str]): A list of code generations in the format of HF generate with beam search,
-            which is the a list of strings in groups of beam_size e.g. for beam size 2 and batch size 2, the list
-            will be of the format [prompt 1 gen 1, prompt 1 gen 2, prompt 2 gen 1, prompt 2 gen 2]
-            labels (List[str]): A list of the correct code generations, for compatibility with existing HF generate
-            functionalities. This is not used.
-        """
-        del labels  # never used
-        client = self.get_client()
-
-        pass_at_k = batch['pass_at_k']
-        num_generations = batch['generation_kwargs']['num_return_sequences']
-        processed_outputs = [
-            outputs[i * num_generations:(i + 1) * num_generations]
-            for i in range(len(batch['prompts']))
-        ]
-        payloads = []
-        for sample_outputs, sample_prompt, test_inputs, test_outputs, entry_point, language in zip(
-                processed_outputs, batch['prompts'], batch['test_inputs'],
-                batch['test_outputs'], batch['entry_points'],
-                batch['languages']):
-            self.total += torch.tensor(1.0)
-            prompt_payload = []
-            for code_gen in sample_outputs:
-                code_gen = re.split(
-                    r'\n[A-Za-z0-9#`]',
-                    code_gen)[0]  # remove everything after function ends
-                final_code = sample_prompt + code_gen  # combine prompt with the code generation
-                generation_payload = []
-                for test_input, test_output in zip(test_inputs, test_outputs):
-                    payload = {
-                        'code': final_code,
-                        'input': test_input,
-                        'output': test_output,
-                        'entry_point': entry_point,
-                        'language': language,
-                    }
-                    generation_payload.append(payload)
-
-                prompt_payload.append(generation_payload)
-            payloads.append(prompt_payload)
-
-        results = client.invoke(payloads)
-        for prompt in results:
-            num_correct = 0
-            for generation in prompt:
-                correct = all(generation)
-                if correct:
-                    num_correct += 1
-
-            pass_at_k_rate = self.estimator(num_generations, num_correct,
-                                            pass_at_k)
-            self.correct += torch.tensor(pass_at_k_rate)
-
-        client.close()  # pyright: ignore [reportOptionalMemberAccess]
-
-    def compute(self):
-        assert isinstance(self.correct, Tensor)
-        assert isinstance(self.total, Tensor)
-        return self.correct / self.total
-
-
 class InContextLearningMultipleChoiceAccuracy(InContextLearningMetric):
     r"""Computes accuracy for In-context learning (ICL) multiple choice (MC)
     tasks.
@@ -412,13 +268,6 @@ class InContextLearningMultipleChoiceAccuracy(InContextLearningMetric):
     full_state_update = False
 
     def __init__(self, dist_sync_on_step: bool = False):
-        warnings.warn(
-            ('InContextLearningMultipleChoiceAccuracy is deprecated and will be removed in a future '
-             'release. Its functionality has been reimplemented '
-             'in llmfoundry.eval.metrics.nlp.InContextLearningMultipleChoiceAccuracy.'
-            ),
-            DeprecationWarning,
-        )
         # state from multiple processes
         super().__init__(dist_sync_on_step=dist_sync_on_step)
         self.add_state('correct',
@@ -426,7 +275,17 @@ def __init__(self, dist_sync_on_step: bool = False):
                        dist_reduce_fx='sum')
         self.add_state('total', default=torch.tensor(0.0), dist_reduce_fx='sum')
 
-    def update(self, batch: dict, outputs: torch.Tensor, labels: torch.Tensor):
+    def update(self,
+               batch: dict,
+               output_logits: Optional[torch.Tensor] = None,
+               labels: Optional[torch.Tensor] = None,
+               outputs: Optional[torch.Tensor] = None):
+        batch, outputs, labels = InContextLearningMetric.rename_args(
+            batch=batch,
+            output_logits=output_logits,
+            labels=labels,
+            outputs=outputs)
+
         perplexities = []
         for batch_idx, cont_idx in enumerate(batch['continuation_indices']):
             # continuation indices refer to indices in the original input's token space
@@ -476,11 +335,6 @@ class InContextLearningExpectedCalibrationError(InContextLearningMetric):
     """
 
     def __init__(self, dist_sync_on_step: bool = False, n_buckets: int = 10):
-        warnings.warn(
-            ('InContextLearningExpectedCalibrationError is deprecated and will be removed in a future '
-             'release.'),
-            DeprecationWarning,
-        )
         # state from multiple processes
         super().__init__(dist_sync_on_step=dist_sync_on_step)
         self.n_buckets = n_buckets
@@ -493,7 +347,8 @@ def __init__(self, dist_sync_on_step: bool = False, n_buckets: int = 10):
                        default=torch.zeros(n_buckets),
                        dist_reduce_fx='sum')
 
-    def update(self, batch: dict, outputs: torch.Tensor, labels: torch.Tensor):
+    def update(self, batch: dict, output_logits: torch.Tensor,
+               labels: torch.Tensor):
         pass
 
     def compute(self):
@@ -529,8 +384,17 @@ class InContextLearningMCExpectedCalibrationError(
     # Make torchmetrics call update only once
     full_state_update = False
 
-    def update(self, batch: Dict[str, Any], outputs: torch.Tensor,
-               labels: torch.Tensor):
+    def update(self,
+               batch: dict,
+               output_logits: Optional[torch.Tensor] = None,
+               labels: Optional[torch.Tensor] = None,
+               outputs: Optional[torch.Tensor] = None):
+        batch, outputs, labels = InContextLearningMetric.rename_args(
+            batch=batch,
+            output_logits=output_logits,
+            labels=labels,
+            outputs=outputs)
+
         outputs = torch.softmax(outputs, dim=2)
         probabilites = []
         for batch_idx, cont_idx in enumerate(batch['continuation_indices']):
@@ -575,8 +439,17 @@ class InContextLearningLMExpectedCalibrationError(
     # Make torchmetrics call update only once
     full_state_update = False
 
-    def update(self, batch: Dict[str, Any], outputs: torch.Tensor,
-               labels: torch.Tensor):
+    def update(self,
+               batch: dict,
+               output_logits: Optional[torch.Tensor] = None,
+               labels: Optional[torch.Tensor] = None,
+               outputs: Optional[torch.Tensor] = None):
+        batch, outputs, labels = InContextLearningMetric.rename_args(
+            batch=batch,
+            output_logits=output_logits,
+            labels=labels,
+            outputs=outputs)
+
         outputs = torch.softmax(outputs, dim=2)
         for batch_idx, cont_idx in enumerate(batch['continuation_indices']):
             cont_tok_logits = outputs[batch_idx].index_select(dim=0,
@@ -597,3 +470,157 @@ def update(self, batch: Dict[str, Any], outputs: torch.Tensor,
 
             self.bucket_totals[
                 bucket_idx] += 1  # pyright: ignore [reportGeneralTypeIssues]
+
+
+class InContextLearningCodeEvalAccuracy(InContextLearningMetric):
+    r"""Computes accuracy for In-context learning (ICL) code evaluation tasks.
+
+    ICL code eval tasks consist of some number of example code eval tasks (referred to as the 'context'), followed by a test task where the model must
+    complete the code, where we term the code completion a 'continuation'.
+
+    In each case, the model constructs a given number of continuations (termed pass@K for K continuations), and each continuation is run against a set of test cases. The model is considered
+    correct if at least one of the proposed continuations passes all the test cases.
+
+    Runs on AWS Lambdas by default.
+
+    Adds metric state variables:
+        correct (float): The number of instances where the predictions passed all the test cases.
+        total (float): The number of total instances that were predicted.
+
+    Args:
+        dist_sync_on_step (bool, optional): Synchronize metric state across processes at
+            each forward() before returning the value at the step. Default: ``False``.
+    """
+
+    # Make torchmetrics call update only once
+    full_state_update = False
+
+    def __init__(self, dist_sync_on_step: bool = False):
+        # state from multiple processes
+        super().__init__(dist_sync_on_step=dist_sync_on_step)
+        self.add_state('correct',
+                       default=torch.tensor(0.),
+                       dist_reduce_fx='sum')
+        self.add_state('total', default=torch.tensor(0.), dist_reduce_fx='sum')
+
+        self.eval_device = os.environ.get('CODE_EVAL_DEVICE', None)
+        if self.eval_device is not None:
+            self.eval_device = self.eval_device.upper()
+
+    def get_client(self) -> EvalClient:
+        """Returns a client for the appropriate remote platform."""
+        client = None
+        if self.eval_device == 'LOCAL':
+            warnings.warn(
+                'Running code eval locally may be insecure. Please set environment variable CODE_EVAL_DEVICE '
+                'to LAMBDA to run on remote. To use Lambdas, spin up your instance that checks code, set the URL as '
+                'CODE_EVAL_URL and the API key as CODE_EVAL_APIKEY.')
+            log.debug('Running code eval locally.')
+            client = LocalEvalClient()
+        elif self.eval_device == 'LAMBDA':
+            client = LambdaEvalClient()
+        elif self.eval_device == 'MOSAICML':
+            client = MosaicMLLambdaEvalClient()
+        elif self.eval_device is None:
+            raise ValueError(
+                'Attempting to use InContextLearningCodeEvalAccuracy but environment '
+                'variable `CODE_EVAL_DEVICE` is not set. Please set it to `CODE_EVAL_DEVICE` '
+                'to one of `LOCAL` (for unsafe local eval), `LAMBDA` (for AWS lambda ',
+                'evaluation), or `MOSAICML` (for lambda eval through MAPI).')
+        else:
+            raise ValueError(
+                'Environment variable `CODE_EVAL_DEVICE` must be one of `LOCAL`, '
+                f'`LAMBDA`, or `MOSAICML` but got {self.eval_device}.')
+
+        return client
+
+    def estimator(self, n: int, c: int, k: int) -> float:
+        """Computes the pass@k metric.
+
+        Given the number of generated samples, n, the number of correct samples, c, and the k of interest,
+        this function calculates pass@k as 1 - comb(n - c, k) / comb(n, k) as per the definition of
+        pass@k in the HumanEval paper (https://arxiv.org/abs/2107.03374) and it's associated implementation:
+        https://github.com/openai/human-eval.
+        """
+        if n - c < k:
+            return 1.0
+        return 1.0 - float(np.prod(1.0 - k / np.arange(n - c + 1, n + 1)))
+
+    def update(self, batch: Dict[str, Any], outputs: List[str],
+               labels: List[str]):
+        """Updates the pass@k accuracy of code generation.
+
+        Given a batch of prompts, test cases, and code generations, evaluates the code generations
+        against the test cases and augments the pass@k accuracy of the batch to the values so far.
+
+        Args:
+            batch (Dict[str, Any]): A batch of data produced by the InContextLearningCodeEvalDataset, with
+            the prompt, test cases, and entry points. This will be a dictionary that must have the following
+            arguments:
+            {
+                'prompts': List[str],
+                'test_inputs': List[List[str]],
+                'test_outputs': List[List[str]],
+                'entry_points': List[str],
+                'languages': List[str],
+                'generation_kwargs': Dict[str, Any]
+            }
+            outputs (List[str]): A list of code generations in the format of HF generate with beam search,
+            which is the a list of strings in groups of beam_size e.g. for beam size 2 and batch size 2, the list
+            will be of the format [prompt 1 gen 1, prompt 1 gen 2, prompt 2 gen 1, prompt 2 gen 2]
+            labels (List[str]): A list of the correct code generations, for compatibility with existing HF generate
+            functionalities. This is not used.
+        """
+        del labels  # never used
+        client = self.get_client()
+
+        pass_at_k = batch['pass_at_k']
+        num_generations = batch['generation_kwargs']['num_return_sequences']
+        processed_outputs = [
+            outputs[i * num_generations:(i + 1) * num_generations]
+            for i in range(len(batch['prompts']))
+        ]
+        payloads = []
+        for sample_outputs, sample_prompt, test_inputs, test_outputs, entry_point, language in zip(
+                processed_outputs, batch['prompts'], batch['test_inputs'],
+                batch['test_outputs'], batch['entry_points'],
+                batch['languages']):
+            self.total += torch.tensor(1.0)
+            prompt_payload = []
+            for code_gen in sample_outputs:
+                code_gen = re.split(
+                    r'\n[A-Za-z0-9#`]',
+                    code_gen)[0]  # remove everything after function ends
+                final_code = sample_prompt + code_gen  # combine prompt with the code generation
+                generation_payload = []
+                for test_input, test_output in zip(test_inputs, test_outputs):
+                    payload = {
+                        'code': final_code,
+                        'input': test_input,
+                        'output': test_output,
+                        'entry_point': entry_point,
+                        'language': language,
+                    }
+                    generation_payload.append(payload)
+
+                prompt_payload.append(generation_payload)
+            payloads.append(prompt_payload)
+
+        results = client.invoke(payloads)
+        for prompt in results:
+            num_correct = 0
+            for generation in prompt:
+                correct = all(generation)
+                if correct:
+                    num_correct += 1
+
+            pass_at_k_rate = self.estimator(num_generations, num_correct,
+                                            pass_at_k)
+            self.correct += torch.tensor(pass_at_k_rate)
+
+        client.close()  # pyright: ignore [reportOptionalMemberAccess]
+
+    def compute(self):
+        assert isinstance(self.correct, Tensor)
+        assert isinstance(self.total, Tensor)
+        return self.correct / self.total
diff --git a/llmfoundry/models/hf/hf_causal_lm.py b/llmfoundry/models/hf/hf_causal_lm.py
index 3ff5e15eed..b991d9f572 100644
--- a/llmfoundry/models/hf/hf_causal_lm.py
+++ b/llmfoundry/models/hf/hf_causal_lm.py
@@ -16,9 +16,10 @@
 from transformers import (AutoConfig, AutoModelForCausalLM, PreTrainedModel,
                           PreTrainedTokenizerBase)
 
-from llmfoundry.eval.metrics.nlp import (
-    InContextLearningCodeEvalAccuracy, InContextLearningLMAccuracy,
-    InContextLearningMultipleChoiceAccuracy, InContextLearningQAAccuracy)
+from llmfoundry.eval.metrics import (InContextLearningCodeEvalAccuracy,
+                                     InContextLearningLMAccuracy,
+                                     InContextLearningMultipleChoiceAccuracy,
+                                     InContextLearningQAAccuracy)
 from llmfoundry.models.hf.hf_fsdp import hf_get_init_device
 from llmfoundry.models.hf.model_wrapper import HuggingFaceModelWithZLoss
 from llmfoundry.models.layers.attention import is_flash_v2_installed
diff --git a/llmfoundry/models/inference_api_wrapper/interface.py b/llmfoundry/models/inference_api_wrapper/interface.py
index a4b038c650..90b3560eb8 100644
--- a/llmfoundry/models/inference_api_wrapper/interface.py
+++ b/llmfoundry/models/inference_api_wrapper/interface.py
@@ -10,9 +10,10 @@
 from torchmetrics import Metric
 from transformers import AutoTokenizer
 
-from llmfoundry.eval.metrics.nlp import (
-    InContextLearningLMAccuracy, InContextLearningMetric,
-    InContextLearningMultipleChoiceAccuracy, InContextLearningQAAccuracy)
+from llmfoundry.eval.metrics import (InContextLearningLMAccuracy,
+                                     InContextLearningMetric,
+                                     InContextLearningMultipleChoiceAccuracy,
+                                     InContextLearningQAAccuracy)
 
 
 class InferenceAPIEvalWrapper(ComposerModel):
diff --git a/llmfoundry/models/mpt/modeling_mpt.py b/llmfoundry/models/mpt/modeling_mpt.py
index 2e3d256cc5..117f12fd68 100644
--- a/llmfoundry/models/mpt/modeling_mpt.py
+++ b/llmfoundry/models/mpt/modeling_mpt.py
@@ -20,9 +20,10 @@
 from composer.models import HuggingFaceModel
 from composer.utils import dist
 
-from llmfoundry.eval.metrics.nlp import (
-    InContextLearningCodeEvalAccuracy, InContextLearningLMAccuracy,
-    InContextLearningMultipleChoiceAccuracy, InContextLearningQAAccuracy)
+from llmfoundry.eval.metrics import (InContextLearningCodeEvalAccuracy,
+                                     InContextLearningLMAccuracy,
+                                     InContextLearningMultipleChoiceAccuracy,
+                                     InContextLearningQAAccuracy)
 from llmfoundry.models.layers.attention import (is_flash_v1_installed,
                                                 is_flash_v2_installed)
 
diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py
index 9b6e9d869a..15fa21e257 100644
--- a/llmfoundry/utils/builders.py
+++ b/llmfoundry/utils/builders.py
@@ -33,8 +33,7 @@
                                   LayerFreezing, MonolithicCheckpointSaver,
                                   ScheduledGarbageCollector)
 from llmfoundry.data.dataloader import build_dataloader
-from llmfoundry.eval.datasets.in_context_learning_evaluation import \
-    get_icl_task_dataloader
+from llmfoundry.eval.datasets import get_icl_task_dataloader
 from llmfoundry.optim import (DecoupledAdaLRLion, DecoupledClipLion,
                               DecoupledLionW, DecoupledLionW_8bit)
 from llmfoundry.optim.scheduler import InverseSquareRootWithWarmupScheduler
diff --git a/mcli/mcli-hf-eval.yaml b/mcli/mcli-hf-eval.yaml
index 77e506c878..10d32d6e63 100644
--- a/mcli/mcli-hf-eval.yaml
+++ b/mcli/mcli-hf-eval.yaml
@@ -8,8 +8,7 @@ integrations:
 
 command: |
   cd llm-foundry/scripts
-  pip uninstall mosaicml -y
-  pip install git+https://github.com/mosaicml/composer.git@refactor_update_metric
+  pip uninstall mosaicml -y; pip install git+https://github.com/mosaicml/composer.git@dev
   composer eval/eval.py /mnt/config/parameters.yaml
 
 # Mosaic Cloud will use run_name (with a unique suffix) to populate the env var $RUN_NAME
diff --git a/tests/eval/test_in_context_learning_datasets.py b/tests/eval/test_in_context_learning_datasets.py
index ec7632bedd..4370f298a2 100644
--- a/tests/eval/test_in_context_learning_datasets.py
+++ b/tests/eval/test_in_context_learning_datasets.py
@@ -17,7 +17,7 @@
 from torch.utils.data import DataLoader
 
 # isort: off
-from llmfoundry.eval.datasets.in_context_learning_evaluation import (
+from llmfoundry.eval.datasets import (
     InContextLearningDataset, InContextLearningCodeEvalDataset,
     InContextLearningMultipleChoiceTaskDataset, InContextLearningQATaskDataset,
     InContextLearningSchemaTaskDataset, get_icl_task_dataloader, strip_data,
@@ -30,9 +30,10 @@
 from composer.trainer import Trainer
 from composer.utils import dist, reproducibility
 
-from llmfoundry.eval.metrics.nlp import (
-    InContextLearningCodeEvalAccuracy, InContextLearningLMAccuracy,
-    InContextLearningMultipleChoiceAccuracy, InContextLearningQAAccuracy)
+from llmfoundry.eval.metrics import (InContextLearningCodeEvalAccuracy,
+                                     InContextLearningLMAccuracy,
+                                     InContextLearningMultipleChoiceAccuracy,
+                                     InContextLearningQAAccuracy)
 
 
 def test_strip_data():
diff --git a/tests/eval/test_nlp_metrics.py b/tests/eval/test_nlp_metrics.py
index 84d84933f4..2b498db87e 100644
--- a/tests/eval/test_nlp_metrics.py
+++ b/tests/eval/test_nlp_metrics.py
@@ -6,9 +6,10 @@
 
 import torch
 
-from llmfoundry.eval.metrics.nlp import (
-    InContextLearningCodeEvalAccuracy, InContextLearningLMAccuracy,
-    InContextLearningMultipleChoiceAccuracy, InContextLearningQAAccuracy)
+from llmfoundry.eval.metrics import (InContextLearningCodeEvalAccuracy,
+                                     InContextLearningLMAccuracy,
+                                     InContextLearningMultipleChoiceAccuracy,
+                                     InContextLearningQAAccuracy)
 
 
 def test_in_context_learning_lm_accuracy(tiny_gpt2_tokenizer):

From fe838282ba2425a1d59bd3f773ed436c4980df72 Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Mon, 12 Feb 2024 14:01:25 -0500
Subject: [PATCH 13/59] fix

---
 llmfoundry/eval/datasets/__init__.py          | 13 ++-
 llmfoundry/eval/metrics/__init__.py           | 13 ++-
 llmfoundry/eval/metrics/nlp.py                | 92 ++++---------------
 .../eval/test_in_context_learning_datasets.py |  2 +-
 4 files changed, 35 insertions(+), 85 deletions(-)

diff --git a/llmfoundry/eval/datasets/__init__.py b/llmfoundry/eval/datasets/__init__.py
index 2d22ae8e90..bbbf7d8b86 100644
--- a/llmfoundry/eval/datasets/__init__.py
+++ b/llmfoundry/eval/datasets/__init__.py
@@ -1,13 +1,16 @@
+# Copyright 2024 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
 # Copyright 2022 MosaicML Composer authors
 # SPDX-License-Identifier: Apache-2.0
 
 """Natively supported datasets."""
 
-from llmfoundry.eval.datasets.in_context_learning_evaluation import (InContextLearningCodeEvalDataset,
-                                                              InContextLearningDataset, InContextLearningLMTaskDataset,
-                                                              InContextLearningMultipleChoiceTaskDataset,
-                                                              InContextLearningQATaskDataset,
-                                                              InContextLearningSchemaTaskDataset)
+from llmfoundry.eval.datasets.in_context_learning_evaluation import (
+    InContextLearningCodeEvalDataset, InContextLearningDataset,
+    InContextLearningLMTaskDataset, InContextLearningMultipleChoiceTaskDataset,
+    InContextLearningQATaskDataset, InContextLearningSchemaTaskDataset,
+    get_icl_task_dataloader)
 
 __all__ = [
     'InContextLearningDataset',
diff --git a/llmfoundry/eval/metrics/__init__.py b/llmfoundry/eval/metrics/__init__.py
index 444bdb6aea..cd38b6bcd8 100644
--- a/llmfoundry/eval/metrics/__init__.py
+++ b/llmfoundry/eval/metrics/__init__.py
@@ -1,12 +1,16 @@
+# Copyright 2024 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
 # Copyright 2022 MosaicML Composer authors
 # SPDX-License-Identifier: Apache-2.0
 
 """A collection of common torchmetrics."""
 
-from llmfoundry.eval.metrics.nlp import (InContextLearningCodeEvalAccuracy, InContextLearningLMAccuracy,
-                                  InContextLearningLMExpectedCalibrationError,
-                                  InContextLearningMCExpectedCalibrationError, InContextLearningMetric,
-                                  InContextLearningMultipleChoiceAccuracy, InContextLearningQAAccuracy)
+from llmfoundry.eval.metrics.nlp import (
+    InContextLearningCodeEvalAccuracy, InContextLearningLMAccuracy,
+    InContextLearningLMExpectedCalibrationError,
+    InContextLearningMCExpectedCalibrationError, InContextLearningMetric,
+    InContextLearningMultipleChoiceAccuracy, InContextLearningQAAccuracy)
 
 __all__ = [
     'InContextLearningLMAccuracy',
@@ -17,4 +21,3 @@
     'InContextLearningMetric',
     'InContextLearningCodeEvalAccuracy',
 ]
-
diff --git a/llmfoundry/eval/metrics/nlp.py b/llmfoundry/eval/metrics/nlp.py
index aa603db129..bc507e51ff 100644
--- a/llmfoundry/eval/metrics/nlp.py
+++ b/llmfoundry/eval/metrics/nlp.py
@@ -11,7 +11,7 @@
 import re
 import string
 import warnings
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List
 
 import numpy as np
 import torch
@@ -41,11 +41,12 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.needs_batch = True
 
-    def update(self,
-               batch: dict,
-               output_logits: Optional[torch.Tensor] = None,
-               labels: Optional[torch.Tensor] = None,
-               outputs: Optional[torch.Tensor] = None):
+    def update(
+        self,
+        batch: dict,
+        outputs: torch.Tensor,
+        labels: torch.Tensor,
+    ):
         """Abstract interface for computing an in-context learning metrics.
 
         The `output_logits` argument is deprecated and will be removed in v0.21 while it's functionality will
@@ -62,30 +63,6 @@ def update(self,
         """
         raise NotImplementedError
 
-    @staticmethod
-    def rename_args(
-        batch: dict,
-        output_logits: Optional[torch.Tensor] = None,
-        labels: Optional[torch.Tensor] = None,
-        outputs: Optional[torch.Tensor] = None
-    ) -> Tuple[dict, torch.Tensor, torch.Tensor]:
-        if outputs is not None and output_logits is not None:
-            raise ValueError('Cannot use both `outputs` and `output_logits`')
-        if output_logits is not None:
-            warnings.warn(
-                ('`output_logits` has been renamed to `outputs` and will be removed in v0.21'
-                ),
-                DeprecationWarning,
-            )
-            outputs = output_logits
-
-        if labels is None:
-            raise ValueError('`labels` cannot be None')
-        if outputs is None:
-            raise ValueError('`outputs` cannot be None')
-
-        return batch, outputs, labels
-
 
 class InContextLearningQAAccuracy(InContextLearningMetric):
     r"""Computes accuracy for In-context learning (ICL) question answering (QA)
@@ -148,8 +125,12 @@ def replace_underscore(text: str) -> str:
             remove_articles(handle_punc(lower(
                 replace_underscore(answer))))).strip()
 
-    def update(self, outputs: List[str], labels: List[List[str]],
-               batch: Dict[str, Any]):
+    def update(
+        self,
+        batch: Dict[str, Any],
+        outputs: List[str],
+        labels: List[List[str]],
+    ):
         cot_delimiter = batch.get('cot_delimiter', '')
         do_normalization = batch.get('do_normalization', True)
         stopping_criteria = batch.get('stopping_criteria', None)
@@ -217,16 +198,7 @@ def __init__(self, dist_sync_on_step: bool = False):
                        dist_reduce_fx='sum')
         self.add_state('total', default=torch.tensor(0.), dist_reduce_fx='sum')
 
-    def update(self,
-               batch: dict,
-               output_logits: Optional[torch.Tensor] = None,
-               labels: Optional[torch.Tensor] = None,
-               outputs: Optional[torch.Tensor] = None):
-        batch, outputs, labels = InContextLearningMetric.rename_args(
-            batch=batch,
-            output_logits=output_logits,
-            labels=labels,
-            outputs=outputs)
+    def update(self, batch: dict, outputs: torch.Tensor, labels: torch.Tensor):
 
         for batch_idx, cont_idx in enumerate(batch['continuation_indices']):
             cont_tok_pred = outputs[batch_idx].index_select(dim=0,
@@ -275,16 +247,7 @@ def __init__(self, dist_sync_on_step: bool = False):
                        dist_reduce_fx='sum')
         self.add_state('total', default=torch.tensor(0.0), dist_reduce_fx='sum')
 
-    def update(self,
-               batch: dict,
-               output_logits: Optional[torch.Tensor] = None,
-               labels: Optional[torch.Tensor] = None,
-               outputs: Optional[torch.Tensor] = None):
-        batch, outputs, labels = InContextLearningMetric.rename_args(
-            batch=batch,
-            output_logits=output_logits,
-            labels=labels,
-            outputs=outputs)
+    def update(self, batch: dict, outputs: torch.Tensor, labels: torch.Tensor):
 
         perplexities = []
         for batch_idx, cont_idx in enumerate(batch['continuation_indices']):
@@ -347,8 +310,7 @@ def __init__(self, dist_sync_on_step: bool = False, n_buckets: int = 10):
                        default=torch.zeros(n_buckets),
                        dist_reduce_fx='sum')
 
-    def update(self, batch: dict, output_logits: torch.Tensor,
-               labels: torch.Tensor):
+    def update(self, batch: dict, outputs: torch.Tensor, labels: torch.Tensor):
         pass
 
     def compute(self):
@@ -384,16 +346,7 @@ class InContextLearningMCExpectedCalibrationError(
     # Make torchmetrics call update only once
     full_state_update = False
 
-    def update(self,
-               batch: dict,
-               output_logits: Optional[torch.Tensor] = None,
-               labels: Optional[torch.Tensor] = None,
-               outputs: Optional[torch.Tensor] = None):
-        batch, outputs, labels = InContextLearningMetric.rename_args(
-            batch=batch,
-            output_logits=output_logits,
-            labels=labels,
-            outputs=outputs)
+    def update(self, batch: dict, outputs: torch.Tensor, labels: torch.Tensor):
 
         outputs = torch.softmax(outputs, dim=2)
         probabilites = []
@@ -439,16 +392,7 @@ class InContextLearningLMExpectedCalibrationError(
     # Make torchmetrics call update only once
     full_state_update = False
 
-    def update(self,
-               batch: dict,
-               output_logits: Optional[torch.Tensor] = None,
-               labels: Optional[torch.Tensor] = None,
-               outputs: Optional[torch.Tensor] = None):
-        batch, outputs, labels = InContextLearningMetric.rename_args(
-            batch=batch,
-            output_logits=output_logits,
-            labels=labels,
-            outputs=outputs)
+    def update(self, batch: dict, outputs: torch.Tensor, labels: torch.Tensor):
 
         outputs = torch.softmax(outputs, dim=2)
         for batch_idx, cont_idx in enumerate(batch['continuation_indices']):
diff --git a/tests/eval/test_in_context_learning_datasets.py b/tests/eval/test_in_context_learning_datasets.py
index 4370f298a2..1c1e142c4a 100644
--- a/tests/eval/test_in_context_learning_datasets.py
+++ b/tests/eval/test_in_context_learning_datasets.py
@@ -17,7 +17,7 @@
 from torch.utils.data import DataLoader
 
 # isort: off
-from llmfoundry.eval.datasets import (
+from llmfoundry.eval.datasets.in_context_learning_evaluation import (
     InContextLearningDataset, InContextLearningCodeEvalDataset,
     InContextLearningMultipleChoiceTaskDataset, InContextLearningQATaskDataset,
     InContextLearningSchemaTaskDataset, get_icl_task_dataloader, strip_data,

From b54a12b97221c9b2da69300f391806198df5edaa Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Mon, 12 Feb 2024 14:43:22 -0500
Subject: [PATCH 14/59] fix pytests

---
 llmfoundry/eval/datasets/__init__.py          |  15 +-
 .../in_context_learning_evaluation.py         | 258 +++--------------
 llmfoundry/eval/datasets/utils.py             | 266 ++++++++++++++++++
 .../eval/test_in_context_learning_datasets.py |  54 ++--
 4 files changed, 342 insertions(+), 251 deletions(-)
 create mode 100644 llmfoundry/eval/datasets/utils.py

diff --git a/llmfoundry/eval/datasets/__init__.py b/llmfoundry/eval/datasets/__init__.py
index bbbf7d8b86..0fc7662468 100644
--- a/llmfoundry/eval/datasets/__init__.py
+++ b/llmfoundry/eval/datasets/__init__.py
@@ -11,12 +11,17 @@
     InContextLearningLMTaskDataset, InContextLearningMultipleChoiceTaskDataset,
     InContextLearningQATaskDataset, InContextLearningSchemaTaskDataset,
     get_icl_task_dataloader)
+from llmfoundry.eval.datasets.utils import (get_continuation_span,
+                                            get_fewshot_sample_idxs,
+                                            make_padded_input, strip_data,
+                                            tokenizer_needs_prefix_space,
+                                            trim_context)
 
 __all__ = [
-    'InContextLearningDataset',
-    'InContextLearningQATaskDataset',
-    'InContextLearningLMTaskDataset',
-    'InContextLearningCodeEvalDataset',
+    'InContextLearningDataset', 'InContextLearningQATaskDataset',
+    'InContextLearningLMTaskDataset', 'InContextLearningCodeEvalDataset',
     'InContextLearningMultipleChoiceTaskDataset',
-    'InContextLearningSchemaTaskDataset',
+    'InContextLearningSchemaTaskDataset', 'get_icl_task_dataloader',
+    'strip_data', 'tokenizer_needs_prefix_space', 'trim_context',
+    'get_continuation_span', 'get_fewshot_sample_idxs', 'make_padded_input'
 ]
diff --git a/llmfoundry/eval/datasets/in_context_learning_evaluation.py b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
index 14d2b29f5d..e1138cf7c1 100644
--- a/llmfoundry/eval/datasets/in_context_learning_evaluation.py
+++ b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
@@ -11,15 +11,21 @@
 import json
 import os
 import random
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
 
-import torch
 from composer.core import DataSpec
 from composer.core.data_spec import _default_split_batch, _split_list
 from composer.datasets.utils import stop_sequences_criteria
 from composer.utils import MissingConditionalImportError, dist, get_file
 from torch.utils.data import DataLoader, Dataset
 
+from llmfoundry.eval.datasets.utils import (convert_tokens_to_tensors,
+                                            get_continuation_span,
+                                            get_fewshot_sample_idxs,
+                                            make_padded_input, strip_data,
+                                            tokenizer_needs_prefix_space,
+                                            trim_context)
+
 if TYPE_CHECKING:
     import transformers
     from datasets import \
@@ -38,192 +44,6 @@
 ]
 
 
-def strip_data(example: Dict) -> Dict:
-    """Remove white space from the begging and end of string values in a
-    dictionary.
-
-    Args:
-        example: Dictionary to be stripped
-
-    Returns:
-        dict: The same dictionary with .strip() applied to any value in the dict that is a string
-    """
-    return {
-        k: v.strip() if isinstance(v, str) else v for k, v in example.items()
-    }
-
-
-def _tokenizer_needs_prefix_space(
-        tokenizer: transformers.PreTrainedTokenizerBase) -> bool:
-    """Test for whether a prefix space is needed before the continuation.
-    Sentencepiece tokenization should not have a prefix space, but gpt2 style
-    BPE should.
-
-    Args:
-        tokenizer: Tokenizer to test
-
-    Returns:
-        bool: Whether or not the tokenizer needs a prefix space
-    """
-    test_tokens = tokenizer(' a', add_special_tokens=False)['input_ids']
-    assert isinstance(test_tokens, list)
-    return len(test_tokens) == 1
-
-
-def _trim_context(context_enc: List, continuation_enc: List,
-                  max_seq_len: int) -> List:
-    """Trims a list of tokens down to `max_seq_len` if the length of the list
-    plus the continuation is more than `max_seq_len`. It will always trim tokens
-    from the left, i.e. tokens at the beginning of the context will be removed.
-
-    Args:
-        context_enc (list): List of tokens in the context
-        continuation_enc (lsit): List of tokens in the continuation
-        max_seq_len (int): Maximum length the model can ingest
-
-    Returns:
-        list: The encoded context trimmed from the left
-    """
-    if len(continuation_enc) + len(context_enc) > max_seq_len:
-        context_max_subseq_len = max_seq_len - len(continuation_enc)
-
-        if context_max_subseq_len < 0:
-            # can't support continuations which are longer than the max seq len
-            raise Exception(
-                f'Dataset included continuation longer than the max seq len')
-
-        # clip from the end
-        context_enc = context_enc[-(context_max_subseq_len):]
-    return context_enc
-
-
-def _get_continuation_span(context_enc: List,
-                           continuation_enc: List) -> torch.Tensor:
-    """Gets the list of indices of the continuation tokens for language modeling
-    or generation tasks.
-
-    Args:
-        context_enc (list): List of context tokens
-        continuation_enc (list): List of continuation tokens
-
-    Returns:
-        torch.tensor: A tensor containing indices corresponding to continuation tokens
-    """
-    return torch.tensor(
-        range(len(context_enc),
-              len(context_enc) + len(continuation_enc)))
-
-
-def _make_padded_input(context_enc: List,
-                       continuation_enc: List,
-                       max_seq_len: int,
-                       pad_tok_id: int,
-                       padding_side: str = 'right') -> torch.Tensor:
-    """Takes an encoded context and continuation and clips the beginning of the
-    context if they're too long. Adds the padding token to the specified side.
-
-    Args:
-        context_enc (List): The encoded input to the model
-        continuation_enc (List): The encoded desired output for the example
-        max_seq_list (int): Maximum length sequences can be
-        pad_tok_id (int): The token id we pad with
-        padding_side (str): Which side to pad the context on. Can be 'right' or 'left
-
-    Returns:
-        input (torch.tensor): The padded and encoded context
-        continuation_span (torch.tensor): The _inclusive_ range of indices corresponding to the continuation
-    """
-
-    inp = torch.tensor(
-        (context_enc + continuation_enc),
-        dtype=torch.long,
-    )
-    (inp_len,) = inp.shape
-
-    # Sometimes tokenizers that have neither a pad_tok_id or eos_tok_id will pass None in as the padding
-    # token and cause errors
-    if not isinstance(pad_tok_id, int):
-        raise ValueError(
-            f'`pad_tok_id` must be an integer. Found {type(pad_tok_id)} instead'
-        )
-    # pad length from seq to padding_length
-    if padding_side == 'right':
-        inp = torch.cat(
-            [
-                inp,  # [seq]
-                torch.LongTensor((max_seq_len - inp_len) * [pad_tok_id]),
-            ],
-            dim=0,
-        )
-    elif padding_side == 'left':
-        inp = torch.cat(
-            [
-                torch.LongTensor((max_seq_len - inp_len) * [pad_tok_id]),
-                inp,  # [seq]
-            ],
-            dim=0,
-        )
-    else:
-        raise ValueError(
-            f"Unknown padding_side {padding_side}. padding_side must be either 'left' or 'right'"
-        )
-
-    return inp
-
-
-def convert_tokens_to_tensors(batch: Dict,
-                              tokenize_labels: bool) -> Dict[str, Any]:
-    """HF Datasets converts tensors into lists when we store them, and we don't
-    want to use `type='torch'` because some content in the dataset, like
-    generation args or single ints, should not be converted.
-
-    Here, we convert those lists of tokens back into tensors in order to feed them into the model.
-
-    Args:
-        batch (dict): A dictionary of batched inputs
-        tokenize_labels (bool): Whether or not the labels are tokenized (and need to be stacked)
-
-    Returns:
-        dict: The batch with torch tensors in the corresponding keys instead of lists of lists
-    """
-    batch['input_ids'] = torch.stack(list(map(torch.tensor,
-                                              batch['input_ids'])))
-    if tokenize_labels:
-        batch['labels'] = torch.stack(list(map(torch.tensor, batch['labels'])))
-        batch['continuation_indices'] = list(
-            map(torch.tensor, batch['continuation_indices']))
-    return batch
-
-
-def _get_fewshot_sample_idxs(dataset_size: int, num_fewshot: int,
-                             example_idx: int, rng: random.Random) -> Set[int]:
-    """
-    Samples indices without replacement. If num_fewshot exceeds the number of unique examples in the dataset,
-    then we will have fewer than num_fewshot examples in context.
-    Args:
-        dataset_size (int): Length of the dataset
-        num_fewshot (int): Number of examples to prepend
-        example_idx (int): Current example's index (excluded from fewshot choices)
-        rng (random.Random): RNG for repeatable sample selection
-
-    Returns:
-        list: Indices of the examples chosen for fewshot selection
-    """
-    num_fewshot = min(dataset_size - 1, num_fewshot)
-    fewshot_idxs = set(rng.sample(range(0, dataset_size), num_fewshot))
-
-    if example_idx in fewshot_idxs:
-        fewshot_idxs.remove(example_idx)
-        if len(fewshot_idxs) >= dataset_size - 1:
-            return fewshot_idxs
-
-        replacement_sample = rng.choice(range(0, dataset_size))
-        while replacement_sample in fewshot_idxs or replacement_sample == example_idx:
-            replacement_sample = rng.choice(range(0, dataset_size))
-        fewshot_idxs.add(replacement_sample)
-    return fewshot_idxs
-
-
 class InContextLearningDataset(Dataset):
     """A base dataset that constructs batches for in-context learning task
     evaluations. The dataset format is expected to be a local jsonl file, a
@@ -321,7 +141,7 @@ def __init__(
             ) from e
 
         self.tokenizer = tokenizer
-        self.prefix_space = _tokenizer_needs_prefix_space(self.tokenizer)
+        self.prefix_space = tokenizer_needs_prefix_space(self.tokenizer)
 
         self.max_seq_len = max_seq_len
         self.pad_tok_id = pad_tok_id
@@ -460,7 +280,7 @@ def _generate_few_shot_prompt(
         few_shot_text = preamble
 
         if num_fewshot > 0:
-            fewshot_idxs = _get_fewshot_sample_idxs(
+            fewshot_idxs = get_fewshot_sample_idxs(
                 len(self.dataset),
                 num_fewshot,
                 example_idx,
@@ -570,32 +390,32 @@ def tokenize_example(self, prompt_and_fewshot: str, ctxt: str,
                 self.get_answer_from_example(example),
                 add_special_tokens=False)['input_ids']
             assert isinstance(tokenized_answer, list)
-            trimmed_context = _trim_context(tokenized_context, tokenized_answer,
-                                            self.padding_size)
+            trimmed_context = trim_context(tokenized_context, tokenized_answer,
+                                           self.padding_size)
             assert isinstance(trimmed_context, list)
-            continuation_indices = _get_continuation_span(
+            continuation_indices = get_continuation_span(
                 trimmed_context, tokenized_answer)
-            padded_context = _make_padded_input(trimmed_context,
-                                                tokenized_answer,
-                                                self.padding_size,
-                                                self.pad_tok_id,
-                                                self.padding_side)
+            padded_context = make_padded_input(trimmed_context,
+                                               tokenized_answer,
+                                               self.padding_size,
+                                               self.pad_tok_id,
+                                               self.padding_side)
 
             tokenized_example[self.context_key] = padded_context
             tokenized_example[self.answer_key] = tokenized_answer
             tokenized_example['continuation_indices'] = continuation_indices
         else:
             assert isinstance(tokenized_context, list)
-            trimmed_context = _trim_context(
+            trimmed_context = trim_context(
                 tokenized_context,
                 [],
                 self.padding_size,
             )
             assert isinstance(trimmed_context, list)
-            padded_context = _make_padded_input(trimmed_context, [],
-                                                self.padding_size,
-                                                self.pad_tok_id,
-                                                self.padding_side)
+            padded_context = make_padded_input(trimmed_context, [],
+                                               self.padding_size,
+                                               self.pad_tok_id,
+                                               self.padding_side)
 
             tokenized_example[self.context_key] = padded_context
             tokenized_example[self.answer_key] = self.get_answer_from_example(
@@ -1011,12 +831,12 @@ def tokenize_example(self, prompt_and_fewshot: str, ctxt: str,
                 choice, add_special_tokens=False)['input_ids']
             assert isinstance(tokenized_context, list)
             assert isinstance(tokenized_answer, list)
-            trimmed_context = _trim_context(tokenized_context, tokenized_answer,
-                                            self.padding_size)
+            trimmed_context = trim_context(tokenized_context, tokenized_answer,
+                                           self.padding_size)
             assert isinstance(trimmed_context, list)
-            continuation_indices = _get_continuation_span(
+            continuation_indices = get_continuation_span(
                 trimmed_context, tokenized_answer)
-            padded_context = _make_padded_input(
+            padded_context = make_padded_input(
                 trimmed_context,
                 tokenized_answer,
                 self.padding_size,
@@ -1280,16 +1100,16 @@ def tokenize_example(self, prompt_and_fewshot: str,
         for context in encoded_contexts:
             assert isinstance(context, list)
             assert isinstance(tokenized_continuation, list)
-            trimmed_context = _trim_context(context, tokenized_continuation,
-                                            self.padding_size)
+            trimmed_context = trim_context(context, tokenized_continuation,
+                                           self.padding_size)
             assert isinstance(trimmed_context, list)
-            continuation_indices = _get_continuation_span(
+            continuation_indices = get_continuation_span(
                 trimmed_context, tokenized_continuation)
-            padded_context = _make_padded_input(trimmed_context,
-                                                tokenized_continuation,
-                                                self.padding_size,
-                                                self.pad_tok_id,
-                                                self.padding_side)
+            padded_context = make_padded_input(trimmed_context,
+                                               tokenized_continuation,
+                                               self.padding_size,
+                                               self.pad_tok_id,
+                                               self.padding_side)
             tokenized_example[self.context_key].append(padded_context)
             tokenized_example['continuation_indices'].append(
                 continuation_indices)
@@ -1458,10 +1278,10 @@ def _trim_padding(self, example: Dict):
             if token != self.pad_tok_id
         ]
         # Reapply padding only to max_prompt_length
-        full_prompt = _trim_context(unpadded_prompt, [], self.max_prompt_length)
-        padded_context = _make_padded_input(full_prompt, [],
-                                            self.max_prompt_length,
-                                            self.pad_tok_id, self.padding_side)
+        full_prompt = trim_context(unpadded_prompt, [], self.max_prompt_length)
+        padded_context = make_padded_input(full_prompt, [],
+                                           self.max_prompt_length,
+                                           self.pad_tok_id, self.padding_side)
 
         example[self.context_key] = padded_context
         return example
diff --git a/llmfoundry/eval/datasets/utils.py b/llmfoundry/eval/datasets/utils.py
new file mode 100644
index 0000000000..55f269ff8e
--- /dev/null
+++ b/llmfoundry/eval/datasets/utils.py
@@ -0,0 +1,266 @@
+# Copyright 2022 MosaicML Composer authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Utility and helper functions for datasets."""
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING, Dict, List, Optional, Set
+import random
+import torch
+__all__ = [
+    'add_vision_dataset_transform',
+    'MultiTokenEOSCriteria',
+]
+
+log = logging.getLogger(__name__)
+
+
+if TYPE_CHECKING:
+    import transformers
+
+def strip_data(example: Dict) -> Dict:
+    """Remove white space from the begging and end of string values in a
+    dictionary.
+
+    Args:
+        example: Dictionary to be stripped
+
+    Returns:
+        dict: The same dictionary with .strip() applied to any value in the dict that is a string
+    """
+    return {
+        k: v.strip() if isinstance(v, str) else v for k, v in example.items()
+    }
+
+
+def tokenizer_needs_prefix_space(
+        tokenizer: transformers.PreTrainedTokenizerBase) -> bool:
+    """Test for whether a prefix space is needed before the continuation.
+    Sentencepiece tokenization should not have a prefix space, but gpt2 style
+    BPE should.
+
+    Args:
+        tokenizer: Tokenizer to test
+
+    Returns:
+        bool: Whether or not the tokenizer needs a prefix space
+    """
+    test_tokens = tokenizer(' a', add_special_tokens=False)['input_ids']
+    assert isinstance(test_tokens, list)
+    return len(test_tokens) == 1
+
+
+def trim_context(context_enc: List, continuation_enc: List,
+                  max_seq_len: int) -> List:
+    """Trims a list of tokens down to `max_seq_len` if the length of the list
+    plus the continuation is more than `max_seq_len`. It will always trim tokens
+    from the left, i.e. tokens at the beginning of the context will be removed.
+
+    Args:
+        context_enc (list): List of tokens in the context
+        continuation_enc (lsit): List of tokens in the continuation
+        max_seq_len (int): Maximum length the model can ingest
+
+    Returns:
+        list: The encoded context trimmed from the left
+    """
+    if len(continuation_enc) + len(context_enc) > max_seq_len:
+        context_max_subseq_len = max_seq_len - len(continuation_enc)
+
+        if context_max_subseq_len < 0:
+            # can't support continuations which are longer than the max seq len
+            raise Exception(
+                f'Dataset included continuation longer than the max seq len')
+
+        # clip from the end
+        context_enc = context_enc[-(context_max_subseq_len):]
+    return context_enc
+
+
+def get_continuation_span(context_enc: List,
+                           continuation_enc: List) -> torch.Tensor:
+    """Gets the list of indices of the continuation tokens for language modeling
+    or generation tasks.
+
+    Args:
+        context_enc (list): List of context tokens
+        continuation_enc (list): List of continuation tokens
+
+    Returns:
+        torch.tensor: A tensor containing indices corresponding to continuation tokens
+    """
+    return torch.tensor(
+        range(len(context_enc),
+              len(context_enc) + len(continuation_enc)))
+
+
+def make_padded_input(context_enc: List,
+                       continuation_enc: List,
+                       max_seq_len: int,
+                       pad_tok_id: int,
+                       padding_side: str = 'right') -> torch.Tensor:
+    """Takes an encoded context and continuation and clips the beginning of the
+    context if they're too long. Adds the padding token to the specified side.
+
+    Args:
+        context_enc (List): The encoded input to the model
+        continuation_enc (List): The encoded desired output for the example
+        max_seq_list (int): Maximum length sequences can be
+        pad_tok_id (int): The token id we pad with
+        padding_side (str): Which side to pad the context on. Can be 'right' or 'left
+
+    Returns:
+        input (torch.tensor): The padded and encoded context
+        continuation_span (torch.tensor): The _inclusive_ range of indices corresponding to the continuation
+    """
+
+    inp = torch.tensor(
+        (context_enc + continuation_enc),
+        dtype=torch.long,
+    )
+    (inp_len,) = inp.shape
+
+    # Sometimes tokenizers that have neither a pad_tok_id or eos_tok_id will pass None in as the padding
+    # token and cause errors
+    if not isinstance(pad_tok_id, int):
+        raise ValueError(
+            f'`pad_tok_id` must be an integer. Found {type(pad_tok_id)} instead'
+        )
+    # pad length from seq to padding_length
+    if padding_side == 'right':
+        inp = torch.cat(
+            [
+                inp,  # [seq]
+                torch.LongTensor((max_seq_len - inp_len) * [pad_tok_id]),
+            ],
+            dim=0,
+        )
+    elif padding_side == 'left':
+        inp = torch.cat(
+            [
+                torch.LongTensor((max_seq_len - inp_len) * [pad_tok_id]),
+                inp,  # [seq]
+            ],
+            dim=0,
+        )
+    else:
+        raise ValueError(
+            f"Unknown padding_side {padding_side}. padding_side must be either 'left' or 'right'"
+        )
+
+    return inp
+
+
+def convert_tokens_to_tensors(batch: Dict,
+                              tokenize_labels: bool) -> Dict[str, Any]:
+    """HF Datasets converts tensors into lists when we store them, and we don't
+    want to use `type='torch'` because some content in the dataset, like
+    generation args or single ints, should not be converted.
+
+    Here, we convert those lists of tokens back into tensors in order to feed them into the model.
+
+    Args:
+        batch (dict): A dictionary of batched inputs
+        tokenize_labels (bool): Whether or not the labels are tokenized (and need to be stacked)
+
+    Returns:
+        dict: The batch with torch tensors in the corresponding keys instead of lists of lists
+    """
+    batch['input_ids'] = torch.stack(list(map(torch.tensor,
+                                              batch['input_ids'])))
+    if tokenize_labels:
+        batch['labels'] = torch.stack(list(map(torch.tensor, batch['labels'])))
+        batch['continuation_indices'] = list(
+            map(torch.tensor, batch['continuation_indices']))
+    return batch
+
+
+def get_fewshot_sample_idxs(dataset_size: int, num_fewshot: int,
+                             example_idx: int, rng: random.Random) -> Set[int]:
+    """
+    Samples indices without replacement. If num_fewshot exceeds the number of unique examples in the dataset,
+    then we will have fewer than num_fewshot examples in context.
+    Args:
+        dataset_size (int): Length of the dataset
+        num_fewshot (int): Number of examples to prepend
+        example_idx (int): Current example's index (excluded from fewshot choices)
+        rng (random.Random): RNG for repeatable sample selection
+
+    Returns:
+        list: Indices of the examples chosen for fewshot selection
+    """
+    num_fewshot = min(dataset_size - 1, num_fewshot)
+    fewshot_idxs = set(rng.sample(range(0, dataset_size), num_fewshot))
+
+    if example_idx in fewshot_idxs:
+        fewshot_idxs.remove(example_idx)
+        if len(fewshot_idxs) >= dataset_size - 1:
+            return fewshot_idxs
+
+        replacement_sample = rng.choice(range(0, dataset_size))
+        while replacement_sample in fewshot_idxs or replacement_sample == example_idx:
+            replacement_sample = rng.choice(range(0, dataset_size))
+        fewshot_idxs.add(replacement_sample)
+    return fewshot_idxs
+
+
+
+try:
+    import transformers
+
+    class MultiTokenEOSCriteria(transformers.StoppingCriteria):
+        """Criteria to stop on the specified multi-token sequence.
+        Slightly modified from: https://github.com/EleutherAI/lm-evaluation-harness/blob/78545d42f2ca95c6fe0ed220d456eeb94f4485e9/lm_eval/utils.py#L614-L649
+        """
+
+        def __init__(
+            self,
+            stop_sequence: str,
+            tokenizer: transformers.PreTrainedTokenizerBase,
+            batch_size: int,
+        ) -> None:
+            self.done_tracker = [False] * batch_size
+            self.stop_sequence = stop_sequence
+            self.stop_sequence_ids = tokenizer.encode(stop_sequence, add_special_tokens=False)
+
+            # sentence piece tokenizers add a superflous underline token before string-initial \n
+            # that throws off our calculation of the stop sequence length
+            # so we remove any token ids that produce empty strings
+            self.stop_sequence_ids = [id for id in self.stop_sequence_ids if tokenizer.decode(id) != '']
+
+            # we look back for 1 more token than it takes to encode our stop sequence
+            # because tokenizers suck, and a model might generate `['\n', '\n']` but our `sequence` is `['\n\n']`
+            # and we don't want to mistakenly not stop a generation because our
+            # (string) stop sequence was output in a different tokenization
+
+            self.stop_sequence_id_len = len(self.stop_sequence_ids) + 1
+            self.tokenizer = tokenizer
+
+        def __call__(self, input_ids: torch.LongTensor, scores: Optional[torch.FloatTensor] = None, **kwargs) -> bool:
+            # For efficiency, we compare the last n tokens where n is the number of tokens in the stop_sequence
+            lookback_ids_batch = input_ids[:, :][:, -self.stop_sequence_id_len:]
+            lookback_tokens_batch = self.tokenizer.batch_decode(lookback_ids_batch)
+            for i, done in enumerate(self.done_tracker):
+                if i >= len(lookback_tokens_batch):
+                    # The last batch of a dataset may be smaller than `batch_size`
+                    # Automatically set those indices in the done_tracker to True
+                    # since those indices don't show up in the current batch
+                    self.done_tracker[i] = True
+                    break
+                elif not done:
+                    self.done_tracker[i] = self.stop_sequence in lookback_tokens_batch[i]
+            return False not in self.done_tracker
+
+    def stop_sequences_criteria(
+        tokenizer: transformers.PreTrainedTokenizerBase,
+        stop_sequences: List[str],
+        batch_size: int,
+    ) -> transformers.StoppingCriteriaList:
+        return transformers.StoppingCriteriaList([
+            *[MultiTokenEOSCriteria(sequence, tokenizer, batch_size) for sequence in stop_sequences],
+        ])
+
+except ImportError as e:
+    stop_sequences_criteria = None  # pyright: ignore [reportGeneralTypeIssues]
+    MultiTokenEOSCriteria = None  # pyright: ignore [reportGeneralTypeIssues]
diff --git a/tests/eval/test_in_context_learning_datasets.py b/tests/eval/test_in_context_learning_datasets.py
index 1c1e142c4a..adacda5de1 100644
--- a/tests/eval/test_in_context_learning_datasets.py
+++ b/tests/eval/test_in_context_learning_datasets.py
@@ -17,12 +17,12 @@
 from torch.utils.data import DataLoader
 
 # isort: off
-from llmfoundry.eval.datasets.in_context_learning_evaluation import (
+from llmfoundry.eval.datasets import (
     InContextLearningDataset, InContextLearningCodeEvalDataset,
     InContextLearningMultipleChoiceTaskDataset, InContextLearningQATaskDataset,
     InContextLearningSchemaTaskDataset, get_icl_task_dataloader, strip_data,
-    _tokenizer_needs_prefix_space, _trim_context, _get_continuation_span,
-    _get_fewshot_sample_idxs, _make_padded_input)
+    tokenizer_needs_prefix_space, trim_context, get_continuation_span,
+    get_fewshot_sample_idxs, make_padded_input)
 # isort: on
 from composer.datasets.utils import MultiTokenEOSCriteria
 from composer.loggers import InMemoryLogger
@@ -53,7 +53,7 @@ def test_strip_data():
     reason="Currently don't have a tokenizer that satisfies this test")
 def test_tokenizer_needs_prefix_space_when_space_not_needed(
         tiny_gpt2_tokenizer):
-    assert not _tokenizer_needs_prefix_space(tiny_gpt2_tokenizer)
+    assert not tokenizer_needs_prefix_space(tiny_gpt2_tokenizer)
 
 
 def test_tokenizer_needs_prefix_space_when_space_needed():
@@ -61,14 +61,14 @@ def test_tokenizer_needs_prefix_space_when_space_needed():
     tokenizer = transformers.AutoTokenizer.from_pretrained(
         'facebook/opt-125m',
         use_fast=False)  # type: ignore reportUnboundVariable
-    assert _tokenizer_needs_prefix_space(tokenizer)
+    assert tokenizer_needs_prefix_space(tokenizer)
 
 
 def test_trim_context():
     context = [0] * 99 + [1] * 2037
     continuation = [2] * 10
     max_seq_len = 2048
-    trimmed_context = _trim_context(context,
+    trimmed_context = trim_context(context,
                                     continuation,
                                     max_seq_len=max_seq_len)
     assert len(trimmed_context) == 2038
@@ -79,11 +79,11 @@ def test_trim_context():
 def test_trim_context_no_continuation():
     context = [0] * 2048
     max_seq_len = 2048
-    trimmed_context = _trim_context(context, [], max_seq_len=max_seq_len)
+    trimmed_context = trim_context(context, [], max_seq_len=max_seq_len)
     assert len(trimmed_context) == 2048
     context = [0] * 3000 + [1]
     max_seq_len = 2048
-    trimmed_context = _trim_context(context, [], max_seq_len=max_seq_len)
+    trimmed_context = trim_context(context, [], max_seq_len=max_seq_len)
     assert len(trimmed_context) == 2048
     assert trimmed_context[-1] == 1
 
@@ -91,10 +91,10 @@ def test_trim_context_no_continuation():
 def test_get_continuation_span():
     context = [0] * 200
     continuation = [1] * 3
-    cont_span = _get_continuation_span(context, continuation)
+    cont_span = get_continuation_span(context, continuation)
     assert torch.all(torch.eq(cont_span, torch.tensor([200, 201, 202])))
     continuation = [1]
-    cont_span = _get_continuation_span(context, continuation)
+    cont_span = get_continuation_span(context, continuation)
     assert torch.all(torch.eq(cont_span, torch.tensor([200])))
 
 
@@ -108,7 +108,7 @@ def test_make_padding(tiny_gpt2_tokenizer, padding_side):
     } else pytest.raises(ValueError)
 
     with error_context:
-        input_ids = _make_padded_input(context, [],
+        input_ids = make_padded_input(context, [],
                                        2048,
                                        padding_id,
                                        padding_side=padding_side)
@@ -125,9 +125,9 @@ def test_batch_padding_logic_no_padding(tiny_gpt2_tokenizer):
     continuation = tiny_gpt2_tokenizer(' dog' * 2000)['input_ids']
     context = tiny_gpt2_tokenizer(' cat' * 2000)['input_ids']
     max_seq_len = 2048
-    trimmed_context = _trim_context(context, continuation, max_seq_len)
-    continuation_spans = _get_continuation_span(trimmed_context, continuation)
-    padded_input = _make_padded_input(trimmed_context,
+    trimmed_context = trim_context(context, continuation, max_seq_len)
+    continuation_spans = get_continuation_span(trimmed_context, continuation)
+    padded_input = make_padded_input(trimmed_context,
                                       continuation,
                                       max_seq_len,
                                       tiny_gpt2_tokenizer.pad_token_id,
@@ -141,9 +141,9 @@ def test_batch_padding_logic_with_padding(tiny_gpt2_tokenizer):
     continuation = tiny_gpt2_tokenizer(' dog' * 200)['input_ids']
     context = tiny_gpt2_tokenizer(' cat' * 200)['input_ids']
     max_seq_len = 2048
-    trimmed_context = _trim_context(context, continuation, max_seq_len)
-    continuation_spans = _get_continuation_span(trimmed_context, continuation)
-    padded_input = _make_padded_input(trimmed_context,
+    trimmed_context = trim_context(context, continuation, max_seq_len)
+    continuation_spans = get_continuation_span(trimmed_context, continuation)
+    padded_input = make_padded_input(trimmed_context,
                                       continuation,
                                       max_seq_len,
                                       tiny_gpt2_tokenizer.pad_token_id,
@@ -156,25 +156,25 @@ def test_batch_padding_logic_with_padding(tiny_gpt2_tokenizer):
 def test_fewshot_sample_idxs():
     rng = random.Random(1234)
 
-    fewshot_idxs = _get_fewshot_sample_idxs(dataset_size=5,
+    fewshot_idxs = get_fewshot_sample_idxs(dataset_size=5,
                                             num_fewshot=4,
                                             example_idx=4,
                                             rng=rng)
     assert fewshot_idxs == {0, 1, 2, 3}
 
-    fewshot_idxs = _get_fewshot_sample_idxs(dataset_size=5,
+    fewshot_idxs = get_fewshot_sample_idxs(dataset_size=5,
                                             num_fewshot=5,
                                             example_idx=4,
                                             rng=rng)
     assert fewshot_idxs == {0, 1, 2, 3}
 
-    fewshot_idxs = _get_fewshot_sample_idxs(dataset_size=5,
+    fewshot_idxs = get_fewshot_sample_idxs(dataset_size=5,
                                             num_fewshot=500,
                                             example_idx=4,
                                             rng=rng)
     assert fewshot_idxs == {0, 1, 2, 3}
 
-    fewshot_idxs = _get_fewshot_sample_idxs(dataset_size=10,
+    fewshot_idxs = get_fewshot_sample_idxs(dataset_size=10,
                                             num_fewshot=7,
                                             example_idx=4,
                                             rng=rng)
@@ -189,21 +189,21 @@ def test_fewshot_sample_idxs_randomness():
     rng_2_seed_1234 = random.Random(1234)
     rng_3_seed_11 = random.Random(11)
 
-    rng_1_sample_1 = _get_fewshot_sample_idxs(dataset_size, num_fewshot, 1,
+    rng_1_sample_1 = get_fewshot_sample_idxs(dataset_size, num_fewshot, 1,
                                               rng_1_seed_1234)
-    rng_2_sample_1 = _get_fewshot_sample_idxs(dataset_size, num_fewshot, 1,
+    rng_2_sample_1 = get_fewshot_sample_idxs(dataset_size, num_fewshot, 1,
                                               rng_2_seed_1234)
-    rng_3_sample_1 = _get_fewshot_sample_idxs(dataset_size, num_fewshot, 1,
+    rng_3_sample_1 = get_fewshot_sample_idxs(dataset_size, num_fewshot, 1,
                                               rng_3_seed_11)
 
     assert rng_1_sample_1 == rng_2_sample_1
     assert rng_1_sample_1 != rng_3_sample_1
 
-    rng_1_sample_2 = _get_fewshot_sample_idxs(dataset_size, num_fewshot, 2,
+    rng_1_sample_2 = get_fewshot_sample_idxs(dataset_size, num_fewshot, 2,
                                               rng_1_seed_1234)
-    rng_2_sample_2 = _get_fewshot_sample_idxs(dataset_size, num_fewshot, 2,
+    rng_2_sample_2 = get_fewshot_sample_idxs(dataset_size, num_fewshot, 2,
                                               rng_2_seed_1234)
-    rng_3_sample_2 = _get_fewshot_sample_idxs(dataset_size, num_fewshot, 2,
+    rng_3_sample_2 = get_fewshot_sample_idxs(dataset_size, num_fewshot, 2,
                                               rng_3_seed_11)
 
     assert rng_1_sample_2 == rng_2_sample_2

From 71e83914112d9a872ea06166602c651bce38d237 Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Fri, 16 Feb 2024 11:38:35 -0500
Subject: [PATCH 15/59] refactor QA

---
 llmfoundry/eval/datasets/utils.py             | 49 ++++++++++-----
 .../eval/test_in_context_learning_datasets.py | 62 +++++++++----------
 2 files changed, 64 insertions(+), 47 deletions(-)

diff --git a/llmfoundry/eval/datasets/utils.py b/llmfoundry/eval/datasets/utils.py
index 55f269ff8e..e881abc9d7 100644
--- a/llmfoundry/eval/datasets/utils.py
+++ b/llmfoundry/eval/datasets/utils.py
@@ -1,3 +1,6 @@
+# Copyright 2024 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
 # Copyright 2022 MosaicML Composer authors
 # SPDX-License-Identifier: Apache-2.0
 
@@ -5,9 +8,11 @@
 from __future__ import annotations
 
 import logging
-from typing import TYPE_CHECKING, Dict, List, Optional, Set
 import random
+from typing import TYPE_CHECKING, Dict, List, Optional, Set
+
 import torch
+
 __all__ = [
     'add_vision_dataset_transform',
     'MultiTokenEOSCriteria',
@@ -15,10 +20,10 @@
 
 log = logging.getLogger(__name__)
 
-
 if TYPE_CHECKING:
     import transformers
 
+
 def strip_data(example: Dict) -> Dict:
     """Remove white space from the begging and end of string values in a
     dictionary.
@@ -52,7 +57,7 @@ def tokenizer_needs_prefix_space(
 
 
 def trim_context(context_enc: List, continuation_enc: List,
-                  max_seq_len: int) -> List:
+                 max_seq_len: int) -> List:
     """Trims a list of tokens down to `max_seq_len` if the length of the list
     plus the continuation is more than `max_seq_len`. It will always trim tokens
     from the left, i.e. tokens at the beginning of the context will be removed.
@@ -79,7 +84,7 @@ def trim_context(context_enc: List, continuation_enc: List,
 
 
 def get_continuation_span(context_enc: List,
-                           continuation_enc: List) -> torch.Tensor:
+                          continuation_enc: List) -> torch.Tensor:
     """Gets the list of indices of the continuation tokens for language modeling
     or generation tasks.
 
@@ -96,10 +101,10 @@ def get_continuation_span(context_enc: List,
 
 
 def make_padded_input(context_enc: List,
-                       continuation_enc: List,
-                       max_seq_len: int,
-                       pad_tok_id: int,
-                       padding_side: str = 'right') -> torch.Tensor:
+                      continuation_enc: List,
+                      max_seq_len: int,
+                      pad_tok_id: int,
+                      padding_side: str = 'right') -> torch.Tensor:
     """Takes an encoded context and continuation and clips the beginning of the
     context if they're too long. Adds the padding token to the specified side.
 
@@ -177,7 +182,7 @@ def convert_tokens_to_tensors(batch: Dict,
 
 
 def get_fewshot_sample_idxs(dataset_size: int, num_fewshot: int,
-                             example_idx: int, rng: random.Random) -> Set[int]:
+                            example_idx: int, rng: random.Random) -> Set[int]:
     """
     Samples indices without replacement. If num_fewshot exceeds the number of unique examples in the dataset,
     then we will have fewer than num_fewshot examples in context.
@@ -205,12 +210,12 @@ def get_fewshot_sample_idxs(dataset_size: int, num_fewshot: int,
     return fewshot_idxs
 
 
-
 try:
     import transformers
 
     class MultiTokenEOSCriteria(transformers.StoppingCriteria):
         """Criteria to stop on the specified multi-token sequence.
+
         Slightly modified from: https://github.com/EleutherAI/lm-evaluation-harness/blob/78545d42f2ca95c6fe0ed220d456eeb94f4485e9/lm_eval/utils.py#L614-L649
         """
 
@@ -222,12 +227,16 @@ def __init__(
         ) -> None:
             self.done_tracker = [False] * batch_size
             self.stop_sequence = stop_sequence
-            self.stop_sequence_ids = tokenizer.encode(stop_sequence, add_special_tokens=False)
+            self.stop_sequence_ids = tokenizer.encode(stop_sequence,
+                                                      add_special_tokens=False)
 
             # sentence piece tokenizers add a superflous underline token before string-initial \n
             # that throws off our calculation of the stop sequence length
             # so we remove any token ids that produce empty strings
-            self.stop_sequence_ids = [id for id in self.stop_sequence_ids if tokenizer.decode(id) != '']
+            self.stop_sequence_ids = [
+                id for id in self.stop_sequence_ids
+                if tokenizer.decode(id) != ''
+            ]
 
             # we look back for 1 more token than it takes to encode our stop sequence
             # because tokenizers suck, and a model might generate `['\n', '\n']` but our `sequence` is `['\n\n']`
@@ -237,10 +246,14 @@ def __init__(
             self.stop_sequence_id_len = len(self.stop_sequence_ids) + 1
             self.tokenizer = tokenizer
 
-        def __call__(self, input_ids: torch.LongTensor, scores: Optional[torch.FloatTensor] = None, **kwargs) -> bool:
+        def __call__(self,
+                     input_ids: torch.LongTensor,
+                     scores: Optional[torch.FloatTensor] = None,
+                     **kwargs) -> bool:
             # For efficiency, we compare the last n tokens where n is the number of tokens in the stop_sequence
             lookback_ids_batch = input_ids[:, :][:, -self.stop_sequence_id_len:]
-            lookback_tokens_batch = self.tokenizer.batch_decode(lookback_ids_batch)
+            lookback_tokens_batch = self.tokenizer.batch_decode(
+                lookback_ids_batch)
             for i, done in enumerate(self.done_tracker):
                 if i >= len(lookback_tokens_batch):
                     # The last batch of a dataset may be smaller than `batch_size`
@@ -249,7 +262,8 @@ def __call__(self, input_ids: torch.LongTensor, scores: Optional[torch.FloatTens
                     self.done_tracker[i] = True
                     break
                 elif not done:
-                    self.done_tracker[i] = self.stop_sequence in lookback_tokens_batch[i]
+                    self.done_tracker[
+                        i] = self.stop_sequence in lookback_tokens_batch[i]
             return False not in self.done_tracker
 
     def stop_sequences_criteria(
@@ -258,7 +272,10 @@ def stop_sequences_criteria(
         batch_size: int,
     ) -> transformers.StoppingCriteriaList:
         return transformers.StoppingCriteriaList([
-            *[MultiTokenEOSCriteria(sequence, tokenizer, batch_size) for sequence in stop_sequences],
+            *[
+                MultiTokenEOSCriteria(sequence, tokenizer, batch_size)
+                for sequence in stop_sequences
+            ],
         ])
 
 except ImportError as e:
diff --git a/tests/eval/test_in_context_learning_datasets.py b/tests/eval/test_in_context_learning_datasets.py
index adacda5de1..e4fbe17bc6 100644
--- a/tests/eval/test_in_context_learning_datasets.py
+++ b/tests/eval/test_in_context_learning_datasets.py
@@ -69,8 +69,8 @@ def test_trim_context():
     continuation = [2] * 10
     max_seq_len = 2048
     trimmed_context = trim_context(context,
-                                    continuation,
-                                    max_seq_len=max_seq_len)
+                                   continuation,
+                                   max_seq_len=max_seq_len)
     assert len(trimmed_context) == 2038
     assert trimmed_context[0] == 0
     assert trimmed_context[1] == 1
@@ -109,9 +109,9 @@ def test_make_padding(tiny_gpt2_tokenizer, padding_side):
 
     with error_context:
         input_ids = make_padded_input(context, [],
-                                       2048,
-                                       padding_id,
-                                       padding_side=padding_side)
+                                      2048,
+                                      padding_id,
+                                      padding_side=padding_side)
 
         if padding_side == 'left':
             assert input_ids[0] == tiny_gpt2_tokenizer.eos_token_id
@@ -128,10 +128,10 @@ def test_batch_padding_logic_no_padding(tiny_gpt2_tokenizer):
     trimmed_context = trim_context(context, continuation, max_seq_len)
     continuation_spans = get_continuation_span(trimmed_context, continuation)
     padded_input = make_padded_input(trimmed_context,
-                                      continuation,
-                                      max_seq_len,
-                                      tiny_gpt2_tokenizer.pad_token_id,
-                                      padding_side='right')
+                                     continuation,
+                                     max_seq_len,
+                                     tiny_gpt2_tokenizer.pad_token_id,
+                                     padding_side='right')
     assert continuation_spans[0] == 48 and continuation_spans[-1] == 2047
     assert len(padded_input) == 2048
     assert tiny_gpt2_tokenizer.pad_token_id not in padded_input
@@ -144,10 +144,10 @@ def test_batch_padding_logic_with_padding(tiny_gpt2_tokenizer):
     trimmed_context = trim_context(context, continuation, max_seq_len)
     continuation_spans = get_continuation_span(trimmed_context, continuation)
     padded_input = make_padded_input(trimmed_context,
-                                      continuation,
-                                      max_seq_len,
-                                      tiny_gpt2_tokenizer.pad_token_id,
-                                      padding_side='right')
+                                     continuation,
+                                     max_seq_len,
+                                     tiny_gpt2_tokenizer.pad_token_id,
+                                     padding_side='right')
     assert continuation_spans[0] == 200 and continuation_spans[-1] == 399
     assert len(padded_input) == 2048
     assert padded_input[-1] == tiny_gpt2_tokenizer.pad_token_id
@@ -157,27 +157,27 @@ def test_fewshot_sample_idxs():
     rng = random.Random(1234)
 
     fewshot_idxs = get_fewshot_sample_idxs(dataset_size=5,
-                                            num_fewshot=4,
-                                            example_idx=4,
-                                            rng=rng)
+                                           num_fewshot=4,
+                                           example_idx=4,
+                                           rng=rng)
     assert fewshot_idxs == {0, 1, 2, 3}
 
     fewshot_idxs = get_fewshot_sample_idxs(dataset_size=5,
-                                            num_fewshot=5,
-                                            example_idx=4,
-                                            rng=rng)
+                                           num_fewshot=5,
+                                           example_idx=4,
+                                           rng=rng)
     assert fewshot_idxs == {0, 1, 2, 3}
 
     fewshot_idxs = get_fewshot_sample_idxs(dataset_size=5,
-                                            num_fewshot=500,
-                                            example_idx=4,
-                                            rng=rng)
+                                           num_fewshot=500,
+                                           example_idx=4,
+                                           rng=rng)
     assert fewshot_idxs == {0, 1, 2, 3}
 
     fewshot_idxs = get_fewshot_sample_idxs(dataset_size=10,
-                                            num_fewshot=7,
-                                            example_idx=4,
-                                            rng=rng)
+                                           num_fewshot=7,
+                                           example_idx=4,
+                                           rng=rng)
     assert len(fewshot_idxs) == 7 and 4 not in fewshot_idxs
 
 
@@ -190,21 +190,21 @@ def test_fewshot_sample_idxs_randomness():
     rng_3_seed_11 = random.Random(11)
 
     rng_1_sample_1 = get_fewshot_sample_idxs(dataset_size, num_fewshot, 1,
-                                              rng_1_seed_1234)
+                                             rng_1_seed_1234)
     rng_2_sample_1 = get_fewshot_sample_idxs(dataset_size, num_fewshot, 1,
-                                              rng_2_seed_1234)
+                                             rng_2_seed_1234)
     rng_3_sample_1 = get_fewshot_sample_idxs(dataset_size, num_fewshot, 1,
-                                              rng_3_seed_11)
+                                             rng_3_seed_11)
 
     assert rng_1_sample_1 == rng_2_sample_1
     assert rng_1_sample_1 != rng_3_sample_1
 
     rng_1_sample_2 = get_fewshot_sample_idxs(dataset_size, num_fewshot, 2,
-                                              rng_1_seed_1234)
+                                             rng_1_seed_1234)
     rng_2_sample_2 = get_fewshot_sample_idxs(dataset_size, num_fewshot, 2,
-                                              rng_2_seed_1234)
+                                             rng_2_seed_1234)
     rng_3_sample_2 = get_fewshot_sample_idxs(dataset_size, num_fewshot, 2,
-                                              rng_3_seed_11)
+                                             rng_3_seed_11)
 
     assert rng_1_sample_2 == rng_2_sample_2
     assert rng_1_sample_2 != rng_3_sample_2

From 414153ee9bbf233a544da5511871af347096dc52 Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Thu, 22 Feb 2024 17:48:57 -0500
Subject: [PATCH 16/59] update

---
 llmfoundry/eval/datasets/__init__.py          |   4 +-
 .../in_context_learning_evaluation.py         | 131 ++++----
 llmfoundry/eval/metrics/__init__.py           |   4 +-
 llmfoundry/eval/metrics/nlp.py                | 127 ++++----
 llmfoundry/models/hf/hf_causal_lm.py          |   4 +-
 .../models/inference_api_wrapper/interface.py |   4 +-
 llmfoundry/models/mpt/modeling_mpt.py         |   4 +-
 llmfoundry/utils/builders.py                  |  15 +-
 scripts/eval/README.md                        |  12 +-
 scripts/eval/yamls/coding_tasks.yaml          |  16 +-
 scripts/eval/yamls/tasks_v0.1.yaml            |  10 +-
 scripts/eval/yamls/tasks_v0.2.yaml            |  10 +-
 scripts/eval/yamls/tasks_v0.3.yaml            |   6 +-
 tests/data/test_tasks.yaml                    |   2 +-
 .../eval/test_in_context_learning_datasets.py | 298 ++++++++----------
 tests/eval/test_nlp_metrics.py                |  43 ++-
 16 files changed, 358 insertions(+), 332 deletions(-)

diff --git a/llmfoundry/eval/datasets/__init__.py b/llmfoundry/eval/datasets/__init__.py
index 0fc7662468..d9d6686331 100644
--- a/llmfoundry/eval/datasets/__init__.py
+++ b/llmfoundry/eval/datasets/__init__.py
@@ -9,7 +9,7 @@
 from llmfoundry.eval.datasets.in_context_learning_evaluation import (
     InContextLearningCodeEvalDataset, InContextLearningDataset,
     InContextLearningLMTaskDataset, InContextLearningMultipleChoiceTaskDataset,
-    InContextLearningQATaskDataset, InContextLearningSchemaTaskDataset,
+    InContextLearningGenerationWithAnswersTaskDataset, InContextLearningSchemaTaskDataset,
     get_icl_task_dataloader)
 from llmfoundry.eval.datasets.utils import (get_continuation_span,
                                             get_fewshot_sample_idxs,
@@ -18,7 +18,7 @@
                                             trim_context)
 
 __all__ = [
-    'InContextLearningDataset', 'InContextLearningQATaskDataset',
+    'InContextLearningDataset', 'InContextLearningGenerationWithAnswersTaskDataset',
     'InContextLearningLMTaskDataset', 'InContextLearningCodeEvalDataset',
     'InContextLearningMultipleChoiceTaskDataset',
     'InContextLearningSchemaTaskDataset', 'get_icl_task_dataloader',
diff --git a/llmfoundry/eval/datasets/in_context_learning_evaluation.py b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
index e1138cf7c1..d6b4f4c578 100644
--- a/llmfoundry/eval/datasets/in_context_learning_evaluation.py
+++ b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
@@ -39,7 +39,7 @@
     'InContextLearningMultipleChoiceTaskDataset',
     'InContextLearningSchemaTaskDataset',
     'InContextLearningCodeEvalDataset',
-    'InContextLearningQATaskDataset',
+    'InContextLearningGenerationWithAnswersTaskDataset',
     'get_icl_task_dataloader',
 ]
 
@@ -60,7 +60,7 @@ class InContextLearningDataset(Dataset):
     - construct_context(): Takes a single example dictionary and formulates the context as a string for that eval question.
     - get_answer_from_example(): Takes a single example dictionary and formulates the correct, ground truth answer as a string.
     - tokenize_example(): Tokenizes the example and adds any extra content from the original dictionary that needs to be passed downstream.
-    - read_dataset(): Loads the dataset and does basic parsing. If additional parsing must be done, this is a good place to do so (See InContextLearningQATaskDataset.read_dataset())
+    - read_dataset(): Loads the dataset and does basic parsing. If additional parsing must be done, this is a good place to do so (See InContextLearningGenerationWithAnswersTaskDataset.read_dataset())
 
     Additionally, base_batch and batch_mapping must be defined.
 
@@ -515,10 +515,10 @@ def split_batch(self, batch: Any,
         return batched_list
 
 
-class InContextLearningQATaskDataset(InContextLearningDataset):
-    """A dataset that constructs batches for in-context learning question
-    answering evaluation. QA tasks evaluate a model's ability to answer
-    questions using a consistent format.
+class InContextLearningGenerationWithAnswersTaskDataset(InContextLearningDataset):
+    """A dataset that constructs batches for in-context learning generation tasks with
+    answers. Generation tasks with evaluate a model's ability to generate responses and
+    score them against a set of gold-standard answers.
 
     The input format is expected to be a jsonl file with the following fields:
     - context: The question
@@ -539,7 +539,7 @@ def __init__(self,
                  **kwargs):
         if kwargs['tokenizer'].eos_token_id is None:
             raise ValueError(
-                '`InContextLearningQATaskDataset` tokenizer must have non-null `eos_token_id`'
+                '`InContextLearningGenerationWithAnswersTaskDataset` tokenizer must have non-null `eos_token_id`'
             )
         self.cot_delimiter = cot_delimiter
         self.has_cot = False
@@ -1120,8 +1120,8 @@ def tokenize_example(self, prompt_and_fewshot: str,
 
 
 class InContextLearningCodeEvalDataset(InContextLearningDataset):
-    """A dataset that constructs batches for in-context learning code
-    evaluation.
+    """
+    A dataset that constructs batches for in-context learning code evaluation.
 
     The input format is expected to be a jsonl file with the following fields:
 
@@ -1153,7 +1153,6 @@ class InContextLearningCodeEvalDataset(InContextLearningDataset):
 
         - pad_token_id: ID for padding token, derived automatically
         - num_beams: How many beams to search for generations, set to 1
-        - num_return_sequences: Value passed for 'generations_per_sample', how many generations per prompt
         - do_sample: Determines whether model is sampling or greedily decoding. Always set to True
         - use_cache: Whether or not to use past key values to speed up sampling. Always set to True
 
@@ -1165,11 +1164,13 @@ class InContextLearningCodeEvalDataset(InContextLearningDataset):
     def __init__(
         self,
         generations_per_sample: int,
-        pass_at_k: int = 1,
+        pass_at_k: Union[int, list[int]] = 1,
         *args,
         **kwargs,
     ):
-        if generations_per_sample < pass_at_k:
+        if isinstance(pass_at_k, int):
+            pass_at_k = [pass_at_k]
+        if generations_per_sample < max(pass_at_k):
             raise ValueError(
                 f'generations_per_sample ({generations_per_sample}) must be greater than or equal to pass_at_k ({pass_at_k}) for code evaluation.'
             )
@@ -1181,17 +1182,29 @@ def __init__(
             'entry_points': 'entry_point',
             'test_inputs': 'test_inputs',
             'test_outputs': 'test_outputs',
-            'languages': 'language'
+            'languages': 'language',
+            'sample_id': 'sample_id',
         }
         # Linting complains if these are not set in init
         self.max_prompt_length = 0
         self.max_answer_length = 0
         static_keys = [
-            'mode', 'pass_at_k', 'generation_length', 'generation_kwargs'
+            'mode',
+            'pass_at_k',
+            'generation_length',
+            'generation_kwargs',
+            'generations_per_sample',
+            'dataset_size',
         ]
         list_keys = [
-            'prompts', 'tests', 'entry_points', 'test_inputs', 'test_outputs',
-            'languages', 'labels'
+            'prompts',
+            'tests',
+            'entry_points',
+            'test_inputs',
+            'test_outputs',
+            'languages',
+            'labels',
+            'sample_id',
         ]
         tensor_keys = ['input_ids', 'attention_mask']
         super().__init__(
@@ -1208,11 +1221,12 @@ def __init__(
             **kwargs,
         )
         self._set_max_prompt_and_answer_lengths()
+        dataset_size = len(self.dataset)
         self.dataset = self.dataset.map(self._trim_padding)
+        self.dataset = self.repeat_dataset(self.dataset, generations_per_sample)
         self.base_batch = {
             'input_ids': [],
-            'mode':
-                'generate',
+            'mode': 'generate',
             'labels': [],
             'prompts': [],
             'tests': [],
@@ -1220,26 +1234,41 @@ def __init__(
             'test_inputs': [],
             'test_outputs': [],
             'languages': [],
-            'pass_at_k':
-                pass_at_k,
-            'generation_length':
-                min(self.max_answer_length,
-                    self.max_seq_len - self.max_prompt_length),
+            'pass_at_k': pass_at_k,
+            'generation_length': min(self.max_answer_length, self.max_seq_len - self.max_prompt_length),
             'generation_kwargs': {
                 'pad_token_id': self.pad_tok_id,
                 'num_beams': 1,  # single beam
-                'num_return_sequences': generations_per_sample,
                 'do_sample': True,
+                'temperature': 0.2,  # good default for code
                 'use_cache': True,
-                'eos_token_id': self.tokenizer.eos_token_id
-            }
+                'eos_token_id': self.tokenizer.eos_token_id,
+            },
+            'sample_id': [],
+            'pass_at_k': list(pass_at_k),
+            'generations_per_sample': generations_per_sample,
+            'dataset_size': dataset_size,
         }
         if 'generation_kwargs' in kwargs:
             self.update_generation_kwargs(kwargs['generation_kwargs'])
 
+    def repeat_dataset(self, dataset: HFDataset, repetitions: int) -> HFDataset:
+
+        def _repeat_dataset():
+            for i, sample in enumerate(dataset):
+                for _ in range(repetitions):
+                    assert isinstance(sample, dict)
+                    yield {'sample_id': i, **sample}
+
+        from datasets import Dataset as HFDataset  # pyright: ignore[reportGeneralTypeIssues]
+
+        repeated_dataset = HFDataset.from_generator(_repeat_dataset)
+        assert isinstance(repeated_dataset, HFDataset)
+        return repeated_dataset
+
     def _set_max_prompt_and_answer_lengths(self):
-        """Iterates through the dataset and finds the maximum prompt length and
-        sequence lengths.
+        """
+        Iterates through the dataset and finds the maximum prompt length and sequence lengths
 
         Returns:
             None
@@ -1248,15 +1277,10 @@ def _set_max_prompt_and_answer_lengths(self):
         max_answer_length = 0
         for example in self.dataset:
             assert isinstance(example, Dict)
-            unpadded_example = [
-                token for token in example[self.context_key]
-                if token != self.pad_tok_id
-            ]
+            unpadded_example = [token for token in example[self.context_key] if token != self.pad_tok_id]
             max_prompt_length = max(max_prompt_length, len(unpadded_example))
 
-            tokenized_answer = self.tokenizer(
-                example['canonical_solution'],
-                add_special_tokens=False)['input_ids']
+            tokenized_answer = self.tokenizer(example['canonical_solution'], add_special_tokens=False)['input_ids']
             assert isinstance(tokenized_answer, list)
             len_tokenized_answer = len(tokenized_answer)
             max_answer_length = max(max_answer_length, len_tokenized_answer)
@@ -1265,35 +1289,29 @@ def _set_max_prompt_and_answer_lengths(self):
         self.max_answer_length = max_answer_length + _MAX_ANSWER_BUFFER_LENGTH
 
     def _trim_padding(self, example: Dict):
-        """Adjusts padding to the maximum prompt length rather than max_seq_len.
-        Needs to be done after the dataset has been processed because we don't
-        know the maximum prompt length until after we've tokenized it.
+        """
+        Adjusts padding to the maximum prompt length rather than max_seq_len.
+        Needs to be done after the dataset has been processed because we don't know the maximum
+        prompt length until after we've tokenized it.
 
         Returns:
             dataset: A HuggingFace Dataset with different padding lengths for example[self.context_key]
         """
         # Remove padding tokens applied during tokenization
-        unpadded_prompt = [
-            token for token in example[self.context_key]
-            if token != self.pad_tok_id
-        ]
+        unpadded_prompt = [token for token in example[self.context_key] if token != self.pad_tok_id]
         # Reapply padding only to max_prompt_length
         full_prompt = trim_context(unpadded_prompt, [], self.max_prompt_length)
-        padded_context = make_padded_input(full_prompt, [],
-                                           self.max_prompt_length,
-                                           self.pad_tok_id, self.padding_side)
+        padded_context = make_padded_input(full_prompt, [], self.max_prompt_length, self.pad_tok_id, self.padding_side)
 
         example[self.context_key] = padded_context
         return example
 
-    def tokenize_example(self, prompt_and_fewshot: str, ctxt: str,
-                         example: Dict) -> Dict[str, Any]:
-        """Adds extra code task details to the example dictionary.
-
+    def tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -> Dict[str, Any]:
+        """
+        Adds extra code task details to the example dictionary.
         See InContextLearningDataset for more details
         """
-        tokenized_example = super().tokenize_example(prompt_and_fewshot, ctxt,
-                                                     example)
+        tokenized_example = super().tokenize_example(prompt_and_fewshot, ctxt, example)
         tokenized_example['prompt_text'] = example['prompt']
         tokenized_example['task_id'] = example['task_id']
         tokenized_example['canonical_solution'] = example['canonical_solution']
@@ -1305,6 +1323,7 @@ def tokenize_example(self, prompt_and_fewshot: str, ctxt: str,
         return tokenized_example
 
 
+
 def build_icl_dataloader(
         icl_task_type: str,
         dataset_uri: str,
@@ -1390,8 +1409,8 @@ def build_icl_dataloader(
             generation_kwargs=generation_kwargs,
         )
         effective_batchsize = batch_size
-    elif icl_task_type == 'question_answering':
-        dataset = InContextLearningQATaskDataset(
+    elif icl_task_type == 'generation_task_with_answers':
+        dataset = InContextLearningGenerationWithAnswersTaskDataset(
             dataset_uri=dataset_uri,
             tokenizer=tokenizer,
             max_seq_len=max_seq_len,
@@ -1441,7 +1460,7 @@ def build_icl_dataloader(
             dataset,
         (
             InContextLearningMultipleChoiceTaskDataset,
-            InContextLearningQATaskDataset,
+            InContextLearningGenerationWithAnswersTaskDataset,
             InContextLearningCodeEvalDataset,
         ),
     ):
@@ -1609,7 +1628,7 @@ def get_icl_task_dataloader(
                 )
 
     Args:
-        icl_task_type (str): Name of icl_task type. One of ['multiple_choice', 'schema', 'language_modeling', 'question_answering', 'code_evaluation']
+        icl_task_type (str): Name of icl_task type. One of ['multiple_choice', 'schema', 'language_modeling', 'generation_task_with_answers', 'code_evaluation']
         dataset_uri (str): A local path, a remote path beginning with ``s3://`` or another backend, or a HuggingFace dataset uri prepended with ``hf://``.
             Alternate backends must be supported by :meth:`composer.utils.maybe_create_object_store_from_uri`.
             A local dataset must consist of rows of JSON data points with task dependant fields.
@@ -1637,7 +1656,7 @@ def get_icl_task_dataloader(
                                                   for more details)
         early_stopping (List, default = None): A list of strings that, when found in a model's output, will be treated as a stopping criteria at metric computation time.
             Used in QA tasks with CoT
-        do_normalization (bool, default = True): Whether or not to normalize the outputs and labels in InContextLearningQAAccuracy. Only used in QA tasks.
+        do_normalization (bool, default = True): Whether or not to normalize the outputs and labels in InContextLearningGenerationWithAnswersTaskDataset. Only used in QA tasks.
 
     Returns:
         DataLoader: A dataloader used for performing in-context learning evaluation on the dataset provided.
diff --git a/llmfoundry/eval/metrics/__init__.py b/llmfoundry/eval/metrics/__init__.py
index cd38b6bcd8..76c301fefa 100644
--- a/llmfoundry/eval/metrics/__init__.py
+++ b/llmfoundry/eval/metrics/__init__.py
@@ -10,12 +10,12 @@
     InContextLearningCodeEvalAccuracy, InContextLearningLMAccuracy,
     InContextLearningLMExpectedCalibrationError,
     InContextLearningMCExpectedCalibrationError, InContextLearningMetric,
-    InContextLearningMultipleChoiceAccuracy, InContextLearningQAAccuracy)
+    InContextLearningMultipleChoiceAccuracy, InContextLearningGenerationAccuracy)
 
 __all__ = [
     'InContextLearningLMAccuracy',
     'InContextLearningMultipleChoiceAccuracy',
-    'InContextLearningQAAccuracy',
+    'InContextLearningGenerationAccuracy',
     'InContextLearningMCExpectedCalibrationError',
     'InContextLearningLMExpectedCalibrationError',
     'InContextLearningMetric',
diff --git a/llmfoundry/eval/metrics/nlp.py b/llmfoundry/eval/metrics/nlp.py
index bc507e51ff..3032ba6e71 100644
--- a/llmfoundry/eval/metrics/nlp.py
+++ b/llmfoundry/eval/metrics/nlp.py
@@ -12,7 +12,7 @@
 import string
 import warnings
 from typing import Any, Dict, List
-
+from composer.utils import dist
 import numpy as np
 import torch
 from composer.utils.eval_client import (EvalClient, LambdaEvalClient,
@@ -28,7 +28,7 @@
     'InContextLearningMetric',
     'InContextLearningLMAccuracy',
     'InContextLearningMultipleChoiceAccuracy',
-    'InContextLearningQAAccuracy',
+    'InContextLearningGenerationWithAnswersTaskDataset',
     'InContextLearningCodeEvalAccuracy',
     'InContextLearningLMExpectedCalibrationError',
     'InContextLearningMCExpectedCalibrationError',
@@ -64,7 +64,7 @@ def update(
         raise NotImplementedError
 
 
-class InContextLearningQAAccuracy(InContextLearningMetric):
+class InContextLearningGenerationAccuracy(InContextLearningMetric):
     r"""Computes accuracy for In-context learning (ICL) question answering (QA)
     tasks.
 
@@ -415,7 +415,6 @@ def update(self, batch: dict, outputs: torch.Tensor, labels: torch.Tensor):
             self.bucket_totals[
                 bucket_idx] += 1  # pyright: ignore [reportGeneralTypeIssues]
 
-
 class InContextLearningCodeEvalAccuracy(InContextLearningMetric):
     r"""Computes accuracy for In-context learning (ICL) code evaluation tasks.
 
@@ -442,10 +441,8 @@ class InContextLearningCodeEvalAccuracy(InContextLearningMetric):
     def __init__(self, dist_sync_on_step: bool = False):
         # state from multiple processes
         super().__init__(dist_sync_on_step=dist_sync_on_step)
-        self.add_state('correct',
-                       default=torch.tensor(0.),
-                       dist_reduce_fx='sum')
-        self.add_state('total', default=torch.tensor(0.), dist_reduce_fx='sum')
+
+        self._initialized = False
 
         self.eval_device = os.environ.get('CODE_EVAL_DEVICE', None)
         if self.eval_device is not None:
@@ -472,9 +469,8 @@ def get_client(self) -> EvalClient:
                 'to one of `LOCAL` (for unsafe local eval), `LAMBDA` (for AWS lambda ',
                 'evaluation), or `MOSAICML` (for lambda eval through MAPI).')
         else:
-            raise ValueError(
-                'Environment variable `CODE_EVAL_DEVICE` must be one of `LOCAL`, '
-                f'`LAMBDA`, or `MOSAICML` but got {self.eval_device}.')
+            raise ValueError('Environment variable `CODE_EVAL_DEVICE` must be one of `LOCAL`, '
+                             f'`LAMBDA`, or `MOSAICML` but got {self.eval_device}.')
 
         return client
 
@@ -490,8 +486,19 @@ def estimator(self, n: int, c: int, k: int) -> float:
             return 1.0
         return 1.0 - float(np.prod(1.0 - k / np.arange(n - c + 1, n + 1)))
 
-    def update(self, batch: Dict[str, Any], outputs: List[str],
-               labels: List[str]):
+    def _initialize_state(self, batch: dict[str, Any]):
+        device = batch['input_ids'].device
+        self.dataset_size = batch['dataset_size']
+        self.pass_at_k = batch['pass_at_k']
+        self.num_generations = batch['generations_per_sample']
+
+        # We need to defer the accumulator initialization because it depends on dataset size
+        self.add_state('correct', default=torch.zeros(self.dataset_size, device=device), dist_reduce_fx='sum')
+        self.add_state('total', default=torch.zeros(self.dataset_size, device=device), dist_reduce_fx='sum')
+        dist.barrier()
+        self._initialized = True
+
+    def update(self, batch: Dict[str, Any], outputs: List[str], labels: List[str]):
         """Updates the pass@k accuracy of code generation.
 
         Given a batch of prompts, test cases, and code generations, evaluates the code generations
@@ -515,56 +522,62 @@ def update(self, batch: Dict[str, Any], outputs: List[str],
             labels (List[str]): A list of the correct code generations, for compatibility with existing HF generate
             functionalities. This is not used.
         """
+        if not self._initialized:
+            self._initialize_state(batch)
+
         del labels  # never used
         client = self.get_client()
 
-        pass_at_k = batch['pass_at_k']
-        num_generations = batch['generation_kwargs']['num_return_sequences']
-        processed_outputs = [
-            outputs[i * num_generations:(i + 1) * num_generations]
-            for i in range(len(batch['prompts']))
-        ]
-        payloads = []
-        for sample_outputs, sample_prompt, test_inputs, test_outputs, entry_point, language in zip(
-                processed_outputs, batch['prompts'], batch['test_inputs'],
-                batch['test_outputs'], batch['entry_points'],
-                batch['languages']):
-            self.total += torch.tensor(1.0)
-            prompt_payload = []
-            for code_gen in sample_outputs:
-                code_gen = re.split(
-                    r'\n[A-Za-z0-9#`]',
-                    code_gen)[0]  # remove everything after function ends
-                final_code = sample_prompt + code_gen  # combine prompt with the code generation
-                generation_payload = []
-                for test_input, test_output in zip(test_inputs, test_outputs):
-                    payload = {
-                        'code': final_code,
-                        'input': test_input,
-                        'output': test_output,
-                        'entry_point': entry_point,
-                        'language': language,
-                    }
-                    generation_payload.append(payload)
-
-                prompt_payload.append(generation_payload)
-            payloads.append(prompt_payload)
-
-        results = client.invoke(payloads)
-        for prompt in results:
-            num_correct = 0
-            for generation in prompt:
-                correct = all(generation)
-                if correct:
-                    num_correct += 1
-
-            pass_at_k_rate = self.estimator(num_generations, num_correct,
-                                            pass_at_k)
-            self.correct += torch.tensor(pass_at_k_rate)
+        for sample_id, code_gen, sample_prompt, test_inputs, test_outputs, entry_point, language in zip(
+                batch['sample_id'], outputs, batch['prompts'], batch['test_inputs'], batch['test_outputs'],
+                batch['entry_points'], batch['languages']):
+
+            idx = sample_id
+            self.total[idx] += 1.0
+
+            code_gen = re.split(r'\n[A-Za-z0-9#`]', code_gen)[0]  # remove everything after function ends
+            final_code = sample_prompt + code_gen  # combine prompt with the code generation
+
+            test_results = []
+            for test_input, test_output in zip(test_inputs, test_outputs):
+                payload = {
+                    'code': final_code,
+                    'input': test_input,
+                    'output': test_output,
+                    'entry_point': entry_point,
+                    'language': language,
+                }
+
+                result = client.invoke([[[payload]]])[0][0][0]
+                test_results.append(result)
+
+            if all(test_results):
+                self.correct[idx] += 1.0
 
         client.close()  # pyright: ignore [reportOptionalMemberAccess]
 
     def compute(self):
         assert isinstance(self.correct, Tensor)
         assert isinstance(self.total, Tensor)
-        return self.correct / self.total
+        complete = self.total == self.num_generations  # so that eval subset batches can be used
+
+        if complete.sum() < (self.total != 0).sum():
+            warnings.warn('Some samples in the dataset have less than the expected number of generations. '
+                          'This is expected if you are using a subset of the dataset for evaluation.')
+
+        if (self.correct > self.total).any().item():
+            raise ValueError(
+                'Internal error some samples have more correct than  total generations. This should not happen.')
+
+        results = {}
+        n = self.num_generations
+
+        for k in self.pass_at_k:
+            pass_at_k = sum([self.estimator(n, int(c.item()), k) for c in self.correct[complete]
+                            ]) / complete.sum().item()
+            results[f'pass@{k}'] = torch.tensor(pass_at_k)
+
+        if len(results) == 1:  # backwards compatibility
+            return list(results.values())[0]
+
+        return results
\ No newline at end of file
diff --git a/llmfoundry/models/hf/hf_causal_lm.py b/llmfoundry/models/hf/hf_causal_lm.py
index b991d9f572..fc401662f2 100644
--- a/llmfoundry/models/hf/hf_causal_lm.py
+++ b/llmfoundry/models/hf/hf_causal_lm.py
@@ -19,7 +19,7 @@
 from llmfoundry.eval.metrics import (InContextLearningCodeEvalAccuracy,
                                      InContextLearningLMAccuracy,
                                      InContextLearningMultipleChoiceAccuracy,
-                                     InContextLearningQAAccuracy)
+                                     InContextLearningGenerationAccuracy)
 from llmfoundry.models.hf.hf_fsdp import hf_get_init_device
 from llmfoundry.models.hf.model_wrapper import HuggingFaceModelWithZLoss
 from llmfoundry.models.layers.attention import is_flash_v2_installed
@@ -112,7 +112,7 @@ def __init__(self, om_model_config: DictConfig,
             LanguagePerplexity(),
             InContextLearningLMAccuracy(),
             InContextLearningMultipleChoiceAccuracy(),
-            InContextLearningQAAccuracy(),
+            InContextLearningGenerationAccuracy(),
             InContextLearningCodeEvalAccuracy()
         ]
         if not om_model_config.get('use_train_metrics', True):
diff --git a/llmfoundry/models/inference_api_wrapper/interface.py b/llmfoundry/models/inference_api_wrapper/interface.py
index 90b3560eb8..a5caaa89ad 100644
--- a/llmfoundry/models/inference_api_wrapper/interface.py
+++ b/llmfoundry/models/inference_api_wrapper/interface.py
@@ -13,7 +13,7 @@
 from llmfoundry.eval.metrics import (InContextLearningLMAccuracy,
                                      InContextLearningMetric,
                                      InContextLearningMultipleChoiceAccuracy,
-                                     InContextLearningQAAccuracy)
+                                     InContextLearningGenerationAccuracy)
 
 
 class InferenceAPIEvalWrapper(ComposerModel):
@@ -27,7 +27,7 @@ def __init__(self, model_cfg: Dict, tokenizer: AutoTokenizer):
             LanguagePerplexity(),
             InContextLearningLMAccuracy(),
             InContextLearningMultipleChoiceAccuracy(),
-            InContextLearningQAAccuracy()
+            InContextLearningGenerationAccuracy()
         ]
         self.eval_metrics = {
             metric.__class__.__name__: metric for metric in eval_metrics
diff --git a/llmfoundry/models/mpt/modeling_mpt.py b/llmfoundry/models/mpt/modeling_mpt.py
index 117f12fd68..a80336bbbf 100644
--- a/llmfoundry/models/mpt/modeling_mpt.py
+++ b/llmfoundry/models/mpt/modeling_mpt.py
@@ -23,7 +23,7 @@
 from llmfoundry.eval.metrics import (InContextLearningCodeEvalAccuracy,
                                      InContextLearningLMAccuracy,
                                      InContextLearningMultipleChoiceAccuracy,
-                                     InContextLearningQAAccuracy)
+                                     InContextLearningGenerationAccuracy)
 from llmfoundry.models.layers.attention import (is_flash_v1_installed,
                                                 is_flash_v2_installed)
 
@@ -1028,7 +1028,7 @@ def __init__(
             LanguagePerplexity(),
             InContextLearningLMAccuracy(),
             InContextLearningMultipleChoiceAccuracy(),
-            InContextLearningQAAccuracy(),
+            InContextLearningGenerationAccuracy(),
             InContextLearningCodeEvalAccuracy(),
         ]
 
diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py
index 15fa21e257..b9f3edd95c 100644
--- a/llmfoundry/utils/builders.py
+++ b/llmfoundry/utils/builders.py
@@ -462,8 +462,8 @@ def _validate_cfg(icl_cfg: DictConfig):
                 icl_cfg.metric_names = [
                     'InContextLearningMultipleChoiceAccuracy'
                 ]
-            elif icl_cfg.icl_task_type == 'question_answering':
-                icl_cfg.metric_names = ['InContextLearningQAAccuracy']
+            elif icl_cfg.icl_task_type == 'generation_task_with_answers':
+                icl_cfg.metric_names = ['InContextLearningGenerationAccuracy']
             elif icl_cfg.icl_task_type == 'code_evaluation':
                 icl_cfg.metric_names = ['InContextLearningCodeEvalAccuracy']
             else:
@@ -483,8 +483,13 @@ def _validate_cfg(icl_cfg: DictConfig):
             icl_cfg.batch_size = default_batch_size
         if 'pass_at_k' not in icl_cfg:
             icl_cfg.pass_at_k = 1
-        if 'num_beams' not in icl_cfg:
-            icl_cfg.num_beams = 20
+        if 'generations_per_sample' not in icl_cfg:
+            icl_cfg.generations_per_sample = 20
+        
+        if 'num_beams' in icl_cfg:
+            raise ValueError(
+                'num_beams is no longer supported as a top level icl_task parameter.'  + \
+                'Please use generation_kwargs.num_beams instead.')
 
     for icl_cfg in icl_tasks_list:
         assert isinstance(icl_cfg, DictConfig)
@@ -523,7 +528,7 @@ def _validate_cfg(icl_cfg: DictConfig):
                 question_prelimiter=icl_cfg.get('question_prelimiter', ''),
                 destination_path=destination_path,
                 pass_at_k=icl_cfg.pass_at_k,
-                generations_per_sample=icl_cfg.num_beams,
+                generations_per_sample=icl_cfg.generations_per_sample,
                 has_categories=icl_cfg.get('has_categories', False),
                 cot_delimiter=icl_cfg.get('cot_delimiter', ''),
                 early_stopping_criteria=early_stopping_criteria,
diff --git a/scripts/eval/README.md b/scripts/eval/README.md
index f7d92e1c99..9c65f6a3ef 100644
--- a/scripts/eval/README.md
+++ b/scripts/eval/README.md
@@ -155,7 +155,7 @@ This document explains the ICL formats compatible with [Composer](https://github
 
 Composer currently supports five ICL formats:
 
-1. [InContextLearningQATaskDataset](https://github.com/mosaicml/composer/blob/336bf8db3e2c09ae942d4bf8a819935106589d1a/composer/datasets/in_context_learning_evaluation.py#L103)
+1. [InContextLearningGenerationWithAnswersTaskDataset](TODO)
 2. [InContextLearningLMTaskDataset](https://github.com/mosaicml/composer/blob/336bf8db3e2c09ae942d4bf8a819935106589d1a/composer/datasets/in_context_learning_evaluation.py#L293)
 3. [InContextLearningMultipleChoiceTaskDataset](https://github.com/mosaicml/composer/blob/336bf8db3e2c09ae942d4bf8a819935106589d1a/composer/datasets/in_context_learning_evaluation.py#L444)
 4. [InContextLearningSchemaTaskDataset](https://github.com/mosaicml/composer/blob/336bf8db3e2c09ae942d4bf8a819935106589d1a/composer/datasets/in_context_learning_evaluation.py#L676)
@@ -163,9 +163,9 @@ Composer currently supports five ICL formats:
 
 ----
 
-### InContextLearningQATaskDataset
+### InContextLearningGenerationWithAnswersTaskDataset
 
-The ICL question answering (QA) task supports free response question answering evaluation using the model’s generate function. A QA dataset consists of a list of JSONs containing a question (under the key `context`), a correct answer (under the key `answer`), and a list of alternative spellings of the answer that would be considered permissible (under the key `aliases`). The QA task works with the NLP metric: [InContextLearningQAAccuracy](https://docs.mosaicml.com/projects/composer/en/latest/api_reference/generated/composer.metrics.InContextLearningQAAccuracy.html) which assigns a model's output to be "correct" if, conditioned on the context, the model's generate method produces a string that is a normalized prefix for either the `answer` or any of the `aliases`.
+The ICL generation with answers task supports free response generation evaluation using the model’s generate function. A generation dataset consists of a list of JSONs containing a prompt (under the key `context`), a correct answer (under the key `answer`), and a list of alternative answers that would be considered permissible (under the key `aliases`). The generation task works with the NLP metric: [InContextLearningGenerationAccuracy](TODO) which assigns a model's output to be "correct" if, conditioned on the context, the model's generate method produces a string that is a normalized prefix for either the `answer` or any of the `aliases`.
 
 Required keys for each datum:
 * `context`: str
@@ -178,7 +178,7 @@ An example datum is below:
 {"context": "What star sign is Jamie Lee Curtis?", "answer": "Scorpio", "aliases": ["Scorpio", "Skorpio"]}
 ```
 
-The QA task expects a **prompt string**, a **continuation delimiter** to separate questions from answers, an **example delimiter** to separate few shot examples from one another, and a **question prelimiter** to put before each question. If using the following settings, with 2 examples in context, the above datum may be rendered to the model as:
+The generation task expects a **prompt string**, a **continuation delimiter** to separate questions from answers, an **example delimiter** to separate few shot examples from one another, and a **question prelimiter** to put before each question. If using the following settings, with 2 examples in context, the above datum may be rendered to the model as:
 
 ```jsx
 prompt_string: "Answer the following trivia question:\n", example_delimiter: "\n", continuation_delimiter: " Answer: ", question_prelimiter: "Question: "
@@ -203,9 +203,9 @@ Below is a complete YAML section that works with the TriviaQA dataset in [`scrip
     - 5
     - 10
     batch_size: 4
-    icl_task_type: question_answering
+    icl_task_type: generation_task_with_answers
     metric_names:
-    - InContextLearningQAAccuracy
+    - InContextLearningGenerationAccuracy
     prompt_string: '' # this goes at the beginning of each input
     example_delimiter: "\n" # this goes between fewshot examples
     continuation_delimiter: ' ' # this separates questions from answers
diff --git a/scripts/eval/yamls/coding_tasks.yaml b/scripts/eval/yamls/coding_tasks.yaml
index 48131a0eae..78f2a213bc 100644
--- a/scripts/eval/yamls/coding_tasks.yaml
+++ b/scripts/eval/yamls/coding_tasks.yaml
@@ -4,7 +4,7 @@ icl_tasks:
   dataset_uri: eval/local_data/programming/human_eval.jsonl  # ADD YOUR OWN DATASET URI
   num_fewshot: [0]
   pass_at_k: 1
-  num_beams: 5
+  generations_per_sample: 5
   batch_size: 1
   icl_task_type: code_evaluation
 -
@@ -12,7 +12,7 @@ icl_tasks:
   dataset_uri: eval/local_data/programming/processed_human_eval_cpp.jsonl  # ADD YOUR OWN DATASET URI
   num_fewshot: [0]
   pass_at_k: 1
-  num_beams: 5
+  generations_per_sample: 5
   batch_size: 1
   icl_task_type: code_evaluation
 -
@@ -20,7 +20,7 @@ icl_tasks:
   dataset_uri: eval/local_data/programming/processed_human_eval_js.jsonl  # ADD YOUR OWN DATASET URI
   num_fewshot: [0]
   pass_at_k: 1
-  num_beams: 5
+  generations_per_sample: 5
   batch_size: 1
   icl_task_type: code_evaluation
 -
@@ -28,7 +28,7 @@ icl_tasks:
   dataset_uri: eval/local_data/programming/human_eval_return_simple.jsonl  # ADD YOUR OWN DATASET URI
   num_fewshot: [0]
   pass_at_k: 1
-  num_beams: 5
+  generations_per_sample: 5
   batch_size: 1
   icl_task_type: code_evaluation
 -
@@ -36,7 +36,7 @@ icl_tasks:
   dataset_uri: eval/local_data/programming/human_eval_return_complex.jsonl  # ADD YOUR OWN DATASET URI
   num_fewshot: [0]
   pass_at_k: 1
-  num_beams: 5
+  generations_per_sample: 5
   batch_size: 1
   icl_task_type: code_evaluation
 -
@@ -44,7 +44,7 @@ icl_tasks:
   dataset_uri: eval/local_data/programming/human_eval-0.25.jsonl  # ADD YOUR OWN DATASET URI
   num_fewshot: [0]
   pass_at_k: 1
-  num_beams: 5
+  generations_per_sample: 5
   batch_size: 1
   icl_task_type: code_evaluation
 -
@@ -52,7 +52,7 @@ icl_tasks:
   dataset_uri: eval/local_data/programming/human_eval-0.5.jsonl  # ADD YOUR OWN DATASET URI
   num_fewshot: [0]
   pass_at_k: 1
-  num_beams: 5
+  generations_per_sample: 5
   batch_size: 1
   icl_task_type: code_evaluation
 -
@@ -60,6 +60,6 @@ icl_tasks:
   dataset_uri: eval/local_data/programming/human_eval-0.75.jsonl  # ADD YOUR OWN DATASET URI
   num_fewshot: [0]
   pass_at_k: 1
-  num_beams: 5
+  generations_per_sample: 5
   batch_size: 1
   icl_task_type: code_evaluation
diff --git a/scripts/eval/yamls/tasks_v0.1.yaml b/scripts/eval/yamls/tasks_v0.1.yaml
index 44f031ae3a..6546b13dd7 100644
--- a/scripts/eval/yamls/tasks_v0.1.yaml
+++ b/scripts/eval/yamls/tasks_v0.1.yaml
@@ -10,12 +10,12 @@ icl_tasks:
   label: triviaqa_sm_sub
   dataset_uri: eval/local_data/world_knowledge/triviaqa_sm_sub.jsonl
   num_fewshot: [3]
-  icl_task_type: question_answering
+  icl_task_type: generation_task_with_answers
 -
   label: gsm8k
   dataset_uri: eval/local_data/symbolic_problem_solving/gsm8k.jsonl
   num_fewshot: [3]
-  icl_task_type: question_answering
+  icl_task_type: generation_task_with_answers
   cot_delimiter: " #### "
   continuation_delimiter: "\nA: Let's think step by step. "
   question_prelimiter: "Q: "
@@ -23,21 +23,21 @@ icl_tasks:
   label: agi_eval_sat_math
   dataset_uri: eval/local_data/symbolic_problem_solving/agi_eval_sat_math.jsonl
   num_fewshot: [3]
-  icl_task_type: question_answering
+  icl_task_type: generation_task_with_answers
   cot_delimiter: " #### "
   continuation_delimiter: "\nA: Let's think step by step. "
 -
   label: aqua
   dataset_uri: eval/local_data/symbolic_problem_solving/aqua.jsonl
   num_fewshot: [3]
-  icl_task_type: question_answering
+  icl_task_type: generation_task_with_answers
   cot_delimiter: " #### "
   continuation_delimiter: "\nA: Let's think step by step. "
 -
   label: svamp
   dataset_uri: eval/local_data/symbolic_problem_solving/svamp.jsonl
   num_fewshot: [3]
-  icl_task_type: question_answering
+  icl_task_type: generation_task_with_answers
   continuation_delimiter: "\nUsing the formula below:\n"
   cot_delimiter: " #### "
   question_prelimiter: "Q: "
diff --git a/scripts/eval/yamls/tasks_v0.2.yaml b/scripts/eval/yamls/tasks_v0.2.yaml
index e23b4df1a5..ae39d87fbd 100644
--- a/scripts/eval/yamls/tasks_v0.2.yaml
+++ b/scripts/eval/yamls/tasks_v0.2.yaml
@@ -10,12 +10,12 @@ icl_tasks:
   label: triviaqa_sm_sub
   dataset_uri: eval/local_data/world_knowledge/triviaqa_sm_sub.jsonl
   num_fewshot: [3]
-  icl_task_type: question_answering
+  icl_task_type: generation_task_with_answers
 -
   label: gsm8k
   dataset_uri: eval/local_data/symbolic_problem_solving/gsm8k.jsonl
   num_fewshot: [8, 5]
-  icl_task_type: question_answering
+  icl_task_type: generation_task_with_answers
   cot_delimiter: " #### "
   continuation_delimiter: "\nA: Let's think step by step. "
   question_prelimiter: "Q: "
@@ -23,21 +23,21 @@ icl_tasks:
   label: agi_eval_sat_math
   dataset_uri: eval/local_data/symbolic_problem_solving/agi_eval_sat_math.jsonl
   num_fewshot: [3]
-  icl_task_type: question_answering
+  icl_task_type: generation_task_with_answers
   cot_delimiter: " #### "
   continuation_delimiter: "\nA: Let's think step by step. "
 -
   label: aqua
   dataset_uri: eval/local_data/symbolic_problem_solving/aqua.jsonl
   num_fewshot: [3]
-  icl_task_type: question_answering
+  icl_task_type: generation_task_with_answers
   cot_delimiter: " #### "
   continuation_delimiter: "\nA: Let's think step by step. "
 -
   label: svamp
   dataset_uri: eval/local_data/symbolic_problem_solving/svamp.jsonl
   num_fewshot: [5]
-  icl_task_type: question_answering
+  icl_task_type: generation_task_with_answers
   continuation_delimiter: "\nUsing the formula below:\n"
   cot_delimiter: " #### "
   question_prelimiter: "Q: "
diff --git a/scripts/eval/yamls/tasks_v0.3.yaml b/scripts/eval/yamls/tasks_v0.3.yaml
index e02178710e..396ceaaf85 100644
--- a/scripts/eval/yamls/tasks_v0.3.yaml
+++ b/scripts/eval/yamls/tasks_v0.3.yaml
@@ -3,7 +3,7 @@ icl_tasks:
   label: gsm8k
   dataset_uri: eval/local_data/symbolic_problem_solving/gsm8k_prepended_8shot.jsonl
   num_fewshot: [0]
-  icl_task_type: question_answering
+  icl_task_type: generation_task_with_answers
   cot_delimiter: "The answer is "
   continuation_delimiter: "\n\nA:"
   question_prelimiter: ""
@@ -15,13 +15,13 @@ icl_tasks:
   label: triviaqa_sm_sub
   dataset_uri: eval/local_data/world_knowledge/triviaqa_sm_sub.jsonl
   num_fewshot: [3]
-  icl_task_type: question_answering
+  icl_task_type: generation_task_with_answers
   do_normalization: true
 -
   label: svamp
   dataset_uri: eval/local_data/symbolic_problem_solving/svamp.jsonl
   num_fewshot: [5]
-  icl_task_type: question_answering
+  icl_task_type: generation_task_with_answers
   cot_delimiter: "The answer is "
   continuation_delimiter: "\n\nA:"
   question_prelimiter: "Question: "
diff --git a/tests/data/test_tasks.yaml b/tests/data/test_tasks.yaml
index cec7984320..cf02ffcbbb 100644
--- a/tests/data/test_tasks.yaml
+++ b/tests/data/test_tasks.yaml
@@ -20,4 +20,4 @@ icl_tasks:
   label: triviaqa
   dataset_uri: scripts/eval/local_data/world_knowledge/triviaqa_small.jsonl  # ADD YOUR OWN DATASET URI
   num_fewshot: [0, 1]
-  icl_task_type: question_answering
+  icl_task_type: generation_task_with_answers
diff --git a/tests/eval/test_in_context_learning_datasets.py b/tests/eval/test_in_context_learning_datasets.py
index e4fbe17bc6..90fab0810c 100644
--- a/tests/eval/test_in_context_learning_datasets.py
+++ b/tests/eval/test_in_context_learning_datasets.py
@@ -19,7 +19,7 @@
 # isort: off
 from llmfoundry.eval.datasets import (
     InContextLearningDataset, InContextLearningCodeEvalDataset,
-    InContextLearningMultipleChoiceTaskDataset, InContextLearningQATaskDataset,
+    InContextLearningMultipleChoiceTaskDataset, InContextLearningGenerationWithAnswersTaskDataset,
     InContextLearningSchemaTaskDataset, get_icl_task_dataloader, strip_data,
     tokenizer_needs_prefix_space, trim_context, get_continuation_span,
     get_fewshot_sample_idxs, make_padded_input)
@@ -33,7 +33,7 @@
 from llmfoundry.eval.metrics import (InContextLearningCodeEvalAccuracy,
                                      InContextLearningLMAccuracy,
                                      InContextLearningMultipleChoiceAccuracy,
-                                     InContextLearningQAAccuracy)
+                                     InContextLearningGenerationAccuracy)
 
 
 def test_strip_data():
@@ -327,7 +327,7 @@ def test_update_generation_kwargs_no_kwargs_qa_dataset(tmp_path):
 
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
-    dl = InContextLearningQATaskDataset(
+    dl = InContextLearningGenerationWithAnswersTaskDataset(
         dataset_uri=dataset_uri,
         tokenizer=tokenizer,
         max_seq_len=1024,
@@ -352,7 +352,7 @@ def test_update_generation_kwargs_with_kwargs_qa_dataset(tmp_path):
 
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
-    dl = InContextLearningQATaskDataset(
+    dl = InContextLearningGenerationWithAnswersTaskDataset(
         dataset_uri=dataset_uri,
         tokenizer=tokenizer,
         max_seq_len=1024,
@@ -590,7 +590,7 @@ def test_qa_set_cot_no_cot(tmp_path):
 
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
-    dl = InContextLearningQATaskDataset(
+    dl = InContextLearningGenerationWithAnswersTaskDataset(
         dataset_uri=dataset_uri,
         tokenizer=tokenizer,
         max_seq_len=1024,
@@ -615,7 +615,7 @@ def test_qa_set_cot_has_cot(tmp_path):
 
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
-    dl = InContextLearningQATaskDataset(
+    dl = InContextLearningGenerationWithAnswersTaskDataset(
         dataset_uri=dataset_uri,
         tokenizer=tokenizer,
         max_seq_len=1024,
@@ -637,7 +637,7 @@ def test_qa_get_max_answer_length(tiny_gpt2_tokenizer, tmp_path):
 
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
-    dl = InContextLearningQATaskDataset(
+    dl = InContextLearningGenerationWithAnswersTaskDataset(
         dataset_uri=dataset_uri,
         tokenizer=tokenizer,
         max_seq_len=1024,
@@ -661,7 +661,7 @@ def test_qa_get_answer_from_example_with_no_cot(tmp_path, tiny_gpt2_tokenizer):
 
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
-    dl = InContextLearningQATaskDataset(
+    dl = InContextLearningGenerationWithAnswersTaskDataset(
         dataset_uri=dataset_uri,
         tokenizer=tiny_gpt2_tokenizer,
         max_seq_len=1024,
@@ -689,7 +689,7 @@ def test_qa_get_answer_from_example_with_cot(tmp_path, tiny_gpt2_tokenizer):
 
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
-    dl = InContextLearningQATaskDataset(
+    dl = InContextLearningGenerationWithAnswersTaskDataset(
         dataset_uri=dataset_uri,
         tokenizer=tiny_gpt2_tokenizer,
         max_seq_len=1024,
@@ -718,7 +718,7 @@ def test_qa_tokenize_example(tiny_gpt2_tokenizer, tmp_path):
 
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
-    dl = InContextLearningQATaskDataset(
+    dl = InContextLearningGenerationWithAnswersTaskDataset(
         dataset_uri=dataset_uri,
         tokenizer=tiny_gpt2_tokenizer,
         max_seq_len=1024,
@@ -1367,7 +1367,7 @@ def test_qa_split_batch(tiny_opt_tokenizer, dataset_uri, tmp_path):
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)  # for dist
     dl = get_icl_task_dataloader(
-        icl_task_type='question_answering',
+        icl_task_type='generation_task_with_answers',
         dataset_uri=dataset_uri,
         tokenizer=tokenizer,
         batch_size=8,
@@ -1424,7 +1424,7 @@ def test_qa_task_dataloader_w_null_eos(dataset_uri, tiny_gpt2_tokenizer,
     seqlen = 512
     tiny_gpt2_tokenizer.eos_token_id = None
     with pytest.raises(ValueError):
-        _ = get_icl_task_dataloader('question_answering',
+        _ = get_icl_task_dataloader('generation_task_with_answers',
                                     dataset_uri,
                                     tokenizer,
                                     batch_size,
@@ -1454,7 +1454,7 @@ def test_qa_task_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path,
     seqlen = 512
     # empirical number from the small test dataset
     maximum_answer_length = 7
-    dl = get_icl_task_dataloader('question_answering',
+    dl = get_icl_task_dataloader('generation_task_with_answers',
                                  dataset_uri=dataset_uri,
                                  tokenizer=tokenizer,
                                  batch_size=batch_size,
@@ -1513,7 +1513,7 @@ def test_qa_task_with_cot_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path,
     # empirical number from the small test dataset
     maximum_answer_length = 132
     dl = get_icl_task_dataloader(
-        'question_answering',
+        'generation_task_with_answers',
         dataset_uri=dataset_uri,
         tokenizer=tokenizer,
         batch_size=batch_size,
@@ -1621,7 +1621,7 @@ def test_code_eval_split_batch(dataset_uri, tmp_path):
         'code_evaluation',
         dataset_uri=dataset_uri,
         tokenizer=tokenizer,
-        batch_size=8,
+        batch_size=5,
         max_seq_len=1024,
         pad_tok_id=tokenizer.eos_token_id,
         num_fewshot=2,
@@ -1629,28 +1629,16 @@ def test_code_eval_split_batch(dataset_uri, tmp_path):
         example_delimiter='\n',
         continuation_delimiter='',
         destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
-        generations_per_sample=4,
+        generations_per_sample=3,
     )
 
     assert isinstance(dl, DataSpec)  # pyright
+    batches = list(dl.dataloader)
 
-    batch = next(iter(dl.dataloader))
-    split_batch = dl.split_batch(batch, 3)
-
-    assert len(split_batch) == 2
-    split1 = split_batch[0]
-    split2 = split_batch[1]
-
-    assert split1['input_ids'].shape[0] == 3
-    assert split2['input_ids'].shape[0] == 1
+    for k in ('input_ids', 'attention_mask'):
+        assert [b[k].shape[0] for b in batches] == [5, 5, 2]
 
-    assert split1['attention_mask'].shape[0] == 3
-    assert split2['attention_mask'].shape[0] == 1
-
-    assert isinstance(split1['mode'], str)
-    assert isinstance(split2['mode'], str)
-
-    list_split = {
+    list_keys = {
         'labels': str,
         'prompts': str,
         'tests': str,
@@ -1659,36 +1647,30 @@ def test_code_eval_split_batch(dataset_uri, tmp_path):
         'test_outputs': list,
         'languages': str,
     }
-    for k, v in list_split.items():
-        assert len(split1[k]) == 3
-        assert len(split2[k]) == 1
-        assert all(isinstance(val, v) for val in split1[k] + split2[k])
 
-    assert isinstance(split1['pass_at_k'], int)
-    assert isinstance(split2['pass_at_k'], int)
-
-    assert isinstance(split1['generation_length'], int)
-    assert isinstance(split2['generation_length'], int)
-
-    assert isinstance(split1['generation_kwargs'], dict)
-    assert isinstance(split2['generation_kwargs'], dict)
+    for batch, size in zip(batches, [5, 5, 2]):
+        for field, type_ in list_keys.items():
+            assert len(batch[field]) == size
+            assert all(isinstance(val, type_) for val in batch[field])
 
+    static_keys = {'pass_at_k': (int, list), 'generation_length': int, 'generation_kwargs': dict}
+    for batch in batches:
+        for field, type_ in static_keys.items():
+            assert isinstance(batch[field], type_)
 
 @pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [0, 2])
 @pytest.mark.parametrize('prompt_string', ['Please code:\n', ''])
 @pytest.mark.parametrize('generations_per_sample', [1, 3])
-def test_code_eval_sentpiece_dataloader(dataset_uri, tmp_path, num_fewshot,
-                                        prompt_string, generations_per_sample):
+def test_code_eval_sentpiece_dataloader(dataset_uri, tmp_path, num_fewshot, prompt_string, generations_per_sample,
+                                        tiny_llama_tokenizer):
     pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
 
-    transformers = pytest.importorskip('transformers')
-    tokenizer = transformers.AutoTokenizer.from_pretrained(
-        'huggyllama/llama-7b')  # type: ignore reportUnboundVariable
+    tokenizer = tiny_llama_tokenizer
     dataset_uri = f'{local_data}/{dataset_uri}'
-    batch_size = 4
+    batch_size = 5
     seqlen = 2048
 
     dl = get_icl_task_dataloader('code_evaluation',
@@ -1702,55 +1684,58 @@ def test_code_eval_sentpiece_dataloader(dataset_uri, tmp_path, num_fewshot,
                                  example_delimiter='\n',
                                  continuation_delimiter='',
                                  question_prelimiter='Code start: \n',
-                                 destination_path=str(
-                                     tmp_path / f'icl_{num_fewshot}.jsonl'),
+                                 destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'),
                                  generations_per_sample=generations_per_sample)
     assert isinstance(dl, DataSpec)
 
     assert isinstance(dl.dataloader, DataLoader)  # pyright
-    batch = next(dl.dataloader._get_iterator())
+    batches = list(dl.dataloader)
+    dataset_size = len(open(dataset_uri, 'r').read().strip().split('\n'))
+    dataset_size *= generations_per_sample
 
     max_prompt_length = 0
-    if isinstance(dl.dataloader.dataset, InContextLearningCodeEvalDataset):
-        max_prompt_length = dl.dataloader.dataset.max_prompt_length
-    assert tuple(batch['input_ids'].shape) == (batch_size, max_prompt_length)
-    assert tuple(batch['attention_mask'].shape) == (batch_size,
-                                                    max_prompt_length)
-    assert batch['mode'] == 'generate'
-    # the maximum generation length from the small test data
-    assert batch['generation_length'] == 129
-    assert any(item[0] != tokenizer.eos_token_id
-               for item in batch['input_ids'])  # longest should be pushed left
 
-    decoded_batch = tokenizer.batch_decode(batch['input_ids'])
-    assert all(
-        item.count('Code start: \n') == num_fewshot + 1
-        for item in decoded_batch)
-
-    if len(prompt_string) > 0:
-        assert all(item.count('Please code:\n') == 1 for item in decoded_batch)
-
-    assert batch['labels'] == [
+    has_left_padding = []
+    for i, batch in enumerate(batches):
+        if isinstance(dl.dataloader.dataset, InContextLearningCodeEvalDataset):
+            max_prompt_length = dl.dataloader.dataset.max_prompt_length
+        N = len(batches)
+        bs = batch_size if i < N - 1 else dataset_size - (N - 1) * batch_size
+        assert tuple(batch['input_ids'].shape) == (bs, max_prompt_length)
+        assert tuple(batch['attention_mask'].shape) == (bs, max_prompt_length)
+        assert batch['mode'] == 'generate'
+        # the maximum generation length from the small test data
+        assert batch['generation_length'] == 129
+        has_left_padding.extend([item[0] == tokenizer.eos_token_id for item in batch['input_ids']])
+    assert not all(has_left_padding)  # longest should be pushed left
+
+    decoded_batches = [tokenizer.batch_decode(batch['input_ids']) for batch in batches]
+    for decoded_batch in decoded_batches:
+        assert all(item.count('Code start: \n') == num_fewshot + 1 for item in decoded_batch)
+
+        if len(prompt_string) > 0:
+            assert all(item.count('Please code:\n') == 1 for item in decoded_batch)
+
+    labels = [
         '    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx != idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n',
         "    result = []\n    current_string = []\n    current_depth = 0\n\n    for c in paren_string:\n        if c == '(':\n            current_depth += 1\n            current_string.append(c)\n        elif c == ')':\n            current_depth -= 1\n            current_string.append(c)\n\n            if current_depth == 0:\n                result.append(''.join(current_string))\n                current_string.clear()\n\n    return result\n",
         '    return number % 1.0\n',
         '    balance = 0\n\n    for op in operations:\n        balance += op\n        if balance < 0:\n            return True\n\n    return False\n',
     ]
 
-    assert decoded_batch[0].endswith(
-        "Code start: \nfrom typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    \"\"\"\n"
-    )
-    assert decoded_batch[1].endswith(
-        "Code start: \nfrom typing import List\n\n\ndef separate_paren_groups(paren_string: str) -> List[str]:\n    \"\"\" Input to this function is a string containing multiple groups of nested parentheses. Your goal is to\n    separate those group into separate strings and return the list of those.\n    Separate groups are balanced (each open brace is properly closed) and not nested within each other\n    Ignore any spaces in the input string.\n    >>> separate_paren_groups('( ) (( )) (( )( ))')\n    ['()', '(())', '(()())']\n    \"\"\"\n"
-    )
-    assert decoded_batch[2].endswith(
-        "Code start: \n\n\ndef truncate_number(number: float) -> float:\n    \"\"\" Given a positive floating point number, it can be decomposed into\n    and integer part (largest integer smaller than given number) and decimals\n    (leftover part always smaller than 1).\n\n    Return the decimal part of the number.\n    >>> truncate_number(3.5)\n    0.5\n    \"\"\"\n"
-    )
-    assert decoded_batch[3].endswith(
+    # assert decoded_batch[0].endswith(
+    samples = [
+        "Code start: \nfrom typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    \"\"\"\n",
+        "Code start: \nfrom typing import List\n\n\ndef separate_paren_groups(paren_string: str) -> List[str]:\n    \"\"\" Input to this function is a string containing multiple groups of nested parentheses. Your goal is to\n    separate those group into separate strings and return the list of those.\n    Separate groups are balanced (each open brace is properly closed) and not nested within each other\n    Ignore any spaces in the input string.\n    >>> separate_paren_groups('( ) (( )) (( )( ))')\n    ['()', '(())', '(()())']\n    \"\"\"\n",
+        "Code start: \n\n\ndef truncate_number(number: float) -> float:\n    \"\"\" Given a positive floating point number, it can be decomposed into\n    and integer part (largest integer smaller than given number) and decimals\n    (leftover part always smaller than 1).\n\n    Return the decimal part of the number.\n    >>> truncate_number(3.5)\n    0.5\n    \"\"\"\n",
         "Code start: \nfrom typing import List\n\n\ndef below_zero(operations: List[int]) -> bool:\n    \"\"\" You're given a list of deposit and withdrawal operations on a bank account that starts with\n    zero balance. Your task is to detect if at any point the balance of account fallls below zero, and\n    at that point function should return True. Otherwise it should return False.\n    >>> below_zero([1, 2, 3])\n    False\n    >>> below_zero([1, 2, -4, 5])\n    True\n    \"\"\"\n"
-    )
-
-
+    ]
+    for i in range(4):
+        for j in range(generations_per_sample):
+            k = i * generations_per_sample + j
+            b, n = divmod(k, batch_size)
+            assert batches[b]['labels'][n] == labels[i]
+            assert decoded_batches[b][n].endswith(samples[i])
 @pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
 def test_code_eval_test_cases(dataset_uri, tmp_path):
     pytest.importorskip('datasets')
@@ -1838,15 +1823,13 @@ def test_code_eval_pass_at_k_validity(dataset_uri, tmp_path):
 @pytest.mark.parametrize('num_fewshot', [0, 2])
 @pytest.mark.parametrize('prompt_string', ['Please code:\n', ''])
 @pytest.mark.parametrize('generations_per_sample', [1, 3])
-def test_code_eval_task_dataloader(dataset_uri, tmp_path, num_fewshot,
-                                   prompt_string, generations_per_sample):
+def test_code_eval_task_dataloader(dataset_uri, tmp_path, num_fewshot, prompt_string, generations_per_sample):
     pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
 
     transformers = pytest.importorskip('transformers')
-    tokenizer = transformers.AutoTokenizer.from_pretrained(
-        'mosaicml/mpt-7b')  # type: ignore reportUnboundVariable
+    tokenizer = transformers.AutoTokenizer.from_pretrained('mosaicml/mpt-7b')  # type: ignore reportUnboundVariable
     dataset_uri = f'{local_data}/{dataset_uri}'
     batch_size = 4
     seqlen = 2048
@@ -1862,8 +1845,7 @@ def test_code_eval_task_dataloader(dataset_uri, tmp_path, num_fewshot,
                                  example_delimiter='\n',
                                  continuation_delimiter='',
                                  question_prelimiter='Code start: \n',
-                                 destination_path=str(
-                                     tmp_path / f'icl_{num_fewshot}.jsonl'),
+                                 destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'),
                                  generations_per_sample=generations_per_sample,
                                  generation_kwargs={
                                      'temperature': .9,
@@ -1872,59 +1854,60 @@ def test_code_eval_task_dataloader(dataset_uri, tmp_path, num_fewshot,
     assert isinstance(dl, DataSpec)
 
     assert isinstance(dl.dataloader, DataLoader)  # pyright
-    batch = next(dl.dataloader._get_iterator())
-
-    max_prompt_length = 0
-    if isinstance(dl.dataloader.dataset, InContextLearningCodeEvalDataset):
-        max_prompt_length = dl.dataloader.dataset.max_prompt_length
-    assert tuple(batch['input_ids'].shape) == (batch_size, max_prompt_length)
-    assert tuple(batch['attention_mask'].shape) == (batch_size,
-                                                    max_prompt_length)
-    assert batch['mode'] == 'generate'
-    # the maximum generation length from the small test data
-    assert batch['generation_length'] == 122
-    assert any(item[0] != tokenizer.eos_token_id
-               for item in batch['input_ids'])  # longest should be pushed left
-
-    decoded_batch = tokenizer.batch_decode(batch['input_ids'])
-    assert all(
-        item.count('Code start: \n') == num_fewshot + 1
-        for item in decoded_batch)
-
-    if len(prompt_string) > 0:
-        assert all(item.count('Please code:\n') == 1 for item in decoded_batch)
-
-    assert batch['labels'] == [
+    batches = list(dl.dataloader)
+    dataset_size = len(open(dataset_uri, 'r').read().strip().split('\n'))
+    dataset_size *= generations_per_sample
+
+    has_left_padding = []
+    for i, batch in enumerate(batches):
+        max_prompt_length = 0
+        if isinstance(dl.dataloader.dataset, InContextLearningCodeEvalDataset):
+            max_prompt_length = dl.dataloader.dataset.max_prompt_length
+        N = len(batches)
+        bs = batch_size if i < N - 1 else dataset_size - (N - 1) * batch_size
+        assert tuple(batch['input_ids'].shape) == (bs, max_prompt_length)
+        assert tuple(batch['attention_mask'].shape) == (bs, max_prompt_length)
+        assert batch['mode'] == 'generate'
+        # the maximum generation length from the small test data
+        assert batch['generation_length'] == 122
+        has_left_padding.extend([item[0] == tokenizer.eos_token_id for item in batch['input_ids']])
+    assert not all(has_left_padding)  # longest should be pushed left
+
+    decoded_batches = [tokenizer.batch_decode(batch['input_ids']) for batch in batches]
+    for decoded_batch in decoded_batches:
+        assert all(item.count('Code start: \n') == num_fewshot + 1 for item in decoded_batch)
+
+        if len(prompt_string) > 0:
+            assert all(item.count('Please code:\n') == 1 for item in decoded_batch)
+
+    labels = [
         '    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx != idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n',
         "    result = []\n    current_string = []\n    current_depth = 0\n\n    for c in paren_string:\n        if c == '(':\n            current_depth += 1\n            current_string.append(c)\n        elif c == ')':\n            current_depth -= 1\n            current_string.append(c)\n\n            if current_depth == 0:\n                result.append(''.join(current_string))\n                current_string.clear()\n\n    return result\n",
         '    return number % 1.0\n',
         '    balance = 0\n\n    for op in operations:\n        balance += op\n        if balance < 0:\n            return True\n\n    return False\n',
     ]
 
-    assert decoded_batch[0].endswith(
-        "Code start: \nfrom typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    \"\"\"\n"
-    )
-    assert decoded_batch[1].endswith(
-        "Code start: \nfrom typing import List\n\n\ndef separate_paren_groups(paren_string: str) -> List[str]:\n    \"\"\" Input to this function is a string containing multiple groups of nested parentheses. Your goal is to\n    separate those group into separate strings and return the list of those.\n    Separate groups are balanced (each open brace is properly closed) and not nested within each other\n    Ignore any spaces in the input string.\n    >>> separate_paren_groups('( ) (( )) (( )( ))')\n    ['()', '(())', '(()())']\n    \"\"\"\n"
-    )
-    assert decoded_batch[2].endswith(
-        "Code start: \n\n\ndef truncate_number(number: float) -> float:\n    \"\"\" Given a positive floating point number, it can be decomposed into\n    and integer part (largest integer smaller than given number) and decimals\n    (leftover part always smaller than 1).\n\n    Return the decimal part of the number.\n    >>> truncate_number(3.5)\n    0.5\n    \"\"\"\n"
-    )
-    assert decoded_batch[3].endswith(
+    # assert decoded_batch[0].endswith(
+    samples = [
+        "Code start: \nfrom typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    \"\"\"\n",
+        "Code start: \nfrom typing import List\n\n\ndef separate_paren_groups(paren_string: str) -> List[str]:\n    \"\"\" Input to this function is a string containing multiple groups of nested parentheses. Your goal is to\n    separate those group into separate strings and return the list of those.\n    Separate groups are balanced (each open brace is properly closed) and not nested within each other\n    Ignore any spaces in the input string.\n    >>> separate_paren_groups('( ) (( )) (( )( ))')\n    ['()', '(())', '(()())']\n    \"\"\"\n",
+        "Code start: \n\n\ndef truncate_number(number: float) -> float:\n    \"\"\" Given a positive floating point number, it can be decomposed into\n    and integer part (largest integer smaller than given number) and decimals\n    (leftover part always smaller than 1).\n\n    Return the decimal part of the number.\n    >>> truncate_number(3.5)\n    0.5\n    \"\"\"\n",
         "Code start: \nfrom typing import List\n\n\ndef below_zero(operations: List[int]) -> bool:\n    \"\"\" You're given a list of deposit and withdrawal operations on a bank account that starts with\n    zero balance. Your task is to detect if at any point the balance of account fallls below zero, and\n    at that point function should return True. Otherwise it should return False.\n    >>> below_zero([1, 2, 3])\n    False\n    >>> below_zero([1, 2, -4, 5])\n    True\n    \"\"\"\n"
-    )
-
-
+    ]
+    for i in range(4):
+        for j in range(generations_per_sample):
+            k = i * generations_per_sample + j
+            b, n = divmod(k, batch_size)
+            assert batches[b]['labels'][n] == labels[i]
+            assert decoded_batches[b][n].endswith(samples[i])
 @pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [0, 1])
-def test_eval_split_batch(tiny_opt_tokenizer, dataset_uri, num_fewshot,
-                          tmp_path):
+def test_eval_split_batch(tiny_opt_tokenizer, dataset_uri, num_fewshot, tmp_path):
     pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     transformers = pytest.importorskip('transformers')
-    tokenizer = transformers.AutoTokenizer.from_pretrained(
-        'mosaicml/mpt-7b')  # type: ignore reportUnboundVariable
+    tokenizer = transformers.AutoTokenizer.from_pretrained('mosaicml/mpt-7b')  # type: ignore reportUnboundVariable
     dataset_uri = f'{local_data}/{dataset_uri}'
     batch_size = 4
     seqlen = 512
@@ -1940,8 +1923,7 @@ def test_eval_split_batch(tiny_opt_tokenizer, dataset_uri, num_fewshot,
                                  example_delimiter='\n',
                                  continuation_delimiter='',
                                  question_prelimiter='Code start: \n',
-                                 destination_path=str(
-                                     tmp_path / f'icl_{num_fewshot}.jsonl'),
+                                 destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'),
                                  generations_per_sample=1,
                                  generation_kwargs={
                                      'temperature': .9,
@@ -1965,12 +1947,10 @@ def test_eval_split_batch(tiny_opt_tokenizer, dataset_uri, num_fewshot,
         assert microbatch['generation_kwargs']['top_k'] == 40
         assert microbatch['generation_kwargs']['pad_token_id'] == 0
         assert microbatch['generation_kwargs']['num_beams'] == 1
-        assert microbatch['generation_kwargs']['num_return_sequences'] == 1
         assert microbatch['generation_kwargs']['do_sample'] == True
         assert microbatch['generation_kwargs']['use_cache'] == True
         assert microbatch['generation_kwargs']['eos_token_id'] == 0
 
-
 @pytest.mark.parametrize('dataset_uri', ['lambada_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [0, 5])
 @pytest.mark.gpu
@@ -2217,7 +2197,7 @@ def test_qa_task_evaluation_opt_tokenizer(tiny_opt_tokenizer, tiny_opt_model,
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
     dl = get_icl_task_dataloader(
-        'question_answering',
+        'generation_task_with_answers',
         dataset_uri=dataset_uri,
         tokenizer=tokenizer,
         batch_size=batch_size,
@@ -2232,21 +2212,21 @@ def test_qa_task_evaluation_opt_tokenizer(tiny_opt_tokenizer, tiny_opt_model,
 
     evaluator = Evaluator(label='triviaqa',
                           dataloader=dl,
-                          metric_names=['InContextLearningQAAccuracy'])
+                          metric_names=['InContextLearningGenerationAccuracy'])
     model = HuggingFaceModel(
         model=tiny_opt_model,
         tokenizer=tokenizer,
-        eval_metrics=[InContextLearningQAAccuracy()],
+        eval_metrics=[InContextLearningGenerationAccuracy()],
         use_logits=True,
     )
 
     trainer = Trainer(model=model, max_duration='1ba', loggers=in_memory_logger)
 
     trainer.eval(eval_dataloader=evaluator, subset_num_batches=2)
-    assert 'metrics/triviaqa/InContextLearningQAAccuracy' in in_memory_logger.data.keys(
+    assert 'metrics/triviaqa/InContextLearningGenerationAccuracy' in in_memory_logger.data.keys(
     )
     assert in_memory_logger.data[
-        'metrics/triviaqa/InContextLearningQAAccuracy'][0][1].item() == 0
+        'metrics/triviaqa/InContextLearningGenerationAccuracy'][0][1].item() == 0
 
 
 @pytest.mark.parametrize('num_fewshot', [5])
@@ -2271,7 +2251,7 @@ def test_qa_task_evaluation_with_cot_opt_tokenizer(tiny_opt_tokenizer,
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
     dl = get_icl_task_dataloader(
-        'question_answering',
+        'generation_task_with_answers',
         dataset_uri=dataset_uri,
         tokenizer=tokenizer,
         batch_size=batch_size,
@@ -2287,20 +2267,20 @@ def test_qa_task_evaluation_with_cot_opt_tokenizer(tiny_opt_tokenizer,
 
     evaluator = Evaluator(label='gsm8k',
                           dataloader=dl,
-                          metric_names=['InContextLearningQAAccuracy'])
+                          metric_names=['InContextLearningGenerationAccuracy'])
     model = HuggingFaceModel(
         model=tiny_opt_model,
         tokenizer=tokenizer,
-        eval_metrics=[InContextLearningQAAccuracy()],
+        eval_metrics=[InContextLearningGenerationAccuracy()],
         use_logits=True,
     )
 
     trainer = Trainer(model=model, max_duration='1ba', loggers=in_memory_logger)
 
     trainer.eval(eval_dataloader=evaluator, subset_num_batches=2)
-    assert 'metrics/gsm8k/InContextLearningQAAccuracy' in in_memory_logger.data.keys(
+    assert 'metrics/gsm8k/InContextLearningGenerationAccuracy' in in_memory_logger.data.keys(
     )
-    assert in_memory_logger.data['metrics/gsm8k/InContextLearningQAAccuracy'][
+    assert in_memory_logger.data['metrics/gsm8k/InContextLearningGenerationAccuracy'][
         0][1].item() == 0
 
 
@@ -2323,7 +2303,7 @@ def test_qa_task_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer,
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
     dl = get_icl_task_dataloader(
-        'question_answering',
+        'generation_task_with_answers',
         dataset_uri=dataset_uri,
         tokenizer=tokenizer,
         batch_size=batch_size,
@@ -2338,22 +2318,22 @@ def test_qa_task_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer,
 
     evaluator = Evaluator(label='triviaqa',
                           dataloader=dl,
-                          metric_names=['InContextLearningQAAccuracy'])
+                          metric_names=['InContextLearningGenerationAccuracy'])
 
     model = HuggingFaceModel(
         model=tiny_gpt2_model,
         tokenizer=tiny_gpt2_tokenizer,
-        eval_metrics=[InContextLearningQAAccuracy()],
+        eval_metrics=[InContextLearningGenerationAccuracy()],
         use_logits=True,
     )
 
     trainer = Trainer(model=model, max_duration='1ba', loggers=in_memory_logger)
 
     trainer.eval(eval_dataloader=evaluator, subset_num_batches=2)
-    assert 'metrics/triviaqa/InContextLearningQAAccuracy' in in_memory_logger.data.keys(
+    assert 'metrics/triviaqa/InContextLearningGenerationAccuracy' in in_memory_logger.data.keys(
     )
     assert in_memory_logger.data[
-        'metrics/triviaqa/InContextLearningQAAccuracy'][0][1].item() == 0
+        'metrics/triviaqa/InContextLearningGenerationAccuracy'][0][1].item() == 0
 
 
 @pytest.mark.parametrize('dataset_uri', ['gsm8k_small.jsonl'])
@@ -2376,7 +2356,7 @@ def test_qa_task_with_cot_evaluation(num_fewshot, dataset_uri,
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
     dl = get_icl_task_dataloader(
-        'question_answering',
+        'generation_task_with_answers',
         dataset_uri=dataset_uri,
         tokenizer=tokenizer,
         batch_size=batch_size,
@@ -2392,21 +2372,21 @@ def test_qa_task_with_cot_evaluation(num_fewshot, dataset_uri,
 
     evaluator = Evaluator(label='gsm8k',
                           dataloader=dl,
-                          metric_names=['InContextLearningQAAccuracy'])
+                          metric_names=['InContextLearningGenerationAccuracy'])
 
     model = HuggingFaceModel(
         model=tiny_gpt2_model,
         tokenizer=tiny_gpt2_tokenizer,
-        eval_metrics=[InContextLearningQAAccuracy()],
+        eval_metrics=[InContextLearningGenerationAccuracy()],
         use_logits=True,
     )
 
     trainer = Trainer(model=model, max_duration='1ba', loggers=in_memory_logger)
 
     trainer.eval(eval_dataloader=evaluator, subset_num_batches=2)
-    assert 'metrics/gsm8k/InContextLearningQAAccuracy' in in_memory_logger.data.keys(
+    assert 'metrics/gsm8k/InContextLearningGenerationAccuracy' in in_memory_logger.data.keys(
     )
-    assert in_memory_logger.data['metrics/gsm8k/InContextLearningQAAccuracy'][
+    assert in_memory_logger.data['metrics/gsm8k/InContextLearningGenerationAccuracy'][
         0][1].item() == 0
 
 
@@ -2476,7 +2456,7 @@ def test_code_eval_microbatching(monkeypatch, tiny_opt_tokenizer,
 
     trainer = Trainer(model=model, max_duration='1ba', loggers=in_memory_logger)
     torch.use_deterministic_algorithms(False)
-    trainer.eval(eval_dataloader=evaluator, subset_num_batches=2)
+    trainer.eval(eval_dataloader=evaluator)
     torch.use_deterministic_algorithms(True)
     assert 'metrics/humaneval/InContextLearningCodeEvalAccuracy' in in_memory_logger.data.keys(
     )
@@ -2532,7 +2512,7 @@ def test_code_eval_sentpiece_evaluation(monkeypatch, num_fewshot, dataset_uri,
 
     trainer = Trainer(model=model, max_duration='1ba', loggers=in_memory_logger)
     torch.use_deterministic_algorithms(False)
-    trainer.eval(eval_dataloader=evaluator, subset_num_batches=2)
+    trainer.eval(eval_dataloader=evaluator)
     torch.use_deterministic_algorithms(True)
     assert 'metrics/humaneval/InContextLearningCodeEvalAccuracy' in in_memory_logger.data.keys(
     )
@@ -2589,7 +2569,7 @@ def test_code_eval_task_evaluation(monkeypatch, num_fewshot, dataset_uri,
 
     trainer = Trainer(model=model, max_duration='1ba', loggers=in_memory_logger)
     torch.use_deterministic_algorithms(False)
-    trainer.eval(eval_dataloader=evaluator, subset_num_batches=2)
+    trainer.eval(eval_dataloader=evaluator)
     torch.use_deterministic_algorithms(True)
     assert 'metrics/humaneval/InContextLearningCodeEvalAccuracy' in in_memory_logger.data.keys(
     )
@@ -2730,7 +2710,7 @@ def test_hf_dataloading_custom_parsing(dataset_uri, tiny_gpt2_tokenizer,
     maximum_answer_length = 4
 
     dl = get_icl_task_dataloader(
-        'question_answering',
+        'generation_task_with_answers',
         dataset_uri=dataset_uri,
         tokenizer=tokenizer,
         batch_size=batch_size,
diff --git a/tests/eval/test_nlp_metrics.py b/tests/eval/test_nlp_metrics.py
index 2b498db87e..01e1eb84c6 100644
--- a/tests/eval/test_nlp_metrics.py
+++ b/tests/eval/test_nlp_metrics.py
@@ -5,11 +5,11 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import torch
-
+import pytest
 from llmfoundry.eval.metrics import (InContextLearningCodeEvalAccuracy,
                                      InContextLearningLMAccuracy,
                                      InContextLearningMultipleChoiceAccuracy,
-                                     InContextLearningQAAccuracy)
+                                     InContextLearningGenerationAccuracy)
 
 
 def test_in_context_learning_lm_accuracy(tiny_gpt2_tokenizer):
@@ -53,7 +53,7 @@ def test_in_context_learning_qa_accuracy():
     ]
     labels = [['Correct'], ['blah', 'blah2'], ['blah', 'correct']]
     batch = {'cot_delimiter': '', 'labels': labels}
-    metric = InContextLearningQAAccuracy()
+    metric = InContextLearningGenerationAccuracy()
     metric.update(batch, outputs, labels)
 
     assert metric.compute() == (2 / 3)
@@ -73,7 +73,7 @@ def test_in_context_learning_qa_cot_accuracy():
         'do_normalization': True,
         'stopping_criteria': '\n\n'
     }
-    metric = InContextLearningQAAccuracy()
+    metric = InContextLearningGenerationAccuracy()
     metric.update(batch, outputs, labels)
 
     assert metric.compute() == (2 / 4)
@@ -89,27 +89,37 @@ def test_in_context_learning_code_eval_accuracy(monkeypatch):
         '    return n + 1'
     ]  # correct
     labels = []
-    prompts = [
-        'def fib(n):\n', 'def multiply_by_two(n):\n', 'def add_one(n):\n'
-    ]
+    prompts = ['def fib(n):\n', 'def multiply_by_two(n):\n', 'def add_one(n):\n']
     entry_points = ['fib', 'multiply_by_two', 'add_one']
-    test_inputs = [['(1,)', '(2,)', '(4,)'], ['(1,)', '(2,)', '(4,)'],
-                   ['(1,)', '(2,)', '(4,)']]
+    test_inputs = [['(1,)', '(2,)', '(4,)'], ['(1,)', '(2,)', '(4,)'], ['(1,)', '(2,)', '(4,)']]
     test_outputs = [['1', '2', '5'], ['2', '4', '8'], ['2', '3', '5']]
+    sample_ids = [0, 1, 2]
     languages = ['python', 'python', 'python']
     monkeypatch.setenv('CODE_EVAL_DEVICE', 'LOCAL')
+    generations_per_sample = 2
+
+    def repeat(values):
+        return [val for val in values for _ in range(generations_per_sample)]
+
+    transformers = pytest.importorskip('transformers')
+    tokenizer = transformers.AutoTokenizer.from_pretrained('mosaicml/mpt-7b')  # type: ignore reportUnboundVariable
+    tokenizer.pad_token = tokenizer.eos_token
+    input_ids = tokenizer.batch_encode_plus(repeat(prompts), return_tensors='pt', padding=True)['input_ids']
     batch = {
         # This tests deterministic beam search rather than sampling
+        'input_ids': input_ids,
         'generation_kwargs': {
             'num_beams': 1,
-            'num_return_sequences': 2
         },
-        'prompts': prompts,
-        'pass_at_k': 1,
-        'entry_points': entry_points,
-        'test_inputs': test_inputs,
-        'test_outputs': test_outputs,
-        'languages': languages,
+        'prompts': repeat(prompts),
+        'pass_at_k': [1],
+        'entry_points': repeat(entry_points),
+        'test_inputs': repeat(test_inputs),
+        'test_outputs': repeat(test_outputs),
+        'languages': repeat(languages),
+        'dataset_size': len(prompts),
+        'generations_per_sample': generations_per_sample,
+        'sample_id': repeat(sample_ids),
     }
     metric = InContextLearningCodeEvalAccuracy()
     metric.update(batch, outputs, labels)
@@ -121,7 +131,6 @@ def test_in_context_learning_code_eval_accuracy(monkeypatch):
     # mean: 0.5
     assert metric.compute() == 0.5
 
-
 def test_in_context_learning_mc_accuracy(tiny_gpt2_tokenizer):
     contexts = [
         'Q: How do you cook a cake?', 'Q: How do you cook a cake?',

From a3f5a313e324f493bbccb2005a83c2cd9c4e0125 Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Fri, 23 Feb 2024 11:18:14 -0500
Subject: [PATCH 17/59] restore

---
 mcli/mcli-1b-eval.yaml                |  4 ++--
 mcli/mcli-1b-max-seq-len-8k.yaml      |  6 +++---
 mcli/mcli-1b.yaml                     |  6 +++---
 mcli/mcli-benchmark-mpt.yaml          |  6 +++---
 mcli/mcli-convert-composer-to-hf.yaml |  4 ++--
 mcli/mcli-hf-eval.yaml                | 17 ++++++++---------
 mcli/mcli-hf-generate.yaml            |  6 +++---
 mcli/mcli-llama2-finetune.yaml        |  6 +++---
 mcli/mcli-openai-eval.yaml            |  4 ++--
 mcli/mcli-pretokenize-oci-upload.yaml |  4 ++--
 10 files changed, 31 insertions(+), 32 deletions(-)

diff --git a/mcli/mcli-1b-eval.yaml b/mcli/mcli-1b-eval.yaml
index 568d35ef17..9ae77af6ca 100644
--- a/mcli/mcli-1b-eval.yaml
+++ b/mcli/mcli-1b-eval.yaml
@@ -1,7 +1,7 @@
 integrations:
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  git_branch: v0.4.0
+  git_branch: v0.5.0
   # git_commit: # OR use your commit hash
   pip_install: -e .[gpu]
   ssh_clone: false  # Should be true if using a private repo
@@ -9,7 +9,7 @@ integrations:
 command: |
   cd llm-foundry/scripts/
   composer eval/eval.py /mnt/config/parameters.yaml
-image: mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04
+image: mosaicml/llm-foundry:2.1.0_cu121_flash2-latest
 name: mpt-1b-eval
 
 compute:
diff --git a/mcli/mcli-1b-max-seq-len-8k.yaml b/mcli/mcli-1b-max-seq-len-8k.yaml
index 75c5f26d63..e413c3bf81 100644
--- a/mcli/mcli-1b-max-seq-len-8k.yaml
+++ b/mcli/mcli-1b-max-seq-len-8k.yaml
@@ -1,9 +1,9 @@
 integrations:
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  git_branch: v0.4.0
+  git_branch: v0.5.0
   # git_commit:  # OR use your commit hash
-  pip_install: -e .[gpu]
+  pip_install: -e .[gpu-flash2]
   ssh_clone: false  # Should be true if using a private repo
 
 # We are fetching, converting, and training on the 'val' split
@@ -17,7 +17,7 @@ command: |
     --out_root ./my-copy-c4 --splits train_small val_small \
     --concat_tokens 8192 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>'
   composer train/train.py /mnt/config/parameters.yaml
-image: mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04
+image: mosaicml/llm-foundry:2.1.0_cu121_flash2-latest
 name: mpt-1b-ctx-8k-gpus-8
 
 compute:
diff --git a/mcli/mcli-1b.yaml b/mcli/mcli-1b.yaml
index a6fb565085..3713d29cc9 100644
--- a/mcli/mcli-1b.yaml
+++ b/mcli/mcli-1b.yaml
@@ -1,9 +1,9 @@
 integrations:
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  git_branch: v0.4.0
+  git_branch: v0.5.0
   # git_commit:  # OR use your commit hash
-  pip_install: -e .[gpu]
+  pip_install: -e .[gpu-flash2]
   ssh_clone: false  # Should be true if using a private repo
 
 # We are fetching, converting, and training on the 'val' split
@@ -21,7 +21,7 @@ command: |
     eval_loader.dataset.split=val_small \
     max_duration=100ba \
     eval_interval=0
-image: mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04
+image: mosaicml/llm-foundry:2.1.0_cu121_flash2-latest
 name: mpt-1b-gpus-8
 
 compute:
diff --git a/mcli/mcli-benchmark-mpt.yaml b/mcli/mcli-benchmark-mpt.yaml
index 8fd18b9d29..cb8adcac00 100644
--- a/mcli/mcli-benchmark-mpt.yaml
+++ b/mcli/mcli-benchmark-mpt.yaml
@@ -6,14 +6,14 @@ compute:
   # cluster: TODO # Name of the cluster to use for this run
   # gpu_type: a100_80gb # Type of GPU to use. We use a100_80gb in our experiments
 
-image: mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04
+image: mosaicml/llm-foundry:2.1.0_cu121_flash2-latest
 
 integrations:
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  git_branch: v0.4.0
+  git_branch: v0.5.0
   # git_commit: # OR use your commit hash
-  pip_install: ".[gpu]"
+  pip_install: ".[gpu-flash2]"
 
 command: |
   cd llm-foundry/scripts/inference/benchmarking
diff --git a/mcli/mcli-convert-composer-to-hf.yaml b/mcli/mcli-convert-composer-to-hf.yaml
index 5804fc8d7a..8ef894bf85 100644
--- a/mcli/mcli-convert-composer-to-hf.yaml
+++ b/mcli/mcli-convert-composer-to-hf.yaml
@@ -1,7 +1,7 @@
 integrations:
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  git_branch: v0.4.0
+  git_branch: v0.5.0
   # git_commit:  # OR use your commit hash
   pip_install: -e .
   ssh_clone: false  # Should be true if using a private repo
@@ -13,7 +13,7 @@ command: |
     --hf_output_path s3://bucket/folder/hf/ \
     --output_precision bf16 \
 
-image: mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04
+image: mosaicml/llm-foundry:2.1.0_cu121_flash2-latest
 name: convert-composer-hf
 
 compute:
diff --git a/mcli/mcli-hf-eval.yaml b/mcli/mcli-hf-eval.yaml
index 10d32d6e63..6800319df2 100644
--- a/mcli/mcli-hf-eval.yaml
+++ b/mcli/mcli-hf-eval.yaml
@@ -1,21 +1,20 @@
 integrations:
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  git_branch: migrate_subclasses_to_foundry  # v0.4.0
+  git_branch: v0.5.0
   # git_commit:  # OR use your commit hash
-  pip_install: -e ".[gpu]"
+  pip_install: -e ".[gpu-flash2]"
   ssh_clone: false  # Should be true if using a private repo
 
 command: |
   cd llm-foundry/scripts
-  pip uninstall mosaicml -y; pip install git+https://github.com/mosaicml/composer.git@dev
   composer eval/eval.py /mnt/config/parameters.yaml
 
 # Mosaic Cloud will use run_name (with a unique suffix) to populate the env var $RUN_NAME
-name: mpt-eval
+run_name: mpt-eval
 gpu_num: 8
-gpu_type: a100_80gb
-cluster: r1z1  # replace with your cluster here!
+# gpu_type:
+# cluster:  # replace with your cluster here!
 
 image: mosaicml/llm-foundry:2.1.0_cu121_flash2-latest
 
@@ -29,16 +28,16 @@ parameters:
 
   models:
   -
-    model_name: mosaicml/mpt-7b
+    model_name: mosaicml/mpt-7b-instruct
     # Tokenizer
     tokenizer:
-      name: mosaicml/mpt-7b
+      name: EleutherAI/gpt-neox-20b
       kwargs:
         model_max_length: ${max_seq_len}
 
     model:
       name: hf_causal_lm
-      pretrained_model_name_or_path: mosaicml/mpt-7b
+      pretrained_model_name_or_path: mosaicml/mpt-7b-instruct
       init_device: mixed
       pretrained: true
       use_auth_token: false
diff --git a/mcli/mcli-hf-generate.yaml b/mcli/mcli-hf-generate.yaml
index 566f073e7b..6880564a06 100644
--- a/mcli/mcli-hf-generate.yaml
+++ b/mcli/mcli-hf-generate.yaml
@@ -1,9 +1,9 @@
 integrations:
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  git_branch: v0.4.0
+  git_branch: v0.5.0
   # git_commit: # OR use your commit hash
-  pip_install: -e .[gpu]
+  pip_install: -e .[gpu-flash2]
   ssh_clone: false  # Should be true if using a private repo
 
 command: |
@@ -35,7 +35,7 @@ command: |
       "Here's a quick recipe for baking chocolate chip cookies: Start by" \
       "The best 5 cities to visit in Europe are"
 
-image: mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04
+image: mosaicml/llm-foundry:2.1.0_cu121_flash2-latest
 name: hf-generate
 
 compute:
diff --git a/mcli/mcli-llama2-finetune.yaml b/mcli/mcli-llama2-finetune.yaml
index bf71ff890e..36de709aed 100644
--- a/mcli/mcli-llama2-finetune.yaml
+++ b/mcli/mcli-llama2-finetune.yaml
@@ -1,15 +1,15 @@
 integrations:
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  git_branch: v0.4.0
+  git_branch: v0.5.0
   # git_commit: # OR use your commit hash
-  pip_install: -e .[gpu]
+  pip_install: -e .[gpu-flash2]
   ssh_clone: false  # Should be true if using a private repo
 
 command: |
   cd llm-foundry/scripts
   composer train/train.py /mnt/config/parameters.yaml
-image: mosaicml/llm-foundry:1.13.1_cu117-latest
+image: mosaicml/llm-foundry:2.1.0_cu121_flash2-latest
 name: llama2-finetune
 
 compute:
diff --git a/mcli/mcli-openai-eval.yaml b/mcli/mcli-openai-eval.yaml
index 99dd42b6e2..38844a76cf 100644
--- a/mcli/mcli-openai-eval.yaml
+++ b/mcli/mcli-openai-eval.yaml
@@ -1,9 +1,9 @@
 integrations:
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  git_branch: v0.4.0
+  git_branch: v0.5.0
   # git_commit:  # OR use your commit hash
-  pip_install: -e ".[gpu,openai]"
+  pip_install: -e ".[gpu-flash2,openai]"
   ssh_clone: false  # Should be true if using a private repo
 
 command: |
diff --git a/mcli/mcli-pretokenize-oci-upload.yaml b/mcli/mcli-pretokenize-oci-upload.yaml
index e26ad00722..4a4781cea3 100644
--- a/mcli/mcli-pretokenize-oci-upload.yaml
+++ b/mcli/mcli-pretokenize-oci-upload.yaml
@@ -1,5 +1,5 @@
 name: c4-2k-pre-tokenized
-image: mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04
+image: mosaicml/llm-foundry:2.1.0_cu121_flash2-latest
 compute:
   gpus: 8  # Number of GPUs to use
 
@@ -14,7 +14,7 @@ integrations:
   - oci-cli==3.23.2
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  git_branch: v0.4.0
+  git_branch: v0.5.0
   # git_commit: # OR use your commit hash
   pip_install: "."
   ssh_clone: false  # Should be true if using a private repo

From 4a1cd7971f3ca24de58a1a3898f0803c0b7c1fda Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Fri, 23 Feb 2024 11:56:58 -0500
Subject: [PATCH 18/59] add

---
 .gitignore                                   | 1 +
 tests/eval/local_data/gsm8k_small.jsonl      | 4 ++++
 tests/eval/local_data/hellaswag_small.jsonl  | 4 ++++
 tests/eval/local_data/human_eval_small.jsonl | 4 ++++
 tests/eval/local_data/lambada_small.jsonl    | 4 ++++
 tests/eval/local_data/mmlu_small.jsonl       | 4 ++++
 tests/eval/local_data/piqa_small.jsonl       | 4 ++++
 tests/eval/local_data/pubmed_sm.jsonl        | 4 ++++
 tests/eval/local_data/triviaqa_small.jsonl   | 4 ++++
 tests/eval/local_data/winograd_small.jsonl   | 4 ++++
 10 files changed, 37 insertions(+)
 create mode 100644 tests/eval/local_data/gsm8k_small.jsonl
 create mode 100644 tests/eval/local_data/hellaswag_small.jsonl
 create mode 100644 tests/eval/local_data/human_eval_small.jsonl
 create mode 100644 tests/eval/local_data/lambada_small.jsonl
 create mode 100644 tests/eval/local_data/mmlu_small.jsonl
 create mode 100644 tests/eval/local_data/piqa_small.jsonl
 create mode 100644 tests/eval/local_data/pubmed_sm.jsonl
 create mode 100644 tests/eval/local_data/triviaqa_small.jsonl
 create mode 100644 tests/eval/local_data/winograd_small.jsonl

diff --git a/.gitignore b/.gitignore
index d041a25c22..1dd80a8b6c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,7 @@
 my-copy-c4*/
 my-copy-arxiv*/
 *.jsonl*
+!tests/eval/local_data/*.jsonl
 
 # WandB
 wandb/
diff --git a/tests/eval/local_data/gsm8k_small.jsonl b/tests/eval/local_data/gsm8k_small.jsonl
new file mode 100644
index 0000000000..522966c902
--- /dev/null
+++ b/tests/eval/local_data/gsm8k_small.jsonl
@@ -0,0 +1,4 @@
+{"context": "Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?", "chain_of_thought": "Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\nShe makes 9 * 2 = $<<9*2=18>>18 every day at the farmer’s market.", "answer": "18"}
+{"context": "A robe takes 2 bolts of blue fiber and half that much white fiber.  How many bolts in total does it take?", "chain_of_thought": "It takes 2/2=<<2/2=1>>1 bolt of white fiber\nSo the total amount of fabric is 2+1=<<2+1=3>>3 bolts of fabric", "answer": "3"}
+{"context": "Josh decides to try flipping a house.  He buys a house for $80,000 and then puts in $50,000 in repairs.  This increased the value of the house by 150%.  How much profit did he make?", "chain_of_thought": "The cost of the house and repairs came out to 80,000+50,000=$<<80000+50000=130000>>130,000\nHe increased the value of the house by 80,000*1.5=<<80000*1.5=120000>>120,000\nSo the new value of the house is 120,000+80,000=$<<120000+80000=200000>>200,000\nSo he made a profit of 200,000-130,000=$<<200000-130000=70000>>70,000", "answer": "70000"}
+{"context": "James decides to run 3 sprints 3 times a week.  He runs 60 meters each sprint.  How many total meters does he run a week?", "chain_of_thought": "He sprints 3*3=<<3*3=9>>9 times\nSo he runs 9*60=<<9*60=540>>540 meters", "answer": "540"}
diff --git a/tests/eval/local_data/hellaswag_small.jsonl b/tests/eval/local_data/hellaswag_small.jsonl
new file mode 100644
index 0000000000..d2e37771c9
--- /dev/null
+++ b/tests/eval/local_data/hellaswag_small.jsonl
@@ -0,0 +1,4 @@
+{"query": "Removing ice from car: Then, the man writes over the snow covering the window of a car, and a woman wearing winter clothes smiles. Then", "choices": [", the man adds wax to the windshield and cuts it.", ", a person board a ski lift, while two men supporting the head of the person wearing winter clothes snow as the we girls sled.", ", the man puts on a christmas coat, knitted with netting.", ", the man continues removing the snow on his car."], "gold": 3}
+{"query": "Baking cookies: A female chef in white uniform shows a stack of baking pans in a large kitchen presenting them. The pans", "choices": ["contain egg yolks and baking soda.", "are then sprinkled with brown sugar.", "are placed in a strainer on the counter.", "are filled with pastries and loaded into the oven."], "gold": 3}
+{"query": "Baking cookies: A female chef in white uniform shows a stack of baking pans in a large kitchen presenting them. The pans are filled with pastries and loaded into the oven. A knife", "choices": ["is seen moving on a board and cutting out its contents.", "hits the peeled cheesecake, followed by sliced custard and still cooked ice cream.", "etches a shape into the inside of the baked pans.", "is used to cut cylinder shaped dough into rounds."], "gold": 3}
+{"query": "Baking cookies: A tray of potatoes is loaded into the oven and removed. A large tray of cake is flipped over and placed on counter. A large tray of meat", "choices": ["is placed onto a baked potato.", ", ls, and pickles are placed in the oven.", "is poured into a midden.", "is prepared then it is removed from the oven by a helper when done."], "gold": 3}
diff --git a/tests/eval/local_data/human_eval_small.jsonl b/tests/eval/local_data/human_eval_small.jsonl
new file mode 100644
index 0000000000..850d46e031
--- /dev/null
+++ b/tests/eval/local_data/human_eval_small.jsonl
@@ -0,0 +1,4 @@
+{"task_id": "HumanEval/0", "prompt": "from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    \"\"\"\n", "entry_point": "has_close_elements", "canonical_solution": "    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx != idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\n    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True\n    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False\n    assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True\n    assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True\n    assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False\n\n", "test_inputs": ["([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3)", "([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05)", "([1.0, 2.0, 5.9, 4.0, 5.0], 0.95)", "([1.0, 2.0, 5.9, 4.0, 5.0], 0.8)", "([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1)", "([1.1, 2.2, 3.1, 4.1, 5.1], 1.0)", "([1.1, 2.2, 3.1, 4.1, 5.1], 0.5)"], "test_outputs": ["True", "False", "True", "False", "True", "True", "False"], "language": "python"}
+{"task_id": "HumanEval/1", "prompt": "from typing import List\n\n\ndef separate_paren_groups(paren_string: str) -> List[str]:\n    \"\"\" Input to this function is a string containing multiple groups of nested parentheses. Your goal is to\n    separate those group into separate strings and return the list of those.\n    Separate groups are balanced (each open brace is properly closed) and not nested within each other\n    Ignore any spaces in the input string.\n    >>> separate_paren_groups('( ) (( )) (( )( ))')\n    ['()', '(())', '(()())']\n    \"\"\"\n", "entry_point": "separate_paren_groups", "canonical_solution": "    result = []\n    current_string = []\n    current_depth = 0\n\n    for c in paren_string:\n        if c == '(':\n            current_depth += 1\n            current_string.append(c)\n        elif c == ')':\n            current_depth -= 1\n            current_string.append(c)\n\n            if current_depth == 0:\n                result.append(''.join(current_string))\n                current_string.clear()\n\n    return result\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate('(()()) ((())) () ((())()())') == [\n        '(()())', '((()))', '()', '((())()())'\n    ]\n    assert candidate('() (()) ((())) (((())))') == [\n        '()', '(())', '((()))', '(((())))'\n    ]\n    assert candidate('(()(())((())))') == [\n        '(()(())((())))'\n    ]\n    assert candidate('( ) (( )) (( )( ))') == ['()', '(())', '(()())']\n", "test_inputs": ["('(()()) ((())) () ((())()())',)", "('() (()) ((())) (((())))',)", "('(()(())((())))',)", "('( ) (( )) (( )( ))',)"], "test_outputs": ["['(()())', '((()))', '()', '((())()())']", "['()', '(())', '((()))', '(((())))']", "['(()(())((())))']", "['()', '(())', '(()())']"], "language": "python"}
+{"task_id": "HumanEval/2", "prompt": "\n\ndef truncate_number(number: float) -> float:\n    \"\"\" Given a positive floating point number, it can be decomposed into\n    and integer part (largest integer smaller than given number) and decimals\n    (leftover part always smaller than 1).\n\n    Return the decimal part of the number.\n    >>> truncate_number(3.5)\n    0.5\n    \"\"\"\n", "entry_point": "truncate_number", "canonical_solution": "    return number % 1.0\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate(3.5) == 0.5\n    assert abs(candidate(1.33) - 0.33) < 1e-6\n    assert abs(candidate(123.456) - 0.456) < 1e-6\n", "test_inputs": ["(3.5,)", "(1.33,)", "(123.456,)"], "test_outputs": ["0.5", "0.33000000000000007", "0.45600000000000307"], "language": "python"}
+{"task_id": "HumanEval/3", "prompt": "from typing import List\n\n\ndef below_zero(operations: List[int]) -> bool:\n    \"\"\" You're given a list of deposit and withdrawal operations on a bank account that starts with\n    zero balance. Your task is to detect if at any point the balance of account fallls below zero, and\n    at that point function should return True. Otherwise it should return False.\n    >>> below_zero([1, 2, 3])\n    False\n    >>> below_zero([1, 2, -4, 5])\n    True\n    \"\"\"\n", "entry_point": "below_zero", "canonical_solution": "    balance = 0\n\n    for op in operations:\n        balance += op\n        if balance < 0:\n            return True\n\n    return False\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate([]) == False\n    assert candidate([1, 2, -3, 1, 2, -3]) == False\n    assert candidate([1, 2, -4, 5, 6]) == True\n    assert candidate([1, -1, 2, -2, 5, -5, 4, -4]) == False\n    assert candidate([1, -1, 2, -2, 5, -5, 4, -5]) == True\n    assert candidate([1, -2, 2, -2, 5, -5, 4, -4]) == True\n", "test_inputs": ["([],)", "([1, 2, -3, 1, 2, -3],)", "([1, 2, -4, 5, 6],)", "([1, -1, 2, -2, 5, -5, 4, -4],)", "([1, -1, 2, -2, 5, -5, 4, -5],)", "([1, -2, 2, -2, 5, -5, 4, -4],)"], "test_outputs": ["False", "False", "True", "False", "True", "True"], "language": "python"}
diff --git a/tests/eval/local_data/lambada_small.jsonl b/tests/eval/local_data/lambada_small.jsonl
new file mode 100644
index 0000000000..5a0dc238ae
--- /dev/null
+++ b/tests/eval/local_data/lambada_small.jsonl
@@ -0,0 +1,4 @@
+{"context": "With Tristran's next step he was standing beside a lake, and the candlelight shone brightly on the water; and then he was walking through the mountains, through lonely crags, where the candlelight was reflected in the eyes of the creatures of the high snows; and then he was walking through the clouds, which, while not entirely substantial, still supported his weight in comfort; and then, holding tightly to his candle, he was underground, and the candlelight glinted back at him from the wet cave walls; now he was in the mountains once more; and then he was on a road through wild forest, and he glimpsed a chariot being pulled by two goats, being driven by a woman in a red dress who looked, for the glimpse he got of her, the way Boadicea was drawn in his history books; and another step and he was in a leafy glen, and he could hear the chuckle of water as it splashed and sang its way into a small brook.\n\nHe took another step, but he was still in the", "continuation": "glen"}
+{"context": "Todd replied: No I thought you looked familiar but I can’t recall.                                                                    The stranger told Todd: I’m Enoch; we met in your dream. Todd looked back again, this time he realized it really was Enoch; Todd stopped on the side of the road, leaned back and tried to see if he was dreaming.                                                                  When Enoch said: No Todd you’re not", "continuation": "dreaming"}
+{"context": "The Librarian thumbed through the bundle of pages, stopping on the final sheet and began reading, “It is our conclusion that much of the work that is currently done in the Library can be out-sourced to contractors, particularly non-skill specific work such as shelving, stacking...”\nLucy gulped and Gillian began to open her mouth to protest again, but the Librarian carried on regardless, his voice becoming louder in order to drown out any potentially dissenting voices, “... blah, blah, blah.  It is our recommendation that a downsizing of the non-essential and part-time members of staff would bring instant economy of scale benefits and would allow for the implementation of a new middle management structure.”\n“You mean sacrifice the troops to pay for the generals,” said", "continuation": "Gillian"}
+{"context": "He was small, even for a dwarf, and his poor taste in sorcerous robes contrasted awkwardly with D’jebee’s elegant attire; her long, diaphanous gown and his chemical-stained, star-spangled robe clashed almost as much as her vacuous expression alongside his own visage, alive as it was with cunning and a twisted intelligence.\n\nD’jebee sighed with boredom.\n\n‘What is it, my love?’ Poldanyelz oozed with ersatz concern.\n\n‘I’m bored,’ D’jebee complained undiplomatically. ‘No one ever comes here. I never see anyone except you.’\n\nA shuffling from the main arch alerted her to the inaccuracy of her", "continuation": "statement"}
diff --git a/tests/eval/local_data/mmlu_small.jsonl b/tests/eval/local_data/mmlu_small.jsonl
new file mode 100644
index 0000000000..90eb402607
--- /dev/null
+++ b/tests/eval/local_data/mmlu_small.jsonl
@@ -0,0 +1,4 @@
+{"query": "Question: How is IP address spoofing detected?\n(A) Installing and configuring a IDS that can read the IP header (B) Comparing the TTL values of the actual and spoofed addresses (C) Implementing a firewall to the network (D) Identify all TCP sessions that are initiated but does not complete successfully\n", "gold": 1, "choices": ["A", "B", "C", "D"], "category": "computer_security"}
+{"query": "Question: Which of the following is not an example of presentation layer issues?\n(A) Poor handling of unexpected input can lead to the execution of arbitrary instructions (B) Unintentional or ill-directed use of superficially supplied input (C) Cryptographic flaws in the system may get exploited to evade privacy (D) Weak or non-existent authentication mechanisms\n", "gold": 3, "choices": ["A", "B", "C", "D"], "category": "computer_security"}
+{"query": "Question: Suppose Unix did not provide a way of passing file descriptors between processes, but still allowed inheriting file descriptors from a parent on fork and exec. What aspects of the OKWS design would break without file descriptor passing?\n1. It would be impossible for services to send messages to oklogd.\n2. It would be impossible for services to get a TCP connection to a database proxy.\n(A) True, True (B) False, False (C) True, False (D) False, True\n", "gold": 1, "choices": ["A", "B", "C", "D"], "category": "computer_security"}
+{"query": "Question: Why would a ping sweep be used?\n(A) To identify live systems (B) To locate live systems (C) To identify open ports (D) To locate firewalls\n", "gold": 0, "choices": ["A", "B", "C", "D"], "category": "computer_security"}
diff --git a/tests/eval/local_data/piqa_small.jsonl b/tests/eval/local_data/piqa_small.jsonl
new file mode 100644
index 0000000000..07b1b27509
--- /dev/null
+++ b/tests/eval/local_data/piqa_small.jsonl
@@ -0,0 +1,4 @@
+{"choices": ["Pour it onto a plate", "Pour it into a jar"], "gold": 1, "query": "When boiling butter, when it's ready, you can"}
+{"choices": ["Weld the metal together to get it to stay firmly in place", "Nail the metal together to get it to stay firmly in place"], "gold": 0, "query": "To permanently attach metal legs to a chair, you can"}
+{"choices": ["leave a space before starting the writing", "press the spacebar"], "gold": 0, "query": "how do you indent something?"}
+{"choices": ["move it up and down and side to side quickly.", "stir it very quickly."], "gold": 0, "query": "how do you shake something?"}
diff --git a/tests/eval/local_data/pubmed_sm.jsonl b/tests/eval/local_data/pubmed_sm.jsonl
new file mode 100644
index 0000000000..c39bab0b04
--- /dev/null
+++ b/tests/eval/local_data/pubmed_sm.jsonl
@@ -0,0 +1,4 @@
+{"context": "Context: PURPOSE. To assess whether eligibility to an adjuvant chemotherapy protocol in itself represents a good prognostic factor after radical cystectomy for bladder cancer.\nPATIENTS AND METHODS. Between April 1984 and May 1989, our institution entered 35 patients with invasive bladder cancer into the Swiss Group for Clinical and Epidemiological Cancer Research (SAKK) study 09/84. They were randomly assigned to either observation or three postoperative courses of cisplatin monotherapy after cystectomy. This study had a negative result. The outcome of these 35 patients (protocol group) was compared with an age- and tumor-stage-matched cohort (matched group; n = 35) who also underwent cystectomy during the same period, but were not entered into the SAKK study, as well as the remaining 57 patients treated during the study period for the same indication (remaining group).\nRESULTS. Median overall survival decreased from 76.3 months in the protocol group to 52.1 months in the matched group and to 20.3 months in the remaining group. The respective times of median recurrence-free survival were 67.2, 16.0, and 9.4 months. Tumor progression occurred in 46% of the protocol group compared with 69% in the matched group and 65% in the remaining group (P<.05). Cancer-related death was noted in 40% of the protocol group, 57% in the matched group, and 56% in the remaining group.\nQuestion: Is eligibility for a chemotherapy protocol a good prognostic factor for invasive bladder cancer after radical cystectomy?\nA. yes\nB. no\nC. maybe\nAnswer: ", "continuation": "yes"}
+{"context": "Context: BACKGROUND. This study was performed to describe the treatment plan modifications after a geriatric oncology clinic. Assessment of health and functional status and cancer assessment was performed in older cancer patients referred to a cancer center.\nPATIENTS AND METHODS. Between June 2004 and May 2005, 105 patients 70 years old or older referred to a geriatric oncology consultation at the Institut Curie cancer center were included. Functional status, nutritional status, mood, mobility, comorbidity, medication, social support, and place of residence were assessed. Oncology data and treatment decisions were recorded before and after this consultation. Data were analyzed for a possible correlation between one domain of the assessment and modification of the treatment plan.\nRESULTS. Patient characteristics included a median age of 79 years and a predominance of women with breast cancer. About one half of patients had an independent functional status. Nearly 15% presented severe undernourishment. Depression was suspected in 53.1% of cases. One third of these patients had>2 chronic diseases, and 74% of patients took>or =3 medications. Of the 93 patients with an initial treatment decision, the treatment plan was modified for 38.7% of cases after this assessment. Only body mass index and the absence of depressive symptoms were associated with a modification of the treatment plan.\nQuestion: Does a geriatric oncology consultation modify the cancer treatment plan for elderly patients?\nA. yes\nB. no\nC. maybe\nAnswer: ", "continuation": "yes"}
+{"context": "Context: BACKGROUND. The alterations of echocardiography and electrocardiogram (ECG) in patients received left atrial appendage LAA occlusion therapy are still unclear. The present study was to evaluate the influence of LAA occlusion device on echocardiography and ECG changes in patients with atrial fibrillation (AF).\nMETHODS. Seventy-three patients who had undergone Watchman, LAmbre and Lefort were enrolled in this study. Echocardiography and ECG results at pre- and post-operation were collected. Besides, echocardiography was also performed during follow-up visits at 1, 6 and 12months after discharge.\nRESULTS. After LAA occlusion, a slight and measureable movement of QRS electric axis was observed in most patients. The significant differences were also observed in heart rate (HR) and the mean-mean QT interval between pre- and post-operation for all patients. There existed no significant difference in echocardiographic parameters between before and after device implantation. However, a larger left atrial (LA) diameter was detected by echocardiography during follow-up visit at 6months when compared with pre-operation parameters. Similarly, aortic root diameter (ARD) was also larger during follow-up at 12months than the baseline dimension in pre-operation.\nQuestion: Does left atrial appendage (LAA) occlusion device alter the echocardiography and electrocardiogram parameters in patients with atrial fibrillation?\nA. yes\nB. no\nC. maybe\nAnswer: ", "continuation": "yes"}
+{"context": "Context: BACKGROUND. Currently the choice of breast cancer therapy is based on prognostic factors. The proliferation marker Ki-67 is used increasingly to determine the method of therapy. The current study analyses the predictive value of Ki-67 in foreseeing breast cancer patients' responses to neoadjuvant chemotherapy.\nMETHODS. This study includes patients with invasive breast cancer treated between 2008 and 2013. The clinical response was assessed by correlating Ki-67 to histological examination, mammography, and ultrasonography findings.\nRESULTS. The average Ki-67 value in our patients collectively (n = 77) is 34.9 ± 24.6%. The average Ki-67 value is the highest with 37.4 ± 24.0% in patients with a pCR. The Ki-67 values do not differ significantly among the 3 groups: pCR versus partial pathological response versus stable disease/progress (P = 0.896). However, Ki-67 values of patients with luminal, Her2 enriched, and basal-like cancers differed significantly from each other. Furthermore, within the group of luminal tumors Ki-67 values of patients with versus without pCR also differed significantly.\nQuestion: Can ki-67 play a role in prediction of breast cancer patients' response to neoadjuvant chemotherapy?\nA. yes\nB. no\nC. maybe\nAnswer: ", "continuation": "yes"}
diff --git a/tests/eval/local_data/triviaqa_small.jsonl b/tests/eval/local_data/triviaqa_small.jsonl
new file mode 100644
index 0000000000..ae5e0783d9
--- /dev/null
+++ b/tests/eval/local_data/triviaqa_small.jsonl
@@ -0,0 +1,4 @@
+{"context": "Who was the man behind The Chipmunks?", "answer": "David Seville", "aliases": ["David Seville"]}
+{"context": "What star sign is Jamie Lee Curtis?", "answer": "Scorpio", "aliases": ["Scorpio", "Skorpio"]}
+{"context": "Which Lloyd Webber musical premiered in the US on 10th December 1993?", "answer": "Sunset Boulevard", "aliases": ["Sunset Blvd", "Sunset Boulevard", "Sunset Bulevard", "West Sunset Boulevard"]}
+{"context": "Who was the next British Prime Minister after Arthur Balfour?", "answer": "Campbell-Bannerman", "aliases": ["Campbell Bannerman", "Campbell-Bannerman", "Henry Campbell Bannerman", "Henry Campbell-Bannerman", "Sir Henry Campbell Bannerman", "Sir Henry Campbell-Bannerman"]}
diff --git a/tests/eval/local_data/winograd_small.jsonl b/tests/eval/local_data/winograd_small.jsonl
new file mode 100644
index 0000000000..8f84cd27e5
--- /dev/null
+++ b/tests/eval/local_data/winograd_small.jsonl
@@ -0,0 +1,4 @@
+{"context_options": ["The city councilmen refused the demonstrators a permit because the city councilmen", "The city councilmen refused the demonstrators a permit because the demonstrators"], "continuation": "feared violence.", "gold": 0}
+{"context_options": ["The city councilmen refused the demonstrators a permit because the city councilmen", "The city councilmen refused the demonstrators a permit because the demonstrators"], "continuation": "advocated violence.", "gold": 1}
+{"context_options": ["The trophy doesn't fit into the brown suitcase because the trophy", "The trophy doesn't fit into the brown suitcase because the suitcase"], "continuation": "is too large.", "gold": 0}
+{"context_options": ["The trophy doesn't fit into the brown suitcase because the trophy", "The trophy doesn't fit into the brown suitcase because the suitcase"], "continuation": "is too small.", "gold": 1}

From 71f77e3b857daf3e4808d1af64b3b53f0ea5496d Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Mon, 26 Feb 2024 16:27:40 -0500
Subject: [PATCH 19/59] fix

---
 llmfoundry/eval/datasets/__init__.py          |   7 +-
 .../in_context_learning_evaluation.py         | 145 +++++---
 llmfoundry/eval/datasets/utils.py             |   5 +-
 llmfoundry/eval/metrics/__init__.py           |   6 +-
 llmfoundry/eval/metrics/nlp.py                |  68 ++--
 llmfoundry/models/hf/hf_causal_lm.py          |   4 +-
 .../models/inference_api_wrapper/interface.py |   6 +-
 llmfoundry/models/mpt/modeling_mpt.py         |   4 +-
 llmfoundry/utils/builders.py                  |   1 -
 .../eval/test_in_context_learning_datasets.py | 335 +++++++++++-------
 tests/eval/test_nlp_metrics.py                |  36 +-
 tests/fixtures/models.py                      |  71 ++--
 12 files changed, 433 insertions(+), 255 deletions(-)

diff --git a/llmfoundry/eval/datasets/__init__.py b/llmfoundry/eval/datasets/__init__.py
index d9d6686331..794b1d563b 100644
--- a/llmfoundry/eval/datasets/__init__.py
+++ b/llmfoundry/eval/datasets/__init__.py
@@ -8,9 +8,9 @@
 
 from llmfoundry.eval.datasets.in_context_learning_evaluation import (
     InContextLearningCodeEvalDataset, InContextLearningDataset,
+    InContextLearningGenerationWithAnswersTaskDataset,
     InContextLearningLMTaskDataset, InContextLearningMultipleChoiceTaskDataset,
-    InContextLearningGenerationWithAnswersTaskDataset, InContextLearningSchemaTaskDataset,
-    get_icl_task_dataloader)
+    InContextLearningSchemaTaskDataset, get_icl_task_dataloader)
 from llmfoundry.eval.datasets.utils import (get_continuation_span,
                                             get_fewshot_sample_idxs,
                                             make_padded_input, strip_data,
@@ -18,7 +18,8 @@
                                             trim_context)
 
 __all__ = [
-    'InContextLearningDataset', 'InContextLearningGenerationWithAnswersTaskDataset',
+    'InContextLearningDataset',
+    'InContextLearningGenerationWithAnswersTaskDataset',
     'InContextLearningLMTaskDataset', 'InContextLearningCodeEvalDataset',
     'InContextLearningMultipleChoiceTaskDataset',
     'InContextLearningSchemaTaskDataset', 'get_icl_task_dataloader',
diff --git a/llmfoundry/eval/datasets/in_context_learning_evaluation.py b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
index d6b4f4c578..beb1f8e45d 100644
--- a/llmfoundry/eval/datasets/in_context_learning_evaluation.py
+++ b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
@@ -11,8 +11,9 @@
 import json
 import os
 import random
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Union
 
+import torch
 from composer.core import DataSpec
 from composer.core.data_spec import _default_split_batch, _split_list
 from composer.datasets.utils import stop_sequences_criteria
@@ -237,8 +238,9 @@ def read_dataset(
             if hf_parsing_map:
                 dataset_parsing_func = lambda example: {
                     k: ' '.join([str(example[col]) for col in v])
-                    for k, v in hf_parsing_map.items(
-                    )  # pyright: ignore[reportOptionalMemberAccess]
+                    for k, v in hf_parsing_map.
+                    items(  # pyright: ignore[reportOptionalMemberAccess]
+                    )
                 }
                 assert isinstance(dataset, HFDataset)
                 dataset = dataset.map(dataset_parsing_func,
@@ -515,10 +517,11 @@ def split_batch(self, batch: Any,
         return batched_list
 
 
-class InContextLearningGenerationWithAnswersTaskDataset(InContextLearningDataset):
-    """A dataset that constructs batches for in-context learning generation tasks with
-    answers. Generation tasks with evaluate a model's ability to generate responses and
-    score them against a set of gold-standard answers.
+class InContextLearningGenerationWithAnswersTaskDataset(InContextLearningDataset
+                                                       ):
+    """A dataset that constructs batches for in-context learning generation
+    tasks with answers. Generation tasks with evaluate a model's ability to
+    generate responses and score them against a set of gold-standard answers.
 
     The input format is expected to be a jsonl file with the following fields:
     - context: The question
@@ -531,12 +534,13 @@ class InContextLearningGenerationWithAnswersTaskDataset(InContextLearningDataset
         cot_delimiter (str): Delimiter to place between the chain of thought and continuations.
     """
 
-    def __init__(self,
-                 cot_delimiter: str = '',
-                 early_stopping_criteria: Optional[List[str]] = None,
-                 do_normalization: bool = True,
-                 *args,
-                 **kwargs):
+    def __init__(
+            self,
+            cot_delimiter: str = '',
+            early_stopping_criteria: Optional[List[str]] = None,
+            do_normalization: bool = True,
+            *args,  # pyright: ignore
+            **kwargs):  # pyright: ignore
         if kwargs['tokenizer'].eos_token_id is None:
             raise ValueError(
                 '`InContextLearningGenerationWithAnswersTaskDataset` tokenizer must have non-null `eos_token_id`'
@@ -606,7 +610,9 @@ def read_dataset(
         self.padding_size = self.max_seq_len - self.max_answer_length
         return dataset
 
-    def get_answer_from_example(self, example: Dict, in_context=False) -> str:
+    def get_answer_from_example(self,
+                                example: Dict,
+                                in_context: bool = False) -> str:
         """
         Returns the answer from the example. Applies chain of thought if self.has_cot is marked as true.
         Args:
@@ -637,7 +643,7 @@ def tokenize_example(self, prompt_and_fewshot: str, ctxt: str,
         tokenized_example['aliases'] = list(example.get('aliases', []))
         return tokenized_example
 
-    def _get_max_answer_length(self, dataset) -> int:
+    def _get_max_answer_length(self, dataset: Iterable[dict]) -> int:
         f"""
         Loops over the dataset and finds the longest answer length.
 
@@ -691,7 +697,7 @@ class InContextLearningLMTaskDataset(InContextLearningDataset):
     See InContextLearningDataset for more details.
     """
 
-    def __init__(self, *args, **kwargs):
+    def __init__(self, *args, **kwargs):  # pyright: ignore
         super().__init__(answer_key='continuation',
                          static_keys=['mode'],
                          tensor_keys=[
@@ -738,14 +744,15 @@ class InContextLearningMultipleChoiceTaskDataset(InContextLearningDataset):
         choices_key (str): The key under which the choices are stored in the saved dataset. Defaults to 'choices'.
     """
 
-    def __init__(self,
-                 choices_key: str = 'choices',
-                 static_keys: Optional[List] = None,
-                 list_of_tensors_keys: Optional[List] = None,
-                 list_of_tuples_keys: Optional[List] = None,
-                 list_of_primitives: Optional[List] = None,
-                 *args,
-                 **kwargs):
+    def __init__(
+            self,
+            choices_key: str = 'choices',
+            static_keys: Optional[List] = None,
+            list_of_tensors_keys: Optional[List] = None,
+            list_of_tuples_keys: Optional[List] = None,
+            list_of_primitives: Optional[List] = None,
+            *args,  # pyright: ignore
+            **kwargs):  # pyright: ignore
         self.choices_key = choices_key
         base_batch = {
             'input_ids': [],
@@ -778,7 +785,9 @@ def __init__(self,
         }
         self.batch_map_per_example = {'gold_indices': 'gold'}
 
-    def get_answer_from_example(self, example: Dict, in_context=False) -> str:
+    def get_answer_from_example(self,
+                                example: Dict,
+                                in_context: bool = False) -> str:
         """
         Returns the correct answer from the example's choices.
         Args:
@@ -886,7 +895,7 @@ def collate_fn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
         batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
         return batch
 
-    def get_num_samples_in_batch(self, batch) -> int:
+    def get_num_samples_in_batch(self, batch: Dict[str, torch.Tensor]) -> int:
         return batch['input_ids'].shape[0] // self.num_choices
 
     def split_batch(self, batch: Any,
@@ -963,7 +972,11 @@ class InContextLearningSchemaTaskDataset(
     - choice_groupings: Indicates which indices of the batch correspond to which questions
     """
 
-    def __init__(self, choices_key='context_options', *args, **kwargs):
+    def __init__(
+            self,
+            choices_key: str = 'context_options',
+            *args,  # pyright: ignore
+            **kwargs):  # pyright: ignore
         static_keys = ['mode']
         tensor_keys = ['input_ids', 'labels', 'attention_mask']
         list_of_tensors_keys = ['continuation_indices']
@@ -984,7 +997,7 @@ def __init__(self, choices_key='context_options', *args, **kwargs):
         }
 
     def construct_context(self,
-                          example,
+                          example: Dict[str, Any],
                           preceding_text: str = '',
                           add_answer: bool = False) -> str:
         """Takes a example and constructs a context with the correct context for
@@ -1120,8 +1133,8 @@ def tokenize_example(self, prompt_and_fewshot: str,
 
 
 class InContextLearningCodeEvalDataset(InContextLearningDataset):
-    """
-    A dataset that constructs batches for in-context learning code evaluation.
+    """A dataset that constructs batches for in-context learning code
+    evaluation.
 
     The input format is expected to be a jsonl file with the following fields:
 
@@ -1162,11 +1175,11 @@ class InContextLearningCodeEvalDataset(InContextLearningDataset):
     """
 
     def __init__(
-        self,
-        generations_per_sample: int,
-        pass_at_k: Union[int, list[int]] = 1,
-        *args,
-        **kwargs,
+            self,
+            generations_per_sample: int,
+            pass_at_k: Union[int, list[int]] = 1,
+            *args,  # pyright: ignore
+            **kwargs,  # pyright: ignore
     ):
         if isinstance(pass_at_k, int):
             pass_at_k = [pass_at_k]
@@ -1226,7 +1239,8 @@ def __init__(
         self.dataset = self.repeat_dataset(self.dataset, generations_per_sample)
         self.base_batch = {
             'input_ids': [],
-            'mode': 'generate',
+            'mode':
+                'generate',
             'labels': [],
             'prompts': [],
             'tests': [],
@@ -1234,8 +1248,11 @@ def __init__(
             'test_inputs': [],
             'test_outputs': [],
             'languages': [],
-            'pass_at_k': pass_at_k,
-            'generation_length': min(self.max_answer_length, self.max_seq_len - self.max_prompt_length),
+            'pass_at_k':
+                pass_at_k,
+            'generation_length':
+                min(self.max_answer_length,
+                    self.max_seq_len - self.max_prompt_length),
             'generation_kwargs': {
                 'pad_token_id': self.pad_tok_id,
                 'num_beams': 1,  # single beam
@@ -1245,9 +1262,12 @@ def __init__(
                 'eos_token_id': self.tokenizer.eos_token_id,
             },
             'sample_id': [],
-            'pass_at_k': list(pass_at_k),
-            'generations_per_sample': generations_per_sample,
-            'dataset_size': dataset_size,
+            'pass_at_k':
+                list(pass_at_k),
+            'generations_per_sample':
+                generations_per_sample,
+            'dataset_size':
+                dataset_size,
         }
         if 'generation_kwargs' in kwargs:
             self.update_generation_kwargs(kwargs['generation_kwargs'])
@@ -1260,15 +1280,16 @@ def _repeat_dataset():
                     assert isinstance(sample, dict)
                     yield {'sample_id': i, **sample}
 
-        from datasets import Dataset as HFDataset  # pyright: ignore[reportGeneralTypeIssues]
+        from datasets import \
+            Dataset as HFDataset  # pyright: ignore[reportGeneralTypeIssues]
 
         repeated_dataset = HFDataset.from_generator(_repeat_dataset)
         assert isinstance(repeated_dataset, HFDataset)
         return repeated_dataset
 
     def _set_max_prompt_and_answer_lengths(self):
-        """
-        Iterates through the dataset and finds the maximum prompt length and sequence lengths
+        """Iterates through the dataset and finds the maximum prompt length and
+        sequence lengths.
 
         Returns:
             None
@@ -1277,10 +1298,15 @@ def _set_max_prompt_and_answer_lengths(self):
         max_answer_length = 0
         for example in self.dataset:
             assert isinstance(example, Dict)
-            unpadded_example = [token for token in example[self.context_key] if token != self.pad_tok_id]
+            unpadded_example = [
+                token for token in example[self.context_key]
+                if token != self.pad_tok_id
+            ]
             max_prompt_length = max(max_prompt_length, len(unpadded_example))
 
-            tokenized_answer = self.tokenizer(example['canonical_solution'], add_special_tokens=False)['input_ids']
+            tokenized_answer = self.tokenizer(
+                example['canonical_solution'],
+                add_special_tokens=False)['input_ids']
             assert isinstance(tokenized_answer, list)
             len_tokenized_answer = len(tokenized_answer)
             max_answer_length = max(max_answer_length, len_tokenized_answer)
@@ -1289,29 +1315,35 @@ def _set_max_prompt_and_answer_lengths(self):
         self.max_answer_length = max_answer_length + _MAX_ANSWER_BUFFER_LENGTH
 
     def _trim_padding(self, example: Dict):
-        """
-        Adjusts padding to the maximum prompt length rather than max_seq_len.
-        Needs to be done after the dataset has been processed because we don't know the maximum
-        prompt length until after we've tokenized it.
+        """Adjusts padding to the maximum prompt length rather than max_seq_len.
+        Needs to be done after the dataset has been processed because we don't
+        know the maximum prompt length until after we've tokenized it.
 
         Returns:
             dataset: A HuggingFace Dataset with different padding lengths for example[self.context_key]
         """
         # Remove padding tokens applied during tokenization
-        unpadded_prompt = [token for token in example[self.context_key] if token != self.pad_tok_id]
+        unpadded_prompt = [
+            token for token in example[self.context_key]
+            if token != self.pad_tok_id
+        ]
         # Reapply padding only to max_prompt_length
         full_prompt = trim_context(unpadded_prompt, [], self.max_prompt_length)
-        padded_context = make_padded_input(full_prompt, [], self.max_prompt_length, self.pad_tok_id, self.padding_side)
+        padded_context = make_padded_input(full_prompt, [],
+                                           self.max_prompt_length,
+                                           self.pad_tok_id, self.padding_side)
 
         example[self.context_key] = padded_context
         return example
 
-    def tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -> Dict[str, Any]:
-        """
-        Adds extra code task details to the example dictionary.
+    def tokenize_example(self, prompt_and_fewshot: str, ctxt: str,
+                         example: Dict) -> Dict[str, Any]:
+        """Adds extra code task details to the example dictionary.
+
         See InContextLearningDataset for more details
         """
-        tokenized_example = super().tokenize_example(prompt_and_fewshot, ctxt, example)
+        tokenized_example = super().tokenize_example(prompt_and_fewshot, ctxt,
+                                                     example)
         tokenized_example['prompt_text'] = example['prompt']
         tokenized_example['task_id'] = example['task_id']
         tokenized_example['canonical_solution'] = example['canonical_solution']
@@ -1323,7 +1355,6 @@ def tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) ->
         return tokenized_example
 
 
-
 def build_icl_dataloader(
         icl_task_type: str,
         dataset_uri: str,
diff --git a/llmfoundry/eval/datasets/utils.py b/llmfoundry/eval/datasets/utils.py
index e881abc9d7..6e39ffdb4e 100644
--- a/llmfoundry/eval/datasets/utils.py
+++ b/llmfoundry/eval/datasets/utils.py
@@ -9,12 +9,11 @@
 
 import logging
 import random
-from typing import TYPE_CHECKING, Dict, List, Optional, Set
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set
 
 import torch
 
 __all__ = [
-    'add_vision_dataset_transform',
     'MultiTokenEOSCriteria',
 ]
 
@@ -249,7 +248,7 @@ def __init__(
         def __call__(self,
                      input_ids: torch.LongTensor,
                      scores: Optional[torch.FloatTensor] = None,
-                     **kwargs) -> bool:
+                     **kwargs: Dict[str, Any]) -> bool:
             # For efficiency, we compare the last n tokens where n is the number of tokens in the stop_sequence
             lookback_ids_batch = input_ids[:, :][:, -self.stop_sequence_id_len:]
             lookback_tokens_batch = self.tokenizer.batch_decode(
diff --git a/llmfoundry/eval/metrics/__init__.py b/llmfoundry/eval/metrics/__init__.py
index 76c301fefa..6457018cbb 100644
--- a/llmfoundry/eval/metrics/__init__.py
+++ b/llmfoundry/eval/metrics/__init__.py
@@ -7,10 +7,10 @@
 """A collection of common torchmetrics."""
 
 from llmfoundry.eval.metrics.nlp import (
-    InContextLearningCodeEvalAccuracy, InContextLearningLMAccuracy,
-    InContextLearningLMExpectedCalibrationError,
+    InContextLearningCodeEvalAccuracy, InContextLearningGenerationAccuracy,
+    InContextLearningLMAccuracy, InContextLearningLMExpectedCalibrationError,
     InContextLearningMCExpectedCalibrationError, InContextLearningMetric,
-    InContextLearningMultipleChoiceAccuracy, InContextLearningGenerationAccuracy)
+    InContextLearningMultipleChoiceAccuracy)
 
 __all__ = [
     'InContextLearningLMAccuracy',
diff --git a/llmfoundry/eval/metrics/nlp.py b/llmfoundry/eval/metrics/nlp.py
index 3032ba6e71..da3d2a5d7c 100644
--- a/llmfoundry/eval/metrics/nlp.py
+++ b/llmfoundry/eval/metrics/nlp.py
@@ -12,9 +12,10 @@
 import string
 import warnings
 from typing import Any, Dict, List
-from composer.utils import dist
+
 import numpy as np
 import torch
+from composer.utils import dist
 from composer.utils.eval_client import (EvalClient, LambdaEvalClient,
                                         LocalEvalClient,
                                         MosaicMLLambdaEvalClient)
@@ -28,7 +29,7 @@
     'InContextLearningMetric',
     'InContextLearningLMAccuracy',
     'InContextLearningMultipleChoiceAccuracy',
-    'InContextLearningGenerationWithAnswersTaskDataset',
+    'InContextLearningGenerationAccuracy',
     'InContextLearningCodeEvalAccuracy',
     'InContextLearningLMExpectedCalibrationError',
     'InContextLearningMCExpectedCalibrationError',
@@ -37,7 +38,7 @@
 
 class InContextLearningMetric(Metric):
 
-    def __init__(self, *args, **kwargs):
+    def __init__(self, *args, **kwargs):  # pyright: ignore
         super().__init__(*args, **kwargs)
         self.needs_batch = True
 
@@ -65,8 +66,7 @@ def update(
 
 
 class InContextLearningGenerationAccuracy(InContextLearningMetric):
-    r"""Computes accuracy for In-context learning (ICL) question answering (QA)
-    tasks.
+    r"""Computes accuracy for In-context learning (ICL) generation tasks.
 
     ICL QA tasks consist of some number of example question answering tasks (referred to as the 'context'), followed by a test task where the model must
     match one of the possible answer aliases (referred to as the 'continuation').
@@ -166,8 +166,7 @@ def compute(self):
 
 
 class InContextLearningLMAccuracy(InContextLearningMetric):
-    r"""Computes accuracy for In-context learning (ICL) language modeling (LM)
-    tasks.
+    r"""Computes accuracy for In-context learning language modeling tasks.
 
     ICL LM tasks consist of some number of example language modeling tasks (referred to as the 'context'), followed by a test task where the model must correctly predict all the tokens
     following tokens in some passage (referred to as the 'continuation').
@@ -217,8 +216,7 @@ def compute(self):
 
 
 class InContextLearningMultipleChoiceAccuracy(InContextLearningMetric):
-    r"""Computes accuracy for In-context learning (ICL) multiple choice (MC)
-    tasks.
+    r"""Computes accuracy for In-context learning multiple choice tasks.
 
     ICL MC tasks consists of a series of questions with some number of possible choices (only one of which can be correct).
     At inference time each possible choice is given to the model as a separate input and the one for which the model assigns
@@ -278,8 +276,9 @@ def compute(self):
 
 
 class InContextLearningExpectedCalibrationError(InContextLearningMetric):
-    """Generic class for Expected Calibration Error (ECE) (cite:
-    https://arxiv.org/pdf/1706.04599.pdf).
+    """Generic class for Expected Calibration Error (ECE).
+
+    Citation: https://arxiv.org/pdf/1706.04599.pdf
 
     Expected calibration error is calculated by dividing predictions into buckets based on the model's confidence (a probability value between 0 and 1).
     We then calculate the accuracy within each bucket and calculate the average gap between confidence and accuracy
@@ -336,6 +335,7 @@ def compute(self):
 class InContextLearningMCExpectedCalibrationError(
         InContextLearningExpectedCalibrationError):
     r"""Computes Expected Calibration Error (ECE) for In-context learning (ICL)
+
     multiple choice (MC) tasks. (source: https://arxiv.org/abs/2012.00955).
 
     For MC tasks, the model confidence is defined as the softmax of average per-token probability assigned to the top question choice.
@@ -382,6 +382,7 @@ def update(self, batch: dict, outputs: torch.Tensor, labels: torch.Tensor):
 class InContextLearningLMExpectedCalibrationError(
         InContextLearningExpectedCalibrationError):
     r"""Computes Expected Calibration Error (ECE) for In-context learning (ICL)
+
     language modeling (LM) tasks. (cite: https://arxiv.org/pdf/1706.04599.pdf).
 
     For LM tasks, the model confidence is defined as the minimum probability assigned to all tokens in the continuation.
@@ -415,6 +416,7 @@ def update(self, batch: dict, outputs: torch.Tensor, labels: torch.Tensor):
             self.bucket_totals[
                 bucket_idx] += 1  # pyright: ignore [reportGeneralTypeIssues]
 
+
 class InContextLearningCodeEvalAccuracy(InContextLearningMetric):
     r"""Computes accuracy for In-context learning (ICL) code evaluation tasks.
 
@@ -443,7 +445,9 @@ def __init__(self, dist_sync_on_step: bool = False):
         super().__init__(dist_sync_on_step=dist_sync_on_step)
 
         self._initialized = False
-
+        self.dataset_size = 0
+        self.pass_at_k = []
+        self.num_generations = 0
         self.eval_device = os.environ.get('CODE_EVAL_DEVICE', None)
         if self.eval_device is not None:
             self.eval_device = self.eval_device.upper()
@@ -469,8 +473,9 @@ def get_client(self) -> EvalClient:
                 'to one of `LOCAL` (for unsafe local eval), `LAMBDA` (for AWS lambda ',
                 'evaluation), or `MOSAICML` (for lambda eval through MAPI).')
         else:
-            raise ValueError('Environment variable `CODE_EVAL_DEVICE` must be one of `LOCAL`, '
-                             f'`LAMBDA`, or `MOSAICML` but got {self.eval_device}.')
+            raise ValueError(
+                'Environment variable `CODE_EVAL_DEVICE` must be one of `LOCAL`, '
+                f'`LAMBDA`, or `MOSAICML` but got {self.eval_device}.')
 
         return client
 
@@ -493,12 +498,17 @@ def _initialize_state(self, batch: dict[str, Any]):
         self.num_generations = batch['generations_per_sample']
 
         # We need to defer the accumulator initialization because it depends on dataset size
-        self.add_state('correct', default=torch.zeros(self.dataset_size, device=device), dist_reduce_fx='sum')
-        self.add_state('total', default=torch.zeros(self.dataset_size, device=device), dist_reduce_fx='sum')
+        self.add_state('correct',
+                       default=torch.zeros(self.dataset_size, device=device),
+                       dist_reduce_fx='sum')
+        self.add_state('total',
+                       default=torch.zeros(self.dataset_size, device=device),
+                       dist_reduce_fx='sum')
         dist.barrier()
         self._initialized = True
 
-    def update(self, batch: Dict[str, Any], outputs: List[str], labels: List[str]):
+    def update(self, batch: Dict[str, Any], outputs: List[str],
+               labels: List[str]):
         """Updates the pass@k accuracy of code generation.
 
         Given a batch of prompts, test cases, and code generations, evaluates the code generations
@@ -529,13 +539,16 @@ def update(self, batch: Dict[str, Any], outputs: List[str], labels: List[str]):
         client = self.get_client()
 
         for sample_id, code_gen, sample_prompt, test_inputs, test_outputs, entry_point, language in zip(
-                batch['sample_id'], outputs, batch['prompts'], batch['test_inputs'], batch['test_outputs'],
+                batch['sample_id'], outputs, batch['prompts'],
+                batch['test_inputs'], batch['test_outputs'],
                 batch['entry_points'], batch['languages']):
 
             idx = sample_id
             self.total[idx] += 1.0
 
-            code_gen = re.split(r'\n[A-Za-z0-9#`]', code_gen)[0]  # remove everything after function ends
+            code_gen = re.split(
+                r'\n[A-Za-z0-9#`]',
+                code_gen)[0]  # remove everything after function ends
             final_code = sample_prompt + code_gen  # combine prompt with the code generation
 
             test_results = []
@@ -562,22 +575,27 @@ def compute(self):
         complete = self.total == self.num_generations  # so that eval subset batches can be used
 
         if complete.sum() < (self.total != 0).sum():
-            warnings.warn('Some samples in the dataset have less than the expected number of generations. '
-                          'This is expected if you are using a subset of the dataset for evaluation.')
+            warnings.warn(
+                'Some samples in the dataset have less than the expected number of generations. '
+                'This is expected if you are using a subset of the dataset for evaluation.'
+            )
 
         if (self.correct > self.total).any().item():
             raise ValueError(
-                'Internal error some samples have more correct than  total generations. This should not happen.')
+                'Internal error some samples have more correct than  total generations. This should not happen.'
+            )
 
         results = {}
         n = self.num_generations
 
         for k in self.pass_at_k:
-            pass_at_k = sum([self.estimator(n, int(c.item()), k) for c in self.correct[complete]
-                            ]) / complete.sum().item()
+            pass_at_k = sum([
+                self.estimator(n, int(c.item()), k)
+                for c in self.correct[complete]
+            ]) / complete.sum().item()
             results[f'pass@{k}'] = torch.tensor(pass_at_k)
 
         if len(results) == 1:  # backwards compatibility
             return list(results.values())[0]
 
-        return results
\ No newline at end of file
+        return results
diff --git a/llmfoundry/models/hf/hf_causal_lm.py b/llmfoundry/models/hf/hf_causal_lm.py
index 868ae23893..756ff4326a 100644
--- a/llmfoundry/models/hf/hf_causal_lm.py
+++ b/llmfoundry/models/hf/hf_causal_lm.py
@@ -17,9 +17,9 @@
                           PreTrainedTokenizerBase)
 
 from llmfoundry.eval.metrics import (InContextLearningCodeEvalAccuracy,
+                                     InContextLearningGenerationAccuracy,
                                      InContextLearningLMAccuracy,
-                                     InContextLearningMultipleChoiceAccuracy,
-                                     InContextLearningGenerationAccuracy)
+                                     InContextLearningMultipleChoiceAccuracy)
 from llmfoundry.metrics import TokenAccuracy
 from llmfoundry.models.hf.hf_fsdp import hf_get_init_device
 from llmfoundry.models.hf.model_wrapper import HuggingFaceModelWithZLoss
diff --git a/llmfoundry/models/inference_api_wrapper/interface.py b/llmfoundry/models/inference_api_wrapper/interface.py
index a5caaa89ad..87bb9a7f70 100644
--- a/llmfoundry/models/inference_api_wrapper/interface.py
+++ b/llmfoundry/models/inference_api_wrapper/interface.py
@@ -10,10 +10,10 @@
 from torchmetrics import Metric
 from transformers import AutoTokenizer
 
-from llmfoundry.eval.metrics import (InContextLearningLMAccuracy,
+from llmfoundry.eval.metrics import (InContextLearningGenerationAccuracy,
+                                     InContextLearningLMAccuracy,
                                      InContextLearningMetric,
-                                     InContextLearningMultipleChoiceAccuracy,
-                                     InContextLearningGenerationAccuracy)
+                                     InContextLearningMultipleChoiceAccuracy)
 
 
 class InferenceAPIEvalWrapper(ComposerModel):
diff --git a/llmfoundry/models/mpt/modeling_mpt.py b/llmfoundry/models/mpt/modeling_mpt.py
index 53868f8e1a..e6154da939 100644
--- a/llmfoundry/models/mpt/modeling_mpt.py
+++ b/llmfoundry/models/mpt/modeling_mpt.py
@@ -21,9 +21,9 @@
 from composer.utils import dist
 
 from llmfoundry.eval.metrics import (InContextLearningCodeEvalAccuracy,
+                                     InContextLearningGenerationAccuracy,
                                      InContextLearningLMAccuracy,
-                                     InContextLearningMultipleChoiceAccuracy,
-                                     InContextLearningGenerationAccuracy)
+                                     InContextLearningMultipleChoiceAccuracy)
 from llmfoundry.metrics import TokenAccuracy
 from llmfoundry.models.layers.attention import (is_flash_v1_installed,
                                                 is_flash_v2_installed)
diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py
index ccbdd1158a..1798bdd9a4 100644
--- a/llmfoundry/utils/builders.py
+++ b/llmfoundry/utils/builders.py
@@ -503,7 +503,6 @@ def _validate_cfg(icl_cfg: DictConfig):
                 'num_beams is no longer supported as a top level icl_task parameter.'  + \
                 'Please use generation_kwargs.num_beams instead.')
 
-
     for icl_cfg in icl_tasks_list:
         assert isinstance(icl_cfg, DictConfig)
         _validate_cfg(icl_cfg)
diff --git a/tests/eval/test_in_context_learning_datasets.py b/tests/eval/test_in_context_learning_datasets.py
index 90fab0810c..049579521d 100644
--- a/tests/eval/test_in_context_learning_datasets.py
+++ b/tests/eval/test_in_context_learning_datasets.py
@@ -9,6 +9,7 @@
 import random
 import types
 from pathlib import Path
+from typing import Dict, List, Optional
 
 import pytest
 import torch
@@ -19,11 +20,13 @@
 # isort: off
 from llmfoundry.eval.datasets import (
     InContextLearningDataset, InContextLearningCodeEvalDataset,
-    InContextLearningMultipleChoiceTaskDataset, InContextLearningGenerationWithAnswersTaskDataset,
+    InContextLearningMultipleChoiceTaskDataset,
+    InContextLearningGenerationWithAnswersTaskDataset,
     InContextLearningSchemaTaskDataset, get_icl_task_dataloader, strip_data,
     tokenizer_needs_prefix_space, trim_context, get_continuation_span,
     get_fewshot_sample_idxs, make_padded_input)
 # isort: on
+import transformers
 from composer.datasets.utils import MultiTokenEOSCriteria
 from composer.loggers import InMemoryLogger
 from composer.models import HuggingFaceModel
@@ -31,9 +34,9 @@
 from composer.utils import dist, reproducibility
 
 from llmfoundry.eval.metrics import (InContextLearningCodeEvalAccuracy,
+                                     InContextLearningGenerationAccuracy,
                                      InContextLearningLMAccuracy,
-                                     InContextLearningMultipleChoiceAccuracy,
-                                     InContextLearningGenerationAccuracy)
+                                     InContextLearningMultipleChoiceAccuracy)
 
 
 def test_strip_data():
@@ -52,7 +55,7 @@ def test_strip_data():
 @pytest.mark.skip(
     reason="Currently don't have a tokenizer that satisfies this test")
 def test_tokenizer_needs_prefix_space_when_space_not_needed(
-        tiny_gpt2_tokenizer):
+        tiny_gpt2_tokenizer: transformers.AutoTokenizer):
     assert not tokenizer_needs_prefix_space(tiny_gpt2_tokenizer)
 
 
@@ -99,7 +102,8 @@ def test_get_continuation_span():
 
 
 @pytest.mark.parametrize('padding_side', ['left', 'right', 'middle'])
-def test_make_padding(tiny_gpt2_tokenizer, padding_side):
+def test_make_padding(tiny_gpt2_tokenizer: transformers.AutoTokenizer,
+                      padding_side: str):
     context = tiny_gpt2_tokenizer(' cat' * 2000)['input_ids']
     padding_id = tiny_gpt2_tokenizer.eos_token_id
 
@@ -121,7 +125,8 @@ def test_make_padding(tiny_gpt2_tokenizer, padding_side):
             assert input_ids[:-48].tolist() == context
 
 
-def test_batch_padding_logic_no_padding(tiny_gpt2_tokenizer):
+def test_batch_padding_logic_no_padding(
+        tiny_gpt2_tokenizer: transformers.AutoTokenizer):
     continuation = tiny_gpt2_tokenizer(' dog' * 2000)['input_ids']
     context = tiny_gpt2_tokenizer(' cat' * 2000)['input_ids']
     max_seq_len = 2048
@@ -137,7 +142,8 @@ def test_batch_padding_logic_no_padding(tiny_gpt2_tokenizer):
     assert tiny_gpt2_tokenizer.pad_token_id not in padded_input
 
 
-def test_batch_padding_logic_with_padding(tiny_gpt2_tokenizer):
+def test_batch_padding_logic_with_padding(
+        tiny_gpt2_tokenizer: transformers.AutoTokenizer):
     continuation = tiny_gpt2_tokenizer(' dog' * 200)['input_ids']
     context = tiny_gpt2_tokenizer(' cat' * 200)['input_ids']
     max_seq_len = 2048
@@ -213,7 +219,8 @@ def test_fewshot_sample_idxs_randomness():
 @pytest.mark.filterwarnings(
     r'ignore:The repository for mosaicml/test_dataset contains custom code which must*:FutureWarning'
 )
-def test_update_generation_kwargs(tiny_gpt2_tokenizer, tmp_path):
+def test_update_generation_kwargs(
+        tiny_gpt2_tokenizer: transformers.AutoTokenizer, tmp_path: Path):
     tokenizer = tiny_gpt2_tokenizer
     seqlen = 2048
     num_fewshot = 0
@@ -246,7 +253,8 @@ def test_update_generation_kwargs(tiny_gpt2_tokenizer, tmp_path):
     }
 
 
-def test_stop_sequences_criteria(tiny_gpt2_tokenizer):
+def test_stop_sequences_criteria(
+        tiny_gpt2_tokenizer: transformers.AutoTokenizer):
     pytest.importorskip('transformers')
     eos_criteria = MultiTokenEOSCriteria('\n\n', tiny_gpt2_tokenizer, 2)
     seq1 = tiny_gpt2_tokenizer('Dogs are furry')['input_ids']
@@ -264,7 +272,8 @@ def test_stop_sequences_criteria(tiny_gpt2_tokenizer):
                         None)  # pyright: ignore[reportGeneralTypeIssues]
 
 
-def test_stop_sequences_criteria_sentencepiece(tiny_llama_tokenizer):
+def test_stop_sequences_criteria_sentencepiece(
+        tiny_llama_tokenizer: transformers.AutoTokenizer):
     pytest.importorskip('datasets')
 
     tokenizer = tiny_llama_tokenizer
@@ -289,7 +298,8 @@ def test_stop_sequences_criteria_sentencepiece(tiny_llama_tokenizer):
 @pytest.mark.filterwarnings(
     r'ignore:The repository for mosaicml/test_dataset contains custom code which must*:FutureWarning'
 )
-def test_update_generation_kwargs_no_kwargs(tiny_gpt2_tokenizer, tmp_path):
+def test_update_generation_kwargs_no_kwargs(
+        tiny_gpt2_tokenizer: transformers.AutoTokenizer, tmp_path: Path):
     tokenizer = tiny_gpt2_tokenizer
     seqlen = 2048
     num_fewshot = 0
@@ -317,7 +327,7 @@ def test_update_generation_kwargs_no_kwargs(tiny_gpt2_tokenizer, tmp_path):
     assert not 'generation_kwargs' in dl.base_batch
 
 
-def test_update_generation_kwargs_no_kwargs_qa_dataset(tmp_path):
+def test_update_generation_kwargs_no_kwargs_qa_dataset(tmp_path: Path):
     pytest.importorskip('datasets')
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/triviaqa_small.jsonl'
@@ -342,7 +352,7 @@ def test_update_generation_kwargs_no_kwargs_qa_dataset(tmp_path):
     assert len(dl.base_batch['generation_kwargs']) == 3
 
 
-def test_update_generation_kwargs_with_kwargs_qa_dataset(tmp_path):
+def test_update_generation_kwargs_with_kwargs_qa_dataset(tmp_path: Path):
     pytest.importorskip('datasets')
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/triviaqa_small.jsonl'
@@ -372,7 +382,8 @@ def test_update_generation_kwargs_with_kwargs_qa_dataset(tmp_path):
 @pytest.mark.filterwarnings(
     r'ignore:The repository for mosaicml/test_dataset contains custom code which must*:FutureWarning'
 )
-def test_construct_context(tiny_gpt2_tokenizer, tmp_path):
+def test_construct_context(tiny_gpt2_tokenizer: transformers.AutoTokenizer,
+                           tmp_path: Path):
     tokenizer = tiny_gpt2_tokenizer
     seqlen = 2048
     num_fewshot = 0
@@ -421,7 +432,8 @@ def test_construct_context(tiny_gpt2_tokenizer, tmp_path):
 @pytest.mark.filterwarnings(
     r'ignore:The repository for mosaicml/test_dataset contains custom code which must*:FutureWarning'
 )
-def test_get_answer_from_example(tiny_gpt2_tokenizer, tmp_path):
+def test_get_answer_from_example(
+        tiny_gpt2_tokenizer: transformers.AutoTokenizer, tmp_path: Path):
     tokenizer = tiny_gpt2_tokenizer
     seqlen = 2048
     num_fewshot = 0
@@ -456,7 +468,7 @@ def test_get_answer_from_example(tiny_gpt2_tokenizer, tmp_path):
 @pytest.mark.filterwarnings(
     r'ignore:The repository for mosaicml/test_dataset contains custom code which must*:FutureWarning'
 )
-def test_fix_eos_on_preamble(tmp_path):
+def test_fix_eos_on_preamble(tmp_path: Path):
     transformers = pytest.importorskip('transformers')
     tokenizer = transformers.AutoTokenizer.from_pretrained(
         'facebook/opt-125m',
@@ -495,7 +507,8 @@ def test_fix_eos_on_preamble(tmp_path):
 @pytest.mark.filterwarnings(
     r'ignore:The repository for mosaicml/test_dataset contains custom code which must*:FutureWarning'
 )
-def test_tokenize_example_with_tokenize_labels(tiny_gpt2_tokenizer, tmp_path):
+def test_tokenize_example_with_tokenize_labels(
+        tiny_gpt2_tokenizer: transformers.AutoTokenizer, tmp_path: Path):
     tokenizer = tiny_gpt2_tokenizer
     seqlen = 2048
     num_fewshot = 0
@@ -539,8 +552,8 @@ def test_tokenize_example_with_tokenize_labels(tiny_gpt2_tokenizer, tmp_path):
 @pytest.mark.filterwarnings(
     r'ignore:The repository for mosaicml/test_dataset contains custom code which must*:FutureWarning'
 )
-def test_tokenize_example_with_no_tokenize_labels(tiny_gpt2_tokenizer,
-                                                  tmp_path):
+def test_tokenize_example_with_no_tokenize_labels(
+        tiny_gpt2_tokenizer: transformers.AutoTokenizer, tmp_path: Path):
     tokenizer = tiny_gpt2_tokenizer
     seqlen = 2048
     num_fewshot = 0
@@ -580,7 +593,7 @@ def test_tokenize_example_with_no_tokenize_labels(tiny_gpt2_tokenizer,
     assert type(tokenized_example['answer']) == str
 
 
-def test_qa_set_cot_no_cot(tmp_path):
+def test_qa_set_cot_no_cot(tmp_path: Path):
     pytest.importorskip('datasets')
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/triviaqa_small.jsonl'
@@ -605,7 +618,7 @@ def test_qa_set_cot_no_cot(tmp_path):
     assert not dl.has_cot
 
 
-def test_qa_set_cot_has_cot(tmp_path):
+def test_qa_set_cot_has_cot(tmp_path: Path):
     pytest.importorskip('datasets')
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/gsm8k_small.jsonl'
@@ -630,7 +643,8 @@ def test_qa_set_cot_has_cot(tmp_path):
     assert dl.has_cot
 
 
-def test_qa_get_max_answer_length(tiny_gpt2_tokenizer, tmp_path):
+def test_qa_get_max_answer_length(
+        tiny_gpt2_tokenizer: transformers.AutoTokenizer, tmp_path: Path):
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/triviaqa_small.jsonl'
     tokenizer = tiny_gpt2_tokenizer
@@ -654,7 +668,8 @@ def test_qa_get_max_answer_length(tiny_gpt2_tokenizer, tmp_path):
     assert dl.max_answer_length == 7
 
 
-def test_qa_get_answer_from_example_with_no_cot(tmp_path, tiny_gpt2_tokenizer):
+def test_qa_get_answer_from_example_with_no_cot(
+        tmp_path: Path, tiny_gpt2_tokenizer: transformers.AutoTokenizer):
     pytest.importorskip('datasets')
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/triviaqa_small.jsonl'
@@ -682,7 +697,8 @@ def test_qa_get_answer_from_example_with_no_cot(tmp_path, tiny_gpt2_tokenizer):
     assert answer == 'this is the correct answer'
 
 
-def test_qa_get_answer_from_example_with_cot(tmp_path, tiny_gpt2_tokenizer):
+def test_qa_get_answer_from_example_with_cot(
+        tmp_path: Path, tiny_gpt2_tokenizer: transformers.AutoTokenizer):
     pytest.importorskip('datasets')
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/triviaqa_small.jsonl'
@@ -711,7 +727,8 @@ def test_qa_get_answer_from_example_with_cot(tmp_path, tiny_gpt2_tokenizer):
     assert answer == "Let's think step by step.  ### this is the correct answer"
 
 
-def test_qa_tokenize_example(tiny_gpt2_tokenizer, tmp_path):
+def test_qa_tokenize_example(tiny_gpt2_tokenizer: transformers.AutoTokenizer,
+                             tmp_path: Path):
     pytest.importorskip('datasets')
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/triviaqa_small.jsonl'
@@ -745,7 +762,8 @@ def test_qa_tokenize_example(tiny_gpt2_tokenizer, tmp_path):
     ]
 
 
-def test_code_adjust_padding(tiny_gpt2_tokenizer, tmp_path):
+def test_code_adjust_padding(tiny_gpt2_tokenizer: transformers.AutoTokenizer,
+                             tmp_path: Path):
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/human_eval_small.jsonl'
     tokenizer = tiny_gpt2_tokenizer
@@ -775,7 +793,8 @@ def test_code_adjust_padding(tiny_gpt2_tokenizer, tmp_path):
         for data in dl.dataset)  # pyright: ignore [reportGeneralTypeIssues]
 
 
-def test_code_update_gen_kwargs(tiny_gpt2_tokenizer, tmp_path):
+def test_code_update_gen_kwargs(tiny_gpt2_tokenizer: transformers.AutoTokenizer,
+                                tmp_path: Path):
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/human_eval_small.jsonl'
     tokenizer = tiny_gpt2_tokenizer
@@ -805,7 +824,8 @@ def test_code_update_gen_kwargs(tiny_gpt2_tokenizer, tmp_path):
     assert dl.base_batch['generation_kwargs']['do_sample'] == True
 
 
-def test_mc_tokenize_example(tiny_gpt2_tokenizer, tmp_path):
+def test_mc_tokenize_example(tiny_gpt2_tokenizer: transformers.AutoTokenizer,
+                             tmp_path: Path):
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/mmlu_small.jsonl'
     tokenizer = tiny_gpt2_tokenizer
@@ -852,7 +872,8 @@ def test_mc_tokenize_example(tiny_gpt2_tokenizer, tmp_path):
     assert untokenized_inputs == correct_output
 
 
-def test_schema_construct_context(tiny_gpt2_tokenizer, tmp_path):
+def test_schema_construct_context(
+        tiny_gpt2_tokenizer: transformers.AutoTokenizer, tmp_path: Path):
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/winograd_small.jsonl'
     tokenizer = tiny_gpt2_tokenizer
@@ -882,7 +903,8 @@ def test_schema_construct_context(tiny_gpt2_tokenizer, tmp_path):
     assert constructed_context == '\ncont one ### this is a continuation'
 
 
-def test_schema_construct_multiple_contexts(tiny_gpt2_tokenizer, tmp_path):
+def test_schema_construct_multiple_contexts(
+        tiny_gpt2_tokenizer: transformers.AutoTokenizer, tmp_path: Path):
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/winograd_small.jsonl'
     tokenizer = tiny_gpt2_tokenizer
@@ -914,7 +936,8 @@ def test_schema_construct_multiple_contexts(tiny_gpt2_tokenizer, tmp_path):
     assert constructed_contexts == ['\ncont one ###', '\ncont two ###']
 
 
-def test_schema_tokenize_example(tiny_gpt2_tokenizer, tmp_path):
+def test_schema_tokenize_example(
+        tiny_gpt2_tokenizer: transformers.AutoTokenizer, tmp_path: Path):
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/winograd_small.jsonl'
     tokenizer = tiny_gpt2_tokenizer
@@ -929,10 +952,11 @@ def test_schema_tokenize_example(tiny_gpt2_tokenizer, tmp_path):
         pad_tok_id=tokenizer.eos_token_id,
         num_fewshot=num_fewshot,
         fewshot_random_seed=1,
-        prompt_string=prompt_string,
-        example_delimiter='\n',
+        prompt_string=prompt_string,  # pyright: ignore
+        example_delimiter='\n',  # pyright: ignore
         continuation_delimiter=' ### ',
-        destination_path=str(tmp_path / 'test_human_eval_small.jsonl'),
+        destination_path=str(tmp_path /
+                             'test_human_eval_small.jsonl'),  # pyright: ignore
     )
     example = {
         'context_options': ['context one', 'context two'],
@@ -960,8 +984,9 @@ def test_schema_tokenize_example(tiny_gpt2_tokenizer, tmp_path):
 
 
 @pytest.mark.parametrize('dataset_uri', ['mmlu_small.jsonl'])
-def test_mc_task_dataloader_subcategories(dataset_uri, tiny_gpt2_tokenizer,
-                                          tmp_path):
+def test_mc_task_dataloader_subcategories(
+        dataset_uri: str, tiny_gpt2_tokenizer: transformers.AutoTokenizer,
+        tmp_path: Path):
     pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -1008,8 +1033,9 @@ def test_mc_task_dataloader_subcategories(dataset_uri, tiny_gpt2_tokenizer,
 @pytest.mark.parametrize('dataset_uri', [
     'pubmed_sm.jsonl',
 ])
-def test_lm_task_dataloader_extra_space(dataset_uri, tiny_gpt2_tokenizer,
-                                        tmp_path):
+def test_lm_task_dataloader_extra_space(
+        dataset_uri: str, tiny_gpt2_tokenizer: transformers.AutoTokenizer,
+        tmp_path: Path):
     pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -1052,7 +1078,9 @@ def test_lm_task_dataloader_extra_space(dataset_uri, tiny_gpt2_tokenizer,
 @pytest.mark.parametrize('dataset_uri', [
     'lambada_small.jsonl',
 ])
-def test_lm_task_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
+def test_lm_task_dataloader(dataset_uri: str,
+                            tiny_gpt2_tokenizer: transformers.AutoTokenizer,
+                            tmp_path: Path):
     pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -1092,7 +1120,9 @@ def test_lm_task_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
 
 
 @pytest.mark.parametrize('dataset_uri', ['winograd_small.jsonl'])
-def test_schema_task_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
+def test_schema_task_dataloader(dataset_uri: str,
+                                tiny_gpt2_tokenizer: transformers.AutoTokenizer,
+                                tmp_path: Path):
     pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -1140,7 +1170,8 @@ def test_schema_task_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
 
 
 @pytest.mark.parametrize('dataset_uri', ['winograd_small.jsonl'])
-def test_schema_task_dataloader_sentpiece_tokenizer(dataset_uri, tmp_path):
+def test_schema_task_dataloader_sentpiece_tokenizer(dataset_uri: str,
+                                                    tmp_path: Path):
     pytest.importorskip('datasets')
     transformers = pytest.importorskip('transformers')
 
@@ -1191,8 +1222,9 @@ def test_schema_task_dataloader_sentpiece_tokenizer(dataset_uri, tmp_path):
 
 @pytest.mark.parametrize('dataset_uri', ['lambada_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [0, 1])
-def test_lm_task_dataloader_opt_tokenizer(tiny_opt_tokenizer, dataset_uri,
-                                          num_fewshot, tmp_path):
+def test_lm_task_dataloader_opt_tokenizer(
+        tiny_opt_tokenizer: transformers.AutoTokenizer, dataset_uri: str,
+        num_fewshot: int, tmp_path: Path):
     pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -1235,8 +1267,9 @@ def test_lm_task_dataloader_opt_tokenizer(tiny_opt_tokenizer, dataset_uri,
 
 @pytest.mark.parametrize('dataset_uri', ['piqa_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [0, 1])
-def test_mc_task_dataloader_opt_tokenizer(tiny_opt_tokenizer, dataset_uri,
-                                          num_fewshot, tmp_path):
+def test_mc_task_dataloader_opt_tokenizer(
+        tiny_opt_tokenizer: transformers.AutoTokenizer, dataset_uri: str,
+        num_fewshot: int, tmp_path: Path):
     pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -1289,7 +1322,8 @@ def test_mc_task_dataloader_opt_tokenizer(tiny_opt_tokenizer, dataset_uri,
 
 @pytest.mark.parametrize('dataset_uri', ['piqa_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [0, 1])
-def test_mc_split_batch(tiny_opt_tokenizer, dataset_uri, num_fewshot, tmp_path):
+def test_mc_split_batch(tiny_opt_tokenizer: transformers.AutoTokenizer,
+                        dataset_uri: str, num_fewshot: int, tmp_path: Path):
     pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -1358,7 +1392,8 @@ def test_mc_split_batch(tiny_opt_tokenizer, dataset_uri, num_fewshot, tmp_path):
 
 
 @pytest.mark.parametrize('dataset_uri', ['triviaqa_small.jsonl'])
-def test_qa_split_batch(tiny_opt_tokenizer, dataset_uri, tmp_path):
+def test_qa_split_batch(tiny_opt_tokenizer: transformers.AutoTokenizer,
+                        dataset_uri: str, tmp_path: Path):
     pytest.importorskip('datasets')
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/{dataset_uri}'
@@ -1412,8 +1447,9 @@ def test_qa_split_batch(tiny_opt_tokenizer, dataset_uri, tmp_path):
 @pytest.mark.parametrize('dataset_uri', ['triviaqa_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [0])
 @pytest.mark.parametrize('prompt_string', ['I am a prompt', ''])
-def test_qa_task_dataloader_w_null_eos(dataset_uri, tiny_gpt2_tokenizer,
-                                       tmp_path, num_fewshot, prompt_string):
+def test_qa_task_dataloader_w_null_eos(
+        dataset_uri: str, tiny_gpt2_tokenizer: transformers.AutoTokenizer,
+        tmp_path: Path, num_fewshot: int, prompt_string: str):
     pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -1442,8 +1478,10 @@ def test_qa_task_dataloader_w_null_eos(dataset_uri, tiny_gpt2_tokenizer,
 @pytest.mark.parametrize('dataset_uri', ['triviaqa_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [0, 2])
 @pytest.mark.parametrize('prompt_string', ['I am a prompt', ''])
-def test_qa_task_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path,
-                            num_fewshot, prompt_string):
+def test_qa_task_dataloader(dataset_uri: str,
+                            tiny_gpt2_tokenizer: transformers.AutoTokenizer,
+                            tmp_path: Path, num_fewshot: int,
+                            prompt_string: str):
     pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -1500,8 +1538,9 @@ def test_qa_task_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path,
 
 @pytest.mark.parametrize('dataset_uri', ['gsm8k_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [0, 2])
-def test_qa_task_with_cot_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path,
-                                     num_fewshot):
+def test_qa_task_with_cot_dataloader(
+        dataset_uri: str, tiny_gpt2_tokenizer: transformers.AutoTokenizer,
+        tmp_path: Path, num_fewshot: int):
     pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -1559,7 +1598,9 @@ def test_qa_task_with_cot_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path,
 
 
 @pytest.mark.parametrize('dataset_uri', ['piqa_small.jsonl'])
-def test_mc_task_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
+def test_mc_task_dataloader(dataset_uri: str,
+                            tiny_gpt2_tokenizer: transformers.AutoTokenizer,
+                            tmp_path: Path):
     pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -1607,7 +1648,7 @@ def test_mc_task_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
 
 
 @pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
-def test_code_eval_split_batch(dataset_uri, tmp_path):
+def test_code_eval_split_batch(dataset_uri: str, tmp_path: Path):
     pytest.importorskip('datasets')
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/{dataset_uri}'
@@ -1653,17 +1694,24 @@ def test_code_eval_split_batch(dataset_uri, tmp_path):
             assert len(batch[field]) == size
             assert all(isinstance(val, type_) for val in batch[field])
 
-    static_keys = {'pass_at_k': (int, list), 'generation_length': int, 'generation_kwargs': dict}
+    static_keys = {
+        'pass_at_k': (int, list),
+        'generation_length': int,
+        'generation_kwargs': dict
+    }
     for batch in batches:
         for field, type_ in static_keys.items():
             assert isinstance(batch[field], type_)
 
+
 @pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [0, 2])
 @pytest.mark.parametrize('prompt_string', ['Please code:\n', ''])
 @pytest.mark.parametrize('generations_per_sample', [1, 3])
-def test_code_eval_sentpiece_dataloader(dataset_uri, tmp_path, num_fewshot, prompt_string, generations_per_sample,
-                                        tiny_llama_tokenizer):
+def test_code_eval_sentpiece_dataloader(
+        dataset_uri: str, tmp_path: Path, num_fewshot: int, prompt_string: str,
+        generations_per_sample: int,
+        tiny_llama_tokenizer: transformers.AutoTokenizer):
     pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -1684,7 +1732,8 @@ def test_code_eval_sentpiece_dataloader(dataset_uri, tmp_path, num_fewshot, prom
                                  example_delimiter='\n',
                                  continuation_delimiter='',
                                  question_prelimiter='Code start: \n',
-                                 destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'),
+                                 destination_path=str(
+                                     tmp_path / f'icl_{num_fewshot}.jsonl'),
                                  generations_per_sample=generations_per_sample)
     assert isinstance(dl, DataSpec)
 
@@ -1706,15 +1755,21 @@ def test_code_eval_sentpiece_dataloader(dataset_uri, tmp_path, num_fewshot, prom
         assert batch['mode'] == 'generate'
         # the maximum generation length from the small test data
         assert batch['generation_length'] == 129
-        has_left_padding.extend([item[0] == tokenizer.eos_token_id for item in batch['input_ids']])
+        has_left_padding.extend(
+            [item[0] == tokenizer.eos_token_id for item in batch['input_ids']])
     assert not all(has_left_padding)  # longest should be pushed left
 
-    decoded_batches = [tokenizer.batch_decode(batch['input_ids']) for batch in batches]
+    decoded_batches = [
+        tokenizer.batch_decode(batch['input_ids']) for batch in batches
+    ]
     for decoded_batch in decoded_batches:
-        assert all(item.count('Code start: \n') == num_fewshot + 1 for item in decoded_batch)
+        assert all(
+            item.count('Code start: \n') == num_fewshot + 1
+            for item in decoded_batch)
 
         if len(prompt_string) > 0:
-            assert all(item.count('Please code:\n') == 1 for item in decoded_batch)
+            assert all(
+                item.count('Please code:\n') == 1 for item in decoded_batch)
 
     labels = [
         '    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx != idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n',
@@ -1736,8 +1791,10 @@ def test_code_eval_sentpiece_dataloader(dataset_uri, tmp_path, num_fewshot, prom
             b, n = divmod(k, batch_size)
             assert batches[b]['labels'][n] == labels[i]
             assert decoded_batches[b][n].endswith(samples[i])
+
+
 @pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
-def test_code_eval_test_cases(dataset_uri, tmp_path):
+def test_code_eval_test_cases(dataset_uri: str, tmp_path: Path):
     pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -1790,7 +1847,7 @@ def test_code_eval_test_cases(dataset_uri, tmp_path):
 
 
 @pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
-def test_code_eval_pass_at_k_validity(dataset_uri, tmp_path):
+def test_code_eval_pass_at_k_validity(dataset_uri: str, tmp_path: Path):
     pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -1823,13 +1880,16 @@ def test_code_eval_pass_at_k_validity(dataset_uri, tmp_path):
 @pytest.mark.parametrize('num_fewshot', [0, 2])
 @pytest.mark.parametrize('prompt_string', ['Please code:\n', ''])
 @pytest.mark.parametrize('generations_per_sample', [1, 3])
-def test_code_eval_task_dataloader(dataset_uri, tmp_path, num_fewshot, prompt_string, generations_per_sample):
+def test_code_eval_task_dataloader(dataset_uri: str, tmp_path: Path,
+                                   num_fewshot: int, prompt_string: str,
+                                   generations_per_sample: int):
     pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
 
     transformers = pytest.importorskip('transformers')
-    tokenizer = transformers.AutoTokenizer.from_pretrained('mosaicml/mpt-7b')  # type: ignore reportUnboundVariable
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        'mosaicml/mpt-7b')  # type: ignore reportUnboundVariable
     dataset_uri = f'{local_data}/{dataset_uri}'
     batch_size = 4
     seqlen = 2048
@@ -1845,7 +1905,8 @@ def test_code_eval_task_dataloader(dataset_uri, tmp_path, num_fewshot, prompt_st
                                  example_delimiter='\n',
                                  continuation_delimiter='',
                                  question_prelimiter='Code start: \n',
-                                 destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'),
+                                 destination_path=str(
+                                     tmp_path / f'icl_{num_fewshot}.jsonl'),
                                  generations_per_sample=generations_per_sample,
                                  generation_kwargs={
                                      'temperature': .9,
@@ -1870,15 +1931,21 @@ def test_code_eval_task_dataloader(dataset_uri, tmp_path, num_fewshot, prompt_st
         assert batch['mode'] == 'generate'
         # the maximum generation length from the small test data
         assert batch['generation_length'] == 122
-        has_left_padding.extend([item[0] == tokenizer.eos_token_id for item in batch['input_ids']])
+        has_left_padding.extend(
+            [item[0] == tokenizer.eos_token_id for item in batch['input_ids']])
     assert not all(has_left_padding)  # longest should be pushed left
 
-    decoded_batches = [tokenizer.batch_decode(batch['input_ids']) for batch in batches]
+    decoded_batches = [
+        tokenizer.batch_decode(batch['input_ids']) for batch in batches
+    ]
     for decoded_batch in decoded_batches:
-        assert all(item.count('Code start: \n') == num_fewshot + 1 for item in decoded_batch)
+        assert all(
+            item.count('Code start: \n') == num_fewshot + 1
+            for item in decoded_batch)
 
         if len(prompt_string) > 0:
-            assert all(item.count('Please code:\n') == 1 for item in decoded_batch)
+            assert all(
+                item.count('Please code:\n') == 1 for item in decoded_batch)
 
     labels = [
         '    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx != idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n',
@@ -1900,14 +1967,16 @@ def test_code_eval_task_dataloader(dataset_uri, tmp_path, num_fewshot, prompt_st
             b, n = divmod(k, batch_size)
             assert batches[b]['labels'][n] == labels[i]
             assert decoded_batches[b][n].endswith(samples[i])
+
+
 @pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [0, 1])
-def test_eval_split_batch(tiny_opt_tokenizer, dataset_uri, num_fewshot, tmp_path):
+def test_eval_split_batch(mpt_tokenizer: transformers.AutoTokenizer,
+                          dataset_uri: str, num_fewshot: int, tmp_path: Path):
     pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-    transformers = pytest.importorskip('transformers')
-    tokenizer = transformers.AutoTokenizer.from_pretrained('mosaicml/mpt-7b')  # type: ignore reportUnboundVariable
+    tokenizer = mpt_tokenizer  # type: ignore reportUnboundVariable
     dataset_uri = f'{local_data}/{dataset_uri}'
     batch_size = 4
     seqlen = 512
@@ -1923,7 +1992,8 @@ def test_eval_split_batch(tiny_opt_tokenizer, dataset_uri, num_fewshot, tmp_path
                                  example_delimiter='\n',
                                  continuation_delimiter='',
                                  question_prelimiter='Code start: \n',
-                                 destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'),
+                                 destination_path=str(
+                                     tmp_path / f'icl_{num_fewshot}.jsonl'),
                                  generations_per_sample=1,
                                  generation_kwargs={
                                      'temperature': .9,
@@ -1951,12 +2021,14 @@ def test_eval_split_batch(tiny_opt_tokenizer, dataset_uri, num_fewshot, tmp_path
         assert microbatch['generation_kwargs']['use_cache'] == True
         assert microbatch['generation_kwargs']['eos_token_id'] == 0
 
+
 @pytest.mark.parametrize('dataset_uri', ['lambada_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [0, 5])
 @pytest.mark.gpu
 @pytest.mark.world_size(2)
-def test_lm_task_evaluation(dataset_uri, num_fewshot, tiny_gpt2_tokenizer,
-                            tmp_path):
+def test_lm_task_evaluation(dataset_uri: str, num_fewshot: int,
+                            tiny_gpt2_tokenizer: transformers.AutoTokenizer,
+                            tmp_path: Path):
     pytest.importorskip('datasets')
     in_memory_logger = InMemoryLogger(
     )  # track the logged metrics in the in_memory_logger
@@ -2003,8 +2075,10 @@ def test_lm_task_evaluation(dataset_uri, num_fewshot, tiny_gpt2_tokenizer,
 @pytest.mark.parametrize('num_fewshot', [0, 5])
 @pytest.mark.parametrize('dataset_uri', ['winograd_small.jsonl'])
 @pytest.mark.filterwarnings(r'ignore:Cannot split .* of length.*:UserWarning')
-def test_schema_task_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer,
-                                tmp_path, tiny_gpt2_model):
+def test_schema_task_evaluation(
+        num_fewshot: int, dataset_uri: str,
+        tiny_gpt2_tokenizer: transformers.AutoTokenizer, tmp_path: Path,
+        tiny_gpt2_model: transformers.AutoModelForCausalLM):
     pytest.importorskip('datasets')
     in_memory_logger = InMemoryLogger(
     )  # track the logged metrics in the in_memory_logger
@@ -2058,9 +2132,10 @@ def test_schema_task_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer,
 @pytest.mark.gpu
 @pytest.mark.world_size(2)
 @pytest.mark.filterwarnings(r'ignore:Cannot split .* of length.*:UserWarning')
-def test_mc_task_evaluation_subcategories(dataset_uri, num_fewshot,
-                                          tiny_gpt2_model, tiny_gpt2_tokenizer,
-                                          tmp_path):
+def test_mc_task_evaluation_subcategories(
+        dataset_uri: str, num_fewshot: int,
+        tiny_gpt2_model: transformers.AutoModelForCausalLM,
+        tiny_gpt2_tokenizer: transformers.AutoTokenizer, tmp_path: Path):
     pytest.importorskip('datasets')
     in_memory_logger = InMemoryLogger(
     )  # track the logged metrics in the in_memory_logger
@@ -2120,8 +2195,10 @@ def test_mc_task_evaluation_subcategories(dataset_uri, num_fewshot,
 @pytest.mark.filterwarnings(r'ignore:Cannot split .* of length.*:UserWarning')
 @pytest.mark.gpu
 @pytest.mark.world_size(2)
-def test_mc_task_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer,
-                            tmp_path, tiny_gpt2_model):
+def test_mc_task_evaluation(num_fewshot: int, dataset_uri: str,
+                            tiny_gpt2_tokenizer: transformers.AutoTokenizer,
+                            tmp_path: Path,
+                            tiny_gpt2_model: transformers.AutoModelForCausalLM):
     pytest.importorskip('datasets')
     in_memory_logger = InMemoryLogger(
     )  # track the logged metrics in the in_memory_logger
@@ -2184,8 +2261,10 @@ def test_mc_task_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer,
 @pytest.mark.filterwarnings(r'ignore:Cannot split .* of length.*:UserWarning')
 @pytest.mark.gpu
 @pytest.mark.world_size(2)
-def test_qa_task_evaluation_opt_tokenizer(tiny_opt_tokenizer, tiny_opt_model,
-                                          num_fewshot, dataset_uri, tmp_path):
+def test_qa_task_evaluation_opt_tokenizer(
+        tiny_opt_tokenizer: transformers.AutoTokenizer,
+        tiny_opt_model: transformers.AutoModelForCausalLM, num_fewshot: int,
+        dataset_uri: str, tmp_path: Path):
     pytest.importorskip('datasets')
     in_memory_logger = InMemoryLogger(
     )  # track the logged metrics in the in_memory_logger
@@ -2226,7 +2305,8 @@ def test_qa_task_evaluation_opt_tokenizer(tiny_opt_tokenizer, tiny_opt_model,
     assert 'metrics/triviaqa/InContextLearningGenerationAccuracy' in in_memory_logger.data.keys(
     )
     assert in_memory_logger.data[
-        'metrics/triviaqa/InContextLearningGenerationAccuracy'][0][1].item() == 0
+        'metrics/triviaqa/InContextLearningGenerationAccuracy'][0][1].item(
+        ) == 0
 
 
 @pytest.mark.parametrize('num_fewshot', [5])
@@ -2237,9 +2317,10 @@ def test_qa_task_evaluation_opt_tokenizer(tiny_opt_tokenizer, tiny_opt_model,
     r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning'
 )
 @pytest.mark.filterwarnings(r'ignore:Cannot split .* of length.*:UserWarning')
-def test_qa_task_evaluation_with_cot_opt_tokenizer(tiny_opt_tokenizer,
-                                                   tiny_opt_model, num_fewshot,
-                                                   dataset_uri, tmp_path):
+def test_qa_task_evaluation_with_cot_opt_tokenizer(
+        tiny_opt_tokenizer: transformers.AutoTokenizer,
+        tiny_opt_model: transformers.AutoModelForCausalLM, num_fewshot: int,
+        dataset_uri: str, tmp_path: Path):
     pytest.importorskip('datasets')
     in_memory_logger = InMemoryLogger(
     )  # track the logged metrics in the in_memory_logger
@@ -2280,8 +2361,8 @@ def test_qa_task_evaluation_with_cot_opt_tokenizer(tiny_opt_tokenizer,
     trainer.eval(eval_dataloader=evaluator, subset_num_batches=2)
     assert 'metrics/gsm8k/InContextLearningGenerationAccuracy' in in_memory_logger.data.keys(
     )
-    assert in_memory_logger.data['metrics/gsm8k/InContextLearningGenerationAccuracy'][
-        0][1].item() == 0
+    assert in_memory_logger.data[
+        'metrics/gsm8k/InContextLearningGenerationAccuracy'][0][1].item() == 0
 
 
 @pytest.mark.parametrize('dataset_uri', ['triviaqa_small.jsonl'])
@@ -2291,8 +2372,10 @@ def test_qa_task_evaluation_with_cot_opt_tokenizer(tiny_opt_tokenizer,
 @pytest.mark.filterwarnings(
     r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning'
 )
-def test_qa_task_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer,
-                            tiny_gpt2_model, tmp_path):
+def test_qa_task_evaluation(num_fewshot: int, dataset_uri: str,
+                            tiny_gpt2_tokenizer: transformers.AutoTokenizer,
+                            tiny_gpt2_model: transformers.AutoModelForCausalLM,
+                            tmp_path: Path):
     pytest.importorskip('datasets')
     in_memory_logger = InMemoryLogger(
     )  # track the logged metrics in the in_memory_logger
@@ -2333,7 +2416,8 @@ def test_qa_task_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer,
     assert 'metrics/triviaqa/InContextLearningGenerationAccuracy' in in_memory_logger.data.keys(
     )
     assert in_memory_logger.data[
-        'metrics/triviaqa/InContextLearningGenerationAccuracy'][0][1].item() == 0
+        'metrics/triviaqa/InContextLearningGenerationAccuracy'][0][1].item(
+        ) == 0
 
 
 @pytest.mark.parametrize('dataset_uri', ['gsm8k_small.jsonl'])
@@ -2343,9 +2427,10 @@ def test_qa_task_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer,
 )
 @pytest.mark.gpu
 @pytest.mark.world_size(2)
-def test_qa_task_with_cot_evaluation(num_fewshot, dataset_uri,
-                                     tiny_gpt2_tokenizer, tiny_gpt2_model,
-                                     tmp_path):
+def test_qa_task_with_cot_evaluation(
+        num_fewshot: int, dataset_uri: str,
+        tiny_gpt2_tokenizer: transformers.AutoTokenizer,
+        tiny_gpt2_model: transformers.AutoModelForCausalLM, tmp_path: Path):
     pytest.importorskip('datasets')
     in_memory_logger = InMemoryLogger(
     )  # track the logged metrics in the in_memory_logger
@@ -2386,11 +2471,11 @@ def test_qa_task_with_cot_evaluation(num_fewshot, dataset_uri,
     trainer.eval(eval_dataloader=evaluator, subset_num_batches=2)
     assert 'metrics/gsm8k/InContextLearningGenerationAccuracy' in in_memory_logger.data.keys(
     )
-    assert in_memory_logger.data['metrics/gsm8k/InContextLearningGenerationAccuracy'][
-        0][1].item() == 0
+    assert in_memory_logger.data[
+        'metrics/gsm8k/InContextLearningGenerationAccuracy'][0][1].item() == 0
 
 
-def test_code_eval_requires_envvar(monkeypatch):
+def test_code_eval_requires_envvar(monkeypatch: pytest.MonkeyPatch):
     monkeypatch.delenv('CODE_EVAL_DEVICE', raising=False)
     with pytest.raises(
             ValueError,
@@ -2398,7 +2483,7 @@ def test_code_eval_requires_envvar(monkeypatch):
         InContextLearningCodeEvalAccuracy().get_client()
 
 
-def test_code_eval_requires_valid_envvar(monkeypatch):
+def test_code_eval_requires_valid_envvar(monkeypatch: pytest.MonkeyPatch):
     monkeypatch.setenv('CODE_EVAL_DEVICE', 'bigchungus')
     with pytest.raises(
             ValueError,
@@ -2414,9 +2499,11 @@ def test_code_eval_requires_valid_envvar(monkeypatch):
 @pytest.mark.filterwarnings(
     r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning'
 )
-def test_code_eval_microbatching(monkeypatch, tiny_opt_tokenizer,
-                                 tiny_opt_model, num_fewshot, dataset_uri,
-                                 tmp_path, generations_per_sample):
+def test_code_eval_microbatching(
+        monkeypatch: pytest.MonkeyPatch,
+        tiny_opt_tokenizer: transformers.AutoTokenizer,
+        tiny_opt_model: transformers.AutoModelForCausalLM, num_fewshot: int,
+        dataset_uri: str, tmp_path: Path, generations_per_sample: int):
     pytest.importorskip('datasets')
     monkeypatch.setenv('CODE_EVAL_DEVICE', 'LOCAL')
     in_memory_logger = InMemoryLogger(
@@ -2472,9 +2559,11 @@ def test_code_eval_microbatching(monkeypatch, tiny_opt_tokenizer,
 @pytest.mark.filterwarnings(
     r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning'
 )
-def test_code_eval_sentpiece_evaluation(monkeypatch, num_fewshot, dataset_uri,
-                                        tiny_t5_tokenizer, tiny_t5_model,
-                                        tmp_path, generations_per_sample):
+def test_code_eval_sentpiece_evaluation(
+        monkeypatch: pytest.MonkeyPatch, num_fewshot: int, dataset_uri: str,
+        tiny_t5_tokenizer: transformers.AutoTokenizer,
+        tiny_t5_model: transformers.AutoModelForCausalLM, tmp_path: Path,
+        generations_per_sample: int):
     pytest.importorskip('datasets')
     monkeypatch.setenv('CODE_EVAL_DEVICE', 'LOCAL')
     in_memory_logger = InMemoryLogger(
@@ -2529,9 +2618,11 @@ def test_code_eval_sentpiece_evaluation(monkeypatch, num_fewshot, dataset_uri,
 @pytest.mark.filterwarnings(
     r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning'
 )
-def test_code_eval_task_evaluation(monkeypatch, num_fewshot, dataset_uri,
-                                   tiny_gpt2_tokenizer, tiny_gpt2_model,
-                                   tmp_path, generations_per_sample):
+def test_code_eval_task_evaluation(
+        monkeypatch: pytest.MonkeyPatch, num_fewshot: int, dataset_uri: str,
+        tiny_gpt2_tokenizer: transformers.AutoTokenizer,
+        tiny_gpt2_model: transformers.AutoModelForCausalLM, tmp_path: Path,
+        generations_per_sample: int):
     pytest.importorskip('datasets')
     monkeypatch.setenv('CODE_EVAL_DEVICE', 'LOCAL')
     in_memory_logger = InMemoryLogger(
@@ -2578,7 +2669,9 @@ def test_code_eval_task_evaluation(monkeypatch, num_fewshot, dataset_uri,
 
 
 @pytest.mark.parametrize('dataset_uri', ['lambada_small.jsonl'])
-def test_lm_spacing_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
+def test_lm_spacing_dataloader(dataset_uri: str,
+                               tiny_gpt2_tokenizer: transformers.AutoTokenizer,
+                               tmp_path: Path):
     pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -2634,9 +2727,12 @@ def test_lm_spacing_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
 @pytest.mark.filterwarnings(
     r'ignore:The repository for mosaicml/test_dataset contains custom code which must*:FutureWarning'
 )
-def test_hf_dataloading_lm_dataloader(dataset_uri, tiny_gpt2_tokenizer,
-                                      tmp_path, num_fewshot, prompt_string,
-                                      hf_loading_vars, hf_parsing_map):
+def test_hf_dataloading_lm_dataloader(
+        dataset_uri: str, tiny_gpt2_tokenizer: transformers.AutoTokenizer,
+        tmp_path: Path, num_fewshot: int, prompt_string: str,
+        hf_loading_vars: Dict[str,
+                              str], hf_parsing_map: Optional[Dict[str,
+                                                                  List[str]]]):
     pytest.importorskip('datasets')
 
     tokenizer = tiny_gpt2_tokenizer
@@ -2697,9 +2793,10 @@ def test_hf_dataloading_lm_dataloader(dataset_uri, tiny_gpt2_tokenizer,
 @pytest.mark.filterwarnings(
     r'ignore:The repository for mosaicml/test_dataset contains custom code which must*:FutureWarning'
 )
-def test_hf_dataloading_custom_parsing(dataset_uri, tiny_gpt2_tokenizer,
-                                       tmp_path, num_fewshot, prompt_string,
-                                       hf_loading_vars, hf_parsing_map):
+def test_hf_dataloading_custom_parsing(
+        dataset_uri: str, tiny_gpt2_tokenizer: transformers.AutoTokenizer,
+        tmp_path: Path, num_fewshot: int, prompt_string: str,
+        hf_loading_vars: Dict[str, str], hf_parsing_map: Dict[str, List[str]]):
     pytest.importorskip('datasets')
 
     tokenizer = tiny_gpt2_tokenizer
diff --git a/tests/eval/test_nlp_metrics.py b/tests/eval/test_nlp_metrics.py
index 01e1eb84c6..29840719d7 100644
--- a/tests/eval/test_nlp_metrics.py
+++ b/tests/eval/test_nlp_metrics.py
@@ -4,15 +4,20 @@
 # Copyright 2022 MosaicML Composer authors
 # SPDX-License-Identifier: Apache-2.0
 
-import torch
+from typing import Any, List
+
 import pytest
+import torch
+import transformers
+
 from llmfoundry.eval.metrics import (InContextLearningCodeEvalAccuracy,
+                                     InContextLearningGenerationAccuracy,
                                      InContextLearningLMAccuracy,
-                                     InContextLearningMultipleChoiceAccuracy,
-                                     InContextLearningGenerationAccuracy)
+                                     InContextLearningMultipleChoiceAccuracy)
 
 
-def test_in_context_learning_lm_accuracy(tiny_gpt2_tokenizer):
+def test_in_context_learning_lm_accuracy(
+        tiny_gpt2_tokenizer: transformers.AutoTokenizer):
     contexts = ['The dog is', 'I love to eat', 'I hate', 'The weather is']
     continuations = [' furry', ' pie', ' long lines', ' snowy']
     pad = tiny_gpt2_tokenizer.pad_token_id
@@ -79,7 +84,8 @@ def test_in_context_learning_qa_cot_accuracy():
     assert metric.compute() == (2 / 4)
 
 
-def test_in_context_learning_code_eval_accuracy(monkeypatch):
+def test_in_context_learning_code_eval_accuracy(
+        monkeypatch: pytest.MonkeyPatch):
     outputs = [
         '    return 1 if n <= 1 else fib(n - 1) + fib(n - 1)',  # incorrect
         '   if n <= 1:\n        return 1\n    return fib(n-1) + fib(n-2)',  # incorrect spacing
@@ -89,22 +95,28 @@ def test_in_context_learning_code_eval_accuracy(monkeypatch):
         '    return n + 1'
     ]  # correct
     labels = []
-    prompts = ['def fib(n):\n', 'def multiply_by_two(n):\n', 'def add_one(n):\n']
+    prompts = [
+        'def fib(n):\n', 'def multiply_by_two(n):\n', 'def add_one(n):\n'
+    ]
     entry_points = ['fib', 'multiply_by_two', 'add_one']
-    test_inputs = [['(1,)', '(2,)', '(4,)'], ['(1,)', '(2,)', '(4,)'], ['(1,)', '(2,)', '(4,)']]
+    test_inputs = [['(1,)', '(2,)', '(4,)'], ['(1,)', '(2,)', '(4,)'],
+                   ['(1,)', '(2,)', '(4,)']]
     test_outputs = [['1', '2', '5'], ['2', '4', '8'], ['2', '3', '5']]
     sample_ids = [0, 1, 2]
     languages = ['python', 'python', 'python']
     monkeypatch.setenv('CODE_EVAL_DEVICE', 'LOCAL')
     generations_per_sample = 2
 
-    def repeat(values):
+    def repeat(values: List[Any]):
         return [val for val in values for _ in range(generations_per_sample)]
 
     transformers = pytest.importorskip('transformers')
-    tokenizer = transformers.AutoTokenizer.from_pretrained('mosaicml/mpt-7b')  # type: ignore reportUnboundVariable
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        'mosaicml/mpt-7b')  # type: ignore reportUnboundVariable
     tokenizer.pad_token = tokenizer.eos_token
-    input_ids = tokenizer.batch_encode_plus(repeat(prompts), return_tensors='pt', padding=True)['input_ids']
+    input_ids = tokenizer.batch_encode_plus(repeat(prompts),
+                                            return_tensors='pt',
+                                            padding=True)['input_ids']
     batch = {
         # This tests deterministic beam search rather than sampling
         'input_ids': input_ids,
@@ -131,7 +143,9 @@ def repeat(values):
     # mean: 0.5
     assert metric.compute() == 0.5
 
-def test_in_context_learning_mc_accuracy(tiny_gpt2_tokenizer):
+
+def test_in_context_learning_mc_accuracy(
+        tiny_gpt2_tokenizer: transformers.AutoTokenizer):
     contexts = [
         'Q: How do you cook a cake?', 'Q: How do you cook a cake?',
         'Q: How old is the earth?', 'Q: How old is the earth?'
diff --git a/tests/fixtures/models.py b/tests/fixtures/models.py
index 449fdbf5bc..4f52e3f681 100644
--- a/tests/fixtures/models.py
+++ b/tests/fixtures/models.py
@@ -72,7 +72,7 @@ def build(**kwargs: Any) -> ComposerHFCausalLM:
     return build
 
 
-def tiny_gpt2_model_helper(config):
+def tiny_gpt2_model_helper(config):  # type: ignore
     transformers = pytest.importorskip('transformers')
 
     return transformers.AutoModelForCausalLM.from_config(config)
@@ -108,6 +108,21 @@ def tiny_gpt2_tokenizer_helper():
     return hf_tokenizer
 
 
+@pytest.fixture
+def tiny_gpt2_model(_session_tiny_gpt2_model):  # type: ignore
+    return copy.deepcopy(_session_tiny_gpt2_model)
+
+
+@pytest.fixture(scope='session')
+def _session_tiny_gpt2_tokenizer():  # type: ignore
+    return tiny_gpt2_tokenizer_helper()
+
+
+@pytest.fixture
+def tiny_gpt2_tokenizer(_session_tiny_gpt2_tokenizer):  # type: ignore
+    return copy.deepcopy(_session_tiny_gpt2_tokenizer)
+
+
 def tiny_llama_tokenizer_helper():
     transformers = pytest.importorskip('transformers')
 
@@ -116,16 +131,16 @@ def tiny_llama_tokenizer_helper():
     return hf_tokenizer
 
 
-@pytest.fixture(scope='session')
-def _session_tiny_gpt2_tokenizer():  # type: ignore
-    return tiny_gpt2_tokenizer_helper()
-
-
 @pytest.fixture(scope='session')
 def _session_tiny_llama_tokenizer():  # type: ignore
     return tiny_llama_tokenizer_helper()
 
 
+@pytest.fixture
+def tiny_llama_tokenizer(_session_tiny_llama_tokenizer):  # type: ignore
+    return copy.deepcopy(_session_tiny_llama_tokenizer)
+
+
 def tiny_opt_tokenizer_helper():
     transformers = pytest.importorskip('transformers')
 
@@ -135,41 +150,45 @@ def tiny_opt_tokenizer_helper():
     return hf_tokenizer
 
 
-@pytest.fixture(scope='session')
-def _session_tiny_opt_tokenizer():  # type: ignore
-    return tiny_opt_tokenizer_helper()
+def tiny_opt_model_helper(config):  # type: ignore
+    transformers = pytest.importorskip('transformers')
 
+    return transformers.AutoModelForCausalLM.from_config(config)
 
-@pytest.fixture
-def tiny_gpt2_config(_session_tiny_gpt2_config):
-    return copy.deepcopy(_session_tiny_gpt2_config)
 
+@pytest.fixture(scope='session')
+def _session_tiny_opt_tokenizer():  # type: ignore
+    return tiny_opt_tokenizer_helper()
 
-@pytest.fixture
-def tiny_gpt2_tokenizer(_session_tiny_gpt2_tokenizer):
-    return copy.deepcopy(_session_tiny_gpt2_tokenizer)
 
+@pytest.fixture(scope='session')
+def _session_tiny_opt_config():  # type: ignore
+    return tiny_opt_config_helper()
 
-@pytest.fixture
-def tiny_llama_tokenizer(_session_tiny_llama_tokenizer):
-    return copy.deepcopy(_session_tiny_llama_tokenizer)
 
+@pytest.fixture(scope='session')
+def _session_tiny_opt_model(_session_tiny_opt_config):  # type: ignore
+    return tiny_opt_model_helper(_session_tiny_opt_config)
 
-@pytest.fixture
-def tiny_gpt2_model(_session_tiny_gpt2_model):
-    return copy.deepcopy(_session_tiny_gpt2_model)
 
+def tiny_opt_config_helper():
+    transformers = pytest.importorskip('transformers')
 
-@pytest.fixture
-def tiny_opt_config(_session_tiny_opt_config):
-    return copy.deepcopy(_session_tiny_opt_config)
+    tiny_overrides = {
+        'n_embd': 2,
+        'n_head': 2,
+        'n_layer': 2,
+        'vocab_size': 50272
+    }
+    return transformers.AutoConfig.from_pretrained('facebook/opt-125m',
+                                                   **tiny_overrides)
 
 
 @pytest.fixture
-def tiny_opt_tokenizer(_session_tiny_opt_tokenizer):
+def tiny_opt_tokenizer(_session_tiny_opt_tokenizer):  # type: ignore
     return copy.deepcopy(_session_tiny_opt_tokenizer)
 
 
 @pytest.fixture
-def tiny_opt_model(_session_tiny_opt_model):
+def tiny_opt_model(_session_tiny_opt_model):  # type: ignore
     return copy.deepcopy(_session_tiny_opt_model)

From cb3725bc782cc66202571202f2f196fc4bf45493 Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Tue, 27 Feb 2024 12:36:49 -0500
Subject: [PATCH 20/59] wip

---
 .../in_context_learning_evaluation.py         | 96 ++++++++++++-------
 llmfoundry/eval/datasets/utils.py             | 26 +++--
 llmfoundry/eval/metrics/__init__.py           |  7 +-
 llmfoundry/eval/metrics/nlp.py                |  5 +-
 llmfoundry/models/hf/hf_causal_lm.py          | 10 +-
 .../models/inference_api_wrapper/interface.py |  9 +-
 llmfoundry/models/mpt/modeling_mpt.py         | 10 +-
 llmfoundry/utils/builders.py                  |  4 +-
 scripts/eval/README.md                        |  4 +-
 .../eval/test_in_context_learning_datasets.py | 66 +++++++------
 tests/eval/test_nlp_metrics.py                | 12 +--
 11 files changed, 146 insertions(+), 103 deletions(-)

diff --git a/llmfoundry/eval/datasets/in_context_learning_evaluation.py b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
index beb1f8e45d..da4e8194b2 100644
--- a/llmfoundry/eval/datasets/in_context_learning_evaluation.py
+++ b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
@@ -46,7 +46,8 @@
 
 
 class InContextLearningDataset(Dataset):
-    """A base dataset that constructs batches for in-context learning task
+    r"""A base dataset that constructs batches for in-context learning task.
+
     evaluations. The dataset format is expected to be a local jsonl file, a
     cloud link to a jsonl file, or a Hugging Face dataset link. 'context' refers
     to the input a model will recieve before generating an output. For example,
@@ -194,14 +195,15 @@ def get_num_samples_in_batch(self, batch: Dict) -> int:
         return batch['input_ids'].shape[0]
 
     def update_generation_kwargs(self, generation_kwargs: Dict) -> None:
-        """Updates self.base_batch with the passed in generation_kwargs. This
-        must be run after self.base_batch is set (for example, if
+        r"""Updates self.base_batch with the passed in generation_kwargs.
+
+        This must be run after self.base_batch is set (for example, if
         self.base_batch is set after __init__() is run, likely because
         base_batch needs a class variable like self.pad_tok_id or
         self.max_answer_length).
 
         Args:
-            dict: Keyword arguments that be written into base_batch['generation_kwargs']
+            generation_kwargs (Dict): Keyword arguments that be written into base_batch['generation_kwargs']
         """
         if generation_kwargs:
             if 'generation_kwargs' not in self.base_batch:
@@ -302,7 +304,8 @@ def construct_context(self,
                           example: Dict,
                           preceding_text: str = '',
                           add_answer: bool = False) -> str:
-        """Takes an example and constructs a context, i.e. the input the model
+        """Takes an example and constructs a context, i.e. the input the model.
+
         reads for this example. Optionally adds the correct answer (for fewshot
         examples) and handles example delimiters.
 
@@ -341,7 +344,8 @@ def get_answer_from_example(self,
         return cont
 
     def _fix_eos_on_preamble(self, input_ids: List[int]) -> List[int]:
-        """If the input_ids is empty then input_ids will be a 0-length List
+        """If the input_ids is empty then input_ids will be a 0-length List.
+
         unless the tokenizer adds special tokens to empty strings (e.g. OPT
         tokenizer). If there is an EOS token added, we need to remove it so it
         is not in the middle of the prompt, as the specific eval question's
@@ -433,7 +437,8 @@ def _prep_example(
         prompt_string: str,
         fewshot_rng: random.Random,
     ) -> Dict[str, Any]:
-        """Prepares a single example from a HF Dataset into tokenized format
+        """Prepares a single example from a HF Dataset into tokenized format.
+
         with prompt and fewshot examples.
 
         Each task consists of a context and a continuation as well as an optional prompt and optional list of
@@ -459,7 +464,8 @@ def _prep_example(
         return tokenized_example
 
     def collate_fn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
-        """The function that the dataloader uses to accumulate data into
+        """The function that the dataloader uses to accumulate data into.
+
         batches.
 
         Args:
@@ -482,7 +488,8 @@ def collate_fn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
 
     def split_batch(self, batch: Any,
                     microbatch_size: int) -> List[Dict[str, Any]]:
-        """Handling for certain specialty columns that must be split into
+        """Handling for certain specialty columns that must be split into.
+
         batches in different formats.
 
         Args:
@@ -519,7 +526,8 @@ def split_batch(self, batch: Any,
 
 class InContextLearningGenerationWithAnswersTaskDataset(InContextLearningDataset
                                                        ):
-    """A dataset that constructs batches for in-context learning generation
+    """A dataset that constructs batches for in-context learning generation.
+
     tasks with answers. Generation tasks with evaluate a model's ability to
     generate responses and score them against a set of gold-standard answers.
 
@@ -613,8 +621,10 @@ def read_dataset(
     def get_answer_from_example(self,
                                 example: Dict,
                                 in_context: bool = False) -> str:
-        """
-        Returns the answer from the example. Applies chain of thought if self.has_cot is marked as true.
+        """Returns the answer from the example. Applies chain of thought if.
+
+        self.has_cot is marked as true.
+
         Args:
             example (Dict): The example from which to retrieve the answer
 
@@ -628,8 +638,8 @@ def get_answer_from_example(self,
 
     def tokenize_example(self, prompt_and_fewshot: str, ctxt: str,
                          example: Dict) -> Dict[str, Any]:
-        """
-        Run text through the tokenizer and handle special cases.
+        """Run text through the tokenizer and handle special cases.
+
         Args:
             prompt_and_fewshot (str): The collection of the prompt and fewshot examples that belongs before the example's context
             ctx (str): The specific example's derrived context
@@ -644,11 +654,10 @@ def tokenize_example(self, prompt_and_fewshot: str, ctxt: str,
         return tokenized_example
 
     def _get_max_answer_length(self, dataset: Iterable[dict]) -> int:
-        f"""
-        Loops over the dataset and finds the longest answer length.
+        """Loops over the dataset and finds the longest answer length.
 
         Returns:
-            int: The maximum answer length with an additional buffer of {_MAX_ANSWER_BUFFER_LENGTH} if chain of thought is present
+            int: The maximum answer length with an additional buffer of 10 if chain of thought is present
         """
         max_answer_length = 0
         for example in dataset:
@@ -686,7 +695,8 @@ def collate_fn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
 
 
 class InContextLearningLMTaskDataset(InContextLearningDataset):
-    """A dataset that constructs batches for in-context learning language
+    """A dataset that constructs batches for in-context learning language.
+
     modeling evaluation. Language modeling tasks test a model's ability to
     properly predict tokens based on preceding tokens.
 
@@ -720,7 +730,8 @@ def __init__(self, *args, **kwargs):  # pyright: ignore
 
 
 class InContextLearningMultipleChoiceTaskDataset(InContextLearningDataset):
-    """A dataset that construct batches for in-context learning multiple choice
+    """A dataset that construct batches for in-context learning multiple choice.
+
     evaluation.
 
     If each question has N answer choices, we construct N distinct inputs per question. In order to ensure
@@ -788,8 +799,8 @@ def __init__(
     def get_answer_from_example(self,
                                 example: Dict,
                                 in_context: bool = False) -> str:
-        """
-        Returns the correct answer from the example's choices.
+        """Returns the correct answer from the example's choices.
+
         Args:
             example (Dict): The example from which to retrieve the answer
 
@@ -802,8 +813,8 @@ def get_answer_from_example(self,
 
     def tokenize_example(self, prompt_and_fewshot: str, ctxt: str,
                          example: Dict) -> Dict[str, Any]:
-        """
-        Runs text through the tokenizer and handle special cases.
+        """Runs text through the tokenizer and handle special cases.
+
         Args:
             prompt_and_fewshot (str): The collection of the prompt and fewshot examples that belongs before the example's context
             ctx (str): The specific example's derrived context
@@ -862,7 +873,8 @@ def tokenize_example(self, prompt_and_fewshot: str, ctxt: str,
         return tokenized_example
 
     def collate_fn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
-        """The function that the dataloader uses to accumulate data into
+        """The function that the dataloader uses to accumulate data into.
+
         batches. We run each distinct query + answer choice through the model
         separately and determine which answer has the lowest per-token-
         perplexity.
@@ -900,7 +912,8 @@ def get_num_samples_in_batch(self, batch: Dict[str, torch.Tensor]) -> int:
 
     def split_batch(self, batch: Any,
                     microbatch_size: int) -> List[Dict[str, Any]]:
-        """Split batch while ensuring all continuations are in the same
+        """Split batch while ensuring all continuations are in the same.
+
         microbatch.
 
         In ICL Multiple Choice, we duplicate each data point for each possible continuation.
@@ -951,7 +964,8 @@ def split_batch(self, batch: Any,
 
 class InContextLearningSchemaTaskDataset(
         InContextLearningMultipleChoiceTaskDataset):
-    """A dataset that constructs batches for in-context learning schema
+    """A dataset that constructs batches for in-context learning schema.
+
     evaluation. A schema task involves sentences with a fill-in-the-blank where
     the user needs to choose the correct word to fill in from a set of N
     options. We use the partial evaluation technique from
@@ -1000,7 +1014,10 @@ def construct_context(self,
                           example: Dict[str, Any],
                           preceding_text: str = '',
                           add_answer: bool = False) -> str:
-        """Takes a example and constructs a context with the correct context for
+        """Takes a example and constructs a context with the correct context.
+
+        for.
+
         the example's continuation.
 
         Args:
@@ -1023,8 +1040,9 @@ def construct_context(self,
     def _construct_multiple_contexts(self,
                                      example: Dict,
                                      preceding_text: str = '') -> List[str]:
-        """Takes a example and constructs all contexts. Optionally, appends this
-        to preceeding text (such as a prompt or fewshot examples).
+        """Takes a example and constructs all contexts.
+
+        Optionally, appends this to preceeding text (such as a prompt or fewshot examples).
 
         Args:
             example (Dict): The example from which to construct the context
@@ -1053,7 +1071,8 @@ def _prep_example(
         prompt_string: str,
         fewshot_rng: random.Random,
     ) -> Dict[str, Any]:
-        """Prepares a single example from a HF Dataset into tokenized format
+        """Prepares a single example from a HF Dataset into tokenized format.
+
         with prompt and fewshot examples.
 
         Each task consists of multiple contexts and a single, correct continuation. Will preprend fewshot examples and
@@ -1133,7 +1152,8 @@ def tokenize_example(self, prompt_and_fewshot: str,
 
 
 class InContextLearningCodeEvalDataset(InContextLearningDataset):
-    """A dataset that constructs batches for in-context learning code
+    """A dataset that constructs batches for in-context learning code.
+
     evaluation.
 
     The input format is expected to be a jsonl file with the following fields:
@@ -1288,7 +1308,8 @@ def _repeat_dataset():
         return repeated_dataset
 
     def _set_max_prompt_and_answer_lengths(self):
-        """Iterates through the dataset and finds the maximum prompt length and
+        """Iterates through the dataset and finds the maximum prompt length and.
+
         sequence lengths.
 
         Returns:
@@ -1316,6 +1337,7 @@ def _set_max_prompt_and_answer_lengths(self):
 
     def _trim_padding(self, example: Dict):
         """Adjusts padding to the maximum prompt length rather than max_seq_len.
+
         Needs to be done after the dataset has been processed because we don't
         know the maximum prompt length until after we've tokenized it.
 
@@ -1377,7 +1399,8 @@ def build_icl_dataloader(
         generation_kwargs: Dict,
         early_stopping_criteria: Optional[List[str]] = None,
         do_normalization: bool = True) -> DataSpec:
-    """Factory method that builds the specific dataset for the specified
+    """Factory method that builds the specific dataset for the specified.
+
     icl_task_type. See documentation for `get_icl_task_dataloader` for arugment
     documentation.
 
@@ -1513,7 +1536,8 @@ def build_icl_dataloader(
 def partition_dataset_by_category(dataset_uri: str, destination_path: str,
                                   hf_loading_vars: Dict,
                                   hf_parsing_map: Dict) -> Dict[str, str]:
-    """If has_categories is enabled, we partition the dataset into a separate
+    """If has_categories is enabled, we partition the dataset into a separate.
+
     dataset for each category value in the data and write each partition to a
     local file.
 
@@ -1524,6 +1548,7 @@ def partition_dataset_by_category(dataset_uri: str, destination_path: str,
     Raises:
         MissingConditionalImportError: If datasets not installed raise exception.
         Exception: If 'category' key missing from dataset, raise exception.
+
     Returns:
         Dict[str, str]: Mapping of category names to partitioned dataset local files names.
     """
@@ -1612,7 +1637,8 @@ def get_icl_task_dataloader(
         generation_kwargs: Optional[Dict] = None,
         early_stopping_criteria: Optional[List[str]] = None,
         do_normalization: bool = True) -> Union[DataSpec, Dict[str, DataSpec]]:
-    """This constructs a dataloader (or dataloaders if has_categories is True)
+    r"""Constructs a dataloader (or dataloaders if has_categories is True)
+
     capable of evaluating LLMs on in-context learning language modeling tasks,
     for example LAMBADA. An example usage is below:
 
diff --git a/llmfoundry/eval/datasets/utils.py b/llmfoundry/eval/datasets/utils.py
index 6e39ffdb4e..cc0acdab6d 100644
--- a/llmfoundry/eval/datasets/utils.py
+++ b/llmfoundry/eval/datasets/utils.py
@@ -24,7 +24,8 @@
 
 
 def strip_data(example: Dict) -> Dict:
-    """Remove white space from the begging and end of string values in a
+    """Remove white space from the begging and end of string values in a.
+
     dictionary.
 
     Args:
@@ -41,6 +42,7 @@ def strip_data(example: Dict) -> Dict:
 def tokenizer_needs_prefix_space(
         tokenizer: transformers.PreTrainedTokenizerBase) -> bool:
     """Test for whether a prefix space is needed before the continuation.
+
     Sentencepiece tokenization should not have a prefix space, but gpt2 style
     BPE should.
 
@@ -57,7 +59,8 @@ def tokenizer_needs_prefix_space(
 
 def trim_context(context_enc: List, continuation_enc: List,
                  max_seq_len: int) -> List:
-    """Trims a list of tokens down to `max_seq_len` if the length of the list
+    """Trims a list of tokens down to `max_seq_len` if the length of the list.
+
     plus the continuation is more than `max_seq_len`. It will always trim tokens
     from the left, i.e. tokens at the beginning of the context will be removed.
 
@@ -84,7 +87,10 @@ def trim_context(context_enc: List, continuation_enc: List,
 
 def get_continuation_span(context_enc: List,
                           continuation_enc: List) -> torch.Tensor:
-    """Gets the list of indices of the continuation tokens for language modeling
+    """Gets the list of indices of the continuation tokens for language.
+
+    modeling.
+
     or generation tasks.
 
     Args:
@@ -104,7 +110,8 @@ def make_padded_input(context_enc: List,
                       max_seq_len: int,
                       pad_tok_id: int,
                       padding_side: str = 'right') -> torch.Tensor:
-    """Takes an encoded context and continuation and clips the beginning of the
+    """Takes an encoded context and continuation and clips the beginning of the.
+
     context if they're too long. Adds the padding token to the specified side.
 
     Args:
@@ -118,7 +125,6 @@ def make_padded_input(context_enc: List,
         input (torch.tensor): The padded and encoded context
         continuation_span (torch.tensor): The _inclusive_ range of indices corresponding to the continuation
     """
-
     inp = torch.tensor(
         (context_enc + continuation_enc),
         dtype=torch.long,
@@ -158,7 +164,8 @@ def make_padded_input(context_enc: List,
 
 def convert_tokens_to_tensors(batch: Dict,
                               tokenize_labels: bool) -> Dict[str, Any]:
-    """HF Datasets converts tensors into lists when we store them, and we don't
+    """HF Datasets converts tensors into lists when we store them, and we don't.
+
     want to use `type='torch'` because some content in the dataset, like
     generation args or single ints, should not be converted.
 
@@ -182,9 +189,10 @@ def convert_tokens_to_tensors(batch: Dict,
 
 def get_fewshot_sample_idxs(dataset_size: int, num_fewshot: int,
                             example_idx: int, rng: random.Random) -> Set[int]:
-    """
-    Samples indices without replacement. If num_fewshot exceeds the number of unique examples in the dataset,
-    then we will have fewer than num_fewshot examples in context.
+    """Samples indices without replacement. If num_fewshot exceeds the number.
+
+    of unique examples in the dataset, then we will have fewer than num_fewshot examples in context.
+
     Args:
         dataset_size (int): Length of the dataset
         num_fewshot (int): Number of examples to prepend
diff --git a/llmfoundry/eval/metrics/__init__.py b/llmfoundry/eval/metrics/__init__.py
index 6457018cbb..5401fb87d4 100644
--- a/llmfoundry/eval/metrics/__init__.py
+++ b/llmfoundry/eval/metrics/__init__.py
@@ -7,15 +7,16 @@
 """A collection of common torchmetrics."""
 
 from llmfoundry.eval.metrics.nlp import (
-    InContextLearningCodeEvalAccuracy, InContextLearningGenerationAccuracy,
-    InContextLearningLMAccuracy, InContextLearningLMExpectedCalibrationError,
+    InContextLearningCodeEvalAccuracy,
+    InContextLearningGenerationExactMatchAccuracy, InContextLearningLMAccuracy,
+    InContextLearningLMExpectedCalibrationError,
     InContextLearningMCExpectedCalibrationError, InContextLearningMetric,
     InContextLearningMultipleChoiceAccuracy)
 
 __all__ = [
     'InContextLearningLMAccuracy',
     'InContextLearningMultipleChoiceAccuracy',
-    'InContextLearningGenerationAccuracy',
+    'InContextLearningGenerationExactMatchAccuracy',
     'InContextLearningMCExpectedCalibrationError',
     'InContextLearningLMExpectedCalibrationError',
     'InContextLearningMetric',
diff --git a/llmfoundry/eval/metrics/nlp.py b/llmfoundry/eval/metrics/nlp.py
index da3d2a5d7c..ae74f4b595 100644
--- a/llmfoundry/eval/metrics/nlp.py
+++ b/llmfoundry/eval/metrics/nlp.py
@@ -29,7 +29,7 @@
     'InContextLearningMetric',
     'InContextLearningLMAccuracy',
     'InContextLearningMultipleChoiceAccuracy',
-    'InContextLearningGenerationAccuracy',
+    'InContextLearningGenerationExactMatchAccuracy',
     'InContextLearningCodeEvalAccuracy',
     'InContextLearningLMExpectedCalibrationError',
     'InContextLearningMCExpectedCalibrationError',
@@ -65,7 +65,7 @@ def update(
         raise NotImplementedError
 
 
-class InContextLearningGenerationAccuracy(InContextLearningMetric):
+class InContextLearningGenerationExactMatchAccuracy(InContextLearningMetric):
     r"""Computes accuracy for In-context learning (ICL) generation tasks.
 
     ICL QA tasks consist of some number of example question answering tasks (referred to as the 'context'), followed by a test task where the model must
@@ -577,6 +577,7 @@ def compute(self):
         if complete.sum() < (self.total != 0).sum():
             warnings.warn(
                 'Some samples in the dataset have less than the expected number of generations. '
+                +
                 'This is expected if you are using a subset of the dataset for evaluation.'
             )
 
diff --git a/llmfoundry/models/hf/hf_causal_lm.py b/llmfoundry/models/hf/hf_causal_lm.py
index 756ff4326a..d3a68c9b06 100644
--- a/llmfoundry/models/hf/hf_causal_lm.py
+++ b/llmfoundry/models/hf/hf_causal_lm.py
@@ -16,10 +16,10 @@
 from transformers import (AutoConfig, AutoModelForCausalLM, PreTrainedModel,
                           PreTrainedTokenizerBase)
 
-from llmfoundry.eval.metrics import (InContextLearningCodeEvalAccuracy,
-                                     InContextLearningGenerationAccuracy,
-                                     InContextLearningLMAccuracy,
-                                     InContextLearningMultipleChoiceAccuracy)
+from llmfoundry.eval.metrics import (
+    InContextLearningCodeEvalAccuracy,
+    InContextLearningGenerationExactMatchAccuracy, InContextLearningLMAccuracy,
+    InContextLearningMultipleChoiceAccuracy)
 from llmfoundry.metrics import TokenAccuracy
 from llmfoundry.models.hf.hf_fsdp import hf_get_init_device
 from llmfoundry.models.hf.model_wrapper import HuggingFaceModelWithZLoss
@@ -120,7 +120,7 @@ def __init__(self, om_model_config: DictConfig,
             TokenAccuracy(),
             InContextLearningLMAccuracy(),
             InContextLearningMultipleChoiceAccuracy(),
-            InContextLearningGenerationAccuracy(),
+            InContextLearningGenerationExactMatchAccuracy(),
             InContextLearningCodeEvalAccuracy()
         ]
         if not om_model_config.get('use_train_metrics', True):
diff --git a/llmfoundry/models/inference_api_wrapper/interface.py b/llmfoundry/models/inference_api_wrapper/interface.py
index 87bb9a7f70..f8e57249e9 100644
--- a/llmfoundry/models/inference_api_wrapper/interface.py
+++ b/llmfoundry/models/inference_api_wrapper/interface.py
@@ -10,10 +10,9 @@
 from torchmetrics import Metric
 from transformers import AutoTokenizer
 
-from llmfoundry.eval.metrics import (InContextLearningGenerationAccuracy,
-                                     InContextLearningLMAccuracy,
-                                     InContextLearningMetric,
-                                     InContextLearningMultipleChoiceAccuracy)
+from llmfoundry.eval.metrics import (
+    InContextLearningGenerationExactMatchAccuracy, InContextLearningLMAccuracy,
+    InContextLearningMetric, InContextLearningMultipleChoiceAccuracy)
 
 
 class InferenceAPIEvalWrapper(ComposerModel):
@@ -27,7 +26,7 @@ def __init__(self, model_cfg: Dict, tokenizer: AutoTokenizer):
             LanguagePerplexity(),
             InContextLearningLMAccuracy(),
             InContextLearningMultipleChoiceAccuracy(),
-            InContextLearningGenerationAccuracy()
+            InContextLearningGenerationExactMatchAccuracy()
         ]
         self.eval_metrics = {
             metric.__class__.__name__: metric for metric in eval_metrics
diff --git a/llmfoundry/models/mpt/modeling_mpt.py b/llmfoundry/models/mpt/modeling_mpt.py
index e6154da939..73c4aa1d0f 100644
--- a/llmfoundry/models/mpt/modeling_mpt.py
+++ b/llmfoundry/models/mpt/modeling_mpt.py
@@ -20,10 +20,10 @@
 from composer.models import HuggingFaceModel
 from composer.utils import dist
 
-from llmfoundry.eval.metrics import (InContextLearningCodeEvalAccuracy,
-                                     InContextLearningGenerationAccuracy,
-                                     InContextLearningLMAccuracy,
-                                     InContextLearningMultipleChoiceAccuracy)
+from llmfoundry.eval.metrics import (
+    InContextLearningCodeEvalAccuracy,
+    InContextLearningGenerationExactMatchAccuracy, InContextLearningLMAccuracy,
+    InContextLearningMultipleChoiceAccuracy)
 from llmfoundry.metrics import TokenAccuracy
 from llmfoundry.models.layers.attention import (is_flash_v1_installed,
                                                 is_flash_v2_installed)
@@ -1056,7 +1056,7 @@ def __init__(
             TokenAccuracy(),
             InContextLearningLMAccuracy(),
             InContextLearningMultipleChoiceAccuracy(),
-            InContextLearningGenerationAccuracy(),
+            InContextLearningGenerationExactMatchAccuracy(),
             InContextLearningCodeEvalAccuracy(),
         ]
 
diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py
index 1798bdd9a4..85449fc871 100644
--- a/llmfoundry/utils/builders.py
+++ b/llmfoundry/utils/builders.py
@@ -474,7 +474,9 @@ def _validate_cfg(icl_cfg: DictConfig):
                     'InContextLearningMultipleChoiceAccuracy'
                 ]
             elif icl_cfg.icl_task_type == 'generation_task_with_answers':
-                icl_cfg.metric_names = ['InContextLearningGenerationAccuracy']
+                icl_cfg.metric_names = [
+                    'InContextLearningGenerationExactMatchAccuracy'
+                ]
             elif icl_cfg.icl_task_type == 'code_evaluation':
                 icl_cfg.metric_names = ['InContextLearningCodeEvalAccuracy']
             else:
diff --git a/scripts/eval/README.md b/scripts/eval/README.md
index 9c65f6a3ef..3ed648da1b 100644
--- a/scripts/eval/README.md
+++ b/scripts/eval/README.md
@@ -165,7 +165,7 @@ Composer currently supports five ICL formats:
 
 ### InContextLearningGenerationWithAnswersTaskDataset
 
-The ICL generation with answers task supports free response generation evaluation using the model’s generate function. A generation dataset consists of a list of JSONs containing a prompt (under the key `context`), a correct answer (under the key `answer`), and a list of alternative answers that would be considered permissible (under the key `aliases`). The generation task works with the NLP metric: [InContextLearningGenerationAccuracy](TODO) which assigns a model's output to be "correct" if, conditioned on the context, the model's generate method produces a string that is a normalized prefix for either the `answer` or any of the `aliases`.
+The ICL generation with answers task supports free response generation evaluation using the model’s generate function. A generation dataset consists of a list of JSONs containing a prompt (under the key `context`), a correct answer (under the key `answer`), and a list of alternative answers that would be considered permissible (under the key `aliases`). The generation task works with the NLP metric: [InContextLearningGenerationExactMatchAccuracy](TODO) which assigns a model's output to be "correct" if, conditioned on the context, the model's generate method produces a string that is a normalized prefix for either the `answer` or any of the `aliases`.
 
 Required keys for each datum:
 * `context`: str
@@ -205,7 +205,7 @@ Below is a complete YAML section that works with the TriviaQA dataset in [`scrip
     batch_size: 4
     icl_task_type: generation_task_with_answers
     metric_names:
-    - InContextLearningGenerationAccuracy
+    - InContextLearningGenerationExactMatchAccuracy
     prompt_string: '' # this goes at the beginning of each input
     example_delimiter: "\n" # this goes between fewshot examples
     continuation_delimiter: ' ' # this separates questions from answers
diff --git a/tests/eval/test_in_context_learning_datasets.py b/tests/eval/test_in_context_learning_datasets.py
index 049579521d..bf511db2e9 100644
--- a/tests/eval/test_in_context_learning_datasets.py
+++ b/tests/eval/test_in_context_learning_datasets.py
@@ -33,10 +33,10 @@
 from composer.trainer import Trainer
 from composer.utils import dist, reproducibility
 
-from llmfoundry.eval.metrics import (InContextLearningCodeEvalAccuracy,
-                                     InContextLearningGenerationAccuracy,
-                                     InContextLearningLMAccuracy,
-                                     InContextLearningMultipleChoiceAccuracy)
+from llmfoundry.eval.metrics import (
+    InContextLearningCodeEvalAccuracy,
+    InContextLearningGenerationExactMatchAccuracy, InContextLearningLMAccuracy,
+    InContextLearningMultipleChoiceAccuracy)
 
 
 def test_strip_data():
@@ -2289,24 +2289,25 @@ def test_qa_task_evaluation_opt_tokenizer(
         destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
     )
 
-    evaluator = Evaluator(label='triviaqa',
-                          dataloader=dl,
-                          metric_names=['InContextLearningGenerationAccuracy'])
+    evaluator = Evaluator(
+        label='triviaqa',
+        dataloader=dl,
+        metric_names=['InContextLearningGenerationExactMatchAccuracy'])
     model = HuggingFaceModel(
         model=tiny_opt_model,
         tokenizer=tokenizer,
-        eval_metrics=[InContextLearningGenerationAccuracy()],
+        eval_metrics=[InContextLearningGenerationExactMatchAccuracy()],
         use_logits=True,
     )
 
     trainer = Trainer(model=model, max_duration='1ba', loggers=in_memory_logger)
 
     trainer.eval(eval_dataloader=evaluator, subset_num_batches=2)
-    assert 'metrics/triviaqa/InContextLearningGenerationAccuracy' in in_memory_logger.data.keys(
+    assert 'metrics/triviaqa/InContextLearningGenerationExactMatchAccuracy' in in_memory_logger.data.keys(
     )
     assert in_memory_logger.data[
-        'metrics/triviaqa/InContextLearningGenerationAccuracy'][0][1].item(
-        ) == 0
+        'metrics/triviaqa/InContextLearningGenerationExactMatchAccuracy'][0][
+            1].item() == 0
 
 
 @pytest.mark.parametrize('num_fewshot', [5])
@@ -2346,23 +2347,25 @@ def test_qa_task_evaluation_with_cot_opt_tokenizer(
         destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
     )
 
-    evaluator = Evaluator(label='gsm8k',
-                          dataloader=dl,
-                          metric_names=['InContextLearningGenerationAccuracy'])
+    evaluator = Evaluator(
+        label='gsm8k',
+        dataloader=dl,
+        metric_names=['InContextLearningGenerationExactMatchAccuracy'])
     model = HuggingFaceModel(
         model=tiny_opt_model,
         tokenizer=tokenizer,
-        eval_metrics=[InContextLearningGenerationAccuracy()],
+        eval_metrics=[InContextLearningGenerationExactMatchAccuracy()],
         use_logits=True,
     )
 
     trainer = Trainer(model=model, max_duration='1ba', loggers=in_memory_logger)
 
     trainer.eval(eval_dataloader=evaluator, subset_num_batches=2)
-    assert 'metrics/gsm8k/InContextLearningGenerationAccuracy' in in_memory_logger.data.keys(
+    assert 'metrics/gsm8k/InContextLearningGenerationExactMatchAccuracy' in in_memory_logger.data.keys(
     )
     assert in_memory_logger.data[
-        'metrics/gsm8k/InContextLearningGenerationAccuracy'][0][1].item() == 0
+        'metrics/gsm8k/InContextLearningGenerationExactMatchAccuracy'][0][
+            1].item() == 0
 
 
 @pytest.mark.parametrize('dataset_uri', ['triviaqa_small.jsonl'])
@@ -2399,25 +2402,26 @@ def test_qa_task_evaluation(num_fewshot: int, dataset_uri: str,
         destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
     )
 
-    evaluator = Evaluator(label='triviaqa',
-                          dataloader=dl,
-                          metric_names=['InContextLearningGenerationAccuracy'])
+    evaluator = Evaluator(
+        label='triviaqa',
+        dataloader=dl,
+        metric_names=['InContextLearningGenerationExactMatchAccuracy'])
 
     model = HuggingFaceModel(
         model=tiny_gpt2_model,
         tokenizer=tiny_gpt2_tokenizer,
-        eval_metrics=[InContextLearningGenerationAccuracy()],
+        eval_metrics=[InContextLearningGenerationExactMatchAccuracy()],
         use_logits=True,
     )
 
     trainer = Trainer(model=model, max_duration='1ba', loggers=in_memory_logger)
 
     trainer.eval(eval_dataloader=evaluator, subset_num_batches=2)
-    assert 'metrics/triviaqa/InContextLearningGenerationAccuracy' in in_memory_logger.data.keys(
+    assert 'metrics/triviaqa/InContextLearningGenerationExactMatchAccuracy' in in_memory_logger.data.keys(
     )
     assert in_memory_logger.data[
-        'metrics/triviaqa/InContextLearningGenerationAccuracy'][0][1].item(
-        ) == 0
+        'metrics/triviaqa/InContextLearningGenerationExactMatchAccuracy'][0][
+            1].item() == 0
 
 
 @pytest.mark.parametrize('dataset_uri', ['gsm8k_small.jsonl'])
@@ -2455,24 +2459,26 @@ def test_qa_task_with_cot_evaluation(
         destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
     )
 
-    evaluator = Evaluator(label='gsm8k',
-                          dataloader=dl,
-                          metric_names=['InContextLearningGenerationAccuracy'])
+    evaluator = Evaluator(
+        label='gsm8k',
+        dataloader=dl,
+        metric_names=['InContextLearningGenerationExactMatchAccuracy'])
 
     model = HuggingFaceModel(
         model=tiny_gpt2_model,
         tokenizer=tiny_gpt2_tokenizer,
-        eval_metrics=[InContextLearningGenerationAccuracy()],
+        eval_metrics=[InContextLearningGenerationExactMatchAccuracy()],
         use_logits=True,
     )
 
     trainer = Trainer(model=model, max_duration='1ba', loggers=in_memory_logger)
 
     trainer.eval(eval_dataloader=evaluator, subset_num_batches=2)
-    assert 'metrics/gsm8k/InContextLearningGenerationAccuracy' in in_memory_logger.data.keys(
+    assert 'metrics/gsm8k/InContextLearningGenerationExactMatchAccuracy' in in_memory_logger.data.keys(
     )
     assert in_memory_logger.data[
-        'metrics/gsm8k/InContextLearningGenerationAccuracy'][0][1].item() == 0
+        'metrics/gsm8k/InContextLearningGenerationExactMatchAccuracy'][0][
+            1].item() == 0
 
 
 def test_code_eval_requires_envvar(monkeypatch: pytest.MonkeyPatch):
diff --git a/tests/eval/test_nlp_metrics.py b/tests/eval/test_nlp_metrics.py
index 29840719d7..24a2078795 100644
--- a/tests/eval/test_nlp_metrics.py
+++ b/tests/eval/test_nlp_metrics.py
@@ -10,10 +10,10 @@
 import torch
 import transformers
 
-from llmfoundry.eval.metrics import (InContextLearningCodeEvalAccuracy,
-                                     InContextLearningGenerationAccuracy,
-                                     InContextLearningLMAccuracy,
-                                     InContextLearningMultipleChoiceAccuracy)
+from llmfoundry.eval.metrics import (
+    InContextLearningCodeEvalAccuracy,
+    InContextLearningGenerationExactMatchAccuracy, InContextLearningLMAccuracy,
+    InContextLearningMultipleChoiceAccuracy)
 
 
 def test_in_context_learning_lm_accuracy(
@@ -58,7 +58,7 @@ def test_in_context_learning_qa_accuracy():
     ]
     labels = [['Correct'], ['blah', 'blah2'], ['blah', 'correct']]
     batch = {'cot_delimiter': '', 'labels': labels}
-    metric = InContextLearningGenerationAccuracy()
+    metric = InContextLearningGenerationExactMatchAccuracy()
     metric.update(batch, outputs, labels)
 
     assert metric.compute() == (2 / 3)
@@ -78,7 +78,7 @@ def test_in_context_learning_qa_cot_accuracy():
         'do_normalization': True,
         'stopping_criteria': '\n\n'
     }
-    metric = InContextLearningGenerationAccuracy()
+    metric = InContextLearningGenerationExactMatchAccuracy()
     metric.update(batch, outputs, labels)
 
     assert metric.compute() == (2 / 4)

From 5135152656b5063bd569de6800471c09f4614b20 Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Tue, 27 Feb 2024 13:05:47 -0500
Subject: [PATCH 21/59] update readme

---
 .../in_context_learning_evaluation.py         |   8 +-
 llmfoundry/eval/metrics/nlp.py                | 294 +++++++++---------
 scripts/eval/README.md                        |  24 +-
 3 files changed, 164 insertions(+), 162 deletions(-)

diff --git a/llmfoundry/eval/datasets/in_context_learning_evaluation.py b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
index da4e8194b2..8c579d0e9d 100644
--- a/llmfoundry/eval/datasets/in_context_learning_evaluation.py
+++ b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
@@ -528,7 +528,7 @@ class InContextLearningGenerationWithAnswersTaskDataset(InContextLearningDataset
                                                        ):
     """A dataset that constructs batches for in-context learning generation.
 
-    tasks with answers. Generation tasks with evaluate a model's ability to
+    tasks with answers. Generation tasks evaluate a model's ability to
     generate responses and score them against a set of gold-standard answers.
 
     The input format is expected to be a jsonl file with the following fields:
@@ -540,6 +540,8 @@ class InContextLearningGenerationWithAnswersTaskDataset(InContextLearningDataset
 
     Additional Args:
         cot_delimiter (str): Delimiter to place between the chain of thought and continuations.
+        early_stopping_criteria (Optional[List[str]]): Optional strings to trigger early stopping.
+        do_normalization (bool): Flag indicating whether to normalize generations before providing output.
     """
 
     def __init__(
@@ -1712,8 +1714,8 @@ def get_icl_task_dataloader(
                                                   keyword args in this fucntion (see https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig
                                                   for more details)
         early_stopping (List, default = None): A list of strings that, when found in a model's output, will be treated as a stopping criteria at metric computation time.
-            Used in QA tasks with CoT
-        do_normalization (bool, default = True): Whether or not to normalize the outputs and labels in InContextLearningGenerationWithAnswersTaskDataset. Only used in QA tasks.
+            Used in generation tasks with CoT
+        do_normalization (bool, default = True): Whether or not to normalize the outputs and labels in InContextLearningGenerationWithAnswersTaskDataset. Only used in generation tasks.
 
     Returns:
         DataLoader: A dataloader used for performing in-context learning evaluation on the dataset provided.
diff --git a/llmfoundry/eval/metrics/nlp.py b/llmfoundry/eval/metrics/nlp.py
index ae74f4b595..38523aef52 100644
--- a/llmfoundry/eval/metrics/nlp.py
+++ b/llmfoundry/eval/metrics/nlp.py
@@ -66,17 +66,17 @@ def update(
 
 
 class InContextLearningGenerationExactMatchAccuracy(InContextLearningMetric):
-    r"""Computes accuracy for In-context learning (ICL) generation tasks.
+    r"""Computes exact match for in-context learning generation tasks.
 
-    ICL QA tasks consist of some number of example question answering tasks (referred to as the 'context'), followed by a test task where the model must
-    match one of the possible answer aliases (referred to as the 'continuation').
+    ICL generation tasks consist of some number of prompted generation tasks with correct answers
+    followed by a test task where the model must correctly produce one of a number of valid answers.
 
     For example, the model may be provided the context below and evaluated on its ability to correctly predict the continuation.
 
     Context: `Question: Who was president of the United States in 2012?\nAnswer: Barack Obama\nQuestion: Is water wet?\nAnswer: `
-    Continuation: [`yes`, `no`]
+    Answers: [`yes`]
 
-    Both predictions and answers will be normalized before comparison.
+    The model will be expected to correctly produce one of the answers, following some optional normalization.
 
     Adds metric state variables:
         correct (float): The number of instances where the prediction was a prefix for any of the answer aliases.
@@ -275,148 +275,6 @@ def compute(self):
         return self.correct.float() / self.total
 
 
-class InContextLearningExpectedCalibrationError(InContextLearningMetric):
-    """Generic class for Expected Calibration Error (ECE).
-
-    Citation: https://arxiv.org/pdf/1706.04599.pdf
-
-    Expected calibration error is calculated by dividing predictions into buckets based on the model's confidence (a probability value between 0 and 1).
-    We then calculate the accuracy within each bucket and calculate the average gap between confidence and accuracy
-    across buckets, weighted by the number of samples in each bucket.
-
-    Each task must implement its own definition of "confidence" to be computed via the `update` method.
-
-    Adds metric state variables:
-    bucket_totals (float): The number of instances where the prediction masked the target per bucket.
-    bucket_correct (float): The number of total instances that were predicted per bucket.
-
-    Args:
-        dist_sync_on_step (bool, optional): Synchronize metric state across processes at
-            each forward() before returning the value at the step. Default: ``False``.
-        n_buckets (int): Number of distinct buckets to split the confidence distribution into
-    """
-
-    def __init__(self, dist_sync_on_step: bool = False, n_buckets: int = 10):
-        # state from multiple processes
-        super().__init__(dist_sync_on_step=dist_sync_on_step)
-        self.n_buckets = n_buckets
-        if n_buckets < 1:
-            raise Exception('`n_buckets`')
-        self.add_state('bucket_totals',
-                       default=torch.zeros(n_buckets),
-                       dist_reduce_fx='sum')
-        self.add_state('bucket_correct',
-                       default=torch.zeros(n_buckets),
-                       dist_reduce_fx='sum')
-
-    def update(self, batch: dict, outputs: torch.Tensor, labels: torch.Tensor):
-        pass
-
-    def compute(self):
-        assert isinstance(self.bucket_correct, Tensor)
-        assert isinstance(self.bucket_totals, Tensor)
-
-        result = torch.tensor(0.0, device=self.bucket_correct.device)
-        total_obs = torch.sum(self.bucket_totals)
-        for i in range(self.n_buckets):
-            if self.bucket_totals[i] == 0:
-                continue
-
-            acc_bucket_i = self.bucket_correct[i] / self.bucket_totals[i]
-            upper_bound = (i + 1) / self.n_buckets
-            lower_bound = i / self.n_buckets
-            conf_bucket_i = torch.tensor((upper_bound + lower_bound) / 2,
-                                         device=self.bucket_correct.device)
-            result += (self.bucket_totals[i] /
-                       total_obs) * torch.abs(acc_bucket_i - conf_bucket_i)
-        return result
-
-
-class InContextLearningMCExpectedCalibrationError(
-        InContextLearningExpectedCalibrationError):
-    r"""Computes Expected Calibration Error (ECE) for In-context learning (ICL)
-
-    multiple choice (MC) tasks. (source: https://arxiv.org/abs/2012.00955).
-
-    For MC tasks, the model confidence is defined as the softmax of average per-token probability assigned to the top question choice.
-
-    See `InContextLearningExpectedCalibrationError` for more info.
-    """
-
-    # Make torchmetrics call update only once
-    full_state_update = False
-
-    def update(self, batch: dict, outputs: torch.Tensor, labels: torch.Tensor):
-
-        outputs = torch.softmax(outputs, dim=2)
-        probabilites = []
-        for batch_idx, cont_idx in enumerate(batch['continuation_indices']):
-            cont_tok_logits = outputs[batch_idx].index_select(dim=0,
-                                                              index=cont_idx -
-                                                              1)
-            cont_tok_targ = labels[batch_idx].index_select(dim=0,
-                                                           index=cont_idx - 1)
-            probability = cont_tok_logits.index_select(
-                dim=1, index=cont_tok_targ).diagonal().mean()
-            probabilites.append(probability)
-
-        for (start, end), gold_idx in zip(batch['choice_groupings'],
-                                          batch['gold_indices']):
-            subset = probabilites[start:end]
-            idx_max = subset.index(max(subset))
-            confidence = torch.tensor(subset).max() / torch.tensor(subset).sum()
-
-            assert confidence >= 0.0 and confidence <= 1.0
-            bucket_idx = int(confidence * self.n_buckets)
-            if bucket_idx == self.n_buckets:
-                bucket_idx -= 1
-
-            if idx_max == gold_idx:
-                self.bucket_correct[
-                    bucket_idx] += 1  # pyright: ignore [reportGeneralTypeIssues]
-
-            self.bucket_totals[
-                bucket_idx] += 1  # pyright: ignore [reportGeneralTypeIssues]
-
-
-class InContextLearningLMExpectedCalibrationError(
-        InContextLearningExpectedCalibrationError):
-    r"""Computes Expected Calibration Error (ECE) for In-context learning (ICL)
-
-    language modeling (LM) tasks. (cite: https://arxiv.org/pdf/1706.04599.pdf).
-
-    For LM tasks, the model confidence is defined as the minimum probability assigned to all tokens in the continuation.
-
-    See `InContextLearningExpectedCalibrationError` for more info.
-    """
-
-    # Make torchmetrics call update only once
-    full_state_update = False
-
-    def update(self, batch: dict, outputs: torch.Tensor, labels: torch.Tensor):
-
-        outputs = torch.softmax(outputs, dim=2)
-        for batch_idx, cont_idx in enumerate(batch['continuation_indices']):
-            cont_tok_logits = outputs[batch_idx].index_select(dim=0,
-                                                              index=cont_idx -
-                                                              1)
-            cont_tok_pred = cont_tok_logits.argmax(dim=-1)
-            confidence = cont_tok_logits.max(dim=-1).values.min()
-            cont_tok_targ = labels[batch_idx].index_select(dim=0,
-                                                           index=cont_idx - 1)
-            assert confidence >= 0.0 and confidence <= 1.0
-            bucket_idx = int(confidence * self.n_buckets)
-            if bucket_idx == self.n_buckets:
-                bucket_idx -= 1
-
-            if (cont_tok_pred == cont_tok_targ).all():
-                self.bucket_correct[
-                    bucket_idx] += 1  # pyright: ignore [reportGeneralTypeIssues]
-
-            self.bucket_totals[
-                bucket_idx] += 1  # pyright: ignore [reportGeneralTypeIssues]
-
-
 class InContextLearningCodeEvalAccuracy(InContextLearningMetric):
     r"""Computes accuracy for In-context learning (ICL) code evaluation tasks.
 
@@ -600,3 +458,145 @@ def compute(self):
             return list(results.values())[0]
 
         return results
+
+
+class InContextLearningExpectedCalibrationError(InContextLearningMetric):
+    """Generic class for Expected Calibration Error (ECE).
+
+    Citation: https://arxiv.org/pdf/1706.04599.pdf
+
+    Expected calibration error is calculated by dividing predictions into buckets based on the model's confidence (a probability value between 0 and 1).
+    We then calculate the accuracy within each bucket and calculate the average gap between confidence and accuracy
+    across buckets, weighted by the number of samples in each bucket.
+
+    Each task must implement its own definition of "confidence" to be computed via the `update` method.
+
+    Adds metric state variables:
+    bucket_totals (float): The number of instances where the prediction masked the target per bucket.
+    bucket_correct (float): The number of total instances that were predicted per bucket.
+
+    Args:
+        dist_sync_on_step (bool, optional): Synchronize metric state across processes at
+            each forward() before returning the value at the step. Default: ``False``.
+        n_buckets (int): Number of distinct buckets to split the confidence distribution into
+    """
+
+    def __init__(self, dist_sync_on_step: bool = False, n_buckets: int = 10):
+        # state from multiple processes
+        super().__init__(dist_sync_on_step=dist_sync_on_step)
+        self.n_buckets = n_buckets
+        if n_buckets < 1:
+            raise Exception('`n_buckets`')
+        self.add_state('bucket_totals',
+                       default=torch.zeros(n_buckets),
+                       dist_reduce_fx='sum')
+        self.add_state('bucket_correct',
+                       default=torch.zeros(n_buckets),
+                       dist_reduce_fx='sum')
+
+    def update(self, batch: dict, outputs: torch.Tensor, labels: torch.Tensor):
+        pass
+
+    def compute(self):
+        assert isinstance(self.bucket_correct, Tensor)
+        assert isinstance(self.bucket_totals, Tensor)
+
+        result = torch.tensor(0.0, device=self.bucket_correct.device)
+        total_obs = torch.sum(self.bucket_totals)
+        for i in range(self.n_buckets):
+            if self.bucket_totals[i] == 0:
+                continue
+
+            acc_bucket_i = self.bucket_correct[i] / self.bucket_totals[i]
+            upper_bound = (i + 1) / self.n_buckets
+            lower_bound = i / self.n_buckets
+            conf_bucket_i = torch.tensor((upper_bound + lower_bound) / 2,
+                                         device=self.bucket_correct.device)
+            result += (self.bucket_totals[i] /
+                       total_obs) * torch.abs(acc_bucket_i - conf_bucket_i)
+        return result
+
+
+class InContextLearningMCExpectedCalibrationError(
+        InContextLearningExpectedCalibrationError):
+    r"""Computes Expected Calibration Error (ECE) for In-context learning (ICL)
+
+    multiple choice (MC) tasks. (source: https://arxiv.org/abs/2012.00955).
+
+    For MC tasks, the model confidence is defined as the softmax of average per-token probability assigned to the top question choice.
+
+    See `InContextLearningExpectedCalibrationError` for more info.
+    """
+
+    # Make torchmetrics call update only once
+    full_state_update = False
+
+    def update(self, batch: dict, outputs: torch.Tensor, labels: torch.Tensor):
+
+        outputs = torch.softmax(outputs, dim=2)
+        probabilites = []
+        for batch_idx, cont_idx in enumerate(batch['continuation_indices']):
+            cont_tok_logits = outputs[batch_idx].index_select(dim=0,
+                                                              index=cont_idx -
+                                                              1)
+            cont_tok_targ = labels[batch_idx].index_select(dim=0,
+                                                           index=cont_idx - 1)
+            probability = cont_tok_logits.index_select(
+                dim=1, index=cont_tok_targ).diagonal().mean()
+            probabilites.append(probability)
+
+        for (start, end), gold_idx in zip(batch['choice_groupings'],
+                                          batch['gold_indices']):
+            subset = probabilites[start:end]
+            idx_max = subset.index(max(subset))
+            confidence = torch.tensor(subset).max() / torch.tensor(subset).sum()
+
+            assert confidence >= 0.0 and confidence <= 1.0
+            bucket_idx = int(confidence * self.n_buckets)
+            if bucket_idx == self.n_buckets:
+                bucket_idx -= 1
+
+            if idx_max == gold_idx:
+                self.bucket_correct[
+                    bucket_idx] += 1  # pyright: ignore [reportGeneralTypeIssues]
+
+            self.bucket_totals[
+                bucket_idx] += 1  # pyright: ignore [reportGeneralTypeIssues]
+
+
+class InContextLearningLMExpectedCalibrationError(
+        InContextLearningExpectedCalibrationError):
+    r"""Computes Expected Calibration Error (ECE) for In-context learning (ICL)
+
+    language modeling (LM) tasks. (cite: https://arxiv.org/pdf/1706.04599.pdf).
+
+    For LM tasks, the model confidence is defined as the minimum probability assigned to all tokens in the continuation.
+
+    See `InContextLearningExpectedCalibrationError` for more info.
+    """
+
+    # Make torchmetrics call update only once
+    full_state_update = False
+
+    def update(self, batch: dict, outputs: torch.Tensor, labels: torch.Tensor):
+
+        outputs = torch.softmax(outputs, dim=2)
+        for batch_idx, cont_idx in enumerate(batch['continuation_indices']):
+            cont_tok_logits = outputs[batch_idx].index_select(dim=0,
+                                                              index=cont_idx -
+                                                              1)
+            cont_tok_pred = cont_tok_logits.argmax(dim=-1)
+            confidence = cont_tok_logits.max(dim=-1).values.min()
+            cont_tok_targ = labels[batch_idx].index_select(dim=0,
+                                                           index=cont_idx - 1)
+            assert confidence >= 0.0 and confidence <= 1.0
+            bucket_idx = int(confidence * self.n_buckets)
+            if bucket_idx == self.n_buckets:
+                bucket_idx -= 1
+
+            if (cont_tok_pred == cont_tok_targ).all():
+                self.bucket_correct[
+                    bucket_idx] += 1  # pyright: ignore [reportGeneralTypeIssues]
+
+            self.bucket_totals[
+                bucket_idx] += 1  # pyright: ignore [reportGeneralTypeIssues]
diff --git a/scripts/eval/README.md b/scripts/eval/README.md
index 3ed648da1b..bfc8fbde58 100644
--- a/scripts/eval/README.md
+++ b/scripts/eval/README.md
@@ -145,7 +145,8 @@ You can use the default `icl_tasks` and `eval_gauntlet` configs or specify your
 
 ICL evaluation measures a model’s ability to solve novel problems by being provided examples in-context without ever being specifically trained to answer such questions.
 
-Composer supports a number of different standard ICL formats and allows users to upload their own datasets that correspond to those formats.
+We supports a number of different standard ICL formats and allows users to upload their own datasets that correspond to those formats. All of our ICL task types are implemented in `llm-foundry/llmfoundry/eval/datasets/in_context_learning_evaluation.py` while all of our ICL
+metrics are implemented in `llm-foundry/llmfoundry/eval/metrics/nlp.py`. You can see which metrics work with which task types in the `llmfoundry.utils.builders.build_icl_evaluators` helper function.
 
 This document explains the ICL formats compatible with [Composer](https://github.com/mosaicml/composer), summarizes how to add new datasets in those formats, and catalogs the datasets currently used by the research team to evaluate models.
 
@@ -153,19 +154,19 @@ This document explains the ICL formats compatible with [Composer](https://github
 
 ## Supported ICL formats
 
-Composer currently supports five ICL formats:
+llm-foundry currently supports five ICL formats:
 
-1. [InContextLearningGenerationWithAnswersTaskDataset](TODO)
-2. [InContextLearningLMTaskDataset](https://github.com/mosaicml/composer/blob/336bf8db3e2c09ae942d4bf8a819935106589d1a/composer/datasets/in_context_learning_evaluation.py#L293)
-3. [InContextLearningMultipleChoiceTaskDataset](https://github.com/mosaicml/composer/blob/336bf8db3e2c09ae942d4bf8a819935106589d1a/composer/datasets/in_context_learning_evaluation.py#L444)
-4. [InContextLearningSchemaTaskDataset](https://github.com/mosaicml/composer/blob/336bf8db3e2c09ae942d4bf8a819935106589d1a/composer/datasets/in_context_learning_evaluation.py#L676)
-5. [InContextLearningCodeEvalDataset](https://github.com/mosaicml/composer/blob/336bf8db3e2c09ae942d4bf8a819935106589d1a/composer/datasets/in_context_learning_evaluation.py#L852)
+1. InContextLearningGenerationWithAnswersTaskDataset
+2. InContextLearningLMTaskDataset
+3. InContextLearningMultipleChoiceTaskDataset
+4. InContextLearningSchemaTaskDataset
+5. InContextLearningCodeEvalDataset
 
 ----
 
 ### InContextLearningGenerationWithAnswersTaskDataset
 
-The ICL generation with answers task supports free response generation evaluation using the model’s generate function. A generation dataset consists of a list of JSONs containing a prompt (under the key `context`), a correct answer (under the key `answer`), and a list of alternative answers that would be considered permissible (under the key `aliases`). The generation task works with the NLP metric: [InContextLearningGenerationExactMatchAccuracy](TODO) which assigns a model's output to be "correct" if, conditioned on the context, the model's generate method produces a string that is a normalized prefix for either the `answer` or any of the `aliases`.
+The ICL generation with answers task supports free response generation evaluation using the model’s generate function. A generation dataset consists of a list of JSONs containing a prompt (under the key `context`), a correct answer (under the key `answer`), and a list of alternative answers that would be considered permissible (under the key `aliases`). The generation task works with the NLP metric: InContextLearningGenerationExactMatchAccuracy which assigns a model's output to be "correct" if, conditioned on the context, the model's generate method produces a string that is a normalized prefix for either the `answer` or any of the `aliases`.
 
 Required keys for each datum:
 * `context`: str
@@ -215,7 +216,7 @@ Below is a complete YAML section that works with the TriviaQA dataset in [`scrip
 
 ### InContextLearningLMTaskDataset
 
-The ICL language modeling (LM) task assesses the model’s ability to predict a precise sequence of tokens (called a continuation) following some context using the model’s `forward` function. An LM dataset consists of a list of JSONs containing a context (under the key `context`) and a continuation (under the key `continuation` that the model must correctly predict conditioned on the context. The LM task uses the NLP metric [InContextLearningLMAccuracy](https://docs.mosaicml.com/projects/composer/en/latest/api_reference/generated/composer.metrics.InContextLearningLMAccuracy.html), which assigns a model's output to be "correct" if, conditioned on the context tokens, the model's argmax output logits exactly match the tokens in the continuation.
+The ICL language modeling (LM) task assesses the model’s ability to predict a precise sequence of tokens (called a continuation) following some context using the model’s `forward` function. An LM dataset consists of a list of JSONs containing a context (under the key `context`) and a continuation (under the key `continuation` that the model must correctly predict conditioned on the context. The LM task uses the NLP metric InContextLearningLMAccuracy, which assigns a model's output to be "correct" if, conditioned on the context tokens, the model's argmax output logits exactly match the tokens in the continuation.
 
 Required keys for each datum:
 * `context`: str
@@ -256,7 +257,7 @@ Below is a YAML section that works with the Lambada OpenAI dataset in [`scripts/
 
 ### InContextLearningMultipleChoiceTaskDataset
 
-The ICL multiple choice (MC) task assesses the model’s ability to answer multiple choice questions by assigning highest per token probability to the correct answer. An MC dataset consists of a list of JSONs containing a query (under the key `query`), a list of choices (under the key `choices`), and the index indicating the correct answer (under the key `gold`). The MC task works with the NLP metric [InContextLearningMultipleChoiceAccuracy](https://docs.mosaicml.com/projects/composer/en/latest/api_reference/generated/composer.metrics.InContextLearningMultipleChoiceAccuracy.html), which separately runs the model's `forward()` method on the query prepended to each choice, and then determines the model to be correct if the correct choice has the lowest per token perplexity conditioned on the query.
+The ICL multiple choice (MC) task assesses the model’s ability to answer multiple choice questions by assigning highest per token probability to the correct answer. An MC dataset consists of a list of JSONs containing a query (under the key `query`), a list of choices (under the key `choices`), and the index indicating the correct answer (under the key `gold`). The MC task works with the NLP metric InContextLearningMultipleChoiceAccuracy, which separately runs the model's `forward()` method on the query prepended to each choice, and then determines the model to be correct if the correct choice has the lowest per token perplexity conditioned on the query.
 
 Required keys for each datum:
 * `query`: str
@@ -294,7 +295,6 @@ Below is a YAML section that works with the HellaSwag dataset in [`scripts/eval/
     icl_task_type: multiple_choice
     metric_names:
     - InContextLearningMultipleChoiceAccuracy
-    - InContextLearningMCExpectedCalibrationError
     prompt_string: '' # this goes at the beginning of each input
     example_delimiter: "\n" # this goes between fewshot examples
     continuation_delimiter: ' ' # this separates questions from answers
@@ -306,7 +306,7 @@ Below is a YAML section that works with the HellaSwag dataset in [`scripts/eval/
 
 The ICL schema task assesses the model’s ability to determine which of some set of possible contexts (under the key `context_options`) makes a sequence of tokens (under the key `continuation`) most likely, with the correct context indicated by "gold". This task is based on [A Simple Method for Commonsense Reasoning](https://arxiv.org/abs/1806.02847).
 
-The schema task works with the NLP metric [InContextLearningMultipleChoiceAccuracy](https://docs.mosaicml.com/projects/composer/en/latest/api_reference/generated/composer.metrics.InContextLearningMultipleChoiceAccuracy.html), which separately runs the model's `forward()` method on each context option prepended to the continuation and rates the model correct if it assigns minimum per token perplexity to the continuation conditioned on the true context.
+The schema task works with the NLP metric InContextLearningMultipleChoiceAccuracy, which separately runs the model's `forward()` method on each context option prepended to the continuation and rates the model correct if it assigns minimum per token perplexity to the continuation conditioned on the true context.
 
 Required keys for each datum:
 * query: str

From c6162dd24932b6baf0386d0468a71164f90090d7 Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Tue, 27 Feb 2024 13:17:24 -0500
Subject: [PATCH 22/59] final pyright

---
 llmfoundry/eval/metrics/nlp.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/llmfoundry/eval/metrics/nlp.py b/llmfoundry/eval/metrics/nlp.py
index 38523aef52..8d6ae1cad6 100644
--- a/llmfoundry/eval/metrics/nlp.py
+++ b/llmfoundry/eval/metrics/nlp.py
@@ -316,8 +316,9 @@ def get_client(self) -> EvalClient:
         if self.eval_device == 'LOCAL':
             warnings.warn(
                 'Running code eval locally may be insecure. Please set environment variable CODE_EVAL_DEVICE '
+                +
                 'to LAMBDA to run on remote. To use Lambdas, spin up your instance that checks code, set the URL as '
-                'CODE_EVAL_URL and the API key as CODE_EVAL_APIKEY.')
+                + 'CODE_EVAL_URL and the API key as CODE_EVAL_APIKEY.')
             log.debug('Running code eval locally.')
             client = LocalEvalClient()
         elif self.eval_device == 'LAMBDA':
@@ -327,13 +328,15 @@ def get_client(self) -> EvalClient:
         elif self.eval_device is None:
             raise ValueError(
                 'Attempting to use InContextLearningCodeEvalAccuracy but environment '
+                +
                 'variable `CODE_EVAL_DEVICE` is not set. Please set it to `CODE_EVAL_DEVICE` '
-                'to one of `LOCAL` (for unsafe local eval), `LAMBDA` (for AWS lambda ',
-                'evaluation), or `MOSAICML` (for lambda eval through MAPI).')
+                +
+                'to one of `LOCAL` (for unsafe local eval), `LAMBDA` (for AWS lambda '
+                + 'evaluation), or `MOSAICML` (for lambda eval through MAPI).')
         else:
             raise ValueError(
                 'Environment variable `CODE_EVAL_DEVICE` must be one of `LOCAL`, '
-                f'`LAMBDA`, or `MOSAICML` but got {self.eval_device}.')
+                + f'`LAMBDA`, or `MOSAICML` but got {self.eval_device}.')
 
         return client
 

From f1b334d79f10ef0673b6986e27704a567a312e8e Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Tue, 27 Feb 2024 13:38:56 -0500
Subject: [PATCH 23/59] done

---
 llmfoundry/utils/huggingface_hub_utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llmfoundry/utils/huggingface_hub_utils.py b/llmfoundry/utils/huggingface_hub_utils.py
index 9fdc20c0d6..6a4c49cb28 100644
--- a/llmfoundry/utils/huggingface_hub_utils.py
+++ b/llmfoundry/utils/huggingface_hub_utils.py
@@ -131,7 +131,8 @@ def edit_files_for_hf_compatibility(
     folder: str,
     flatten_imports_prefix: Sequence[str] = ('llmfoundry',),
     remove_imports_prefix: Sequence[str] = ('composer', 'omegaconf',
-                                            'llmfoundry.metrics'),
+                                            'llmfoundry.metrics',
+                                            'llmfoundry.eval'),
 ) -> None:
     """Edit files to be compatible with Hugging Face Hub.
 

From c4ed644fcf8d7f06ce805dda2d663f6d3c24f3a8 Mon Sep 17 00:00:00 2001
From: Eitan Turok <150733043+eitanturok@users.noreply.github.com>
Date: Tue, 27 Feb 2024 18:57:23 -0500
Subject: [PATCH 24/59] pass prelimiter into ALL the ICL task datasets

---
 llmfoundry/eval/datasets/in_context_learning_evaluation.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llmfoundry/eval/datasets/in_context_learning_evaluation.py b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
index 8c579d0e9d..0ca37ef54e 100644
--- a/llmfoundry/eval/datasets/in_context_learning_evaluation.py
+++ b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
@@ -1423,6 +1423,7 @@ def build_icl_dataloader(
             example_delimiter=example_delimiter,
             continuation_delimiter=continuation_delimiter,
             destination_path=destination_path,
+            prelimiter=prelimiter,
             fewshot_random_seed=fewshot_random_seed,
             hf_loading_vars=hf_loading_vars,
             hf_parsing_map=hf_parsing_map,
@@ -1441,6 +1442,7 @@ def build_icl_dataloader(
             example_delimiter=example_delimiter,
             continuation_delimiter=continuation_delimiter,
             destination_path=destination_path,
+            prelimiter=prelimiter,
             fewshot_random_seed=fewshot_random_seed,
             hf_loading_vars=hf_loading_vars,
             hf_parsing_map=hf_parsing_map,
@@ -1459,6 +1461,7 @@ def build_icl_dataloader(
             example_delimiter=example_delimiter,
             continuation_delimiter=continuation_delimiter,
             destination_path=destination_path,
+            prelimiter=prelimiter,
             fewshot_random_seed=fewshot_random_seed,
             hf_loading_vars=hf_loading_vars,
             hf_parsing_map=hf_parsing_map,

From f213a40c9d811187c81a1c0933905f1b847d4ab5 Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Mon, 4 Mar 2024 13:07:02 -0500
Subject: [PATCH 25/59] allow QA task name stil lfor backward compatibility

---
 llmfoundry/eval/datasets/in_context_learning_evaluation.py | 7 +++++--
 llmfoundry/utils/builders.py                               | 2 +-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/llmfoundry/eval/datasets/in_context_learning_evaluation.py b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
index 8c579d0e9d..4b294c5469 100644
--- a/llmfoundry/eval/datasets/in_context_learning_evaluation.py
+++ b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
@@ -19,13 +19,14 @@
 from composer.datasets.utils import stop_sequences_criteria
 from composer.utils import MissingConditionalImportError, dist, get_file
 from torch.utils.data import DataLoader, Dataset
-
+import logging
 from llmfoundry.eval.datasets.utils import (convert_tokens_to_tensors,
                                             get_continuation_span,
                                             get_fewshot_sample_idxs,
                                             make_padded_input, strip_data,
                                             tokenizer_needs_prefix_space,
                                             trim_context)
+log = logging.getLogger(__name__)
 
 if TYPE_CHECKING:
     import transformers
@@ -1465,7 +1466,9 @@ def build_icl_dataloader(
             generation_kwargs=generation_kwargs,
         )
         effective_batchsize = batch_size
-    elif icl_task_type == 'generation_task_with_answers':
+    elif icl_task_type == 'generation_task_with_answers' or icl_task_type == "question_answering":
+        if icl_task_type == "question_answering":
+            log.warning(f"ICL task type `question_answering` has been deprecated, please use `generation_task_with_answers`.")
         dataset = InContextLearningGenerationWithAnswersTaskDataset(
             dataset_uri=dataset_uri,
             tokenizer=tokenizer,
diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py
index 63e0df91c8..cfa341fb9c 100644
--- a/llmfoundry/utils/builders.py
+++ b/llmfoundry/utils/builders.py
@@ -476,7 +476,7 @@ def _validate_cfg(icl_cfg: DictConfig):
                 icl_cfg.metric_names = [
                     'InContextLearningMultipleChoiceAccuracy'
                 ]
-            elif icl_cfg.icl_task_type == 'generation_task_with_answers':
+            elif icl_cfg.icl_task_type == 'generation_task_with_answers' or icl_cfg.icl_task_type == "question_answering":
                 icl_cfg.metric_names = [
                     'InContextLearningGenerationExactMatchAccuracy'
                 ]

From d570e5d64b717f9baf9358e05dc1e80c0affbb7b Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Tue, 5 Mar 2024 14:01:46 -0500
Subject: [PATCH 26/59] fix

---
 llmfoundry/eval/datasets/__init__.py          |   4 +-
 .../in_context_learning_evaluation.py         | 130 +++++++++---------
 llmfoundry/utils/builders.py                  |   2 +-
 scripts/eval/README.md                        |   4 +-
 .../eval/test_in_context_learning_datasets.py |  92 ++++++++-----
 5 files changed, 129 insertions(+), 103 deletions(-)

diff --git a/llmfoundry/eval/datasets/__init__.py b/llmfoundry/eval/datasets/__init__.py
index 794b1d563b..52c72b67af 100644
--- a/llmfoundry/eval/datasets/__init__.py
+++ b/llmfoundry/eval/datasets/__init__.py
@@ -8,7 +8,7 @@
 
 from llmfoundry.eval.datasets.in_context_learning_evaluation import (
     InContextLearningCodeEvalDataset, InContextLearningDataset,
-    InContextLearningGenerationWithAnswersTaskDataset,
+    InContextLearningGenerationTaskWithAnswersDataset,
     InContextLearningLMTaskDataset, InContextLearningMultipleChoiceTaskDataset,
     InContextLearningSchemaTaskDataset, get_icl_task_dataloader)
 from llmfoundry.eval.datasets.utils import (get_continuation_span,
@@ -19,7 +19,7 @@
 
 __all__ = [
     'InContextLearningDataset',
-    'InContextLearningGenerationWithAnswersTaskDataset',
+    'InContextLearningGenerationTaskWithAnswersDataset',
     'InContextLearningLMTaskDataset', 'InContextLearningCodeEvalDataset',
     'InContextLearningMultipleChoiceTaskDataset',
     'InContextLearningSchemaTaskDataset', 'get_icl_task_dataloader',
diff --git a/llmfoundry/eval/datasets/in_context_learning_evaluation.py b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
index 1964a7119d..966949befc 100644
--- a/llmfoundry/eval/datasets/in_context_learning_evaluation.py
+++ b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
@@ -9,6 +9,7 @@
 
 import copy
 import json
+import logging
 import os
 import random
 from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Union
@@ -19,13 +20,14 @@
 from composer.datasets.utils import stop_sequences_criteria
 from composer.utils import MissingConditionalImportError, dist, get_file
 from torch.utils.data import DataLoader, Dataset
-import logging
+
 from llmfoundry.eval.datasets.utils import (convert_tokens_to_tensors,
                                             get_continuation_span,
                                             get_fewshot_sample_idxs,
                                             make_padded_input, strip_data,
                                             tokenizer_needs_prefix_space,
                                             trim_context)
+
 log = logging.getLogger(__name__)
 
 if TYPE_CHECKING:
@@ -41,7 +43,7 @@
     'InContextLearningMultipleChoiceTaskDataset',
     'InContextLearningSchemaTaskDataset',
     'InContextLearningCodeEvalDataset',
-    'InContextLearningGenerationWithAnswersTaskDataset',
+    'InContextLearningGenerationTaskWithAnswersDataset',
     'get_icl_task_dataloader',
 ]
 
@@ -63,7 +65,7 @@ class InContextLearningDataset(Dataset):
     - construct_context(): Takes a single example dictionary and formulates the context as a string for that eval question.
     - get_answer_from_example(): Takes a single example dictionary and formulates the correct, ground truth answer as a string.
     - tokenize_example(): Tokenizes the example and adds any extra content from the original dictionary that needs to be passed downstream.
-    - read_dataset(): Loads the dataset and does basic parsing. If additional parsing must be done, this is a good place to do so (See InContextLearningGenerationWithAnswersTaskDataset.read_dataset())
+    - read_dataset(): Loads the dataset and does basic parsing. If additional parsing must be done, this is a good place to do so (See InContextLearningGenerationTaskWithAnswersDataset.read_dataset())
 
     Additionally, base_batch and batch_mapping must be defined.
 
@@ -525,7 +527,7 @@ def split_batch(self, batch: Any,
         return batched_list
 
 
-class InContextLearningGenerationWithAnswersTaskDataset(InContextLearningDataset
+class InContextLearningGenerationTaskWithAnswersDataset(InContextLearningDataset
                                                        ):
     """A dataset that constructs batches for in-context learning generation.
 
@@ -545,23 +547,22 @@ class InContextLearningGenerationWithAnswersTaskDataset(InContextLearningDataset
         do_normalization (bool): Flag indicating whether to normalize generations before providing output.
     """
 
-    def __init__(
-            self,
-            cot_delimiter: str = '',
-            early_stopping_criteria: Optional[List[str]] = None,
-            do_normalization: bool = True,
-            *args,  # pyright: ignore
-            **kwargs):  # pyright: ignore
+    def __init__(self,
+                 cot_delimiter: str = '',
+                 early_stopping_criteria: Optional[List[str]] = None,
+                 do_normalization: bool = True,
+                 *args: Any,
+                 **kwargs: Any):
         if kwargs['tokenizer'].eos_token_id is None:
             raise ValueError(
-                '`InContextLearningGenerationWithAnswersTaskDataset` tokenizer must have non-null `eos_token_id`'
+                '`InContextLearningGenerationTaskWithAnswersDataset` tokenizer must have non-null `eos_token_id`'
             )
         self.cot_delimiter = cot_delimiter
         self.has_cot = False
         self.max_answer_length = 0
         static_keys = [
-            'mode', 'cot_delimiter', 'generation_length', 'generation_kwargs',
-            'do_normalization', 'stopping_criteria'
+            'mode', 'cot_delimiter', 'generation_kwargs', 'do_normalization',
+            'stopping_criteria'
         ]
         tensor_keys = ['input_ids', 'attention_mask']
         list_keys = ['labels']
@@ -579,13 +580,13 @@ def __init__(
             'mode': 'generate',
             'labels': [],
             'cot_delimiter': self.cot_delimiter,
-            'generation_length': self.max_answer_length,
             'stopping_criteria': early_stopping_criteria,
             'do_normalization': do_normalization,
             'generation_kwargs': {
                 'pad_token_id': self.pad_tok_id,
                 'use_cache': True,
                 'eos_token_id': self.tokenizer.eos_token_id,
+                'max_new_tokens': self.max_answer_length,
             }
         }
         self.batch_mapping = {
@@ -710,7 +711,7 @@ class InContextLearningLMTaskDataset(InContextLearningDataset):
     See InContextLearningDataset for more details.
     """
 
-    def __init__(self, *args, **kwargs):  # pyright: ignore
+    def __init__(self, *args: Any, **kwargs: Any):
         super().__init__(answer_key='continuation',
                          static_keys=['mode'],
                          tensor_keys=[
@@ -758,15 +759,14 @@ class InContextLearningMultipleChoiceTaskDataset(InContextLearningDataset):
         choices_key (str): The key under which the choices are stored in the saved dataset. Defaults to 'choices'.
     """
 
-    def __init__(
-            self,
-            choices_key: str = 'choices',
-            static_keys: Optional[List] = None,
-            list_of_tensors_keys: Optional[List] = None,
-            list_of_tuples_keys: Optional[List] = None,
-            list_of_primitives: Optional[List] = None,
-            *args,  # pyright: ignore
-            **kwargs):  # pyright: ignore
+    def __init__(self,
+                 choices_key: str = 'choices',
+                 static_keys: Optional[List] = None,
+                 list_of_tensors_keys: Optional[List] = None,
+                 list_of_tuples_keys: Optional[List] = None,
+                 list_of_primitives: Optional[List] = None,
+                 *args: Any,
+                 **kwargs: Any):
         self.choices_key = choices_key
         base_batch = {
             'input_ids': [],
@@ -989,11 +989,10 @@ class InContextLearningSchemaTaskDataset(
     - choice_groupings: Indicates which indices of the batch correspond to which questions
     """
 
-    def __init__(
-            self,
-            choices_key: str = 'context_options',
-            *args,  # pyright: ignore
-            **kwargs):  # pyright: ignore
+    def __init__(self,
+                 choices_key: str = 'context_options',
+                 *args: Any,
+                 **kwargs: Any):
         static_keys = ['mode']
         tensor_keys = ['input_ids', 'labels', 'attention_mask']
         list_of_tensors_keys = ['continuation_indices']
@@ -1037,7 +1036,7 @@ def construct_context(self,
         context = context_options[gold_idx]
         if len(preceding_text) > 0:
             context = f'{self.example_delimiter}{context}'
-        context = f'{context}{self.continuation_delimiter}{continuation}'
+        context = f'{self.prelimiter}{context}{self.continuation_delimiter}{continuation}'
         return context
 
     def _construct_multiple_contexts(self,
@@ -1061,9 +1060,11 @@ def _construct_multiple_contexts(self,
             else:
                 cont_del = self.continuation_delimiter
             context_options = [
-                f'{self.example_delimiter}{c}{cont_del}'
+                f'{self.prelimiter}{self.example_delimiter}{c}{cont_del}'
                 for c in context_options
             ]
+        else:
+            context_options = [f'{self.prelimiter}{c}' for c in context_options]
         return context_options
 
     def _prep_example(
@@ -1182,7 +1183,6 @@ class InContextLearningCodeEvalDataset(InContextLearningDataset):
     - test_outputs: List of test outputs
     - languages:  List of languages
     - pass_at_k: Passed value for pass_at_k
-    - generation_length: Derrived maximum generation length
     - generation_kwargs: Dictionary of kwargs neeeded for generation. Includes the following, which will be individually overwritten
       by keys in generaiton_kwargs if set (see https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig
       for more details):
@@ -1198,11 +1198,11 @@ class InContextLearningCodeEvalDataset(InContextLearningDataset):
     """
 
     def __init__(
-            self,
-            generations_per_sample: int,
-            pass_at_k: Union[int, list[int]] = 1,
-            *args,  # pyright: ignore
-            **kwargs,  # pyright: ignore
+        self,
+        generations_per_sample: int,
+        pass_at_k: Union[int, list[int]] = 1,
+        *args: Any,
+        **kwargs: Any,
     ):
         if isinstance(pass_at_k, int):
             pass_at_k = [pass_at_k]
@@ -1227,7 +1227,6 @@ def __init__(
         static_keys = [
             'mode',
             'pass_at_k',
-            'generation_length',
             'generation_kwargs',
             'generations_per_sample',
             'dataset_size',
@@ -1262,8 +1261,7 @@ def __init__(
         self.dataset = self.repeat_dataset(self.dataset, generations_per_sample)
         self.base_batch = {
             'input_ids': [],
-            'mode':
-                'generate',
+            'mode': 'generate',
             'labels': [],
             'prompts': [],
             'tests': [],
@@ -1271,26 +1269,28 @@ def __init__(
             'test_inputs': [],
             'test_outputs': [],
             'languages': [],
-            'pass_at_k':
-                pass_at_k,
-            'generation_length':
-                min(self.max_answer_length,
-                    self.max_seq_len - self.max_prompt_length),
+            'pass_at_k': pass_at_k,
             'generation_kwargs': {
-                'pad_token_id': self.pad_tok_id,
-                'num_beams': 1,  # single beam
-                'do_sample': True,
-                'temperature': 0.2,  # good default for code
-                'use_cache': True,
-                'eos_token_id': self.tokenizer.eos_token_id,
+                'pad_token_id':
+                    self.pad_tok_id,
+                'num_beams':
+                    1,  # single beam
+                'do_sample':
+                    True,
+                'temperature':
+                    0.2,  # good default for code
+                'use_cache':
+                    True,
+                'eos_token_id':
+                    self.tokenizer.eos_token_id,
+                'max_new_tokens':
+                    min(self.max_answer_length,
+                        self.max_seq_len - self.max_prompt_length),
             },
             'sample_id': [],
-            'pass_at_k':
-                list(pass_at_k),
-            'generations_per_sample':
-                generations_per_sample,
-            'dataset_size':
-                dataset_size,
+            'pass_at_k': list(pass_at_k),
+            'generations_per_sample': generations_per_sample,
+            'dataset_size': dataset_size,
         }
         if 'generation_kwargs' in kwargs:
             self.update_generation_kwargs(kwargs['generation_kwargs'])
@@ -1469,10 +1469,12 @@ def build_icl_dataloader(
             generation_kwargs=generation_kwargs,
         )
         effective_batchsize = batch_size
-    elif icl_task_type == 'generation_task_with_answers' or icl_task_type == "question_answering":
-        if icl_task_type == "question_answering":
-            log.warning(f"ICL task type `question_answering` has been deprecated, please use `generation_task_with_answers`.")
-        dataset = InContextLearningGenerationWithAnswersTaskDataset(
+    elif icl_task_type == 'generation_task_with_answers' or icl_task_type == 'question_answering':
+        if icl_task_type == 'question_answering':
+            log.warning(
+                f'ICL task type `question_answering` has been deprecated, please use `generation_task_with_answers`.'
+            )
+        dataset = InContextLearningGenerationTaskWithAnswersDataset(
             dataset_uri=dataset_uri,
             tokenizer=tokenizer,
             max_seq_len=max_seq_len,
@@ -1522,7 +1524,7 @@ def build_icl_dataloader(
             dataset,
         (
             InContextLearningMultipleChoiceTaskDataset,
-            InContextLearningGenerationWithAnswersTaskDataset,
+            InContextLearningGenerationTaskWithAnswersDataset,
             InContextLearningCodeEvalDataset,
         ),
     ):
@@ -1721,7 +1723,7 @@ def get_icl_task_dataloader(
                                                   for more details)
         early_stopping (List, default = None): A list of strings that, when found in a model's output, will be treated as a stopping criteria at metric computation time.
             Used in generation tasks with CoT
-        do_normalization (bool, default = True): Whether or not to normalize the outputs and labels in InContextLearningGenerationWithAnswersTaskDataset. Only used in generation tasks.
+        do_normalization (bool, default = True): Whether or not to normalize the outputs and labels in InContextLearningGenerationTaskWithAnswersDataset. Only used in generation tasks.
 
     Returns:
         DataLoader: A dataloader used for performing in-context learning evaluation on the dataset provided.
diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py
index b563881088..27dacee457 100644
--- a/llmfoundry/utils/builders.py
+++ b/llmfoundry/utils/builders.py
@@ -476,7 +476,7 @@ def _validate_cfg(icl_cfg: DictConfig):
                 icl_cfg.metric_names = [
                     'InContextLearningMultipleChoiceAccuracy'
                 ]
-            elif icl_cfg.icl_task_type == 'generation_task_with_answers' or icl_cfg.icl_task_type == "question_answering":
+            elif icl_cfg.icl_task_type == 'generation_task_with_answers' or icl_cfg.icl_task_type == 'question_answering':
                 icl_cfg.metric_names = [
                     'InContextLearningGenerationExactMatchAccuracy'
                 ]
diff --git a/scripts/eval/README.md b/scripts/eval/README.md
index bfc8fbde58..9027af841c 100644
--- a/scripts/eval/README.md
+++ b/scripts/eval/README.md
@@ -156,7 +156,7 @@ This document explains the ICL formats compatible with [Composer](https://github
 
 llm-foundry currently supports five ICL formats:
 
-1. InContextLearningGenerationWithAnswersTaskDataset
+1. InContextLearningGenerationTaskWithAnswersDataset
 2. InContextLearningLMTaskDataset
 3. InContextLearningMultipleChoiceTaskDataset
 4. InContextLearningSchemaTaskDataset
@@ -164,7 +164,7 @@ llm-foundry currently supports five ICL formats:
 
 ----
 
-### InContextLearningGenerationWithAnswersTaskDataset
+### InContextLearningGenerationTaskWithAnswersDataset
 
 The ICL generation with answers task supports free response generation evaluation using the model’s generate function. A generation dataset consists of a list of JSONs containing a prompt (under the key `context`), a correct answer (under the key `answer`), and a list of alternative answers that would be considered permissible (under the key `aliases`). The generation task works with the NLP metric: InContextLearningGenerationExactMatchAccuracy which assigns a model's output to be "correct" if, conditioned on the context, the model's generate method produces a string that is a normalized prefix for either the `answer` or any of the `aliases`.
 
diff --git a/tests/eval/test_in_context_learning_datasets.py b/tests/eval/test_in_context_learning_datasets.py
index bf511db2e9..4ab001df1b 100644
--- a/tests/eval/test_in_context_learning_datasets.py
+++ b/tests/eval/test_in_context_learning_datasets.py
@@ -21,7 +21,7 @@
 from llmfoundry.eval.datasets import (
     InContextLearningDataset, InContextLearningCodeEvalDataset,
     InContextLearningMultipleChoiceTaskDataset,
-    InContextLearningGenerationWithAnswersTaskDataset,
+    InContextLearningGenerationTaskWithAnswersDataset,
     InContextLearningSchemaTaskDataset, get_icl_task_dataloader, strip_data,
     tokenizer_needs_prefix_space, trim_context, get_continuation_span,
     get_fewshot_sample_idxs, make_padded_input)
@@ -337,7 +337,7 @@ def test_update_generation_kwargs_no_kwargs_qa_dataset(tmp_path: Path):
 
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
-    dl = InContextLearningGenerationWithAnswersTaskDataset(
+    dl = InContextLearningGenerationTaskWithAnswersDataset(
         dataset_uri=dataset_uri,
         tokenizer=tokenizer,
         max_seq_len=1024,
@@ -349,7 +349,7 @@ def test_update_generation_kwargs_no_kwargs_qa_dataset(tmp_path: Path):
         continuation_delimiter=': ',
         destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
         generation_kwargs=None)
-    assert len(dl.base_batch['generation_kwargs']) == 3
+    assert len(dl.base_batch['generation_kwargs']) == 4
 
 
 def test_update_generation_kwargs_with_kwargs_qa_dataset(tmp_path: Path):
@@ -362,7 +362,7 @@ def test_update_generation_kwargs_with_kwargs_qa_dataset(tmp_path: Path):
 
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
-    dl = InContextLearningGenerationWithAnswersTaskDataset(
+    dl = InContextLearningGenerationTaskWithAnswersDataset(
         dataset_uri=dataset_uri,
         tokenizer=tokenizer,
         max_seq_len=1024,
@@ -376,7 +376,7 @@ def test_update_generation_kwargs_with_kwargs_qa_dataset(tmp_path: Path):
         generation_kwargs={'temperature': 0.9})
     assert 'generation_kwargs' in dl.base_batch
     assert dl.base_batch['generation_kwargs']['temperature'] == 0.9
-    assert len(dl.base_batch['generation_kwargs']) == 4
+    assert len(dl.base_batch['generation_kwargs']) == 5
 
 
 @pytest.mark.filterwarnings(
@@ -603,7 +603,7 @@ def test_qa_set_cot_no_cot(tmp_path: Path):
 
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
-    dl = InContextLearningGenerationWithAnswersTaskDataset(
+    dl = InContextLearningGenerationTaskWithAnswersDataset(
         dataset_uri=dataset_uri,
         tokenizer=tokenizer,
         max_seq_len=1024,
@@ -628,7 +628,7 @@ def test_qa_set_cot_has_cot(tmp_path: Path):
 
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
-    dl = InContextLearningGenerationWithAnswersTaskDataset(
+    dl = InContextLearningGenerationTaskWithAnswersDataset(
         dataset_uri=dataset_uri,
         tokenizer=tokenizer,
         max_seq_len=1024,
@@ -651,7 +651,7 @@ def test_qa_get_max_answer_length(
 
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
-    dl = InContextLearningGenerationWithAnswersTaskDataset(
+    dl = InContextLearningGenerationTaskWithAnswersDataset(
         dataset_uri=dataset_uri,
         tokenizer=tokenizer,
         max_seq_len=1024,
@@ -676,7 +676,7 @@ def test_qa_get_answer_from_example_with_no_cot(
 
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
-    dl = InContextLearningGenerationWithAnswersTaskDataset(
+    dl = InContextLearningGenerationTaskWithAnswersDataset(
         dataset_uri=dataset_uri,
         tokenizer=tiny_gpt2_tokenizer,
         max_seq_len=1024,
@@ -705,7 +705,7 @@ def test_qa_get_answer_from_example_with_cot(
 
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
-    dl = InContextLearningGenerationWithAnswersTaskDataset(
+    dl = InContextLearningGenerationTaskWithAnswersDataset(
         dataset_uri=dataset_uri,
         tokenizer=tiny_gpt2_tokenizer,
         max_seq_len=1024,
@@ -735,7 +735,7 @@ def test_qa_tokenize_example(tiny_gpt2_tokenizer: transformers.AutoTokenizer,
 
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
-    dl = InContextLearningGenerationWithAnswersTaskDataset(
+    dl = InContextLearningGenerationTaskWithAnswersDataset(
         dataset_uri=dataset_uri,
         tokenizer=tiny_gpt2_tokenizer,
         max_seq_len=1024,
@@ -872,8 +872,10 @@ def test_mc_tokenize_example(tiny_gpt2_tokenizer: transformers.AutoTokenizer,
     assert untokenized_inputs == correct_output
 
 
+@pytest.mark.parametrize('prelimiter', ['', 'This is a question: '])
 def test_schema_construct_context(
-        tiny_gpt2_tokenizer: transformers.AutoTokenizer, tmp_path: Path):
+        prelimiter: str, tiny_gpt2_tokenizer: transformers.AutoTokenizer,
+        tmp_path: Path):
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/winograd_small.jsonl'
     tokenizer = tiny_gpt2_tokenizer
@@ -888,6 +890,7 @@ def test_schema_construct_context(
         num_fewshot=num_fewshot,
         fewshot_random_seed=1,
         prompt_string='',
+        prelimiter=prelimiter,
         example_delimiter='\n',
         continuation_delimiter=' ### ',
         destination_path=str(tmp_path / 'test_human_eval_small.jsonl'),
@@ -898,13 +901,17 @@ def test_schema_construct_context(
         'continuation': 'this is a continuation'
     }
     constructed_context = dl.construct_context(example)
-    assert constructed_context == 'cont one ### this is a continuation'
+    assert constructed_context == f'{prelimiter}cont one ### this is a continuation'
     constructed_context = dl.construct_context(example, preceding_text='text')
-    assert constructed_context == '\ncont one ### this is a continuation'
+    assert constructed_context == f'{prelimiter}\ncont one ### this is a continuation'
 
 
+@pytest.mark.parametrize('prelimiter', ['', 'This is a question: '])
 def test_schema_construct_multiple_contexts(
-        tiny_gpt2_tokenizer: transformers.AutoTokenizer, tmp_path: Path):
+    prelimiter: str,
+    tiny_gpt2_tokenizer: transformers.AutoTokenizer,
+    tmp_path: Path,
+):
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/winograd_small.jsonl'
     tokenizer = tiny_gpt2_tokenizer
@@ -919,21 +926,26 @@ def test_schema_construct_multiple_contexts(
         pad_tok_id=tokenizer.eos_token_id,
         num_fewshot=num_fewshot,
         fewshot_random_seed=1,
+        prelimiter=prelimiter,
         prompt_string=prompt_string,
         example_delimiter='\n',
         continuation_delimiter=' ### ',
         destination_path=str(tmp_path / 'test_human_eval_small.jsonl'),
     )
     example = {
-        'context_options': ['cont one', 'cont two'],
+        'context_options': [f'cont one', 'cont two'],
         'gold': 0,
         'continuation': 'this is a continuation'
     }
     constructed_contexts = dl._construct_multiple_contexts(example)
-    assert constructed_contexts == ['cont one', 'cont two']
+    assert constructed_contexts == [
+        f'{prelimiter}cont one', f'{prelimiter}cont two'
+    ]
     constructed_contexts = dl._construct_multiple_contexts(
         example, preceding_text='some text')
-    assert constructed_contexts == ['\ncont one ###', '\ncont two ###']
+    assert constructed_contexts == [
+        f'{prelimiter}\ncont one ###', f'{prelimiter}\ncont two ###'
+    ]
 
 
 def test_schema_tokenize_example(
@@ -1120,7 +1132,8 @@ def test_lm_task_dataloader(dataset_uri: str,
 
 
 @pytest.mark.parametrize('dataset_uri', ['winograd_small.jsonl'])
-def test_schema_task_dataloader(dataset_uri: str,
+@pytest.mark.parametrize('prelimiter', ['', 'This is a question: '])
+def test_schema_task_dataloader(dataset_uri: str, prelimiter: str,
                                 tiny_gpt2_tokenizer: transformers.AutoTokenizer,
                                 tmp_path: Path):
     pytest.importorskip('datasets')
@@ -1140,6 +1153,7 @@ def test_schema_task_dataloader(dataset_uri: str,
                                  num_fewshot=1,
                                  prompt_string='',
                                  example_delimiter='\n',
+                                 question_prelimiter=prelimiter,
                                  continuation_delimiter='',
                                  destination_path=str(tmp_path / 'icl.jsonl'))
     assert isinstance(dl, DataSpec)
@@ -1437,8 +1451,8 @@ def test_qa_split_batch(tiny_opt_tokenizer: transformers.AutoTokenizer,
     assert len(split2['labels']) == 1
     assert all(isinstance(v, list) for v in split1['labels'] + split2['labels'])
 
-    assert isinstance(split1['generation_length'], int)
-    assert isinstance(split2['generation_length'], int)
+    assert isinstance(split1['generation_kwargs']['max_new_tokens'], int)
+    assert isinstance(split2['generation_kwargs']['max_new_tokens'], int)
 
     assert isinstance(split1['generation_kwargs'], dict)
     assert isinstance(split2['generation_kwargs'], dict)
@@ -1517,7 +1531,7 @@ def test_qa_task_dataloader(dataset_uri: str,
     assert batch['mode'] == 'generate'
     # the maximum generation length from the small test data
 
-    assert batch['generation_length'] == maximum_answer_length
+    assert batch['generation_kwargs']['max_new_tokens'] == maximum_answer_length
     assert all(item[0] == tokenizer.eos_token_id for item in batch['input_ids'])
 
     decoded_batch = tokenizer.batch_decode(batch['input_ids'])
@@ -1574,7 +1588,7 @@ def test_qa_task_with_cot_dataloader(
                                                     maximum_answer_length)
     assert batch['mode'] == 'generate'
     # the maximum generation length from the small test data
-    assert batch['generation_length'] == maximum_answer_length
+    assert batch['generation_kwargs']['max_new_tokens'] == maximum_answer_length
     assert all(item[0] == tokenizer.eos_token_id for item in batch['input_ids'])
     decoded_batch = tokenizer.batch_decode(batch['input_ids'])
     assert all(item.count('Q: ') == num_fewshot + 1 for item in decoded_batch)
@@ -1598,7 +1612,8 @@ def test_qa_task_with_cot_dataloader(
 
 
 @pytest.mark.parametrize('dataset_uri', ['piqa_small.jsonl'])
-def test_mc_task_dataloader(dataset_uri: str,
+@pytest.mark.parametrize('prelimiter', ['', 'This is a question: '])
+def test_mc_task_dataloader(dataset_uri: str, prelimiter: str,
                             tiny_gpt2_tokenizer: transformers.AutoTokenizer,
                             tmp_path: Path):
     pytest.importorskip('datasets')
@@ -1609,6 +1624,7 @@ def test_mc_task_dataloader(dataset_uri: str,
     dataset_uri = f'{local_data}/{dataset_uri}'
     batch_size = 2
     seqlen = 64
+    example_delimiter = '\n'
     dl = get_icl_task_dataloader('multiple_choice',
                                  dataset_uri=dataset_uri,
                                  tokenizer=tokenizer,
@@ -1617,8 +1633,9 @@ def test_mc_task_dataloader(dataset_uri: str,
                                  pad_tok_id=tokenizer.eos_token_id,
                                  num_fewshot=1,
                                  prompt_string='',
-                                 example_delimiter='\n',
-                                 continuation_delimiter=': ',
+                                 question_prelimiter=prelimiter,
+                                 example_delimiter=example_delimiter,
+                                 continuation_delimiter='\nA: ',
                                  destination_path=str(tmp_path / 'icl.jsonl'))
     assert isinstance(dl, DataSpec)
     assert isinstance(dl.dataloader, DataLoader)  # pyright
@@ -1645,6 +1662,14 @@ def test_mc_task_dataloader(dataset_uri: str,
     max_idx = max(batch['continuation_indices'][0]).item()
     assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx +
                                                   1]) == ' Pour it onto a plate'
+    q1 = 'how do you shake something?\nA: '
+    a1 = 'move it up and down and side to side quickly.'
+    q2 = "When boiling butter, when it's ready, you can\nA:"
+    assert tokenizer.decode(
+        batch['input_ids'][0][:min_idx]
+    ) == f'{prelimiter}{q1}{a1}{example_delimiter}{prelimiter}{q2}'
+    assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx +
+                                                  1]) == ' Pour it onto a plate'
 
 
 @pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
@@ -1694,12 +1719,11 @@ def test_code_eval_split_batch(dataset_uri: str, tmp_path: Path):
             assert len(batch[field]) == size
             assert all(isinstance(val, type_) for val in batch[field])
 
-    static_keys = {
-        'pass_at_k': (int, list),
-        'generation_length': int,
-        'generation_kwargs': dict
-    }
+    static_keys = {'pass_at_k': (int, list), 'generation_kwargs': dict}
     for batch in batches:
+        assert 'generation_kwargs' in batch
+        assert 'max_new_tokens' in batch['generation_kwargs']
+        assert isinstance(batch['generation_kwargs']['max_new_tokens'], int)
         for field, type_ in static_keys.items():
             assert isinstance(batch[field], type_)
 
@@ -1754,7 +1778,7 @@ def test_code_eval_sentpiece_dataloader(
         assert tuple(batch['attention_mask'].shape) == (bs, max_prompt_length)
         assert batch['mode'] == 'generate'
         # the maximum generation length from the small test data
-        assert batch['generation_length'] == 129
+        assert batch['generation_kwargs']['max_new_tokens'] == 129
         has_left_padding.extend(
             [item[0] == tokenizer.eos_token_id for item in batch['input_ids']])
     assert not all(has_left_padding)  # longest should be pushed left
@@ -1832,7 +1856,7 @@ def test_code_eval_test_cases(dataset_uri: str, tmp_path: Path):
                                                     max_prompt_length)
     assert batch['mode'] == 'generate'
     # the maximum generation length from the small test data
-    assert batch['generation_length'] == 129
+    assert batch['generation_kwargs']['max_new_tokens'] == 129
     assert any(item[0] != tokenizer.eos_token_id
                for item in batch['input_ids'])  # longest should be pushed left
 
@@ -1930,7 +1954,7 @@ def test_code_eval_task_dataloader(dataset_uri: str, tmp_path: Path,
         assert tuple(batch['attention_mask'].shape) == (bs, max_prompt_length)
         assert batch['mode'] == 'generate'
         # the maximum generation length from the small test data
-        assert batch['generation_length'] == 122
+        assert batch['generation_kwargs']['max_new_tokens'] == 122
         has_left_padding.extend(
             [item[0] == tokenizer.eos_token_id for item in batch['input_ids']])
     assert not all(has_left_padding)  # longest should be pushed left

From a5cd308d8ec6a217afc0edc8bb23c0d7bee5c7d1 Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Tue, 5 Mar 2024 14:12:35 -0500
Subject: [PATCH 27/59] fix test

---
 tests/eval/test_in_context_learning_datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/eval/test_in_context_learning_datasets.py b/tests/eval/test_in_context_learning_datasets.py
index 4ab001df1b..c6d9535cab 100644
--- a/tests/eval/test_in_context_learning_datasets.py
+++ b/tests/eval/test_in_context_learning_datasets.py
@@ -2861,7 +2861,7 @@ def test_hf_dataloading_custom_parsing(
                                                     maximum_answer_length)
     assert batch['mode'] == 'generate'
     # the maximum generation length from the small test data
-    assert batch['generation_length'] == maximum_answer_length
+    assert batch['generation_kwargs']['max_new_tokens'] == maximum_answer_length
     assert all(item[0] == tokenizer.eos_token_id for item in batch['input_ids'])
 
     decoded_batch = tokenizer.batch_decode(batch['input_ids'])

From 901fc69c8394da761f11d57aa510a64bb28eefe0 Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Tue, 5 Mar 2024 14:25:01 -0500
Subject: [PATCH 28/59] add generation length

---
 .../in_context_learning_evaluation.py         | 26 ++++++++++++++-----
 mcli/mcli-hf-eval.yaml                        | 14 +++++-----
 2 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/llmfoundry/eval/datasets/in_context_learning_evaluation.py b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
index 966949befc..25a6ee9ea8 100644
--- a/llmfoundry/eval/datasets/in_context_learning_evaluation.py
+++ b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
@@ -562,7 +562,7 @@ def __init__(self,
         self.max_answer_length = 0
         static_keys = [
             'mode', 'cot_delimiter', 'generation_kwargs', 'do_normalization',
-            'stopping_criteria'
+            'generation_length', 'stopping_criteria'
         ]
         tensor_keys = ['input_ids', 'attention_mask']
         list_keys = ['labels']
@@ -587,7 +587,10 @@ def __init__(self,
                 'use_cache': True,
                 'eos_token_id': self.tokenizer.eos_token_id,
                 'max_new_tokens': self.max_answer_length,
-            }
+            },
+            'generation_length':
+                self.
+                max_answer_length,  # TODO: deprecate with next composer udpate
         }
         self.batch_mapping = {
             'input_ids': self.context_key,
@@ -1229,6 +1232,7 @@ def __init__(
             'pass_at_k',
             'generation_kwargs',
             'generations_per_sample',
+            'generation_length',
             'dataset_size',
         ]
         list_keys = [
@@ -1261,7 +1265,8 @@ def __init__(
         self.dataset = self.repeat_dataset(self.dataset, generations_per_sample)
         self.base_batch = {
             'input_ids': [],
-            'mode': 'generate',
+            'mode':
+                'generate',
             'labels': [],
             'prompts': [],
             'tests': [],
@@ -1269,7 +1274,8 @@ def __init__(
             'test_inputs': [],
             'test_outputs': [],
             'languages': [],
-            'pass_at_k': pass_at_k,
+            'pass_at_k':
+                pass_at_k,
             'generation_kwargs': {
                 'pad_token_id':
                     self.pad_tok_id,
@@ -1288,9 +1294,15 @@ def __init__(
                         self.max_seq_len - self.max_prompt_length),
             },
             'sample_id': [],
-            'pass_at_k': list(pass_at_k),
-            'generations_per_sample': generations_per_sample,
-            'dataset_size': dataset_size,
+            'pass_at_k':
+                list(pass_at_k),
+            'generations_per_sample':
+                generations_per_sample,
+            'dataset_size':
+                dataset_size,
+            'generation_length':  # TODO: deprecate with next composer release
+                min(self.max_answer_length,
+                    self.max_seq_len - self.max_prompt_length),
         }
         if 'generation_kwargs' in kwargs:
             self.update_generation_kwargs(kwargs['generation_kwargs'])
diff --git a/mcli/mcli-hf-eval.yaml b/mcli/mcli-hf-eval.yaml
index 6800319df2..429f29c572 100644
--- a/mcli/mcli-hf-eval.yaml
+++ b/mcli/mcli-hf-eval.yaml
@@ -1,7 +1,7 @@
 integrations:
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  git_branch: v0.5.0
+  git_branch: migrate_subclasses_to_foundry  # v0.5.0
   # git_commit:  # OR use your commit hash
   pip_install: -e ".[gpu-flash2]"
   ssh_clone: false  # Should be true if using a private repo
@@ -11,12 +11,12 @@ command: |
   composer eval/eval.py /mnt/config/parameters.yaml
 
 # Mosaic Cloud will use run_name (with a unique suffix) to populate the env var $RUN_NAME
-run_name: mpt-eval
+name: mpt-eval
 gpu_num: 8
-# gpu_type:
-# cluster:  # replace with your cluster here!
+gpu_type: a100_80gb
+cluster: r1z1  # replace with your cluster here!
 
-image: mosaicml/llm-foundry:2.1.0_cu121_flash2-latest
+image: mosaicml/llm-foundry:2.2.0_cu121_flash2-latest
 
 # The below is injected as a YAML file: /mnt/config/parameters.yaml
 parameters:
@@ -28,7 +28,7 @@ parameters:
 
   models:
   -
-    model_name: mosaicml/mpt-7b-instruct
+    model_name: mosaicml/mpt-7b
     # Tokenizer
     tokenizer:
       name: EleutherAI/gpt-neox-20b
@@ -37,7 +37,7 @@ parameters:
 
     model:
       name: hf_causal_lm
-      pretrained_model_name_or_path: mosaicml/mpt-7b-instruct
+      pretrained_model_name_or_path: mosaicml/mpt-7b
       init_device: mixed
       pretrained: true
       use_auth_token: false

From df19c0d3380c6b51b655cd744908625b4810d694 Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Tue, 5 Mar 2024 15:54:21 -0500
Subject: [PATCH 29/59] remove max_new_tokens

---
 llmfoundry/eval/datasets/in_context_learning_evaluation.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/llmfoundry/eval/datasets/in_context_learning_evaluation.py b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
index 25a6ee9ea8..3e24517e98 100644
--- a/llmfoundry/eval/datasets/in_context_learning_evaluation.py
+++ b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
@@ -586,7 +586,6 @@ def __init__(self,
                 'pad_token_id': self.pad_tok_id,
                 'use_cache': True,
                 'eos_token_id': self.tokenizer.eos_token_id,
-                'max_new_tokens': self.max_answer_length,
             },
             'generation_length':
                 self.
@@ -1289,9 +1288,6 @@ def __init__(
                     True,
                 'eos_token_id':
                     self.tokenizer.eos_token_id,
-                'max_new_tokens':
-                    min(self.max_answer_length,
-                        self.max_seq_len - self.max_prompt_length),
             },
             'sample_id': [],
             'pass_at_k':

From 54bb4c7b7b577a5d260e000b9e9c92d53d34d3b0 Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Wed, 6 Mar 2024 13:00:37 -0500
Subject: [PATCH 30/59] fix cpu trsts

---
 .../in_context_learning_evaluation.py         | 18 ++++--------
 mcli/mcli-hf-eval.yaml                        | 14 ++++-----
 scripts/train/train.py                        |  4 +--
 .../eval/test_in_context_learning_datasets.py | 29 ++++++++++---------
 4 files changed, 30 insertions(+), 35 deletions(-)

diff --git a/llmfoundry/eval/datasets/in_context_learning_evaluation.py b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
index 3e24517e98..80ad9b044b 100644
--- a/llmfoundry/eval/datasets/in_context_learning_evaluation.py
+++ b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
@@ -1276,18 +1276,12 @@ def __init__(
             'pass_at_k':
                 pass_at_k,
             'generation_kwargs': {
-                'pad_token_id':
-                    self.pad_tok_id,
-                'num_beams':
-                    1,  # single beam
-                'do_sample':
-                    True,
-                'temperature':
-                    0.2,  # good default for code
-                'use_cache':
-                    True,
-                'eos_token_id':
-                    self.tokenizer.eos_token_id,
+                'pad_token_id': self.pad_tok_id,
+                'num_beams': 1,  # single beam
+                'do_sample': True,
+                'temperature': 0.2,  # good default for code
+                'use_cache': True,
+                'eos_token_id': self.tokenizer.eos_token_id,
             },
             'sample_id': [],
             'pass_at_k':
diff --git a/mcli/mcli-hf-eval.yaml b/mcli/mcli-hf-eval.yaml
index 429f29c572..6800319df2 100644
--- a/mcli/mcli-hf-eval.yaml
+++ b/mcli/mcli-hf-eval.yaml
@@ -1,7 +1,7 @@
 integrations:
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  git_branch: migrate_subclasses_to_foundry  # v0.5.0
+  git_branch: v0.5.0
   # git_commit:  # OR use your commit hash
   pip_install: -e ".[gpu-flash2]"
   ssh_clone: false  # Should be true if using a private repo
@@ -11,12 +11,12 @@ command: |
   composer eval/eval.py /mnt/config/parameters.yaml
 
 # Mosaic Cloud will use run_name (with a unique suffix) to populate the env var $RUN_NAME
-name: mpt-eval
+run_name: mpt-eval
 gpu_num: 8
-gpu_type: a100_80gb
-cluster: r1z1  # replace with your cluster here!
+# gpu_type:
+# cluster:  # replace with your cluster here!
 
-image: mosaicml/llm-foundry:2.2.0_cu121_flash2-latest
+image: mosaicml/llm-foundry:2.1.0_cu121_flash2-latest
 
 # The below is injected as a YAML file: /mnt/config/parameters.yaml
 parameters:
@@ -28,7 +28,7 @@ parameters:
 
   models:
   -
-    model_name: mosaicml/mpt-7b
+    model_name: mosaicml/mpt-7b-instruct
     # Tokenizer
     tokenizer:
       name: EleutherAI/gpt-neox-20b
@@ -37,7 +37,7 @@ parameters:
 
     model:
       name: hf_causal_lm
-      pretrained_model_name_or_path: mosaicml/mpt-7b
+      pretrained_model_name_or_path: mosaicml/mpt-7b-instruct
       init_device: mixed
       pretrained: true
       use_auth_token: false
diff --git a/scripts/train/train.py b/scripts/train/train.py
index 0eb3c7d341..15f61e2fad 100644
--- a/scripts/train/train.py
+++ b/scripts/train/train.py
@@ -15,7 +15,6 @@
 from composer.loggers import MosaicMLLogger
 from composer.loggers.mosaicml_logger import (MOSAICML_ACCESS_TOKEN_ENV_VAR,
                                               MOSAICML_PLATFORM_ENV_VAR)
-from composer.metrics.nlp import InContextLearningMetric
 from composer.profiler import (JSONTraceHandler, Profiler, TraceHandler,
                                cyclic_schedule)
 from composer.utils import dist, get_device, reproducibility
@@ -23,6 +22,8 @@
 from omegaconf import OmegaConf as om
 from rich.traceback import install
 
+from llmfoundry.eval.metrics.nlp import InContextLearningMetric
+
 install()
 
 from transformers import PreTrainedTokenizerBase
@@ -536,7 +537,6 @@ def main(cfg: DictConfig) -> Trainer:
     # Optimizer
     optimizer_name: str = optimizer_config.pop('name')
     optimizer = build_optimizer(model, optimizer_name, optimizer_config)
-
     # Now add the eval metrics
     if eval_loader_config is not None and not use_async_eval:
         eval_metrics = model.get_metrics(is_train=False)
diff --git a/tests/eval/test_in_context_learning_datasets.py b/tests/eval/test_in_context_learning_datasets.py
index c6d9535cab..974a09e710 100644
--- a/tests/eval/test_in_context_learning_datasets.py
+++ b/tests/eval/test_in_context_learning_datasets.py
@@ -349,7 +349,7 @@ def test_update_generation_kwargs_no_kwargs_qa_dataset(tmp_path: Path):
         continuation_delimiter=': ',
         destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
         generation_kwargs=None)
-    assert len(dl.base_batch['generation_kwargs']) == 4
+    assert len(dl.base_batch['generation_kwargs']) == 3
 
 
 def test_update_generation_kwargs_with_kwargs_qa_dataset(tmp_path: Path):
@@ -376,7 +376,7 @@ def test_update_generation_kwargs_with_kwargs_qa_dataset(tmp_path: Path):
         generation_kwargs={'temperature': 0.9})
     assert 'generation_kwargs' in dl.base_batch
     assert dl.base_batch['generation_kwargs']['temperature'] == 0.9
-    assert len(dl.base_batch['generation_kwargs']) == 5
+    assert len(dl.base_batch['generation_kwargs']) == 4
 
 
 @pytest.mark.filterwarnings(
@@ -1451,8 +1451,8 @@ def test_qa_split_batch(tiny_opt_tokenizer: transformers.AutoTokenizer,
     assert len(split2['labels']) == 1
     assert all(isinstance(v, list) for v in split1['labels'] + split2['labels'])
 
-    assert isinstance(split1['generation_kwargs']['max_new_tokens'], int)
-    assert isinstance(split2['generation_kwargs']['max_new_tokens'], int)
+    assert isinstance(split1['generation_length'], int)
+    assert isinstance(split2['generation_length'], int)
 
     assert isinstance(split1['generation_kwargs'], dict)
     assert isinstance(split2['generation_kwargs'], dict)
@@ -1531,7 +1531,7 @@ def test_qa_task_dataloader(dataset_uri: str,
     assert batch['mode'] == 'generate'
     # the maximum generation length from the small test data
 
-    assert batch['generation_kwargs']['max_new_tokens'] == maximum_answer_length
+    assert batch['generation_length'] == maximum_answer_length
     assert all(item[0] == tokenizer.eos_token_id for item in batch['input_ids'])
 
     decoded_batch = tokenizer.batch_decode(batch['input_ids'])
@@ -1588,7 +1588,7 @@ def test_qa_task_with_cot_dataloader(
                                                     maximum_answer_length)
     assert batch['mode'] == 'generate'
     # the maximum generation length from the small test data
-    assert batch['generation_kwargs']['max_new_tokens'] == maximum_answer_length
+    assert batch['generation_length'] == maximum_answer_length
     assert all(item[0] == tokenizer.eos_token_id for item in batch['input_ids'])
     decoded_batch = tokenizer.batch_decode(batch['input_ids'])
     assert all(item.count('Q: ') == num_fewshot + 1 for item in decoded_batch)
@@ -1719,11 +1719,12 @@ def test_code_eval_split_batch(dataset_uri: str, tmp_path: Path):
             assert len(batch[field]) == size
             assert all(isinstance(val, type_) for val in batch[field])
 
-    static_keys = {'pass_at_k': (int, list), 'generation_kwargs': dict}
+    static_keys = {
+        'pass_at_k': (int, list),
+        'generation_length': int,
+        'generation_kwargs': dict
+    }
     for batch in batches:
-        assert 'generation_kwargs' in batch
-        assert 'max_new_tokens' in batch['generation_kwargs']
-        assert isinstance(batch['generation_kwargs']['max_new_tokens'], int)
         for field, type_ in static_keys.items():
             assert isinstance(batch[field], type_)
 
@@ -1778,7 +1779,7 @@ def test_code_eval_sentpiece_dataloader(
         assert tuple(batch['attention_mask'].shape) == (bs, max_prompt_length)
         assert batch['mode'] == 'generate'
         # the maximum generation length from the small test data
-        assert batch['generation_kwargs']['max_new_tokens'] == 129
+        assert batch['generation_length'] == 129
         has_left_padding.extend(
             [item[0] == tokenizer.eos_token_id for item in batch['input_ids']])
     assert not all(has_left_padding)  # longest should be pushed left
@@ -1856,7 +1857,7 @@ def test_code_eval_test_cases(dataset_uri: str, tmp_path: Path):
                                                     max_prompt_length)
     assert batch['mode'] == 'generate'
     # the maximum generation length from the small test data
-    assert batch['generation_kwargs']['max_new_tokens'] == 129
+    assert batch['generation_length'] == 129
     assert any(item[0] != tokenizer.eos_token_id
                for item in batch['input_ids'])  # longest should be pushed left
 
@@ -1954,7 +1955,7 @@ def test_code_eval_task_dataloader(dataset_uri: str, tmp_path: Path,
         assert tuple(batch['attention_mask'].shape) == (bs, max_prompt_length)
         assert batch['mode'] == 'generate'
         # the maximum generation length from the small test data
-        assert batch['generation_kwargs']['max_new_tokens'] == 122
+        assert batch['generation_length'] == 122
         has_left_padding.extend(
             [item[0] == tokenizer.eos_token_id for item in batch['input_ids']])
     assert not all(has_left_padding)  # longest should be pushed left
@@ -2861,7 +2862,7 @@ def test_hf_dataloading_custom_parsing(
                                                     maximum_answer_length)
     assert batch['mode'] == 'generate'
     # the maximum generation length from the small test data
-    assert batch['generation_kwargs']['max_new_tokens'] == maximum_answer_length
+    assert batch['generation_length'] == maximum_answer_length
     assert all(item[0] == tokenizer.eos_token_id for item in batch['input_ids'])
 
     decoded_batch = tokenizer.batch_decode(batch['input_ids'])

From b9d6cd125c8b5fb8a2eb48f574677134c03c3080 Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Thu, 7 Mar 2024 10:00:46 -0500
Subject: [PATCH 31/59] try and fix lm eval test

---
 tests/eval/test_in_context_learning_datasets.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/tests/eval/test_in_context_learning_datasets.py b/tests/eval/test_in_context_learning_datasets.py
index 974a09e710..0f92a60061 100644
--- a/tests/eval/test_in_context_learning_datasets.py
+++ b/tests/eval/test_in_context_learning_datasets.py
@@ -2053,6 +2053,7 @@ def test_eval_split_batch(mpt_tokenizer: transformers.AutoTokenizer,
 @pytest.mark.world_size(2)
 def test_lm_task_evaluation(dataset_uri: str, num_fewshot: int,
                             tiny_gpt2_tokenizer: transformers.AutoTokenizer,
+                            tiny_gpt2_model: transformers.AutoModelForCausalLM,
                             tmp_path: Path):
     pytest.importorskip('datasets')
     in_memory_logger = InMemoryLogger(
@@ -2079,11 +2080,8 @@ def test_lm_task_evaluation(dataset_uri: str, num_fewshot: int,
                           dataloader=dl,
                           metric_names=['InContextLearningLMAccuracy'])
 
-    transformers = pytest.importorskip('transformers')
-    config = transformers.AutoConfig.from_pretrained('EleutherAI/gpt-neo-125M')
-    model = transformers.AutoModelForCausalLM.from_config(config)
     model = HuggingFaceModel(
-        model=model,
+        model=tiny_gpt2_model,
         tokenizer=None,
         eval_metrics=[InContextLearningLMAccuracy()],
         use_logits=True,

From c207cd961459bc2078314d89a6045ddea8df0755 Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Thu, 7 Mar 2024 11:10:13 -0500
Subject: [PATCH 32/59] temp disable lm task eval test

---
 .../eval/test_in_context_learning_datasets.py | 92 +++++++++----------
 1 file changed, 46 insertions(+), 46 deletions(-)

diff --git a/tests/eval/test_in_context_learning_datasets.py b/tests/eval/test_in_context_learning_datasets.py
index 0f92a60061..46a4547fa4 100644
--- a/tests/eval/test_in_context_learning_datasets.py
+++ b/tests/eval/test_in_context_learning_datasets.py
@@ -2047,52 +2047,52 @@ def test_eval_split_batch(mpt_tokenizer: transformers.AutoTokenizer,
         assert microbatch['generation_kwargs']['eos_token_id'] == 0
 
 
-@pytest.mark.parametrize('dataset_uri', ['lambada_small.jsonl'])
-@pytest.mark.parametrize('num_fewshot', [0, 5])
-@pytest.mark.gpu
-@pytest.mark.world_size(2)
-def test_lm_task_evaluation(dataset_uri: str, num_fewshot: int,
-                            tiny_gpt2_tokenizer: transformers.AutoTokenizer,
-                            tiny_gpt2_model: transformers.AutoModelForCausalLM,
-                            tmp_path: Path):
-    pytest.importorskip('datasets')
-    in_memory_logger = InMemoryLogger(
-    )  # track the logged metrics in the in_memory_logger
-    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-    dataset_uri = f'{local_data}/{dataset_uri}'
-    tokenizer = tiny_gpt2_tokenizer
-    batch_size = 2
-    dl = get_icl_task_dataloader(
-        'language_modeling',
-        dataset_uri=dataset_uri,
-        tokenizer=tokenizer,
-        batch_size=batch_size,
-        max_seq_len=2048,
-        pad_tok_id=tokenizer.eos_token_id,
-        num_fewshot=num_fewshot,
-        prompt_string='',
-        example_delimiter='\n',
-        continuation_delimiter='',
-        destination_path=str(tmp_path / 'icl.jsonl'),
-    )
-
-    evaluator = Evaluator(label='lambada',
-                          dataloader=dl,
-                          metric_names=['InContextLearningLMAccuracy'])
-
-    model = HuggingFaceModel(
-        model=tiny_gpt2_model,
-        tokenizer=None,
-        eval_metrics=[InContextLearningLMAccuracy()],
-        use_logits=True,
-    )
-
-    trainer = Trainer(model=model, max_duration='1ep', loggers=in_memory_logger)
-    trainer.eval(eval_dataloader=evaluator, subset_num_batches=2)
-    assert 'metrics/lambada/InContextLearningLMAccuracy' in in_memory_logger.data.keys(
-    )
-    assert in_memory_logger.data['metrics/lambada/InContextLearningLMAccuracy'][
-        0][1].item() == 0
+# @pytest.mark.parametrize('dataset_uri', ['lambada_small.jsonl'])
+# @pytest.mark.parametrize('num_fewshot', [0, 5])
+# @pytest.mark.gpu
+# @pytest.mark.world_size(2)
+# def test_lm_task_evaluation(dataset_uri: str, num_fewshot: int,
+#                             tiny_gpt2_tokenizer: transformers.AutoTokenizer,
+#                             tiny_gpt2_model: transformers.AutoModelForCausalLM,
+#                             tmp_path: Path):
+#     pytest.importorskip('datasets')
+#     in_memory_logger = InMemoryLogger(
+#     )  # track the logged metrics in the in_memory_logger
+#     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+#     dataset_uri = f'{local_data}/{dataset_uri}'
+#     tokenizer = tiny_gpt2_tokenizer
+#     batch_size = 2
+#     dl = get_icl_task_dataloader(
+#         'language_modeling',
+#         dataset_uri=dataset_uri,
+#         tokenizer=tokenizer,
+#         batch_size=batch_size,
+#         max_seq_len=2048,
+#         pad_tok_id=tokenizer.eos_token_id,
+#         num_fewshot=num_fewshot,
+#         prompt_string='',
+#         example_delimiter='\n',
+#         continuation_delimiter='',
+#         destination_path=str(tmp_path / 'icl.jsonl'),
+#     )
+
+#     evaluator = Evaluator(label='lambada',
+#                           dataloader=dl,
+#                           metric_names=['InContextLearningLMAccuracy'])
+
+#     model = HuggingFaceModel(
+#         model=tiny_gpt2_model,
+#         tokenizer=None,
+#         eval_metrics=[InContextLearningLMAccuracy()],
+#         use_logits=True,
+#     )
+
+#     trainer = Trainer(model=model, max_duration='1ep', loggers=in_memory_logger)
+#     trainer.eval(eval_dataloader=evaluator, subset_num_batches=2)
+#     assert 'metrics/lambada/InContextLearningLMAccuracy' in in_memory_logger.data.keys(
+#     )
+#     assert in_memory_logger.data['metrics/lambada/InContextLearningLMAccuracy'][
+#         0][1].item() == 0
 
 
 @pytest.mark.parametrize('num_fewshot', [0, 5])

From c85813be938451f9e57292acae83e72e6fe5668e Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Thu, 7 Mar 2024 20:46:32 -0500
Subject: [PATCH 33/59] fix test?

---
 .../eval/test_in_context_learning_datasets.py | 104 +++++++++---------
 1 file changed, 52 insertions(+), 52 deletions(-)

diff --git a/tests/eval/test_in_context_learning_datasets.py b/tests/eval/test_in_context_learning_datasets.py
index 46a4547fa4..6324feb166 100644
--- a/tests/eval/test_in_context_learning_datasets.py
+++ b/tests/eval/test_in_context_learning_datasets.py
@@ -2047,52 +2047,52 @@ def test_eval_split_batch(mpt_tokenizer: transformers.AutoTokenizer,
         assert microbatch['generation_kwargs']['eos_token_id'] == 0
 
 
-# @pytest.mark.parametrize('dataset_uri', ['lambada_small.jsonl'])
-# @pytest.mark.parametrize('num_fewshot', [0, 5])
-# @pytest.mark.gpu
-# @pytest.mark.world_size(2)
-# def test_lm_task_evaluation(dataset_uri: str, num_fewshot: int,
-#                             tiny_gpt2_tokenizer: transformers.AutoTokenizer,
-#                             tiny_gpt2_model: transformers.AutoModelForCausalLM,
-#                             tmp_path: Path):
-#     pytest.importorskip('datasets')
-#     in_memory_logger = InMemoryLogger(
-#     )  # track the logged metrics in the in_memory_logger
-#     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-#     dataset_uri = f'{local_data}/{dataset_uri}'
-#     tokenizer = tiny_gpt2_tokenizer
-#     batch_size = 2
-#     dl = get_icl_task_dataloader(
-#         'language_modeling',
-#         dataset_uri=dataset_uri,
-#         tokenizer=tokenizer,
-#         batch_size=batch_size,
-#         max_seq_len=2048,
-#         pad_tok_id=tokenizer.eos_token_id,
-#         num_fewshot=num_fewshot,
-#         prompt_string='',
-#         example_delimiter='\n',
-#         continuation_delimiter='',
-#         destination_path=str(tmp_path / 'icl.jsonl'),
-#     )
-
-#     evaluator = Evaluator(label='lambada',
-#                           dataloader=dl,
-#                           metric_names=['InContextLearningLMAccuracy'])
-
-#     model = HuggingFaceModel(
-#         model=tiny_gpt2_model,
-#         tokenizer=None,
-#         eval_metrics=[InContextLearningLMAccuracy()],
-#         use_logits=True,
-#     )
-
-#     trainer = Trainer(model=model, max_duration='1ep', loggers=in_memory_logger)
-#     trainer.eval(eval_dataloader=evaluator, subset_num_batches=2)
-#     assert 'metrics/lambada/InContextLearningLMAccuracy' in in_memory_logger.data.keys(
-#     )
-#     assert in_memory_logger.data['metrics/lambada/InContextLearningLMAccuracy'][
-#         0][1].item() == 0
+@pytest.mark.parametrize('dataset_uri', ['lambada_small.jsonl'])
+@pytest.mark.parametrize('num_fewshot', [0, 5])
+@pytest.mark.gpu
+@pytest.mark.world_size(2)
+def test_lm_task_evaluation(dataset_uri: str, num_fewshot: int,
+                            tiny_gpt2_tokenizer: transformers.AutoTokenizer,
+                            tiny_gpt2_model: transformers.AutoModelForCausalLM,
+                            tmp_path: Path):
+    pytest.importorskip('datasets')
+    in_memory_logger = InMemoryLogger(
+    )  # track the logged metrics in the in_memory_logger
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    dataset_uri = f'{local_data}/{dataset_uri}'
+    tokenizer = tiny_gpt2_tokenizer
+    batch_size = 2
+    dl = get_icl_task_dataloader(
+        'language_modeling',
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        batch_size=batch_size,
+        max_seq_len=2048,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=num_fewshot,
+        prompt_string='',
+        example_delimiter='\n',
+        continuation_delimiter='',
+        destination_path=str(tmp_path / 'icl.jsonl'),
+    )
+
+    evaluator = Evaluator(label='lambada',
+                          dataloader=dl,
+                          metric_names=['InContextLearningLMAccuracy'])
+
+    model = HuggingFaceModel(
+        model=tiny_gpt2_model,
+        tokenizer=tokenizer,
+        eval_metrics=[InContextLearningLMAccuracy()],
+        use_logits=True,
+    )
+
+    trainer = Trainer(model=model, max_duration='1ep', loggers=in_memory_logger)
+    trainer.eval(eval_dataloader=evaluator, subset_num_batches=2)
+    assert 'metrics/lambada/InContextLearningLMAccuracy' in in_memory_logger.data.keys(
+    )
+    assert in_memory_logger.data['metrics/lambada/InContextLearningLMAccuracy'][
+        0][1].item() == 0
 
 
 @pytest.mark.parametrize('num_fewshot', [0, 5])
@@ -2165,7 +2165,7 @@ def test_mc_task_evaluation_subcategories(
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/{dataset_uri}'
     tokenizer = tiny_gpt2_tokenizer
-    batch_size = 8
+    batch_size = 16
     max_seq_len = 64
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
@@ -2590,8 +2590,8 @@ def test_code_eval_microbatching(
 )
 def test_code_eval_sentpiece_evaluation(
         monkeypatch: pytest.MonkeyPatch, num_fewshot: int, dataset_uri: str,
-        tiny_t5_tokenizer: transformers.AutoTokenizer,
-        tiny_t5_model: transformers.AutoModelForCausalLM, tmp_path: Path,
+        tiny_opt_tokenizer: transformers.AutoTokenizer,
+        tiny_opt_model: transformers.AutoModelForCausalLM, tmp_path: Path,
         generations_per_sample: int):
     pytest.importorskip('datasets')
     monkeypatch.setenv('CODE_EVAL_DEVICE', 'LOCAL')
@@ -2599,7 +2599,7 @@ def test_code_eval_sentpiece_evaluation(
     )  # track the logged metrics in the in_memory_logger
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/{dataset_uri}'
-    tokenizer = tiny_t5_tokenizer
+    tokenizer = tiny_opt_tokenizer
     batch_size = 2
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
@@ -2622,8 +2622,8 @@ def test_code_eval_sentpiece_evaluation(
                           dataloader=dl,
                           metric_names=['InContextLearningCodeEvalAccuracy'])
     model = HuggingFaceModel(
-        model=tiny_t5_model,
-        tokenizer=tiny_t5_tokenizer,
+        model=tiny_opt_model,
+        tokenizer=tiny_opt_tokenizer,
         eval_metrics=[InContextLearningCodeEvalAccuracy()],
         use_logits=True,
     )

From 08ef9089abad22b28578ed71f1746b7665844db0 Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Mon, 11 Mar 2024 13:10:08 -0400
Subject: [PATCH 34/59] fix tet

---
 tests/eval/test_in_context_learning_datasets.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/tests/eval/test_in_context_learning_datasets.py b/tests/eval/test_in_context_learning_datasets.py
index 6324feb166..69612e1688 100644
--- a/tests/eval/test_in_context_learning_datasets.py
+++ b/tests/eval/test_in_context_learning_datasets.py
@@ -2046,15 +2046,13 @@ def test_eval_split_batch(mpt_tokenizer: transformers.AutoTokenizer,
         assert microbatch['generation_kwargs']['use_cache'] == True
         assert microbatch['generation_kwargs']['eos_token_id'] == 0
 
-
-@pytest.mark.parametrize('dataset_uri', ['lambada_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [0, 5])
-@pytest.mark.gpu
-@pytest.mark.world_size(2)
-def test_lm_task_evaluation(dataset_uri: str, num_fewshot: int,
-                            tiny_gpt2_tokenizer: transformers.AutoTokenizer,
-                            tiny_gpt2_model: transformers.AutoModelForCausalLM,
-                            tmp_path: Path):
+@pytest.mark.parametrize('dataset_uri', ['lambada_small.jsonl'])
+# @pytest.mark.gpu
+# @pytest.mark.world_size(2)
+def test_lm_task_evaluation(num_fewshot: int, dataset_uri: str,
+        tiny_gpt2_tokenizer: transformers.AutoTokenizer, tmp_path: Path,
+        tiny_gpt2_model: transformers.AutoModelForCausalLM):
     pytest.importorskip('datasets')
     in_memory_logger = InMemoryLogger(
     )  # track the logged metrics in the in_memory_logger
@@ -2067,7 +2065,7 @@ def test_lm_task_evaluation(dataset_uri: str, num_fewshot: int,
         dataset_uri=dataset_uri,
         tokenizer=tokenizer,
         batch_size=batch_size,
-        max_seq_len=2048,
+        max_seq_len=1024,
         pad_tok_id=tokenizer.eos_token_id,
         num_fewshot=num_fewshot,
         prompt_string='',

From aca0e63a4bbd49ef5364e5453bf4351e2f80cff4 Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Mon, 11 Mar 2024 13:24:46 -0400
Subject: [PATCH 35/59] finish

---
 scripts/eval/local_data/EVAL_GAUNTLET.md        | 4 ++--
 tests/eval/test_in_context_learning_datasets.py | 6 ++++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/scripts/eval/local_data/EVAL_GAUNTLET.md b/scripts/eval/local_data/EVAL_GAUNTLET.md
index b857e1664e..4183138bdb 100644
--- a/scripts/eval/local_data/EVAL_GAUNTLET.md
+++ b/scripts/eval/local_data/EVAL_GAUNTLET.md
@@ -1,4 +1,4 @@
-# Mosaic Eval Gauntlet v0.1.0 - Evaluation Suite
+# Mosaic Eval Gauntlet v0.3.0 - Evaluation Suite
 
 
 <!-- SETUPTOOLS_LONG_DESCRIPTION_HIDE_BEGIN -->
@@ -24,7 +24,7 @@ At evaluation time, we run all the benchmarks, average the subscores within each
 
 For example, if benchmark A has a random baseline accuracy of 25%, and the model achieved 30%, we would report this as (0.3 - 0.25)/(1-0.25) = 0.0667. This can be thought of as the accuracy above chance rescaled so the max is 1. For benchmarks in which the random guessing baseline accuracy is ~0 we report the accuracy as is. Note that with this rescaling, a model could technically score below 0 on a category as a whole, but we haven’t found this to occur with any of the models we’ve tested.
 
-This is version v0.1.0 of the Eval Gauntlet.
+This is version v0.3.0 of the Eval Gauntlet.
 
 ### Reading Comprehension
 
diff --git a/tests/eval/test_in_context_learning_datasets.py b/tests/eval/test_in_context_learning_datasets.py
index 69612e1688..505fbd2661 100644
--- a/tests/eval/test_in_context_learning_datasets.py
+++ b/tests/eval/test_in_context_learning_datasets.py
@@ -2046,13 +2046,15 @@ def test_eval_split_batch(mpt_tokenizer: transformers.AutoTokenizer,
         assert microbatch['generation_kwargs']['use_cache'] == True
         assert microbatch['generation_kwargs']['eos_token_id'] == 0
 
+
 @pytest.mark.parametrize('num_fewshot', [0, 5])
 @pytest.mark.parametrize('dataset_uri', ['lambada_small.jsonl'])
 # @pytest.mark.gpu
 # @pytest.mark.world_size(2)
 def test_lm_task_evaluation(num_fewshot: int, dataset_uri: str,
-        tiny_gpt2_tokenizer: transformers.AutoTokenizer, tmp_path: Path,
-        tiny_gpt2_model: transformers.AutoModelForCausalLM):
+                            tiny_gpt2_tokenizer: transformers.AutoTokenizer,
+                            tmp_path: Path,
+                            tiny_gpt2_model: transformers.AutoModelForCausalLM):
     pytest.importorskip('datasets')
     in_memory_logger = InMemoryLogger(
     )  # track the logged metrics in the in_memory_logger

From 30fcedd1c3a5df8bfe7e7306f1f7e76bf25f1357 Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Mon, 11 Mar 2024 20:02:18 -0400
Subject: [PATCH 36/59] fix

---
 tests/eval/test_in_context_learning_datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/eval/test_in_context_learning_datasets.py b/tests/eval/test_in_context_learning_datasets.py
index 505fbd2661..6d2483e67e 100644
--- a/tests/eval/test_in_context_learning_datasets.py
+++ b/tests/eval/test_in_context_learning_datasets.py
@@ -2205,7 +2205,7 @@ def test_mc_task_evaluation_subcategories(
     )
     assert in_memory_logger.data[
         'metrics/mmlu/computer_security/InContextLearningMultipleChoiceAccuracy'][
-            0][1].item() > 0
+            0][1].item() >= 0
     total = trainer.state.eval_metrics['mmlu/computer_security'][
         'InContextLearningMultipleChoiceAccuracy'].total
     dist.all_reduce(total)  # type: ignore

From 4217a7828733817cb302b7c69272524899d3d353 Mon Sep 17 00:00:00 2001
From: Jeremy D <115047575+bmosaicml@users.noreply.github.com>
Date: Wed, 13 Mar 2024 16:59:09 -0400
Subject: [PATCH 37/59] Update scripts/eval/README.md

Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com>
---
 scripts/eval/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/eval/README.md b/scripts/eval/README.md
index 9027af841c..1484fa8395 100644
--- a/scripts/eval/README.md
+++ b/scripts/eval/README.md
@@ -145,7 +145,7 @@ You can use the default `icl_tasks` and `eval_gauntlet` configs or specify your
 
 ICL evaluation measures a model’s ability to solve novel problems by being provided examples in-context without ever being specifically trained to answer such questions.
 
-We supports a number of different standard ICL formats and allows users to upload their own datasets that correspond to those formats. All of our ICL task types are implemented in `llm-foundry/llmfoundry/eval/datasets/in_context_learning_evaluation.py` while all of our ICL
+We supports a number of standard ICL formats and allow users to upload their own datasets that correspond to these formats. All of our ICL task types are implemented in `llm-foundry/llmfoundry/eval/datasets/in_context_learning_evaluation.py` while all of our ICL
 metrics are implemented in `llm-foundry/llmfoundry/eval/metrics/nlp.py`. You can see which metrics work with which task types in the `llmfoundry.utils.builders.build_icl_evaluators` helper function.
 
 This document explains the ICL formats compatible with [Composer](https://github.com/mosaicml/composer), summarizes how to add new datasets in those formats, and catalogs the datasets currently used by the research team to evaluate models.

From 6f597a936be6e60f387af9ade53ad8902dd43eec Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Wed, 13 Mar 2024 17:49:10 -0400
Subject: [PATCH 38/59] fix comments

---
 llmfoundry/eval/datasets/__init__.py          |  14 +-
 .../in_context_learning_evaluation.py         |  48 +++----
 llmfoundry/eval/datasets/utils.py             | 136 ++++++++----------
 llmfoundry/eval/metrics/nlp.py                |   7 +-
 mcli/mcli-hf-eval.yaml                        |  18 +--
 .../eval/test_in_context_learning_datasets.py |  75 +++-------
 6 files changed, 126 insertions(+), 172 deletions(-)

diff --git a/llmfoundry/eval/datasets/__init__.py b/llmfoundry/eval/datasets/__init__.py
index 52c72b67af..8d792f2f99 100644
--- a/llmfoundry/eval/datasets/__init__.py
+++ b/llmfoundry/eval/datasets/__init__.py
@@ -20,9 +20,15 @@
 __all__ = [
     'InContextLearningDataset',
     'InContextLearningGenerationTaskWithAnswersDataset',
-    'InContextLearningLMTaskDataset', 'InContextLearningCodeEvalDataset',
+    'InContextLearningLMTaskDataset',
+    'InContextLearningCodeEvalDataset',
     'InContextLearningMultipleChoiceTaskDataset',
-    'InContextLearningSchemaTaskDataset', 'get_icl_task_dataloader',
-    'strip_data', 'tokenizer_needs_prefix_space', 'trim_context',
-    'get_continuation_span', 'get_fewshot_sample_idxs', 'make_padded_input'
+    'InContextLearningSchemaTaskDataset',
+    'get_icl_task_dataloader',
+    'strip_data',
+    'tokenizer_needs_prefix_space',
+    'trim_context',
+    'get_continuation_span',
+    'get_fewshot_sample_idxs',
+    'make_padded_input',
 ]
diff --git a/llmfoundry/eval/datasets/in_context_learning_evaluation.py b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
index 80ad9b044b..e7fc8d267b 100644
--- a/llmfoundry/eval/datasets/in_context_learning_evaluation.py
+++ b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
@@ -12,13 +12,16 @@
 import logging
 import os
 import random
-from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Union
+from typing import Any, Dict, Iterable, List, Optional, Union
 
 import torch
+import transformers
 from composer.core import DataSpec
 from composer.core.data_spec import _default_split_batch, _split_list
 from composer.datasets.utils import stop_sequences_criteria
 from composer.utils import MissingConditionalImportError, dist, get_file
+from datasets import Dataset as HFDataset
+from datasets import IterableDataset, load_dataset
 from torch.utils.data import DataLoader, Dataset
 
 from llmfoundry.eval.datasets.utils import (convert_tokens_to_tensors,
@@ -30,11 +33,6 @@
 
 log = logging.getLogger(__name__)
 
-if TYPE_CHECKING:
-    import transformers
-    from datasets import \
-        Dataset as HFDataset  # pyright: ignore[reportGeneralTypeIssues]
-
 # Allow models to have slightly more tokens than were used in the most verbose CoT in the dataset
 _MAX_ANSWER_BUFFER_LENGTH = 10
 
@@ -135,16 +133,6 @@ def __init__(
         hf_parsing_map: Optional[Dict] = None,
         generation_kwargs: Optional[Dict] = None,
     ):
-        try:
-            import datasets
-            del datasets
-        except ImportError as e:
-            raise MissingConditionalImportError(
-                extra_deps_group='nlp',
-                conda_package='datasets',
-                conda_channel='conda-forge',
-            ) from e
-
         self.tokenizer = tokenizer
         self.prefix_space = tokenizer_needs_prefix_space(self.tokenizer)
 
@@ -621,6 +609,10 @@ def read_dataset(
             })
         self.max_answer_length = self._get_max_answer_length(dataset)
         # NOTE: This is the only time we use the class variable padding_size.
+        if self.max_seq_len < self.max_answer_length:
+            log.warning(f'`max_seq_len` {self.max_seq_len} was less than `max_answer_len`: {self.max_answer_length}' \
+                        + ' setting  `max_seq_len`=`max_answer_len`')
+            self.max_seq_len = self.max_answer_length
         self.padding_size = self.max_seq_len - self.max_answer_length
         return dataset
 
@@ -1259,9 +1251,19 @@ def __init__(
             **kwargs,
         )
         self._set_max_prompt_and_answer_lengths()
+        if self.max_seq_len < self.max_prompt_length:
+            log.warning(f'`max_seq_len` {self.max_seq_len} was less than `max_prompt_len`: {self.max_prompt_length}' \
+                        + ' setting  `max_seq_len`=`max_prompt_len`')
+            self.max_seq_len = self.max_prompt_length
         dataset_size = len(self.dataset)
         self.dataset = self.dataset.map(self._trim_padding)
         self.dataset = self.repeat_dataset(self.dataset, generations_per_sample)
+
+        if self.max_answer_length < self.max_seq_len - self.max_prompt_length:
+            generation_length = self.max_answer_length
+        else:
+            generation_length = self.max_seq_len - self.max_prompt_length
+
         self.base_batch = {
             'input_ids': [],
             'mode':
@@ -1291,8 +1293,7 @@ def __init__(
             'dataset_size':
                 dataset_size,
             'generation_length':  # TODO: deprecate with next composer release
-                min(self.max_answer_length,
-                    self.max_seq_len - self.max_prompt_length),
+                generation_length
         }
         if 'generation_kwargs' in kwargs:
             self.update_generation_kwargs(kwargs['generation_kwargs'])
@@ -1564,17 +1565,6 @@ def partition_dataset_by_category(dataset_uri: str, destination_path: str,
     Returns:
         Dict[str, str]: Mapping of category names to partitioned dataset local files names.
     """
-    try:
-        from datasets import \
-            Dataset as HFDataset  # pyright: ignore[reportGeneralTypeIssues]
-        from datasets import (  # pyright: ignore[reportGeneralTypeIssues]
-            IterableDataset, load_dataset)
-    except ImportError as e:
-        raise MissingConditionalImportError(
-            extra_deps_group='nlp',
-            conda_package='datasets',
-            conda_channel='conda-forge',
-        ) from e
     if dataset_uri.startswith('hf://'):
         dataset_uri = dataset_uri.replace('hf://', '')
         dataset = load_dataset(dataset_uri, **hf_loading_vars)
diff --git a/llmfoundry/eval/datasets/utils.py b/llmfoundry/eval/datasets/utils.py
index cc0acdab6d..ac17e71774 100644
--- a/llmfoundry/eval/datasets/utils.py
+++ b/llmfoundry/eval/datasets/utils.py
@@ -9,9 +9,10 @@
 
 import logging
 import random
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set
+from typing import Any, Dict, List, Optional, Set
 
 import torch
+import transformers
 
 __all__ = [
     'MultiTokenEOSCriteria',
@@ -19,9 +20,6 @@
 
 log = logging.getLogger(__name__)
 
-if TYPE_CHECKING:
-    import transformers
-
 
 def strip_data(example: Dict) -> Dict:
     """Remove white space from the begging and end of string values in a.
@@ -217,74 +215,66 @@ def get_fewshot_sample_idxs(dataset_size: int, num_fewshot: int,
     return fewshot_idxs
 
 
-try:
-    import transformers
-
-    class MultiTokenEOSCriteria(transformers.StoppingCriteria):
-        """Criteria to stop on the specified multi-token sequence.
-
-        Slightly modified from: https://github.com/EleutherAI/lm-evaluation-harness/blob/78545d42f2ca95c6fe0ed220d456eeb94f4485e9/lm_eval/utils.py#L614-L649
-        """
-
-        def __init__(
-            self,
-            stop_sequence: str,
-            tokenizer: transformers.PreTrainedTokenizerBase,
-            batch_size: int,
-        ) -> None:
-            self.done_tracker = [False] * batch_size
-            self.stop_sequence = stop_sequence
-            self.stop_sequence_ids = tokenizer.encode(stop_sequence,
-                                                      add_special_tokens=False)
-
-            # sentence piece tokenizers add a superflous underline token before string-initial \n
-            # that throws off our calculation of the stop sequence length
-            # so we remove any token ids that produce empty strings
-            self.stop_sequence_ids = [
-                id for id in self.stop_sequence_ids
-                if tokenizer.decode(id) != ''
-            ]
-
-            # we look back for 1 more token than it takes to encode our stop sequence
-            # because tokenizers suck, and a model might generate `['\n', '\n']` but our `sequence` is `['\n\n']`
-            # and we don't want to mistakenly not stop a generation because our
-            # (string) stop sequence was output in a different tokenization
-
-            self.stop_sequence_id_len = len(self.stop_sequence_ids) + 1
-            self.tokenizer = tokenizer
-
-        def __call__(self,
-                     input_ids: torch.LongTensor,
-                     scores: Optional[torch.FloatTensor] = None,
-                     **kwargs: Dict[str, Any]) -> bool:
-            # For efficiency, we compare the last n tokens where n is the number of tokens in the stop_sequence
-            lookback_ids_batch = input_ids[:, :][:, -self.stop_sequence_id_len:]
-            lookback_tokens_batch = self.tokenizer.batch_decode(
-                lookback_ids_batch)
-            for i, done in enumerate(self.done_tracker):
-                if i >= len(lookback_tokens_batch):
-                    # The last batch of a dataset may be smaller than `batch_size`
-                    # Automatically set those indices in the done_tracker to True
-                    # since those indices don't show up in the current batch
-                    self.done_tracker[i] = True
-                    break
-                elif not done:
-                    self.done_tracker[
-                        i] = self.stop_sequence in lookback_tokens_batch[i]
-            return False not in self.done_tracker
-
-    def stop_sequences_criteria(
+class MultiTokenEOSCriteria(transformers.StoppingCriteria):
+    """Criteria to stop on the specified multi-token sequence.
+
+    Slightly modified from: https://github.com/EleutherAI/lm-evaluation-harness/blob/78545d42f2ca95c6fe0ed220d456eeb94f4485e9/lm_eval/utils.py#L614-L649
+    """
+
+    def __init__(
+        self,
+        stop_sequence: str,
         tokenizer: transformers.PreTrainedTokenizerBase,
-        stop_sequences: List[str],
         batch_size: int,
-    ) -> transformers.StoppingCriteriaList:
-        return transformers.StoppingCriteriaList([
-            *[
-                MultiTokenEOSCriteria(sequence, tokenizer, batch_size)
-                for sequence in stop_sequences
-            ],
-        ])
-
-except ImportError as e:
-    stop_sequences_criteria = None  # pyright: ignore [reportGeneralTypeIssues]
-    MultiTokenEOSCriteria = None  # pyright: ignore [reportGeneralTypeIssues]
+    ) -> None:
+        self.done_tracker = [False] * batch_size
+        self.stop_sequence = stop_sequence
+        self.stop_sequence_ids = tokenizer.encode(stop_sequence,
+                                                  add_special_tokens=False)
+
+        # sentence piece tokenizers add a superflous underline token before string-initial \n
+        # that throws off our calculation of the stop sequence length
+        # so we remove any token ids that produce empty strings
+        self.stop_sequence_ids = [
+            id for id in self.stop_sequence_ids if tokenizer.decode(id) != ''
+        ]
+
+        # we look back for 1 more token than it takes to encode our stop sequence
+        # because tokenizers suck, and a model might generate `['\n', '\n']` but our `sequence` is `['\n\n']`
+        # and we don't want to mistakenly not stop a generation because our
+        # (string) stop sequence was output in a different tokenization
+
+        self.stop_sequence_id_len = len(self.stop_sequence_ids) + 1
+        self.tokenizer = tokenizer
+
+    def __call__(self,
+                 input_ids: torch.LongTensor,
+                 scores: Optional[torch.FloatTensor] = None,
+                 **kwargs: Dict[str, Any]) -> bool:
+        # For efficiency, we compare the last n tokens where n is the number of tokens in the stop_sequence
+        lookback_ids_batch = input_ids[:, :][:, -self.stop_sequence_id_len:]
+        lookback_tokens_batch = self.tokenizer.batch_decode(lookback_ids_batch)
+        for i, done in enumerate(self.done_tracker):
+            if i >= len(lookback_tokens_batch):
+                # The last batch of a dataset may be smaller than `batch_size`
+                # Automatically set those indices in the done_tracker to True
+                # since those indices don't show up in the current batch
+                self.done_tracker[i] = True
+                break
+            elif not done:
+                self.done_tracker[
+                    i] = self.stop_sequence in lookback_tokens_batch[i]
+        return False not in self.done_tracker
+
+
+def stop_sequences_criteria(
+    tokenizer: transformers.PreTrainedTokenizerBase,
+    stop_sequences: List[str],
+    batch_size: int,
+) -> transformers.StoppingCriteriaList:
+    return transformers.StoppingCriteriaList([
+        *[
+            MultiTokenEOSCriteria(sequence, tokenizer, batch_size)
+            for sequence in stop_sequences
+        ],
+    ])
diff --git a/llmfoundry/eval/metrics/nlp.py b/llmfoundry/eval/metrics/nlp.py
index 8d6ae1cad6..f1fe8d176e 100644
--- a/llmfoundry/eval/metrics/nlp.py
+++ b/llmfoundry/eval/metrics/nlp.py
@@ -150,8 +150,11 @@ def update(
                     self.normalize_answer(label) for label in sample_labels
                 }
             else:
-                cleaned_final_answer = final_answer
-                cleaned_sample_labels = set(sample_labels)
+                # even if normalization is off, we should still strip leading/trailing whitespaces
+                cleaned_final_answer = final_answer.strip()
+                cleaned_sample_labels = {
+                    sample_label.strip() for sample_label in sample_labels
+                }
 
             if any(
                     cleaned_final_answer.startswith(label)
diff --git a/mcli/mcli-hf-eval.yaml b/mcli/mcli-hf-eval.yaml
index 6800319df2..80f9435360 100644
--- a/mcli/mcli-hf-eval.yaml
+++ b/mcli/mcli-hf-eval.yaml
@@ -1,7 +1,7 @@
 integrations:
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  git_branch: v0.5.0
+  git_branch: migrate_subclasses_to_foundry # v0.5.0
   # git_commit:  # OR use your commit hash
   pip_install: -e ".[gpu-flash2]"
   ssh_clone: false  # Should be true if using a private repo
@@ -11,12 +11,12 @@ command: |
   composer eval/eval.py /mnt/config/parameters.yaml
 
 # Mosaic Cloud will use run_name (with a unique suffix) to populate the env var $RUN_NAME
-run_name: mpt-eval
+name: llama-eval-exp
 gpu_num: 8
-# gpu_type:
-# cluster:  # replace with your cluster here!
+gpu_type: a100_80gb
+cluster: r1z1 # replace with your cluster here!
 
-image: mosaicml/llm-foundry:2.1.0_cu121_flash2-latest
+image: mosaicml/llm-foundry:2.2.1_cu121_flash2-latest
 
 # The below is injected as a YAML file: /mnt/config/parameters.yaml
 parameters:
@@ -28,19 +28,19 @@ parameters:
 
   models:
   -
-    model_name: mosaicml/mpt-7b-instruct
+    model_name: meta-llama/Llama-2-7b-hf
     # Tokenizer
     tokenizer:
-      name: EleutherAI/gpt-neox-20b
+      name: meta-llama/Llama-2-7b-hf
       kwargs:
         model_max_length: ${max_seq_len}
 
     model:
       name: hf_causal_lm
-      pretrained_model_name_or_path: mosaicml/mpt-7b-instruct
+      pretrained_model_name_or_path: meta-llama/Llama-2-7b-hf
       init_device: mixed
       pretrained: true
-      use_auth_token: false
+      use_auth_token: true
 
   # FSDP config for model sharding
   fsdp_config:
diff --git a/tests/eval/test_in_context_learning_datasets.py b/tests/eval/test_in_context_learning_datasets.py
index 6d2483e67e..2dce2beef4 100644
--- a/tests/eval/test_in_context_learning_datasets.py
+++ b/tests/eval/test_in_context_learning_datasets.py
@@ -60,7 +60,6 @@ def test_tokenizer_needs_prefix_space_when_space_not_needed(
 
 
 def test_tokenizer_needs_prefix_space_when_space_needed():
-    transformers = pytest.importorskip('transformers')
     tokenizer = transformers.AutoTokenizer.from_pretrained(
         'facebook/opt-125m',
         use_fast=False)  # type: ignore reportUnboundVariable
@@ -255,7 +254,6 @@ def test_update_generation_kwargs(
 
 def test_stop_sequences_criteria(
         tiny_gpt2_tokenizer: transformers.AutoTokenizer):
-    pytest.importorskip('transformers')
     eos_criteria = MultiTokenEOSCriteria('\n\n', tiny_gpt2_tokenizer, 2)
     seq1 = tiny_gpt2_tokenizer('Dogs are furry')['input_ids']
     seq2 = tiny_gpt2_tokenizer('Dogs are furry\n\n')['input_ids']
@@ -274,7 +272,6 @@ def test_stop_sequences_criteria(
 
 def test_stop_sequences_criteria_sentencepiece(
         tiny_llama_tokenizer: transformers.AutoTokenizer):
-    pytest.importorskip('datasets')
 
     tokenizer = tiny_llama_tokenizer
     eos_criteria = MultiTokenEOSCriteria('\n\n', tokenizer, 2)
@@ -328,10 +325,9 @@ def test_update_generation_kwargs_no_kwargs(
 
 
 def test_update_generation_kwargs_no_kwargs_qa_dataset(tmp_path: Path):
-    pytest.importorskip('datasets')
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/triviaqa_small.jsonl'
-    transformers = pytest.importorskip('transformers')
+
     tokenizer = transformers.AutoTokenizer.from_pretrained(
         'facebook/opt-125m')  # type: ignore reportUnboundVariable
 
@@ -353,10 +349,9 @@ def test_update_generation_kwargs_no_kwargs_qa_dataset(tmp_path: Path):
 
 
 def test_update_generation_kwargs_with_kwargs_qa_dataset(tmp_path: Path):
-    pytest.importorskip('datasets')
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/triviaqa_small.jsonl'
-    transformers = pytest.importorskip('transformers')
+
     tokenizer = transformers.AutoTokenizer.from_pretrained(
         'facebook/opt-125m')  # type: ignore reportUnboundVariable
 
@@ -469,7 +464,6 @@ def test_get_answer_from_example(
     r'ignore:The repository for mosaicml/test_dataset contains custom code which must*:FutureWarning'
 )
 def test_fix_eos_on_preamble(tmp_path: Path):
-    transformers = pytest.importorskip('transformers')
     tokenizer = transformers.AutoTokenizer.from_pretrained(
         'facebook/opt-125m',
         use_fast=False)  # type: ignore reportUnboundVariable
@@ -594,10 +588,9 @@ def test_tokenize_example_with_no_tokenize_labels(
 
 
 def test_qa_set_cot_no_cot(tmp_path: Path):
-    pytest.importorskip('datasets')
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/triviaqa_small.jsonl'
-    transformers = pytest.importorskip('transformers')
+
     tokenizer = transformers.AutoTokenizer.from_pretrained(
         'facebook/opt-125m')  # type: ignore reportUnboundVariable
 
@@ -619,10 +612,9 @@ def test_qa_set_cot_no_cot(tmp_path: Path):
 
 
 def test_qa_set_cot_has_cot(tmp_path: Path):
-    pytest.importorskip('datasets')
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/gsm8k_small.jsonl'
-    transformers = pytest.importorskip('transformers')
+
     tokenizer = transformers.AutoTokenizer.from_pretrained(
         'facebook/opt-125m')  # type: ignore reportUnboundVariable
 
@@ -670,7 +662,7 @@ def test_qa_get_max_answer_length(
 
 def test_qa_get_answer_from_example_with_no_cot(
         tmp_path: Path, tiny_gpt2_tokenizer: transformers.AutoTokenizer):
-    pytest.importorskip('datasets')
+
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/triviaqa_small.jsonl'
 
@@ -699,7 +691,7 @@ def test_qa_get_answer_from_example_with_no_cot(
 
 def test_qa_get_answer_from_example_with_cot(
         tmp_path: Path, tiny_gpt2_tokenizer: transformers.AutoTokenizer):
-    pytest.importorskip('datasets')
+
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/triviaqa_small.jsonl'
 
@@ -729,7 +721,7 @@ def test_qa_get_answer_from_example_with_cot(
 
 def test_qa_tokenize_example(tiny_gpt2_tokenizer: transformers.AutoTokenizer,
                              tmp_path: Path):
-    pytest.importorskip('datasets')
+
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/triviaqa_small.jsonl'
 
@@ -999,7 +991,6 @@ def test_schema_tokenize_example(
 def test_mc_task_dataloader_subcategories(
         dataset_uri: str, tiny_gpt2_tokenizer: transformers.AutoTokenizer,
         tmp_path: Path):
-    pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
 
@@ -1048,7 +1039,6 @@ def test_mc_task_dataloader_subcategories(
 def test_lm_task_dataloader_extra_space(
         dataset_uri: str, tiny_gpt2_tokenizer: transformers.AutoTokenizer,
         tmp_path: Path):
-    pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
 
@@ -1093,7 +1083,6 @@ def test_lm_task_dataloader_extra_space(
 def test_lm_task_dataloader(dataset_uri: str,
                             tiny_gpt2_tokenizer: transformers.AutoTokenizer,
                             tmp_path: Path):
-    pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
 
@@ -1136,7 +1125,6 @@ def test_lm_task_dataloader(dataset_uri: str,
 def test_schema_task_dataloader(dataset_uri: str, prelimiter: str,
                                 tiny_gpt2_tokenizer: transformers.AutoTokenizer,
                                 tmp_path: Path):
-    pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
 
@@ -1186,8 +1174,6 @@ def test_schema_task_dataloader(dataset_uri: str, prelimiter: str,
 @pytest.mark.parametrize('dataset_uri', ['winograd_small.jsonl'])
 def test_schema_task_dataloader_sentpiece_tokenizer(dataset_uri: str,
                                                     tmp_path: Path):
-    pytest.importorskip('datasets')
-    transformers = pytest.importorskip('transformers')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     tokenizer = transformers.AutoTokenizer.from_pretrained(
@@ -1239,7 +1225,6 @@ def test_schema_task_dataloader_sentpiece_tokenizer(dataset_uri: str,
 def test_lm_task_dataloader_opt_tokenizer(
         tiny_opt_tokenizer: transformers.AutoTokenizer, dataset_uri: str,
         num_fewshot: int, tmp_path: Path):
-    pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
 
@@ -1284,7 +1269,6 @@ def test_lm_task_dataloader_opt_tokenizer(
 def test_mc_task_dataloader_opt_tokenizer(
         tiny_opt_tokenizer: transformers.AutoTokenizer, dataset_uri: str,
         num_fewshot: int, tmp_path: Path):
-    pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
 
@@ -1338,7 +1322,6 @@ def test_mc_task_dataloader_opt_tokenizer(
 @pytest.mark.parametrize('num_fewshot', [0, 1])
 def test_mc_split_batch(tiny_opt_tokenizer: transformers.AutoTokenizer,
                         dataset_uri: str, num_fewshot: int, tmp_path: Path):
-    pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
 
@@ -1408,7 +1391,7 @@ def test_mc_split_batch(tiny_opt_tokenizer: transformers.AutoTokenizer,
 @pytest.mark.parametrize('dataset_uri', ['triviaqa_small.jsonl'])
 def test_qa_split_batch(tiny_opt_tokenizer: transformers.AutoTokenizer,
                         dataset_uri: str, tmp_path: Path):
-    pytest.importorskip('datasets')
+
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/{dataset_uri}'
     tokenizer = tiny_opt_tokenizer
@@ -1464,7 +1447,6 @@ def test_qa_split_batch(tiny_opt_tokenizer: transformers.AutoTokenizer,
 def test_qa_task_dataloader_w_null_eos(
         dataset_uri: str, tiny_gpt2_tokenizer: transformers.AutoTokenizer,
         tmp_path: Path, num_fewshot: int, prompt_string: str):
-    pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
 
@@ -1496,7 +1478,6 @@ def test_qa_task_dataloader(dataset_uri: str,
                             tiny_gpt2_tokenizer: transformers.AutoTokenizer,
                             tmp_path: Path, num_fewshot: int,
                             prompt_string: str):
-    pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
 
@@ -1555,7 +1536,6 @@ def test_qa_task_dataloader(dataset_uri: str,
 def test_qa_task_with_cot_dataloader(
         dataset_uri: str, tiny_gpt2_tokenizer: transformers.AutoTokenizer,
         tmp_path: Path, num_fewshot: int):
-    pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
 
@@ -1616,7 +1596,6 @@ def test_qa_task_with_cot_dataloader(
 def test_mc_task_dataloader(dataset_uri: str, prelimiter: str,
                             tiny_gpt2_tokenizer: transformers.AutoTokenizer,
                             tmp_path: Path):
-    pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
 
@@ -1674,10 +1653,9 @@ def test_mc_task_dataloader(dataset_uri: str, prelimiter: str,
 
 @pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
 def test_code_eval_split_batch(dataset_uri: str, tmp_path: Path):
-    pytest.importorskip('datasets')
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/{dataset_uri}'
-    transformers = pytest.importorskip('transformers')
+
     tokenizer = transformers.AutoTokenizer.from_pretrained(
         'EleutherAI/gpt-neox-20b')  # type: ignore reportUnboundVariable
 
@@ -1737,7 +1715,6 @@ def test_code_eval_sentpiece_dataloader(
         dataset_uri: str, tmp_path: Path, num_fewshot: int, prompt_string: str,
         generations_per_sample: int,
         tiny_llama_tokenizer: transformers.AutoTokenizer):
-    pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
 
@@ -1820,11 +1797,8 @@ def test_code_eval_sentpiece_dataloader(
 
 @pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
 def test_code_eval_test_cases(dataset_uri: str, tmp_path: Path):
-    pytest.importorskip('datasets')
-
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
 
-    transformers = pytest.importorskip('transformers')
     tokenizer = transformers.AutoTokenizer.from_pretrained(
         'huggyllama/llama-7b')  # type: ignore reportUnboundVariable
     dataset_uri = f'{local_data}/{dataset_uri}'
@@ -1873,11 +1847,8 @@ def test_code_eval_test_cases(dataset_uri: str, tmp_path: Path):
 
 @pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
 def test_code_eval_pass_at_k_validity(dataset_uri: str, tmp_path: Path):
-    pytest.importorskip('datasets')
-
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
 
-    transformers = pytest.importorskip('transformers')
     tokenizer = transformers.AutoTokenizer.from_pretrained(
         'huggyllama/llama-7b')  # type: ignore reportUnboundVariable
     dataset_uri = f'{local_data}/{dataset_uri}'
@@ -1908,11 +1879,9 @@ def test_code_eval_pass_at_k_validity(dataset_uri: str, tmp_path: Path):
 def test_code_eval_task_dataloader(dataset_uri: str, tmp_path: Path,
                                    num_fewshot: int, prompt_string: str,
                                    generations_per_sample: int):
-    pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
 
-    transformers = pytest.importorskip('transformers')
     tokenizer = transformers.AutoTokenizer.from_pretrained(
         'mosaicml/mpt-7b')  # type: ignore reportUnboundVariable
     dataset_uri = f'{local_data}/{dataset_uri}'
@@ -1998,7 +1967,6 @@ def test_code_eval_task_dataloader(dataset_uri: str, tmp_path: Path,
 @pytest.mark.parametrize('num_fewshot', [0, 1])
 def test_eval_split_batch(mpt_tokenizer: transformers.AutoTokenizer,
                           dataset_uri: str, num_fewshot: int, tmp_path: Path):
-    pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     tokenizer = mpt_tokenizer  # type: ignore reportUnboundVariable
@@ -2055,7 +2023,7 @@ def test_lm_task_evaluation(num_fewshot: int, dataset_uri: str,
                             tiny_gpt2_tokenizer: transformers.AutoTokenizer,
                             tmp_path: Path,
                             tiny_gpt2_model: transformers.AutoModelForCausalLM):
-    pytest.importorskip('datasets')
+
     in_memory_logger = InMemoryLogger(
     )  # track the logged metrics in the in_memory_logger
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -2102,7 +2070,7 @@ def test_schema_task_evaluation(
         num_fewshot: int, dataset_uri: str,
         tiny_gpt2_tokenizer: transformers.AutoTokenizer, tmp_path: Path,
         tiny_gpt2_model: transformers.AutoModelForCausalLM):
-    pytest.importorskip('datasets')
+
     in_memory_logger = InMemoryLogger(
     )  # track the logged metrics in the in_memory_logger
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -2159,7 +2127,7 @@ def test_mc_task_evaluation_subcategories(
         dataset_uri: str, num_fewshot: int,
         tiny_gpt2_model: transformers.AutoModelForCausalLM,
         tiny_gpt2_tokenizer: transformers.AutoTokenizer, tmp_path: Path):
-    pytest.importorskip('datasets')
+
     in_memory_logger = InMemoryLogger(
     )  # track the logged metrics in the in_memory_logger
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -2222,7 +2190,7 @@ def test_mc_task_evaluation(num_fewshot: int, dataset_uri: str,
                             tiny_gpt2_tokenizer: transformers.AutoTokenizer,
                             tmp_path: Path,
                             tiny_gpt2_model: transformers.AutoModelForCausalLM):
-    pytest.importorskip('datasets')
+
     in_memory_logger = InMemoryLogger(
     )  # track the logged metrics in the in_memory_logger
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -2288,7 +2256,7 @@ def test_qa_task_evaluation_opt_tokenizer(
         tiny_opt_tokenizer: transformers.AutoTokenizer,
         tiny_opt_model: transformers.AutoModelForCausalLM, num_fewshot: int,
         dataset_uri: str, tmp_path: Path):
-    pytest.importorskip('datasets')
+
     in_memory_logger = InMemoryLogger(
     )  # track the logged metrics in the in_memory_logger
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -2345,7 +2313,7 @@ def test_qa_task_evaluation_with_cot_opt_tokenizer(
         tiny_opt_tokenizer: transformers.AutoTokenizer,
         tiny_opt_model: transformers.AutoModelForCausalLM, num_fewshot: int,
         dataset_uri: str, tmp_path: Path):
-    pytest.importorskip('datasets')
+
     in_memory_logger = InMemoryLogger(
     )  # track the logged metrics in the in_memory_logger
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -2402,7 +2370,7 @@ def test_qa_task_evaluation(num_fewshot: int, dataset_uri: str,
                             tiny_gpt2_tokenizer: transformers.AutoTokenizer,
                             tiny_gpt2_model: transformers.AutoModelForCausalLM,
                             tmp_path: Path):
-    pytest.importorskip('datasets')
+
     in_memory_logger = InMemoryLogger(
     )  # track the logged metrics in the in_memory_logger
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -2458,7 +2426,7 @@ def test_qa_task_with_cot_evaluation(
         num_fewshot: int, dataset_uri: str,
         tiny_gpt2_tokenizer: transformers.AutoTokenizer,
         tiny_gpt2_model: transformers.AutoModelForCausalLM, tmp_path: Path):
-    pytest.importorskip('datasets')
+
     in_memory_logger = InMemoryLogger(
     )  # track the logged metrics in the in_memory_logger
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -2533,7 +2501,7 @@ def test_code_eval_microbatching(
         tiny_opt_tokenizer: transformers.AutoTokenizer,
         tiny_opt_model: transformers.AutoModelForCausalLM, num_fewshot: int,
         dataset_uri: str, tmp_path: Path, generations_per_sample: int):
-    pytest.importorskip('datasets')
+
     monkeypatch.setenv('CODE_EVAL_DEVICE', 'LOCAL')
     in_memory_logger = InMemoryLogger(
     )  # track the logged metrics in the in_memory_logger
@@ -2593,7 +2561,7 @@ def test_code_eval_sentpiece_evaluation(
         tiny_opt_tokenizer: transformers.AutoTokenizer,
         tiny_opt_model: transformers.AutoModelForCausalLM, tmp_path: Path,
         generations_per_sample: int):
-    pytest.importorskip('datasets')
+
     monkeypatch.setenv('CODE_EVAL_DEVICE', 'LOCAL')
     in_memory_logger = InMemoryLogger(
     )  # track the logged metrics in the in_memory_logger
@@ -2652,7 +2620,7 @@ def test_code_eval_task_evaluation(
         tiny_gpt2_tokenizer: transformers.AutoTokenizer,
         tiny_gpt2_model: transformers.AutoModelForCausalLM, tmp_path: Path,
         generations_per_sample: int):
-    pytest.importorskip('datasets')
+
     monkeypatch.setenv('CODE_EVAL_DEVICE', 'LOCAL')
     in_memory_logger = InMemoryLogger(
     )  # track the logged metrics in the in_memory_logger
@@ -2701,7 +2669,6 @@ def test_code_eval_task_evaluation(
 def test_lm_spacing_dataloader(dataset_uri: str,
                                tiny_gpt2_tokenizer: transformers.AutoTokenizer,
                                tmp_path: Path):
-    pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
 
@@ -2762,7 +2729,6 @@ def test_hf_dataloading_lm_dataloader(
         hf_loading_vars: Dict[str,
                               str], hf_parsing_map: Optional[Dict[str,
                                                                   List[str]]]):
-    pytest.importorskip('datasets')
 
     tokenizer = tiny_gpt2_tokenizer
     batch_size = 2
@@ -2826,7 +2792,6 @@ def test_hf_dataloading_custom_parsing(
         dataset_uri: str, tiny_gpt2_tokenizer: transformers.AutoTokenizer,
         tmp_path: Path, num_fewshot: int, prompt_string: str,
         hf_loading_vars: Dict[str, str], hf_parsing_map: Dict[str, List[str]]):
-    pytest.importorskip('datasets')
 
     tokenizer = tiny_gpt2_tokenizer
     batch_size = 2

From f387a73d4a61d1b3d1da3065c0e70c0914659ab1 Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Thu, 14 Mar 2024 00:03:08 -0400
Subject: [PATCH 39/59] fix bug with seq len

---
 .../eval/datasets/in_context_learning_evaluation.py       | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llmfoundry/eval/datasets/in_context_learning_evaluation.py b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
index e7fc8d267b..f4bd0c62ad 100644
--- a/llmfoundry/eval/datasets/in_context_learning_evaluation.py
+++ b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
@@ -575,9 +575,9 @@ def __init__(self,
                 'use_cache': True,
                 'eos_token_id': self.tokenizer.eos_token_id,
             },
-            'generation_length':
-                self.
-                max_answer_length,  # TODO: deprecate with next composer udpate
+            'generation_length': max(
+                self.max_answer_length,
+                1),  # TODO: deprecate with next composer udpate
         }
         self.batch_mapping = {
             'input_ids': self.context_key,
@@ -1293,7 +1293,7 @@ def __init__(
             'dataset_size':
                 dataset_size,
             'generation_length':  # TODO: deprecate with next composer release
-                generation_length
+                max(generation_length, 1)
         }
         if 'generation_kwargs' in kwargs:
             self.update_generation_kwargs(kwargs['generation_kwargs'])

From 2f405d9db96e4b9f49a48568ed37b76db280e258 Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Thu, 14 Mar 2024 00:05:51 -0400
Subject: [PATCH 40/59] restore mcli

---
 mcli/mcli-hf-eval.yaml         | 18 +++++++++---------
 mcli/mcli-llama2-finetune.yaml |  4 ++--
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/mcli/mcli-hf-eval.yaml b/mcli/mcli-hf-eval.yaml
index 80f9435360..6800319df2 100644
--- a/mcli/mcli-hf-eval.yaml
+++ b/mcli/mcli-hf-eval.yaml
@@ -1,7 +1,7 @@
 integrations:
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  git_branch: migrate_subclasses_to_foundry # v0.5.0
+  git_branch: v0.5.0
   # git_commit:  # OR use your commit hash
   pip_install: -e ".[gpu-flash2]"
   ssh_clone: false  # Should be true if using a private repo
@@ -11,12 +11,12 @@ command: |
   composer eval/eval.py /mnt/config/parameters.yaml
 
 # Mosaic Cloud will use run_name (with a unique suffix) to populate the env var $RUN_NAME
-name: llama-eval-exp
+run_name: mpt-eval
 gpu_num: 8
-gpu_type: a100_80gb
-cluster: r1z1 # replace with your cluster here!
+# gpu_type:
+# cluster:  # replace with your cluster here!
 
-image: mosaicml/llm-foundry:2.2.1_cu121_flash2-latest
+image: mosaicml/llm-foundry:2.1.0_cu121_flash2-latest
 
 # The below is injected as a YAML file: /mnt/config/parameters.yaml
 parameters:
@@ -28,19 +28,19 @@ parameters:
 
   models:
   -
-    model_name: meta-llama/Llama-2-7b-hf
+    model_name: mosaicml/mpt-7b-instruct
     # Tokenizer
     tokenizer:
-      name: meta-llama/Llama-2-7b-hf
+      name: EleutherAI/gpt-neox-20b
       kwargs:
         model_max_length: ${max_seq_len}
 
     model:
       name: hf_causal_lm
-      pretrained_model_name_or_path: meta-llama/Llama-2-7b-hf
+      pretrained_model_name_or_path: mosaicml/mpt-7b-instruct
       init_device: mixed
       pretrained: true
-      use_auth_token: true
+      use_auth_token: false
 
   # FSDP config for model sharding
   fsdp_config:
diff --git a/mcli/mcli-llama2-finetune.yaml b/mcli/mcli-llama2-finetune.yaml
index 091fc5a84e..36de709aed 100644
--- a/mcli/mcli-llama2-finetune.yaml
+++ b/mcli/mcli-llama2-finetune.yaml
@@ -38,7 +38,7 @@ parameters:
     pretrained: true
     # Note: you must have set the HUGGING_FACE_HUB_TOKEN environment variable and have access to the llama2 models
     use_auth_token: true
-    use_flash_attention_2: true
+    attention_patch_type: triton
 
   # Tokenizer
   tokenizer:
@@ -62,7 +62,7 @@ parameters:
       # # Or use `python llmfoundry/scripts/misc/profile_packing.py --yaml-path /path/to/this/yaml/ ...`
       # # to profile this run's optimal packing_ratio as it depends on GPU count,
       # # batch size, sequence length
-      # packing_ratio: auto
+      # packing_ratio:
     drop_last: true
     num_workers: 8
     pin_memory: false

From 7faeb787a1d005eb3c93a28898dd565878077ab4 Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Mon, 1 Apr 2024 13:22:13 -0400
Subject: [PATCH 41/59] merge

---
 llmfoundry/metrics/__init__.py                | 20 ++++++++++--------
 llmfoundry/models/hf/hf_causal_lm.py          |  2 --
 .../models/inference_api_wrapper/interface.py |  4 +---
 llmfoundry/models/mpt/modeling_mpt.py         |  9 --------
 llmfoundry/utils/builders.py                  | 21 ++++---------------
 llmfoundry/utils/huggingface_hub_utils.py     |  4 ++--
 scripts/train/train.py                        |  8 +------
 7 files changed, 19 insertions(+), 49 deletions(-)

diff --git a/llmfoundry/metrics/__init__.py b/llmfoundry/metrics/__init__.py
index 6c71a3ea08..e8310687a1 100644
--- a/llmfoundry/metrics/__init__.py
+++ b/llmfoundry/metrics/__init__.py
@@ -1,14 +1,15 @@
 # Copyright 2024 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 
-from composer.metrics import (InContextLearningCodeEvalAccuracy,
-                              InContextLearningLMAccuracy,
-                              InContextLearningLMExpectedCalibrationError,
-                              InContextLearningMCExpectedCalibrationError,
-                              InContextLearningMultipleChoiceAccuracy,
-                              InContextLearningQAAccuracy, MaskedAccuracy)
-from composer.metrics.nlp import LanguageCrossEntropy, LanguagePerplexity
+from composer.metrics import (LanguageCrossEntropy, LanguagePerplexity,
+                              MaskedAccuracy)
 
+from llmfoundry.eval.metrics import (
+    InContextLearningCodeEvalAccuracy,
+    InContextLearningGenerationExactMatchAccuracy, InContextLearningLMAccuracy,
+    InContextLearningLMExpectedCalibrationError,
+    InContextLearningMCExpectedCalibrationError,
+    InContextLearningMultipleChoiceAccuracy)
 from llmfoundry.metrics.token_acc import TokenAccuracy
 from llmfoundry.registry import metrics
 
@@ -19,7 +20,8 @@
 metrics.register('mc_expected_calibration_error',
                  func=InContextLearningMCExpectedCalibrationError)
 metrics.register('mc_accuracy', func=InContextLearningMultipleChoiceAccuracy)
-metrics.register('qa_accuracy', func=InContextLearningQAAccuracy)
+metrics.register('qa_accuracy',
+                 func=InContextLearningGenerationExactMatchAccuracy)
 metrics.register('code_eval_accuracy', func=InContextLearningCodeEvalAccuracy)
 metrics.register('language_cross_entropy', func=LanguageCrossEntropy)
 metrics.register('language_perplexity', func=LanguagePerplexity)
@@ -54,7 +56,7 @@
     'InContextLearningLMExpectedCalibrationError',
     'InContextLearningMCExpectedCalibrationError',
     'InContextLearningMultipleChoiceAccuracy',
-    'InContextLearningQAAccuracy',
+    'InContextLearningGenerationExactMatchAccuracy',
     'InContextLearningCodeEvalAccuracy',
     'LanguageCrossEntropy',
     'LanguagePerplexity',
diff --git a/llmfoundry/models/hf/hf_causal_lm.py b/llmfoundry/models/hf/hf_causal_lm.py
index b13d896ff6..275a142131 100644
--- a/llmfoundry/models/hf/hf_causal_lm.py
+++ b/llmfoundry/models/hf/hf_causal_lm.py
@@ -9,14 +9,12 @@
 from typing import TYPE_CHECKING, Any, Dict, Mapping
 
 # required for loading a python model into composer
-from composer.metrics.nlp import LanguageCrossEntropy, LanguagePerplexity
 from composer.models.huggingface import peft_installed
 from composer.utils import dist
 from omegaconf import DictConfig
 from transformers import (AutoConfig, AutoModelForCausalLM, PreTrainedModel,
                           PreTrainedTokenizerBase)
 
-
 from llmfoundry.metrics import (DEFAULT_CAUSAL_LM_EVAL_METRICS,
                                 DEFAULT_CAUSAL_LM_TRAIN_METRICS)
 from llmfoundry.models.hf.hf_fsdp import hf_get_init_device
diff --git a/llmfoundry/models/inference_api_wrapper/interface.py b/llmfoundry/models/inference_api_wrapper/interface.py
index 8172003ee2..91f6fb2600 100644
--- a/llmfoundry/models/inference_api_wrapper/interface.py
+++ b/llmfoundry/models/inference_api_wrapper/interface.py
@@ -5,14 +5,12 @@
 
 import torch
 from composer.core.types import Batch
-from composer.metrics.nlp import LanguageCrossEntropy, LanguagePerplexity
-from composer.metrics import InContextLearningMetric
 from composer.models import ComposerModel
 from omegaconf import DictConfig
 from torchmetrics import Metric
 from transformers import AutoTokenizer
 
-
+from llmfoundry.eval.metrics import InContextLearningMetric
 from llmfoundry.metrics import DEFAULT_CAUSAL_LM_EVAL_METRICS
 
 
diff --git a/llmfoundry/models/mpt/modeling_mpt.py b/llmfoundry/models/mpt/modeling_mpt.py
index cbf2cf38f6..e0a666f62c 100644
--- a/llmfoundry/models/mpt/modeling_mpt.py
+++ b/llmfoundry/models/mpt/modeling_mpt.py
@@ -16,15 +16,6 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from composer.metrics.nlp import LanguageCrossEntropy, LanguagePerplexity
-from composer.models import HuggingFaceModel
-from composer.utils import dist
-
-from llmfoundry.eval.metrics import (
-    InContextLearningCodeEvalAccuracy,
-    InContextLearningGenerationExactMatchAccuracy, InContextLearningLMAccuracy,
-    InContextLearningMultipleChoiceAccuracy)
-from llmfoundry.metrics import TokenAccuracy
 from composer.models import HuggingFaceModel
 from composer.utils import dist
 
diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py
index 3eae18454e..39afd608dd 100644
--- a/llmfoundry/utils/builders.py
+++ b/llmfoundry/utils/builders.py
@@ -12,14 +12,6 @@
 
 import torch
 from composer.core import Algorithm, Callback, Evaluator
-from composer.loggers import (InMemoryLogger, LoggerDestination, MLFlowLogger,
-                              TensorboardLogger, WandBLogger)
-from composer.optim import DecoupledAdamW
-from composer.optim.scheduler import (ComposerScheduler,
-                                      ConstantWithWarmupScheduler,
-                                      CosineAnnealingWithWarmupScheduler,
-                                      LinearWithWarmupScheduler)
-
 from composer.loggers import LoggerDestination
 from composer.models import ComposerModel
 from composer.optim.scheduler import ComposerScheduler
@@ -33,11 +25,8 @@
 from llmfoundry import registry
 from llmfoundry.callbacks import EvalGauntlet
 from llmfoundry.data.dataloader import build_dataloader
-from llmfoundry.eval.datasets import get_icl_task_dataloader
-from llmfoundry.optim import (DecoupledAdaLRLion, DecoupledClipLion,
-                              DecoupledLionW)
-from llmfoundry.optim.scheduler import InverseSquareRootWithWarmupScheduler
-
+from llmfoundry.eval.datasets.in_context_learning_evaluation import \
+    get_icl_task_dataloader
 from llmfoundry.tokenizers.tiktoken import TiktokenTokenizerWrapper
 from llmfoundry.utils.registry_utils import construct_from_registry
 
@@ -506,10 +495,8 @@ def _validate_cfg(icl_cfg: DictConfig):
                 icl_cfg.metric_names = [
                     'InContextLearningMultipleChoiceAccuracy'
                 ]
-            elif icl_cfg.icl_task_type == 'generation_task_with_answers' or icl_cfg.icl_task_type == 'question_answering':
-                icl_cfg.metric_names = [
-                    'InContextLearningGenerationExactMatchAccuracy'
-                ]
+            elif icl_cfg.icl_task_type == 'question_answering':
+                icl_cfg.metric_names = ['InContextLearningQAAccuracy']
             elif icl_cfg.icl_task_type == 'code_evaluation':
                 icl_cfg.metric_names = ['InContextLearningCodeEvalAccuracy']
             else:
diff --git a/llmfoundry/utils/huggingface_hub_utils.py b/llmfoundry/utils/huggingface_hub_utils.py
index 0bdec97dea..3903a9bed3 100644
--- a/llmfoundry/utils/huggingface_hub_utils.py
+++ b/llmfoundry/utils/huggingface_hub_utils.py
@@ -132,8 +132,8 @@ def edit_files_for_hf_compatibility(
     flatten_imports_prefix: Sequence[str] = ('llmfoundry',),
     remove_imports_prefix: Sequence[str] = ('composer', 'omegaconf',
                                             'llmfoundry.metrics',
-                                            'llmfoundry.eval'),
-                                            'llmfoundry.utils.builders'),
+                                            'llmfoundry.eval',
+                                            'llmfoundry.utils.builders')
 ) -> None:
     """Edit files to be compatible with Hugging Face Hub.
 
diff --git a/scripts/train/train.py b/scripts/train/train.py
index 0e9190ac52..92d595ed7e 100644
--- a/scripts/train/train.py
+++ b/scripts/train/train.py
@@ -12,9 +12,6 @@
 import torch
 from composer import Trainer
 from composer.core.callback import Callback
-from composer.loggers import MosaicMLLogger
-from composer.loggers.mosaicml_logger import (MOSAICML_ACCESS_TOKEN_ENV_VAR,
-                                              MOSAICML_PLATFORM_ENV_VAR)
 from composer.profiler import (JSONTraceHandler, Profiler, TraceHandler,
                                cyclic_schedule)
 from composer.utils import dist, get_device, reproducibility
@@ -23,10 +20,6 @@
 from rich.traceback import install
 
 from llmfoundry.eval.metrics.nlp import InContextLearningMetric
-
-install()
-
-from transformers import PreTrainedTokenizerBase
 from llmfoundry.utils import (find_mosaicml_logger, log_train_analytics,
                               maybe_create_mosaicml_logger)
 
@@ -531,6 +524,7 @@ def main(cfg: DictConfig) -> Trainer:
     # Optimizer
     optimizer_name: str = optimizer_config.pop('name')
     optimizer = build_optimizer(model, optimizer_name, optimizer_config)
+
     # Now add the eval metrics
     try:
         if eval_loader_config is not None and not use_async_eval:

From 343e1158f60e3055337a27e560c7bc802b68ec84 Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Mon, 1 Apr 2024 13:56:47 -0400
Subject: [PATCH 42/59] fix builder

---
 llmfoundry/utils/builders.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py
index 39afd608dd..c937596d05 100644
--- a/llmfoundry/utils/builders.py
+++ b/llmfoundry/utils/builders.py
@@ -495,8 +495,12 @@ def _validate_cfg(icl_cfg: DictConfig):
                 icl_cfg.metric_names = [
                     'InContextLearningMultipleChoiceAccuracy'
                 ]
-            elif icl_cfg.icl_task_type == 'question_answering':
-                icl_cfg.metric_names = ['InContextLearningQAAccuracy']
+            elif icl_cfg.icl_task_type == 'generation_task_with_answers' or icl_cfg.icl_task_type == 'question_answering':
+                if icl_cfg.icl_task_type == 'question_answering':
+                    log.warning(
+                        ""
+                    )
+                icl_cfg.metric_names = ['InContextLearningGenerationExactMatchAccuracy']
             elif icl_cfg.icl_task_type == 'code_evaluation':
                 icl_cfg.metric_names = ['InContextLearningCodeEvalAccuracy']
             else:

From bf6231edbb75eb9f33ff34b5fb3fd5cd8a50a92f Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Mon, 1 Apr 2024 14:08:00 -0400
Subject: [PATCH 43/59] add deprecation warning

---
 .../eval/datasets/in_context_learning_evaluation.py |  9 ++++++---
 llmfoundry/utils/builders.py                        | 13 +++++++++----
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/llmfoundry/eval/datasets/in_context_learning_evaluation.py b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
index f4bd0c62ad..1398f61ae0 100644
--- a/llmfoundry/eval/datasets/in_context_learning_evaluation.py
+++ b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
@@ -12,6 +12,7 @@
 import logging
 import os
 import random
+import warnings
 from typing import Any, Dict, Iterable, List, Optional, Union
 
 import torch
@@ -30,6 +31,7 @@
                                             make_padded_input, strip_data,
                                             tokenizer_needs_prefix_space,
                                             trim_context)
+from llmfoundry.utils import VersionedDeprecationWarning
 
 log = logging.getLogger(__name__)
 
@@ -1474,9 +1476,10 @@ def build_icl_dataloader(
         effective_batchsize = batch_size
     elif icl_task_type == 'generation_task_with_answers' or icl_task_type == 'question_answering':
         if icl_task_type == 'question_answering':
-            log.warning(
-                f'ICL task type `question_answering` has been deprecated, please use `generation_task_with_answers`.'
-            )
+            warnings.warn(
+                VersionedDeprecationWarning(
+                    "ICL task type 'question_answering' is now deprecated. Use identifier 'generation_task_with_answers'",
+                    'v0.7.0'))
         dataset = InContextLearningGenerationTaskWithAnswersDataset(
             dataset_uri=dataset_uri,
             tokenizer=tokenizer,
diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py
index c937596d05..548a0b1e9b 100644
--- a/llmfoundry/utils/builders.py
+++ b/llmfoundry/utils/builders.py
@@ -6,6 +6,7 @@
 import logging
 import os
 import re
+import warnings
 from collections import OrderedDict
 from typing import (Any, ContextManager, Dict, Iterable, List, Optional, Tuple,
                     Union)
@@ -28,6 +29,7 @@
 from llmfoundry.eval.datasets.in_context_learning_evaluation import \
     get_icl_task_dataloader
 from llmfoundry.tokenizers.tiktoken import TiktokenTokenizerWrapper
+from llmfoundry.utils import VersionedDeprecationWarning
 from llmfoundry.utils.registry_utils import construct_from_registry
 
 log = logging.getLogger(__name__)
@@ -497,10 +499,13 @@ def _validate_cfg(icl_cfg: DictConfig):
                 ]
             elif icl_cfg.icl_task_type == 'generation_task_with_answers' or icl_cfg.icl_task_type == 'question_answering':
                 if icl_cfg.icl_task_type == 'question_answering':
-                    log.warning(
-                        ""
-                    )
-                icl_cfg.metric_names = ['InContextLearningGenerationExactMatchAccuracy']
+                    warnings.warn(
+                        VersionedDeprecationWarning(
+                            "ICL task type 'question_answering' is now deprecated. Use identifier 'generation_task_with_answers'",
+                            'v0.7.0'))
+                icl_cfg.metric_names = [
+                    'InContextLearningGenerationExactMatchAccuracy'
+                ]
             elif icl_cfg.icl_task_type == 'code_evaluation':
                 icl_cfg.metric_names = ['InContextLearningCodeEvalAccuracy']
             else:

From 501d4ccd1c43c78ed1a24f23cca3d3e36bae7c18 Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Mon, 1 Apr 2024 14:22:08 -0400
Subject: [PATCH 44/59] add deprecation warning

---
 llmfoundry/eval/datasets/in_context_learning_evaluation.py | 2 +-
 llmfoundry/utils/builders.py                               | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/llmfoundry/eval/datasets/in_context_learning_evaluation.py b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
index 1398f61ae0..91c832ea72 100644
--- a/llmfoundry/eval/datasets/in_context_learning_evaluation.py
+++ b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
@@ -31,7 +31,7 @@
                                             make_padded_input, strip_data,
                                             tokenizer_needs_prefix_space,
                                             trim_context)
-from llmfoundry.utils import VersionedDeprecationWarning
+from llmfoundry.utils.warnings import VersionedDeprecationWarning
 
 log = logging.getLogger(__name__)
 
diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py
index 548a0b1e9b..9057355adb 100644
--- a/llmfoundry/utils/builders.py
+++ b/llmfoundry/utils/builders.py
@@ -29,7 +29,7 @@
 from llmfoundry.eval.datasets.in_context_learning_evaluation import \
     get_icl_task_dataloader
 from llmfoundry.tokenizers.tiktoken import TiktokenTokenizerWrapper
-from llmfoundry.utils import VersionedDeprecationWarning
+from llmfoundry.utils.warnings import VersionedDeprecationWarning
 from llmfoundry.utils.registry_utils import construct_from_registry
 
 log = logging.getLogger(__name__)

From 414467a2830f786576626ad99230dcc438dab8c8 Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Mon, 1 Apr 2024 14:25:08 -0400
Subject: [PATCH 45/59] merge

---
 llmfoundry/utils/builders.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py
index 9057355adb..3fb75979e1 100644
--- a/llmfoundry/utils/builders.py
+++ b/llmfoundry/utils/builders.py
@@ -29,8 +29,8 @@
 from llmfoundry.eval.datasets.in_context_learning_evaluation import \
     get_icl_task_dataloader
 from llmfoundry.tokenizers.tiktoken import TiktokenTokenizerWrapper
-from llmfoundry.utils.warnings import VersionedDeprecationWarning
 from llmfoundry.utils.registry_utils import construct_from_registry
+from llmfoundry.utils.warnings import VersionedDeprecationWarning
 
 log = logging.getLogger(__name__)
 

From 65fbbeda0e1c58014feafb66a3af82a118cf74cf Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Mon, 1 Apr 2024 15:30:41 -0400
Subject: [PATCH 46/59] merge

---
 .../eval/datasets/in_context_learning_evaluation.py       | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/llmfoundry/eval/datasets/in_context_learning_evaluation.py b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
index 91c832ea72..faf84a7cd7 100644
--- a/llmfoundry/eval/datasets/in_context_learning_evaluation.py
+++ b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
@@ -576,10 +576,8 @@ def __init__(self,
                 'pad_token_id': self.pad_tok_id,
                 'use_cache': True,
                 'eos_token_id': self.tokenizer.eos_token_id,
+                'max_new_tokens': max(self.max_answer_length, 1)
             },
-            'generation_length': max(
-                self.max_answer_length,
-                1),  # TODO: deprecate with next composer udpate
         }
         self.batch_mapping = {
             'input_ids': self.context_key,
@@ -1286,6 +1284,7 @@ def __init__(
                 'temperature': 0.2,  # good default for code
                 'use_cache': True,
                 'eos_token_id': self.tokenizer.eos_token_id,
+                'max_new_tokens': max(generation_length, 1)
             },
             'sample_id': [],
             'pass_at_k':
@@ -1294,8 +1293,7 @@ def __init__(
                 generations_per_sample,
             'dataset_size':
                 dataset_size,
-            'generation_length':  # TODO: deprecate with next composer release
-                max(generation_length, 1)
+            
         }
         if 'generation_kwargs' in kwargs:
             self.update_generation_kwargs(kwargs['generation_kwargs'])

From 5696f095fc285d979d12ff1306102673f2c75ee4 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Mon, 1 Apr 2024 20:39:13 +0000
Subject: [PATCH 47/59] add logging necessities to nlp.py

---
 llmfoundry/eval/metrics/nlp.py | 118 +++++++++++++++++++++++++++++++--
 1 file changed, 113 insertions(+), 5 deletions(-)

diff --git a/llmfoundry/eval/metrics/nlp.py b/llmfoundry/eval/metrics/nlp.py
index f1fe8d176e..55007682f5 100644
--- a/llmfoundry/eval/metrics/nlp.py
+++ b/llmfoundry/eval/metrics/nlp.py
@@ -1,17 +1,16 @@
 # Copyright 2024 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
 """A collection of common torchmetrics for NLP tasks."""
 
+import copy
+import functools
 import logging
 import os
 import re
 import string
 import warnings
-from typing import Any, Dict, List
+from typing import Callable, Any, Dict, List
 
 import numpy as np
 import torch
@@ -41,6 +40,38 @@ class InContextLearningMetric(Metric):
     def __init__(self, *args, **kwargs):  # pyright: ignore
         super().__init__(*args, **kwargs)
         self.needs_batch = True
+    
+    def _wrap_update(self, update: Callable) -> Callable:
+        """Overwrite default _wrap_update to return result of update().
+
+        Torch metrics wraps update with following wrapped_func but explicitly does not return the value.
+        In general, torchmetrics update() does not return a value, but we want to in order to pass it on
+        to state.metric_outputs.
+        """
+
+        @functools.wraps(update)
+        def wrapped_func(*args: Any, **kwargs: Any) -> None:
+            self._computed = None
+            self._update_count += 1
+            with torch.set_grad_enabled(self._enable_grad):
+                try:
+                    update_result = update(*args, **kwargs)
+                except RuntimeError as err:
+                    if 'Expected all tensors to be on' in str(err):
+                        raise RuntimeError(
+                            'Encountered different devices in metric calculation (see stacktrace for details).'
+                            ' This could be due to the metric class not being on the same device as input.'
+                            f' Instead of `metric={self.__class__.__name__}(...)` try to do'
+                            f' `metric={self.__class__.__name__}(...).to(device)` where'
+                            ' device corresponds to the device of the input.',
+                        ) from err
+                    raise err
+
+            if self.compute_on_cpu:
+                self._move_list_states_to_cpu()
+            return update_result
+
+        return wrapped_func
 
     def update(
         self,
@@ -97,6 +128,12 @@ def __init__(self, dist_sync_on_step: bool = False):
                        default=torch.tensor(0.),
                        dist_reduce_fx='sum')
         self.add_state('total', default=torch.tensor(0.), dist_reduce_fx='sum')
+        self.metric_result_dict = {
+            'cleaned_output': [],
+            'original_label': [],
+            'cleaned_label': [],
+            'result': [],
+        }
 
     def normalize_answer(self, answer: str):
         """Lower text and remove punctuation, articles and extra whitespace.
@@ -134,6 +171,7 @@ def update(
         cot_delimiter = batch.get('cot_delimiter', '')
         do_normalization = batch.get('do_normalization', True)
         stopping_criteria = batch.get('stopping_criteria', None)
+        metric_result_dict = copy.deepcopy(self.metric_result_dict)
         for sample_output, sample_labels in zip(outputs, labels):
             final_answer = sample_output
 
@@ -155,13 +193,23 @@ def update(
                 cleaned_sample_labels = {
                     sample_label.strip() for sample_label in sample_labels
                 }
+            metric_result_dict['original_label'].append(sample_labels)
+            metric_result_dict['cleaned_output'].append(cleaned_final_answer)
+            metric_result_dict['cleaned_label'].append(cleaned_sample_labels)
 
             if any(
                     cleaned_final_answer.startswith(label)
                     for label in cleaned_sample_labels):
                 self.correct += torch.tensor(1.0)
+                metric_result_dict['result'].append(1)
+            else:
+                metric_result_dict['result'].append(0)
+
             self.total += torch.tensor(1.0)
 
+            return metric_result_dict
+
+
     def compute(self):
         assert isinstance(self.correct, Tensor)
         assert isinstance(self.total, Tensor)
@@ -199,9 +247,12 @@ def __init__(self, dist_sync_on_step: bool = False):
                        default=torch.tensor(0.),
                        dist_reduce_fx='sum')
         self.add_state('total', default=torch.tensor(0.), dist_reduce_fx='sum')
+        self.metric_result_dict = {'context': [], 'label': [], 'output': [], 'result': []}
+
 
     def update(self, batch: dict, outputs: torch.Tensor, labels: torch.Tensor):
 
+        metric_result_dict = copy.deepcopy(self.metric_result_dict)
         for batch_idx, cont_idx in enumerate(batch['continuation_indices']):
             cont_tok_pred = outputs[batch_idx].index_select(dim=0,
                                                             index=cont_idx -
@@ -209,9 +260,18 @@ def update(self, batch: dict, outputs: torch.Tensor, labels: torch.Tensor):
             cont_tok_targ = labels[batch_idx].index_select(dim=0,
                                                            index=cont_idx - 1)
 
-            self.correct += (cont_tok_pred == cont_tok_targ).all().int()
+            metric_result_dict['context'].append(batch['input_ids'][batch_idx][:cont_idx[0]])
+            metric_result_dict['label'].append(cont_tok_targ)
+            metric_result_dict['output'].append(cont_tok_pred)
+
+            correct = (cont_tok_pred == cont_tok_targ).all().int()
+            self.correct += correct
+            metric_result_dict['result'].append(int(correct))
+
             self.total += torch.tensor(1.0)
 
+        return metric_result_dict
+
     def compute(self):
         assert isinstance(self.correct, Tensor)
         assert isinstance(self.total, Tensor)
@@ -247,6 +307,15 @@ def __init__(self, dist_sync_on_step: bool = False):
                        default=torch.tensor(0.0),
                        dist_reduce_fx='sum')
         self.add_state('total', default=torch.tensor(0.0), dist_reduce_fx='sum')
+        self.metric_result_dict = {
+            'context': [],
+            'correct_choice': [],
+            'correct_choice_idx': [],
+            'selected_choice': [],
+            'selected_choice_idx': [],
+            'all_choices': [],
+            'result': [],
+        }
 
     def update(self, batch: dict, outputs: torch.Tensor, labels: torch.Tensor):
 
@@ -263,6 +332,7 @@ def update(self, batch: dict, outputs: torch.Tensor, labels: torch.Tensor):
             perplexity = torch.exp(cross_entropy)
             perplexities.append(perplexity)
 
+        metric_result_dict = copy.deepcopy(self.metric_result_dict)
         for (start, end), gold_idx in zip(batch['choice_groupings'],
                                           batch['gold_indices']):
             subset = perplexities[start:end]
@@ -270,8 +340,36 @@ def update(self, batch: dict, outputs: torch.Tensor, labels: torch.Tensor):
 
             if idx_min == gold_idx:
                 self.correct += torch.tensor(1.0)
+                metric_result_dict['result'].append(1)
+            else:
+                metric_result_dict['result'].append(0)
+
+            question = batch['input_ids'][start][:batch['continuation_indices'][start][0]]
+
+            correct_choice = batch['input_ids'][start:end][gold_idx][batch['continuation_indices'][start:end][gold_idx][
+                0]:batch['continuation_indices'][start:end][gold_idx][-1] + 1]
+            selected_choice = batch['input_ids'][start:end][idx_min][batch['continuation_indices'][start:end][idx_min][
+                0]:batch['continuation_indices'][start:end][idx_min][-1] + 1]
+            metric_result_dict['context'].append(question)
+            metric_result_dict['correct_choice'].append(correct_choice)
+            metric_result_dict['correct_choice_idx'].append(gold_idx)
+            metric_result_dict['selected_choice'].append(selected_choice)
+            metric_result_dict['selected_choice_idx'].append(idx_min)
+            all_choices = batch['input_ids'][start:end]
+            # Unpads the choices. Necessary in case different choices have different token lengths.
+            if 'attention_mask' in batch:
+                all_choices_list = [choice[batch['attention_mask'][i]] for i, choice in enumerate(all_choices)]
+                metric_result_dict['all_choices'].append(all_choices_list)
+
             self.total += torch.tensor(1.0)
 
+        # Don't return all_choices if we didn't fill it up (i.e. didn't use causal lms)
+        if metric_result_dict['all_choices'] == []:
+            metric_result_dict.pop('all_choices')
+
+        return metric_result_dict
+
+
     def compute(self):
         assert isinstance(self.correct, Tensor)
         assert isinstance(self.total, Tensor)
@@ -312,6 +410,8 @@ def __init__(self, dist_sync_on_step: bool = False):
         self.eval_device = os.environ.get('CODE_EVAL_DEVICE', None)
         if self.eval_device is not None:
             self.eval_device = self.eval_device.upper()
+        self.metric_result_dict = {'context': [], 'output': [], 'result': [], 'sample_id': []}
+        
 
     def get_client(self) -> EvalClient:
         """Returns a client for the appropriate remote platform."""
@@ -402,6 +502,7 @@ def update(self, batch: Dict[str, Any], outputs: List[str],
         del labels  # never used
         client = self.get_client()
 
+        metric_result_dict = copy.deepcopy(self.metric_result_dict)
         for sample_id, code_gen, sample_prompt, test_inputs, test_outputs, entry_point, language in zip(
                 batch['sample_id'], outputs, batch['prompts'],
                 batch['test_inputs'], batch['test_outputs'],
@@ -409,11 +510,14 @@ def update(self, batch: Dict[str, Any], outputs: List[str],
 
             idx = sample_id
             self.total[idx] += 1.0
+            metric_result_dict['sample_id'].append(sample_id)
 
             code_gen = re.split(
                 r'\n[A-Za-z0-9#`]',
                 code_gen)[0]  # remove everything after function ends
             final_code = sample_prompt + code_gen  # combine prompt with the code generation
+            metric_result_dict['context'].append(sample_prompt)
+            metric_result_dict['output'].append(code_gen)
 
             test_results = []
             for test_input, test_output in zip(test_inputs, test_outputs):
@@ -430,8 +534,12 @@ def update(self, batch: Dict[str, Any], outputs: List[str],
 
             if all(test_results):
                 self.correct[idx] += 1.0
+                metric_result_dict['result'].append(1)
+            else:
+                metric_result_dict['result'].append(0)
 
         client.close()  # pyright: ignore [reportOptionalMemberAccess]
+        return metric_result_dict
 
     def compute(self):
         assert isinstance(self.correct, Tensor)

From 91a2b18a7179e3c0763f2dcb386586a288f7a8c1 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Mon, 1 Apr 2024 20:42:17 +0000
Subject: [PATCH 48/59] add attention_mask test update

---
 tests/eval/test_nlp_metrics.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/eval/test_nlp_metrics.py b/tests/eval/test_nlp_metrics.py
index 24a2078795..911f40c861 100644
--- a/tests/eval/test_nlp_metrics.py
+++ b/tests/eval/test_nlp_metrics.py
@@ -164,6 +164,7 @@ def test_in_context_learning_mc_accuracy(
     ]
     inputs = torch.tensor(
         [input + [pad] * (2048 - len(input)) for input in inputs])
+    attention_mask = ~(inputs == pad)
 
     cont_idxs = []
     for context, continuation in zip(contexts, continuations):
@@ -175,6 +176,7 @@ def test_in_context_learning_mc_accuracy(
         'continuation_indices': cont_idxs,
         'labels': inputs.roll(-1),
         'input_ids': inputs,
+        'attention_mask': attention_mask,
         'gold_indices': gold_indices,
         'choice_groupings': choice_groupings
     }

From 79877ee650843d54cf7b22e0e76f284814b8d1c1 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Mon, 1 Apr 2024 21:17:49 +0000
Subject: [PATCH 49/59] fix generation_length in tests

---
 .../in_context_learning_evaluation.py         | 25 +++-----
 llmfoundry/eval/metrics/nlp.py                | 58 ++++++++++++-------
 .../eval/test_in_context_learning_datasets.py | 29 +++++-----
 3 files changed, 59 insertions(+), 53 deletions(-)

diff --git a/llmfoundry/eval/datasets/in_context_learning_evaluation.py b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
index faf84a7cd7..ecb47ca504 100644
--- a/llmfoundry/eval/datasets/in_context_learning_evaluation.py
+++ b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
@@ -552,7 +552,7 @@ def __init__(self,
         self.max_answer_length = 0
         static_keys = [
             'mode', 'cot_delimiter', 'generation_kwargs', 'do_normalization',
-            'generation_length', 'stopping_criteria'
+            'stopping_criteria'
         ]
         tensor_keys = ['input_ids', 'attention_mask']
         list_keys = ['labels']
@@ -1223,7 +1223,6 @@ def __init__(
             'pass_at_k',
             'generation_kwargs',
             'generations_per_sample',
-            'generation_length',
             'dataset_size',
         ]
         list_keys = [
@@ -1260,14 +1259,13 @@ def __init__(
         self.dataset = self.repeat_dataset(self.dataset, generations_per_sample)
 
         if self.max_answer_length < self.max_seq_len - self.max_prompt_length:
-            generation_length = self.max_answer_length
+            max_new_tokens = self.max_answer_length
         else:
-            generation_length = self.max_seq_len - self.max_prompt_length
+            max_new_tokens = self.max_seq_len - self.max_prompt_length
 
         self.base_batch = {
             'input_ids': [],
-            'mode':
-                'generate',
+            'mode': 'generate',
             'labels': [],
             'prompts': [],
             'tests': [],
@@ -1275,8 +1273,7 @@ def __init__(
             'test_inputs': [],
             'test_outputs': [],
             'languages': [],
-            'pass_at_k':
-                pass_at_k,
+            'pass_at_k': pass_at_k,
             'generation_kwargs': {
                 'pad_token_id': self.pad_tok_id,
                 'num_beams': 1,  # single beam
@@ -1284,16 +1281,12 @@ def __init__(
                 'temperature': 0.2,  # good default for code
                 'use_cache': True,
                 'eos_token_id': self.tokenizer.eos_token_id,
-                'max_new_tokens': max(generation_length, 1)
+                'max_new_tokens': max(max_new_tokens, 1)
             },
             'sample_id': [],
-            'pass_at_k':
-                list(pass_at_k),
-            'generations_per_sample':
-                generations_per_sample,
-            'dataset_size':
-                dataset_size,
-            
+            'pass_at_k': list(pass_at_k),
+            'generations_per_sample': generations_per_sample,
+            'dataset_size': dataset_size,
         }
         if 'generation_kwargs' in kwargs:
             self.update_generation_kwargs(kwargs['generation_kwargs'])
diff --git a/llmfoundry/eval/metrics/nlp.py b/llmfoundry/eval/metrics/nlp.py
index 55007682f5..168d20f416 100644
--- a/llmfoundry/eval/metrics/nlp.py
+++ b/llmfoundry/eval/metrics/nlp.py
@@ -10,7 +10,7 @@
 import re
 import string
 import warnings
-from typing import Callable, Any, Dict, List
+from typing import Any, Callable, Dict, List
 
 import numpy as np
 import torch
@@ -40,13 +40,14 @@ class InContextLearningMetric(Metric):
     def __init__(self, *args, **kwargs):  # pyright: ignore
         super().__init__(*args, **kwargs)
         self.needs_batch = True
-    
+
     def _wrap_update(self, update: Callable) -> Callable:
         """Overwrite default _wrap_update to return result of update().
 
-        Torch metrics wraps update with following wrapped_func but explicitly does not return the value.
-        In general, torchmetrics update() does not return a value, but we want to in order to pass it on
-        to state.metric_outputs.
+        Torch metrics wraps update with following wrapped_func but explicitly
+        does not return the value. In general, torchmetrics update() does not
+        return a value, but we want to in order to pass it on to
+        state.metric_outputs.
         """
 
         @functools.wraps(update)
@@ -59,10 +60,10 @@ def wrapped_func(*args: Any, **kwargs: Any) -> None:
                 except RuntimeError as err:
                     if 'Expected all tensors to be on' in str(err):
                         raise RuntimeError(
-                            'Encountered different devices in metric calculation (see stacktrace for details).'
-                            ' This could be due to the metric class not being on the same device as input.'
-                            f' Instead of `metric={self.__class__.__name__}(...)` try to do'
-                            f' `metric={self.__class__.__name__}(...).to(device)` where'
+                            'Encountered different devices in metric calculation (see stacktrace for details).' + \
+                            ' This could be due to the metric class not being on the same device as input.' + \
+                            f' Instead of `metric={self.__class__.__name__}(...)` try to do' + \
+                            f' `metric={self.__class__.__name__}(...).to(device)` where' + \
                             ' device corresponds to the device of the input.',
                         ) from err
                     raise err
@@ -209,7 +210,6 @@ def update(
 
             return metric_result_dict
 
-
     def compute(self):
         assert isinstance(self.correct, Tensor)
         assert isinstance(self.total, Tensor)
@@ -247,8 +247,12 @@ def __init__(self, dist_sync_on_step: bool = False):
                        default=torch.tensor(0.),
                        dist_reduce_fx='sum')
         self.add_state('total', default=torch.tensor(0.), dist_reduce_fx='sum')
-        self.metric_result_dict = {'context': [], 'label': [], 'output': [], 'result': []}
-
+        self.metric_result_dict = {
+            'context': [],
+            'label': [],
+            'output': [],
+            'result': []
+        }
 
     def update(self, batch: dict, outputs: torch.Tensor, labels: torch.Tensor):
 
@@ -260,7 +264,8 @@ def update(self, batch: dict, outputs: torch.Tensor, labels: torch.Tensor):
             cont_tok_targ = labels[batch_idx].index_select(dim=0,
                                                            index=cont_idx - 1)
 
-            metric_result_dict['context'].append(batch['input_ids'][batch_idx][:cont_idx[0]])
+            metric_result_dict['context'].append(
+                batch['input_ids'][batch_idx][:cont_idx[0]])
             metric_result_dict['label'].append(cont_tok_targ)
             metric_result_dict['output'].append(cont_tok_pred)
 
@@ -344,12 +349,15 @@ def update(self, batch: dict, outputs: torch.Tensor, labels: torch.Tensor):
             else:
                 metric_result_dict['result'].append(0)
 
-            question = batch['input_ids'][start][:batch['continuation_indices'][start][0]]
+            question = batch['input_ids'][
+                start][:batch['continuation_indices'][start][0]]
 
-            correct_choice = batch['input_ids'][start:end][gold_idx][batch['continuation_indices'][start:end][gold_idx][
-                0]:batch['continuation_indices'][start:end][gold_idx][-1] + 1]
-            selected_choice = batch['input_ids'][start:end][idx_min][batch['continuation_indices'][start:end][idx_min][
-                0]:batch['continuation_indices'][start:end][idx_min][-1] + 1]
+            correct_choice = batch['input_ids'][start:end][gold_idx][
+                batch['continuation_indices'][start:end][gold_idx][0]:
+                batch['continuation_indices'][start:end][gold_idx][-1] + 1]
+            selected_choice = batch['input_ids'][start:end][idx_min][
+                batch['continuation_indices'][start:end][idx_min][0]:
+                batch['continuation_indices'][start:end][idx_min][-1] + 1]
             metric_result_dict['context'].append(question)
             metric_result_dict['correct_choice'].append(correct_choice)
             metric_result_dict['correct_choice_idx'].append(gold_idx)
@@ -358,7 +366,10 @@ def update(self, batch: dict, outputs: torch.Tensor, labels: torch.Tensor):
             all_choices = batch['input_ids'][start:end]
             # Unpads the choices. Necessary in case different choices have different token lengths.
             if 'attention_mask' in batch:
-                all_choices_list = [choice[batch['attention_mask'][i]] for i, choice in enumerate(all_choices)]
+                all_choices_list = [
+                    choice[batch['attention_mask'][i]]
+                    for i, choice in enumerate(all_choices)
+                ]
                 metric_result_dict['all_choices'].append(all_choices_list)
 
             self.total += torch.tensor(1.0)
@@ -369,7 +380,6 @@ def update(self, batch: dict, outputs: torch.Tensor, labels: torch.Tensor):
 
         return metric_result_dict
 
-
     def compute(self):
         assert isinstance(self.correct, Tensor)
         assert isinstance(self.total, Tensor)
@@ -410,8 +420,12 @@ def __init__(self, dist_sync_on_step: bool = False):
         self.eval_device = os.environ.get('CODE_EVAL_DEVICE', None)
         if self.eval_device is not None:
             self.eval_device = self.eval_device.upper()
-        self.metric_result_dict = {'context': [], 'output': [], 'result': [], 'sample_id': []}
-        
+        self.metric_result_dict = {
+            'context': [],
+            'output': [],
+            'result': [],
+            'sample_id': []
+        }
 
     def get_client(self) -> EvalClient:
         """Returns a client for the appropriate remote platform."""
diff --git a/tests/eval/test_in_context_learning_datasets.py b/tests/eval/test_in_context_learning_datasets.py
index 2dce2beef4..82ef4ec259 100644
--- a/tests/eval/test_in_context_learning_datasets.py
+++ b/tests/eval/test_in_context_learning_datasets.py
@@ -345,7 +345,7 @@ def test_update_generation_kwargs_no_kwargs_qa_dataset(tmp_path: Path):
         continuation_delimiter=': ',
         destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
         generation_kwargs=None)
-    assert len(dl.base_batch['generation_kwargs']) == 3
+    assert len(dl.base_batch['generation_kwargs']) == 4
 
 
 def test_update_generation_kwargs_with_kwargs_qa_dataset(tmp_path: Path):
@@ -371,7 +371,7 @@ def test_update_generation_kwargs_with_kwargs_qa_dataset(tmp_path: Path):
         generation_kwargs={'temperature': 0.9})
     assert 'generation_kwargs' in dl.base_batch
     assert dl.base_batch['generation_kwargs']['temperature'] == 0.9
-    assert len(dl.base_batch['generation_kwargs']) == 4
+    assert len(dl.base_batch['generation_kwargs']) == 5
 
 
 @pytest.mark.filterwarnings(
@@ -1434,8 +1434,8 @@ def test_qa_split_batch(tiny_opt_tokenizer: transformers.AutoTokenizer,
     assert len(split2['labels']) == 1
     assert all(isinstance(v, list) for v in split1['labels'] + split2['labels'])
 
-    assert isinstance(split1['generation_length'], int)
-    assert isinstance(split2['generation_length'], int)
+    assert isinstance(split1['generation_kwargs']['max_new_tokens'], int)
+    assert isinstance(split2['generation_kwargs']['max_new_tokens'], int)
 
     assert isinstance(split1['generation_kwargs'], dict)
     assert isinstance(split2['generation_kwargs'], dict)
@@ -1512,7 +1512,7 @@ def test_qa_task_dataloader(dataset_uri: str,
     assert batch['mode'] == 'generate'
     # the maximum generation length from the small test data
 
-    assert batch['generation_length'] == maximum_answer_length
+    assert batch['generation_kwargs']['max_new_tokens'] == maximum_answer_length
     assert all(item[0] == tokenizer.eos_token_id for item in batch['input_ids'])
 
     decoded_batch = tokenizer.batch_decode(batch['input_ids'])
@@ -1568,7 +1568,7 @@ def test_qa_task_with_cot_dataloader(
                                                     maximum_answer_length)
     assert batch['mode'] == 'generate'
     # the maximum generation length from the small test data
-    assert batch['generation_length'] == maximum_answer_length
+    assert batch['generation_kwargs']['max_new_tokens'] == maximum_answer_length
     assert all(item[0] == tokenizer.eos_token_id for item in batch['input_ids'])
     decoded_batch = tokenizer.batch_decode(batch['input_ids'])
     assert all(item.count('Q: ') == num_fewshot + 1 for item in decoded_batch)
@@ -1697,12 +1697,11 @@ def test_code_eval_split_batch(dataset_uri: str, tmp_path: Path):
             assert len(batch[field]) == size
             assert all(isinstance(val, type_) for val in batch[field])
 
-    static_keys = {
-        'pass_at_k': (int, list),
-        'generation_length': int,
-        'generation_kwargs': dict
-    }
+    static_keys = {'pass_at_k': (int, list), 'generation_kwargs': dict}
     for batch in batches:
+        assert 'generation_kwargs' in batch
+        assert 'max_new_tokens' in batch['generation_kwargs']
+        assert isinstance(batch['generation_kwargs']['max_new_tokens'], int)
         for field, type_ in static_keys.items():
             assert isinstance(batch[field], type_)
 
@@ -1756,7 +1755,7 @@ def test_code_eval_sentpiece_dataloader(
         assert tuple(batch['attention_mask'].shape) == (bs, max_prompt_length)
         assert batch['mode'] == 'generate'
         # the maximum generation length from the small test data
-        assert batch['generation_length'] == 129
+        assert batch['generation_kwargs']['max_new_tokens'] == 129
         has_left_padding.extend(
             [item[0] == tokenizer.eos_token_id for item in batch['input_ids']])
     assert not all(has_left_padding)  # longest should be pushed left
@@ -1831,7 +1830,7 @@ def test_code_eval_test_cases(dataset_uri: str, tmp_path: Path):
                                                     max_prompt_length)
     assert batch['mode'] == 'generate'
     # the maximum generation length from the small test data
-    assert batch['generation_length'] == 129
+    assert batch['generation_kwargs']['max_new_tokens'] == 129
     assert any(item[0] != tokenizer.eos_token_id
                for item in batch['input_ids'])  # longest should be pushed left
 
@@ -1924,7 +1923,7 @@ def test_code_eval_task_dataloader(dataset_uri: str, tmp_path: Path,
         assert tuple(batch['attention_mask'].shape) == (bs, max_prompt_length)
         assert batch['mode'] == 'generate'
         # the maximum generation length from the small test data
-        assert batch['generation_length'] == 122
+        assert batch['generation_kwargs']['max_new_tokens'] == 122
         has_left_padding.extend(
             [item[0] == tokenizer.eos_token_id for item in batch['input_ids']])
     assert not all(has_left_padding)  # longest should be pushed left
@@ -2825,7 +2824,7 @@ def test_hf_dataloading_custom_parsing(
                                                     maximum_answer_length)
     assert batch['mode'] == 'generate'
     # the maximum generation length from the small test data
-    assert batch['generation_length'] == maximum_answer_length
+    assert batch['generation_kwargs']['max_new_tokens'] == maximum_answer_length
     assert all(item[0] == tokenizer.eos_token_id for item in batch['input_ids'])
 
     decoded_batch = tokenizer.batch_decode(batch['input_ids'])

From eac919a2cb54254701adc2974b80ba23c0a025ea Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Mon, 1 Apr 2024 22:47:25 +0000
Subject: [PATCH 50/59] fix bug

---
 llmfoundry/eval/metrics/nlp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llmfoundry/eval/metrics/nlp.py b/llmfoundry/eval/metrics/nlp.py
index 168d20f416..55922e28d2 100644
--- a/llmfoundry/eval/metrics/nlp.py
+++ b/llmfoundry/eval/metrics/nlp.py
@@ -208,7 +208,7 @@ def update(
 
             self.total += torch.tensor(1.0)
 
-            return metric_result_dict
+        return metric_result_dict
 
     def compute(self):
         assert isinstance(self.correct, Tensor)

From e10086f9769236c681032b036c2dea541c49767b Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Wed, 3 Apr 2024 11:45:32 -0400
Subject: [PATCH 51/59] restore yamls

---
 mcli/mcli-llama2-finetune.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mcli/mcli-llama2-finetune.yaml b/mcli/mcli-llama2-finetune.yaml
index d39fe73ba2..4f4d994eb1 100644
--- a/mcli/mcli-llama2-finetune.yaml
+++ b/mcli/mcli-llama2-finetune.yaml
@@ -38,7 +38,7 @@ parameters:
     pretrained: true
     # Note: you must have set the HUGGING_FACE_HUB_TOKEN environment variable and have access to the llama2 models
     use_auth_token: true
-    attention_patch_type: triton
+    use_flash_attention_2: true
 
   # Tokenizer
   tokenizer:
@@ -62,7 +62,7 @@ parameters:
       # # Or use `python llmfoundry/scripts/misc/profile_packing.py --yaml-path /path/to/this/yaml/ ...`
       # # to profile this run's optimal packing_ratio as it depends on GPU count,
       # # batch size, sequence length
-      # packing_ratio:
+      # packing_ratio: auto
     drop_last: true
     num_workers: 8
     pin_memory: false

From d5aebc8741b105194a7d6c09019d7e3625794d6c Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Wed, 10 Apr 2024 12:51:16 -0400
Subject: [PATCH 52/59] fix typos

---
 llmfoundry/eval/datasets/__init__.py              |  5 +----
 .../datasets/in_context_learning_evaluation.py    | 15 +++++++--------
 llmfoundry/eval/datasets/utils.py                 |  3 ---
 llmfoundry/eval/metrics/__init__.py               |  3 ---
 llmfoundry/models/hf/hf_causal_lm.py              |  1 -
 scripts/eval/README.md                            |  2 +-
 tests/eval/test_in_context_learning_datasets.py   |  3 ---
 tests/eval/test_nlp_metrics.py                    |  3 ---
 8 files changed, 9 insertions(+), 26 deletions(-)

diff --git a/llmfoundry/eval/datasets/__init__.py b/llmfoundry/eval/datasets/__init__.py
index 8d792f2f99..e6a8b5222d 100644
--- a/llmfoundry/eval/datasets/__init__.py
+++ b/llmfoundry/eval/datasets/__init__.py
@@ -1,10 +1,7 @@
 # Copyright 2024 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""Natively supported datasets."""
+"""Natively supported in-context learning evaluation datasets."""
 
 from llmfoundry.eval.datasets.in_context_learning_evaluation import (
     InContextLearningCodeEvalDataset, InContextLearningDataset,
diff --git a/llmfoundry/eval/datasets/in_context_learning_evaluation.py b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
index ecb47ca504..c3f1a71159 100644
--- a/llmfoundry/eval/datasets/in_context_learning_evaluation.py
+++ b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
@@ -1,10 +1,6 @@
 # Copyright 2024 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-# This code is based on the implementation in https://github.com/EleutherAI/lm-evaluation-harness/blob/8c048e266a22a1c85ccbdb0c209ac712e4f39989/lm_eval/base.py#L221-L330
-
 from __future__ import annotations
 
 import copy
@@ -97,14 +93,17 @@ class InContextLearningDataset(Dataset):
         strip_dataset (bool): Boolean for whether to strip whitespace from data. Trailing whitespace can cause degenerative outputs,
             so unless whitespace should be preserved (for example in code), this should be set to True.
         padding_side (str): Side of the content and answer on which to apply padding. Can be either 'right' or 'left'.
+        tokenize_labels (bool): Whether or not the labels should be tokenized. Generally determined by which metric a dataset uses.
         padding_size (int): The final size of the tensor after padding. Defaults to max_sequence_length.
         base_batch (Dict): The base dictionary upon which a batch is created. See above for more details.
         base_mapping (Dict): A mapping of batch keys to dataset columns, used to create batches. See above for more details.
         hf_loading_vars (Dict): A dictionary containing keyword arguments to be passed into `load_dataset` if dataset is being pulled from HF.
         hf_parsing_map (Dict): A dictionary containing a mapping from HF columns to ICL dataset keys. The dictionary should be formatted {icl_key:[hf_key1, hf_key1]}.
             Column contents will be concatenated with ' ' seperating them. If not included, will load the columns already present in the HF dataset.
-        tokenize_labels (bool): Whether or not the labels should be tokenized. Generally determined by which metric a dataset uses.
         generation_kwargs (Dict): A dictionary containing keyword arguments to be passed along to the model's generate function.
+        static_keys (List): A list of the key values which will be broadcast across a batch (e.g. it is the same for each batch element).
+        list_keys (List): A list of the batch keys whose values are lists which will be split using list methods during calls to split_batch.
+        tensor_keys (List): A list of the batch keys whose values are tensors which will be split using tensor methods during calls to split_batch.
     """
 
     def __init__(
@@ -125,15 +124,15 @@ def __init__(
         strip_dataset: bool = True,
         padding_side: str = 'right',
         tokenize_labels: bool = True,
-        static_keys: Optional[List] = None,
-        list_keys: Optional[List] = None,
-        tensor_keys: Optional[List] = None,
         padding_size: Optional[int] = None,
         base_batch: Optional[Dict] = None,
         batch_mapping: Optional[Dict] = None,
         hf_loading_vars: Optional[Dict] = None,
         hf_parsing_map: Optional[Dict] = None,
         generation_kwargs: Optional[Dict] = None,
+        static_keys: Optional[List] = None,
+        list_keys: Optional[List] = None,
+        tensor_keys: Optional[List] = None,
     ):
         self.tokenizer = tokenizer
         self.prefix_space = tokenizer_needs_prefix_space(self.tokenizer)
diff --git a/llmfoundry/eval/datasets/utils.py b/llmfoundry/eval/datasets/utils.py
index ac17e71774..7ea7f9fae2 100644
--- a/llmfoundry/eval/datasets/utils.py
+++ b/llmfoundry/eval/datasets/utils.py
@@ -1,9 +1,6 @@
 # Copyright 2024 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
 """Utility and helper functions for datasets."""
 from __future__ import annotations
 
diff --git a/llmfoundry/eval/metrics/__init__.py b/llmfoundry/eval/metrics/__init__.py
index 5401fb87d4..6e70e2ece3 100644
--- a/llmfoundry/eval/metrics/__init__.py
+++ b/llmfoundry/eval/metrics/__init__.py
@@ -1,9 +1,6 @@
 # Copyright 2024 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
 """A collection of common torchmetrics."""
 
 from llmfoundry.eval.metrics.nlp import (
diff --git a/llmfoundry/models/hf/hf_causal_lm.py b/llmfoundry/models/hf/hf_causal_lm.py
index 275a142131..38ed7a7e70 100644
--- a/llmfoundry/models/hf/hf_causal_lm.py
+++ b/llmfoundry/models/hf/hf_causal_lm.py
@@ -8,7 +8,6 @@
 import warnings
 from typing import TYPE_CHECKING, Any, Dict, Mapping
 
-# required for loading a python model into composer
 from composer.models.huggingface import peft_installed
 from composer.utils import dist
 from omegaconf import DictConfig
diff --git a/scripts/eval/README.md b/scripts/eval/README.md
index 3fcace6289..ba2e9e2c79 100644
--- a/scripts/eval/README.md
+++ b/scripts/eval/README.md
@@ -145,7 +145,7 @@ You can use the default `icl_tasks` and `eval_gauntlet` configs or specify your
 
 ICL evaluation measures a model’s ability to solve novel problems by being provided examples in-context without ever being specifically trained to answer such questions.
 
-We supports a number of standard ICL formats and allow users to upload their own datasets that correspond to these formats. All of our ICL task types are implemented in `llm-foundry/llmfoundry/eval/datasets/in_context_learning_evaluation.py` while all of our ICL
+We support a number of standard ICL formats and allow users to upload their own datasets that correspond to these formats. All of our ICL task types are implemented in `llm-foundry/llmfoundry/eval/datasets/in_context_learning_evaluation.py` while all of our ICL
 metrics are implemented in `llm-foundry/llmfoundry/eval/metrics/nlp.py`. You can see which metrics work with which task types in the `llmfoundry.utils.builders.build_icl_evaluators` helper function.
 
 This document explains the ICL formats compatible with [Composer](https://github.com/mosaicml/composer), summarizes how to add new datasets in those formats, and catalogs the datasets currently used by the research team to evaluate models.
diff --git a/tests/eval/test_in_context_learning_datasets.py b/tests/eval/test_in_context_learning_datasets.py
index 82ef4ec259..33a041aaea 100644
--- a/tests/eval/test_in_context_learning_datasets.py
+++ b/tests/eval/test_in_context_learning_datasets.py
@@ -1,9 +1,6 @@
 # Copyright 2024 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
 import contextlib
 import os
 import random
diff --git a/tests/eval/test_nlp_metrics.py b/tests/eval/test_nlp_metrics.py
index 911f40c861..344d642715 100644
--- a/tests/eval/test_nlp_metrics.py
+++ b/tests/eval/test_nlp_metrics.py
@@ -1,9 +1,6 @@
 # Copyright 2024 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
 from typing import Any, List
 
 import pytest

From a5082b07f17a2c5fc7d287b6f3d453c2061a1559 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Wed, 10 Apr 2024 17:04:30 +0000
Subject: [PATCH 53/59] add deprecation warning for code

---
 llmfoundry/eval/datasets/in_context_learning_evaluation.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/llmfoundry/eval/datasets/in_context_learning_evaluation.py b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
index ecb47ca504..8a1a549429 100644
--- a/llmfoundry/eval/datasets/in_context_learning_evaluation.py
+++ b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
@@ -1182,7 +1182,7 @@ class InContextLearningCodeEvalDataset(InContextLearningDataset):
       for more details):
 
         - pad_token_id: ID for padding token, derived automatically
-        - num_beams: How many beams to search for generations, set to 1
+        - num_beams: How many beams to search for generations, default set to 1
         - do_sample: Determines whether model is sampling or greedily decoding. Always set to True
         - use_cache: Whether or not to use past key values to speed up sampling. Always set to True
 
@@ -1492,6 +1492,10 @@ def build_icl_dataloader(
         )
         effective_batchsize = batch_size
     elif icl_task_type == 'code_evaluation':
+        warnings.warn(
+            VersionedDeprecationWarning(
+                "ICL task type 'code_evaluation' is deprecated and will no longer be supported. ",
+                'v0.7.0'))
         dataset = InContextLearningCodeEvalDataset(
             dataset_uri=dataset_uri,
             tokenizer=tokenizer,

From 642ad40a1dbed0bb17a100528191f2cb80be3e98 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Wed, 10 Apr 2024 22:32:38 +0000
Subject: [PATCH 54/59] pyright wip

---
 .../eval/datasets/in_context_learning_evaluation.py       | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/llmfoundry/eval/datasets/in_context_learning_evaluation.py b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
index c582be798e..bd5c7dc30c 100644
--- a/llmfoundry/eval/datasets/in_context_learning_evaluation.py
+++ b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
@@ -9,7 +9,7 @@
 import os
 import random
 import warnings
-from typing import Any, Dict, Iterable, List, Optional, Union
+from typing import Any, Dict, Iterable, List, Optional, Sequence, Union
 
 import torch
 import transformers
@@ -478,8 +478,7 @@ def collate_fn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
         batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
         return batch
 
-    def split_batch(self, batch: Any,
-                    microbatch_size: int) -> List[Dict[str, Any]]:
+    def split_batch(self, batch: Any, microbatch_size: int) -> Sequence:
         """Handling for certain specialty columns that must be split into.
 
         batches in different formats.
@@ -906,8 +905,7 @@ def collate_fn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
     def get_num_samples_in_batch(self, batch: Dict[str, torch.Tensor]) -> int:
         return batch['input_ids'].shape[0] // self.num_choices
 
-    def split_batch(self, batch: Any,
-                    microbatch_size: int) -> List[Dict[str, Any]]:
+    def split_batch(self, batch: Any, microbatch_size: int) -> Sequence:
         """Split batch while ensuring all continuations are in the same.
 
         microbatch.

From de321b215c29719e55a0ca38920812eb90309548 Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Thu, 11 Apr 2024 19:19:00 -0400
Subject: [PATCH 55/59] fix pyright

---
 llmfoundry/eval/datasets/in_context_learning_evaluation.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llmfoundry/eval/datasets/in_context_learning_evaluation.py b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
index bd5c7dc30c..55b21a1fa0 100644
--- a/llmfoundry/eval/datasets/in_context_learning_evaluation.py
+++ b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
@@ -478,7 +478,7 @@ def collate_fn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
         batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
         return batch
 
-    def split_batch(self, batch: Any, microbatch_size: int) -> Sequence:
+    def split_batch(self, batch: Any, microbatch_size: int) -> Sequence[Any]:
         """Handling for certain specialty columns that must be split into.
 
         batches in different formats.
@@ -905,7 +905,7 @@ def collate_fn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
     def get_num_samples_in_batch(self, batch: Dict[str, torch.Tensor]) -> int:
         return batch['input_ids'].shape[0] // self.num_choices
 
-    def split_batch(self, batch: Any, microbatch_size: int) -> Sequence:
+    def split_batch(self, batch: Any, microbatch_size: int) -> Sequence[Any]:
         """Split batch while ensuring all continuations are in the same.
 
         microbatch.

From 019c58a44c44bb2abb4751c7f4cdb9ac00702280 Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Thu, 11 Apr 2024 19:32:09 -0400
Subject: [PATCH 56/59] fix pyright error again

---
 .../eval/datasets/in_context_learning_evaluation.py       | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llmfoundry/eval/datasets/in_context_learning_evaluation.py b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
index 55b21a1fa0..30502d2d92 100644
--- a/llmfoundry/eval/datasets/in_context_learning_evaluation.py
+++ b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
@@ -478,14 +478,14 @@ def collate_fn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
         batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
         return batch
 
-    def split_batch(self, batch: Any, microbatch_size: int) -> Sequence[Any]:
+    def split_batch(self, batch: Any, microbatch_size: Union[int , float]) -> Sequence[Any]:
         """Handling for certain specialty columns that must be split into.
 
         batches in different formats.
 
         Args:
             batch (Dict): Batch of data
-            microbatch_size (int): Size of microbatches
+            microbatch_size (int | float): Size of microbatches
 
         Returns:
             List: List of chunked batches
@@ -905,7 +905,7 @@ def collate_fn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
     def get_num_samples_in_batch(self, batch: Dict[str, torch.Tensor]) -> int:
         return batch['input_ids'].shape[0] // self.num_choices
 
-    def split_batch(self, batch: Any, microbatch_size: int) -> Sequence[Any]:
+    def split_batch(self, batch: Any, microbatch_size: Union[int , float]) -> Sequence[Any]:
         """Split batch while ensuring all continuations are in the same.
 
         microbatch.
@@ -917,7 +917,7 @@ def split_batch(self, batch: Any, microbatch_size: int) -> Sequence[Any]:
         microbatch_size and real attributes by microbatch_size * num_choices.
         Args:
             batch (Dict): Batch of data
-            microbatch_size (int): Size of microbatches
+            microbatch_size (int | float): Size of microbatches
 
         Returns:
             list: List of chunked batches

From 779f4908d8e659123098707af1bbd7ab9856cf62 Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Thu, 11 Apr 2024 19:39:17 -0400
Subject: [PATCH 57/59] fix pyright

---
 llmfoundry/eval/datasets/in_context_learning_evaluation.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/llmfoundry/eval/datasets/in_context_learning_evaluation.py b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
index 30502d2d92..df5799df2b 100644
--- a/llmfoundry/eval/datasets/in_context_learning_evaluation.py
+++ b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
@@ -493,6 +493,8 @@ def split_batch(self, batch: Any, microbatch_size: Union[int , float]) -> Sequen
         # Don't split kwargs that don't change
         # Normally split torch tensors
         # List split lists of strings
+        if isinstance(microbatch_size, float):
+            raise ValueError('split_batch does not support floating point microbatch_size.')
         chunked = {}
         for k, v in batch.items():
             if k in self.static_keys:
@@ -922,6 +924,8 @@ def split_batch(self, batch: Any, microbatch_size: Union[int , float]) -> Sequen
         Returns:
             list: List of chunked batches
         """
+        if isinstance(microbatch_size, float):
+            raise ValueError('split_batch does not support floating point microbatch_size.')
         chunked = {}
         for k, v in batch.items():
             if k in self.static_keys:

From 03f7e91e7d30e36334e365d019ce9204371016ef Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Thu, 11 Apr 2024 19:50:32 -0400
Subject: [PATCH 58/59] fix pyright

---
 .../eval/datasets/in_context_learning_evaluation.py  | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/llmfoundry/eval/datasets/in_context_learning_evaluation.py b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
index df5799df2b..8f317f60b8 100644
--- a/llmfoundry/eval/datasets/in_context_learning_evaluation.py
+++ b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
@@ -478,7 +478,8 @@ def collate_fn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
         batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
         return batch
 
-    def split_batch(self, batch: Any, microbatch_size: Union[int , float]) -> Sequence[Any]:
+    def split_batch(self, batch: Any,
+                    microbatch_size: Union[int, float]) -> Sequence[Any]:
         """Handling for certain specialty columns that must be split into.
 
         batches in different formats.
@@ -494,7 +495,8 @@ def split_batch(self, batch: Any, microbatch_size: Union[int , float]) -> Sequen
         # Normally split torch tensors
         # List split lists of strings
         if isinstance(microbatch_size, float):
-            raise ValueError('split_batch does not support floating point microbatch_size.')
+            raise ValueError(
+                'split_batch does not support floating point microbatch_size.')
         chunked = {}
         for k, v in batch.items():
             if k in self.static_keys:
@@ -907,7 +909,8 @@ def collate_fn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
     def get_num_samples_in_batch(self, batch: Dict[str, torch.Tensor]) -> int:
         return batch['input_ids'].shape[0] // self.num_choices
 
-    def split_batch(self, batch: Any, microbatch_size: Union[int , float]) -> Sequence[Any]:
+    def split_batch(self, batch: Any,
+                    microbatch_size: Union[int, float]) -> Sequence[Any]:
         """Split batch while ensuring all continuations are in the same.
 
         microbatch.
@@ -925,7 +928,8 @@ def split_batch(self, batch: Any, microbatch_size: Union[int , float]) -> Sequen
             list: List of chunked batches
         """
         if isinstance(microbatch_size, float):
-            raise ValueError('split_batch does not support floating point microbatch_size.')
+            raise ValueError(
+                'split_batch does not support floating point microbatch_size.')
         chunked = {}
         for k, v in batch.items():
             if k in self.static_keys:

From 02308df80c0623ba86dce92d76822d82a3a7e4c3 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Fri, 12 Apr 2024 20:24:35 +0000
Subject: [PATCH 59/59] update version

---
 llmfoundry/utils/builders.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py
index 04904962de..5d60cb0a1f 100644
--- a/llmfoundry/utils/builders.py
+++ b/llmfoundry/utils/builders.py
@@ -503,7 +503,7 @@ def _validate_cfg(icl_cfg: DictConfig):
                     warnings.warn(
                         VersionedDeprecationWarning(
                             "ICL task type 'question_answering' is now deprecated. Use identifier 'generation_task_with_answers'",
-                            'v0.7.0'))
+                            'v0.9.0'))
                 icl_cfg.metric_names = [
                     'InContextLearningGenerationExactMatchAccuracy'
                 ]