From e62a85cf835d15614df80981a2c2048d6e2b6ffd Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Fri, 26 Jan 2024 16:06:36 -0500
Subject: [PATCH 1/2] start

---
 .../in_context_learning_evaluation.py         | 1109 +--------
 composer/metrics/nlp.py                       |  459 +---
 composer/models/huggingface.py                |    7 +-
 .../test_in_context_learning_datasets.py      | 1981 -----------------
 tests/metrics/test_nlp_metrics.py             |  245 +-
 5 files changed, 8 insertions(+), 3793 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 4e0e30f1ff..25a3cfa5b1 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -5,32 +5,22 @@
 from __future__ import annotations
 
 import copy
-import json
-import os
 import random
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set
 
 import torch
-from torch.utils.data import DataLoader, Dataset
+from torch.utils.data import Dataset
 
-from composer.core import DataSpec
 from composer.core.data_spec import _default_split_batch, _split_list
-from composer.datasets.utils import stop_sequences_criteria
 from composer.utils import MissingConditionalImportError, dist, get_file
 
 if TYPE_CHECKING:
     import transformers
     from datasets import Dataset as HFDataset  # pyright: ignore[reportGeneralTypeIssues]
 
-# Allow models to have slightly more tokens than were used in the most verbose CoT in the dataset
-_MAX_ANSWER_BUFFER_LENGTH = 10
 
 __all__ = [
-    'InContextLearningLMTaskDataset',
-    'InContextLearningMultipleChoiceTaskDataset',
-    'InContextLearningCodeEvalDataset',
-    'InContextLearningQATaskDataset',
-    'get_icl_task_dataloader',
+    'InContextLearningDataset',
 ]
 
 
@@ -641,1096 +631,3 @@ def split_batch(self, batch: Any, microbatch_size: int) -> List[Dict[str, Any]]:
 
         batched_list = [{k: v[idx] for k, v in chunked.items()} for idx in range(num_chunks)]
         return batched_list
-
-
-class InContextLearningQATaskDataset(InContextLearningDataset):
-    """
-    A dataset that constructs batches for in-context learning question answering evaluation.
-    QA tasks evaluate a model's ability to answer questions using a consistent format.
-
-    The input format is expected to be a jsonl file with the following fields:
-    - context: The question
-    - answer: The preferred answer to the question
-    - aliases: A list of aliases for the answer
-
-    See InContextLearningDataset for more details.
-
-    Additional Args:
-        cot_delimiter (str): Delimiter to place between the chain of thought and continuations.
-    """
-
-    def __init__(self,
-                 cot_delimiter: str = '',
-                 early_stopping_criteria: Optional[List[str]] = None,
-                 do_normalization: bool = True,
-                 *args,
-                 **kwargs):
-        if kwargs['tokenizer'].eos_token_id is None:
-            raise ValueError('`InContextLearningQATaskDataset` tokenizer must have non-null `eos_token_id`')
-        self.cot_delimiter = cot_delimiter
-        self.has_cot = False
-        self.max_answer_length = 0
-        static_keys = [
-            'mode', 'cot_delimiter', 'generation_length', 'generation_kwargs', 'do_normalization', 'stopping_criteria'
-        ]
-        tensor_keys = ['input_ids', 'attention_mask']
-        list_keys = ['labels']
-        super().__init__(padding_side='left',
-                         tokenize_labels=False,
-                         static_keys=static_keys,
-                         list_keys=list_keys,
-                         tensor_keys=tensor_keys,
-                         *args,
-                         **kwargs)
-        # NOTE: set these after init call because they take class vars
-        self.early_stopping_criteria = early_stopping_criteria
-        self.base_batch = {
-            'input_ids': [],
-            'mode': 'generate',
-            'labels': [],
-            'cot_delimiter': self.cot_delimiter,
-            'generation_length': self.max_answer_length,
-            'stopping_criteria': early_stopping_criteria,
-            'do_normalization': do_normalization,
-            'generation_kwargs': {
-                'pad_token_id': self.pad_tok_id,
-                'use_cache': True,
-                'eos_token_id': self.tokenizer.eos_token_id,
-            }
-        }
-        self.batch_mapping = {
-            'input_ids': self.context_key,
-            'labels': 'aliases',
-        }
-        self.update_generation_kwargs(kwargs.get('generation_kwargs', {}))
-
-    def read_dataset(
-        self,
-        dataset_uri: str,
-        destination_path: str,
-        hf_loading_vars: Dict,
-        hf_parsing_map: Dict,
-    ) -> 'HFDataset':
-        dataset = super().read_dataset(dataset_uri, destination_path, hf_loading_vars, hf_parsing_map)
-        self.has_cot = 'chain_of_thought' in dataset.features
-        dataset = dataset.map(
-            lambda examples: {
-                'context': examples['context'],
-                'answer': examples['answer'],
-                'aliases': set([examples['answer']] + examples.get('aliases', [])),
-                'chain_of_thought': examples.get('chain_of_thought', ''),
-            })
-        self.max_answer_length = self._get_max_answer_length(dataset)
-        # NOTE: This is the only time we use the class variable padding_size.
-        self.padding_size = self.max_seq_len - self.max_answer_length
-        return dataset
-
-    def get_answer_from_example(self, example: Dict, in_context=False) -> str:
-        """
-        Returns the answer from the example. Applies chain of thought if self.has_cot is marked as true.
-        Args:
-            example (Dict): The example from which to retrieve the answer
-
-        Returns:
-            str: The answer in from the example with chain of thought and delimiter if needed
-        """
-        if self.has_cot:
-            return f'{example["chain_of_thought"]}{self.cot_delimiter}{example[self.answer_key]}'
-        else:
-            return example[self.answer_key]
-
-    def tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -> Dict[str, Any]:
-        """
-        Run text through the tokenizer and handle special cases.
-        Args:
-            prompt_and_fewshot (str): The collection of the prompt and fewshot examples that belongs before the example's context
-            ctx (str): The specific example's derrived context
-            example (Dict): The example as a dictionary.
-
-        Returns:
-            Dict: Dictionary with the tokenized data
-        """
-        tokenized_example = super().tokenize_example(prompt_and_fewshot, ctxt, example)
-        tokenized_example['aliases'] = list(example.get('aliases', []))
-        return tokenized_example
-
-    def _get_max_answer_length(self, dataset) -> int:
-        f"""
-        Loops over the dataset and finds the longest answer length.
-
-        Returns:
-            int: The maximum answer length with an additional buffer of {_MAX_ANSWER_BUFFER_LENGTH} if chain of thought is present
-        """
-        max_answer_length = 0
-        for example in dataset:
-            all_answers = [example[self.answer_key]] + list(example.get('aliases', []))
-            for answer in all_answers:
-                if self.has_cot:
-                    response = (f'{example["chain_of_thought"]}{self.cot_delimiter}{answer}')
-                else:
-                    response = answer
-                tokenized_repsonse = self.tokenizer(response)['input_ids']
-                assert isinstance(tokenized_repsonse, list)
-                max_answer_length = max(max_answer_length, len(tokenized_repsonse))
-        max_answer_length = max_answer_length + (_MAX_ANSWER_BUFFER_LENGTH if len(self.cot_delimiter) > 0 else 0)
-        return max_answer_length
-
-    def collate_fn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
-        batch = super().collate_fn(data)
-        batch_size = batch['input_ids'].shape[0]
-        stopping_criteria = None
-        if self.early_stopping_criteria:
-            if stop_sequences_criteria is None:  # pyright: ignore [reportUnnecessaryComparison]
-                raise MissingConditionalImportError(extra_deps_group='nlp',
-                                                    conda_package='transformers',
-                                                    conda_channel='conda-forge')
-            stopping_criteria = stop_sequences_criteria(self.tokenizer, self.early_stopping_criteria, batch_size)
-        batch['generation_kwargs']['stopping_criteria'] = stopping_criteria
-        return batch
-
-
-class InContextLearningLMTaskDataset(InContextLearningDataset):
-    """
-    A dataset that constructs batches for in-context learning language modeling evaluation.
-    Language modeling tasks test a model's ability to properly predict tokens based on preceding tokens.
-
-    The input format is expected to be a jsonl file with the following fields:
-    - context: Preceding text
-    - continuation: The expected continuation
-
-    See InContextLearningDataset for more details.
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(answer_key='continuation',
-                         static_keys=['mode'],
-                         tensor_keys=['input_ids', 'continuation_indices', 'labels', 'attention_mask'],
-                         base_batch={
-                             'input_ids': [],
-                             'continuation_indices': [],
-                             'mode': 'icl_task',
-                             'labels': []
-                         },
-                         batch_mapping={
-                             'input_ids': 'context',
-                             'labels': 'context'
-                         },
-                         padding_side='right',
-                         *args,
-                         **kwargs)
-
-
-class InContextLearningMultipleChoiceTaskDataset(InContextLearningDataset):
-    """
-    A dataset that construct batches for in-context learning multiple choice evaluation.
-
-    If each question has N answer choices, we construct N distinct inputs per question. In order to ensure
-    consistency across multi-GPU, we set the batch size to be `min(N, batch_size)` so that all N
-    inputs per question can stored in the same batch.
-
-    The default input format is a jsonl file with the following fields:
-    - query: The preceding text, question, or document relevant to the choices
-    - gold: Index of the correct choice under 'choices'
-    - choices: A list of strings, each being one of the potential choices
-
-    Each batch then consists of ``|batch_size // N|`` distinct questions and has the following the structure.
-    - input_ids: Input tensor ``|batch x seqlen x # tokens|``
-    - continuation_indices: List of ``|batch|`` consisting of tensors indicating which indices in the sequence correspond to the question answer (aka continuation)
-    - mode: Indicates to the model that this is an ICL task and may rely on a custom code path to properly update metrics
-    - labels: Identical to the input, used by the model to calculate loss/metrics
-    - gold_indices: List of length ``|batch_size // N|`` indicating for each question, which of the answers is correct (via an integer [0, N-1])
-    - choice_groupings: Indicates which indices of the batch correspond to which questions
-
-    Additional Args:
-        choices_key (str): The key under which the choices are stored in the saved dataset. Defaults to 'choices'.
-    """
-
-    def __init__(self,
-                 choices_key: str = 'choices',
-                 static_keys: Optional[List] = None,
-                 list_of_tensors_keys: Optional[List] = None,
-                 list_of_tuples_keys: Optional[List] = None,
-                 list_of_primitives: Optional[List] = None,
-                 *args,
-                 **kwargs):
-        self.choices_key = choices_key
-        base_batch = {
-            'input_ids': [],
-            'continuation_indices': [],
-            'mode': 'icl_task',
-            'labels': [],
-            'gold_indices': [],
-            'choice_groupings': [],
-        }
-        context_key = kwargs.pop('context_key', 'query')
-        static_keys = kwargs.pop('static_keys', ['mode', 'generation_kwargs'])
-        tensor_keys = kwargs.pop('tensor_keys', ['input_ids', 'labels', 'attention_mask'])
-        self.list_of_tensors_keys = list_of_tensors_keys or ['continuation_indices']
-        self.list_of_tuples_keys = list_of_tuples_keys or ['choice_groupings']
-        self.list_of_primitives = list_of_primitives or ['gold_indices']
-        super().__init__(context_key=context_key,
-                         base_batch=base_batch,
-                         static_keys=static_keys,
-                         tensor_keys=tensor_keys,
-                         padding_side='right',
-                         *args,
-                         **kwargs)
-        self.num_choices = len(self.dataset[0][self.choices_key])
-        self.batch_mapping_per_choice = {'input_ids': 'context', 'labels': 'context'}
-        self.batch_map_per_example = {'gold_indices': 'gold'}
-
-    def get_answer_from_example(self, example: Dict, in_context=False) -> str:
-        """
-        Returns the correct answer from the example's choices.
-        Args:
-            example (Dict): The example from which to retrieve the answer
-
-        Returns:
-            str: The full string of the correct answer based on the 'gold' key
-        """
-        choices = example[self.choices_key]
-        gold_idx = example['gold']
-        return choices[gold_idx]
-
-    def tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -> Dict[str, Any]:
-        """
-        Runs text through the tokenizer and handle special cases.
-        Args:
-            prompt_and_fewshot (str): The collection of the prompt and fewshot examples that belongs before the example's context
-            ctx (str): The specific example's derrived context
-            example (Dict): The example as a dictionary.
-
-        Returns:
-            Dict: Dictionary with the tokenized data
-        """
-        # NOTE: some of this is repeated from super class but for loop makes things considerably different
-        tokenized_example = {}
-        # Always add special tokens to preamble
-        preamble = self.tokenizer(prompt_and_fewshot)['input_ids']
-        assert isinstance(preamble, list)
-        preamble = self._fix_eos_on_preamble(preamble)
-        if self.strip_data:
-            # rstrip context because a prompt ending in a space results in degenerate output
-            ctxt = ctxt.rstrip()
-        # Never add special tokens to context
-        tokenized_context = self.tokenizer(ctxt, add_special_tokens=False)['input_ids']
-        assert isinstance(tokenized_context, list)
-        tokenized_context = preamble + tokenized_context
-
-        tokenized_example[self.context_key] = []
-        tokenized_example[self.answer_key] = []
-        tokenized_example['continuation_indices'] = []
-        # NOTE: Treating tokenize_labels as True for all MC datasets (required for our MC accuracy metric)
-        for choice in example[self.choices_key]:
-            if self.prefix_space:
-                choice = f' {choice}' if not choice.startswith(' ') else choice
-
-            # Never add special tokens to answer
-            tokenized_answer = self.tokenizer(choice, add_special_tokens=False)['input_ids']
-            assert isinstance(tokenized_context, list)
-            assert isinstance(tokenized_answer, list)
-            trimmed_context = _trim_context(tokenized_context, tokenized_answer, self.padding_size)
-            assert isinstance(trimmed_context, list)
-            continuation_indices = _get_continuation_span(trimmed_context, tokenized_answer)
-            padded_context = _make_padded_input(
-                trimmed_context,
-                tokenized_answer,
-                self.padding_size,
-                self.pad_tok_id,
-                self.padding_side,
-            )
-
-            tokenized_example[self.context_key].append(padded_context)
-            tokenized_example[self.answer_key].append(tokenized_answer)
-            tokenized_example['continuation_indices'].append(continuation_indices)
-
-        tokenized_example['gold'] = example['gold']
-        return tokenized_example
-
-    def collate_fn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
-        """
-        The function that the dataloader uses to accumulate data into batches.
-        We run each distinct query + answer choice through the model separately and determine which
-        answer has the lowest per-token-perplexity.
-
-        If each question has N possible choices, all N must be grouped together as distinct elements of the batch
-        since the batch may consist of multiple questions, the choice_groupings indicates
-        which contiguous sequences of elements in the batch correspond to which question
-        gold_indices indicates which of the [0, N-1] choices is the correct one for each question.
-        Args:
-            data (List): List of tokenized datapoints (dicts returned by self._tokenize_example)
-
-        Returns:
-            Dict: Dictionary for a single batch
-        """
-        batch = copy.deepcopy(self.base_batch)
-        for data_pair in data:
-            choice_start_idx = len(batch['continuation_indices'])
-            # NOTE: not using batch_mapping
-            for i, context_enc in enumerate(data_pair[self.context_key]):
-                batch['input_ids'].append(context_enc)
-                batch['continuation_indices'].append(data_pair['continuation_indices'][i])
-                batch['labels'].append(context_enc)
-
-            batch['gold_indices'].append(data_pair['gold'])
-            choice_end_idx = len(batch['continuation_indices'])
-            batch['choice_groupings'].append((choice_start_idx, choice_end_idx))
-
-        batch = convert_tokens_to_tensors(batch, self.tokenize_labels)
-        batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
-        return batch
-
-    def get_num_samples_in_batch(self, batch) -> int:
-        return batch['input_ids'].shape[0] // self.num_choices
-
-    def split_batch(self, batch: Any, microbatch_size: int) -> List[Dict[str, Any]]:
-        """
-        Split batch while ensuring all continuations are in the same microbatch.
-
-        In ICL Multiple Choice, we duplicate each data point for each possible continuation.
-        When splitting a batch, we have logical example, which refer to one possible question,
-        and real example, which refers to one possible continuation. As example count and
-        microbatch_size are tracked in logical example, we split logical attributes by
-        microbatch_size and real attributes by microbatch_size * num_choices.
-        Args:
-            batch (Dict): Batch of data
-            microbatch_size (int): Size of microbatches
-
-        Returns:
-            list: List of chunked batches
-        """
-        chunked = {}
-        for k, v in batch.items():
-            if k in self.static_keys:
-                # Defer broadcasting primitives until we know num_chunks
-                pass
-            elif type(v) == list:
-                # list of tensors - 'continuation_indices'
-                if k in self.list_of_tensors_keys:
-                    chunked[k] = _split_list(v, microbatch_size * self.num_choices)
-                # list of tuples - 'choice_groupings'
-                elif k in self.list_of_tuples_keys:
-                    chunked[k] = _split_list(v, microbatch_size)
-                # list - 'gold_indices'
-                elif k in self.list_of_primitives:
-                    chunked[k] = _default_split_batch(v, microbatch_size)
-                else:
-                    raise ValueError(f'Unexpected key {k} in list splitting')
-            elif k in self.tensor_keys:
-                chunked[k] = _default_split_batch(v, microbatch_size * self.num_choices)
-            else:
-                raise ValueError(f'Unexpected key {k} in batch splitting')
-        num_chunks = len(chunked['input_ids'])
-        # Broadcast primitives to all chunks
-        for k, v in batch.items():
-            if k in self.static_keys:
-                chunked[k] = [v] * num_chunks
-
-        return [{k: v[idx] for k, v in chunked.items()} for idx in range(num_chunks)]
-
-
-class InContextLearningSchemaTaskDataset(InContextLearningMultipleChoiceTaskDataset):
-    """A dataset that constructs batches for in-context learning schema evaluation.
-    A schema task involves sentences with a fill-in-the-blank where the user needs to choose the correct word
-    to fill in from a set of N options. We use the partial evaluation technique from https://arxiv.org/abs/1806.02847
-    to determine the model's choice of fill-in word.
-
-    The default input format is a jsonl file with the following fields:
-    - context_options: List of strings corresponding to possible preceding context options for the continuation
-    - gold: Index of the correct context from 'context_options'
-    - continuation: The finishing continuation
-
-    Each batch then consists of ``batch_size // N`` distinct tasks and has the following the structure
-    - input_ids: Input tensor ``batch x seqlen x # of tokens``
-    - continuation_indices: List of ``batch`` consisting of tensors indicating which indices in the sequence correspond to the question answer (aka continuation)
-    - mode: Indicates to the model that this is an ICL task and may rely on a custom code path to properly update metrics
-    - labels: Identical to the input, used by the model to calculate loss/metrics
-    - gold_indices: List of length ``batch_size // N`` indicating for each question, which of the answers is correct (via an integer [0, N-1])
-    - choice_groupings: Indicates which indices of the batch correspond to which questions
-
-    """
-
-    def __init__(self, choices_key='context_options', *args, **kwargs):
-        static_keys = ['mode']
-        tensor_keys = ['input_ids', 'labels', 'attention_mask']
-        list_of_tensors_keys = ['continuation_indices']
-        super().__init__(choices_key=choices_key,
-                         context_key=choices_key,
-                         static_keys=static_keys,
-                         tensor_keys=tensor_keys,
-                         list_of_tensors_keys=list_of_tensors_keys,
-                         *args,
-                         **kwargs)
-        self.base_batch = {
-            'input_ids': [],
-            'continuation_indices': [],
-            'mode': 'icl_task',
-            'labels': [],
-            'gold_indices': [],
-            'choice_groupings': [],
-        }
-
-    def construct_context(self, example, preceding_text: str = '', add_answer: bool = False) -> str:
-        """
-        Takes a example and constructs a context with the correct context for the example's continuation.
-
-        Args:
-            example (Dict): The example from which to construct the context
-            preceding_text (str): Any preceding text, needed to if self.example_delimiter is needed at the beginning
-            add_answer (bool): This will always be true when calling this function for SchemaTaskDataset
-
-        Returns:
-            str: The single correct context for a given continuation
-        """
-        context_options = example[self.choices_key]
-        gold_idx = example['gold']
-        continuation = example['continuation']
-        context = context_options[gold_idx]
-        if len(preceding_text) > 0:
-            context = f'{self.example_delimiter}{context}'
-        context = f'{context}{self.continuation_delimiter}{continuation}'
-        return context
-
-    def _construct_multiple_contexts(self, example: Dict, preceding_text: str = '') -> List[str]:
-        """
-        Takes a example and constructs all contexts. Optionally, appends this to preceeding text (such as a
-        prompt or fewshot examples).
-
-        Args:
-            example (Dict): The example from which to construct the context
-            preceding_text (str): Any preceding text, needed to if self.example_delimiter is needed at the beginning
-
-        Returns:
-            list: All context options for the selected example with formatting
-        """
-        context_options = example[self.choices_key]
-        if len(preceding_text) > 0:
-            if self.strip_data:
-                cont_del = self.continuation_delimiter.rstrip()
-            else:
-                cont_del = self.continuation_delimiter
-            context_options = [f'{self.example_delimiter}{c}{cont_del}' for c in context_options]
-        return context_options
-
-    def _prep_example(
-        self,
-        example: Dict,
-        example_idx: int,
-        num_fewshot: int,
-        prompt_string: str,
-        fewshot_rng: random.Random,
-    ) -> Dict[str, Any]:
-        """
-        Prepares a single example from a HF Dataset into tokenized format with prompt and fewshot examples.
-
-        Each task consists of multiple contexts and a single, correct continuation. Will preprend fewshot examples and
-        prompt if present.
-
-        Args:
-            example (Dict): A dictionary from the hf dataset
-            example_idx (int): The index of example
-            num_fewshot (int): Number of examples context/continuation pairs to prepend to the test pair
-            prompt_string (str): The prompt to prepend to all inputs
-            fewshot_rng (random.Random): Random number generator to use for fewshot sampling
-
-        Returns:
-            Dict: Contains a dictionary with the tokenized data
-        """
-        prompt_and_fewshot = self._generate_few_shot_prompt(num_fewshot, example_idx, prompt_string, fewshot_rng)
-        ctxt = self._construct_multiple_contexts(example, prompt_and_fewshot)
-        tokenized_example = self.tokenize_example(prompt_and_fewshot, ctxt, example)
-        return tokenized_example
-
-    def tokenize_example(self, prompt_and_fewshot: str, context_options: List[str], example: Dict) -> Dict[str, Any]:
-        """
-        Runs text through the tokenizer and handle special cases.
-
-        Args:
-            prompt_and_fewshot (str): The collection of the prompt and fewshot examples that belongs before the example's context
-            ctx (str): The specific example's derrived context
-            example (Dict): The example as a dictionary.
-
-        Returns:
-            Dict: Dictionary with the tokenized data
-        """
-        tokenized_example = {}
-        preamble = self.tokenizer(prompt_and_fewshot)['input_ids']
-        assert isinstance(preamble, list)
-        preamble = self._fix_eos_on_preamble(preamble)
-        encoded_contexts = [
-            preamble +  # pyright: ignore[reportOperatorIssue, reportGeneralTypeIssues]
-            self.tokenizer(c, add_special_tokens=False)['input_ids']  # pyright: ignore[reportOperatorIssue, ]
-            for c in context_options
-        ]
-        continuation = example['continuation']
-        if self.prefix_space:
-            continuation = (f' {continuation}' if not continuation.startswith(' ') else continuation)
-        tokenized_continuation = self.tokenizer(continuation, add_special_tokens=False)['input_ids']
-
-        tokenized_example[self.context_key] = []
-        tokenized_example['continuation_indices'] = []
-        tokenized_example[self.answer_key] = []
-        for context in encoded_contexts:
-            assert isinstance(context, list)
-            assert isinstance(tokenized_continuation, list)
-            trimmed_context = _trim_context(context, tokenized_continuation, self.padding_size)
-            assert isinstance(trimmed_context, list)
-            continuation_indices = _get_continuation_span(trimmed_context, tokenized_continuation)
-            padded_context = _make_padded_input(trimmed_context, tokenized_continuation, self.padding_size,
-                                                self.pad_tok_id, self.padding_side)
-            tokenized_example[self.context_key].append(padded_context)
-            tokenized_example['continuation_indices'].append(continuation_indices)
-            tokenized_example[self.answer_key].append(tokenized_continuation)
-
-        tokenized_example['gold'] = example['gold']
-        return tokenized_example
-
-
-class InContextLearningCodeEvalDataset(InContextLearningDataset):
-    """
-    A dataset that constructs batches for in-context learning code evaluation.
-
-    The input format is expected to be a jsonl file with the following fields:
-
-    - task_id: Label of given task
-    - prompt: The code snippet that must be completed
-    - entry_point: The entry to the function/code snippet to generate
-    - canonical_solution: Working solution
-    - test: The checker code that will run to completion if the code generation is valid and otherwise throw assertion
-    - test_inputs: List of test inputs
-    - test_outputs: List of test outputs
-    - language: The language of the code snippet
-
-    Each batch then consists of the following the structure
-
-    - input_ids: Input tensor batch x seqlen x num tokens
-    - mode: Indicates to the model that this is an ICL task and may rely on a custom code path to properly update metrics
-    - mode: Always set to 'generate'
-    - labels: Exact solution for the coding problem
-    - prompts: Prompt for the task
-    - entry_points: List of entry points
-    - test_inputs: List of test inputs
-    - test_outputs: List of test outputs
-    - languages:  List of languages
-    - pass_at_k: Passed value for pass_at_k
-    - generation_length: Derrived maximum generation length
-    - generation_kwargs: Dictionary of kwargs neeeded for generation. Includes the following, which will be individually overwritten
-      by keys in generaiton_kwargs if set (see https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig
-      for more details):
-
-        - pad_token_id: ID for padding token, derived automatically
-        - num_beams: How many beams to search for generations, set to 1
-        - num_return_sequences: Value passed for 'generations_per_sample', how many generations per prompt
-        - do_sample: Determines whether model is sampling or greedily decoding. Always set to True
-        - use_cache: Whether or not to use past key values to speed up sampling. Always set to True
-
-    Additional Args:
-        generations_per_sample (int) (defaults to 1): The number of independently computed returned sequences for each element in the batch
-        pass_at_k (int) (defaults to 1): k for how many chances the model gets to write passing code
-    """
-
-    def __init__(
-        self,
-        generations_per_sample: int,
-        pass_at_k: int = 1,
-        *args,
-        **kwargs,
-    ):
-        if generations_per_sample < pass_at_k:
-            raise ValueError(
-                f'generations_per_sample ({generations_per_sample}) must be greater than or equal to pass_at_k ({pass_at_k}) for code evaluation.'
-            )
-        batch_mapping = {
-            'input_ids': 'prompt',
-            'prompts': 'prompt_text',
-            'tests': 'test',
-            'labels': 'canonical_solution',
-            'entry_points': 'entry_point',
-            'test_inputs': 'test_inputs',
-            'test_outputs': 'test_outputs',
-            'languages': 'language'
-        }
-        # Linting complains if these are not set in init
-        self.max_prompt_length = 0
-        self.max_answer_length = 0
-        static_keys = ['mode', 'pass_at_k', 'generation_length', 'generation_kwargs']
-        list_keys = ['prompts', 'tests', 'entry_points', 'test_inputs', 'test_outputs', 'languages', 'labels']
-        tensor_keys = ['input_ids', 'attention_mask']
-        super().__init__(
-            context_key='prompt',
-            answer_key='canonical_solution',
-            strip_dataset=False,
-            static_keys=static_keys,
-            list_keys=list_keys,
-            tensor_keys=tensor_keys,
-            tokenize_labels=False,
-            padding_side='left',
-            batch_mapping=batch_mapping,
-            *args,
-            **kwargs,
-        )
-        self._set_max_prompt_and_answer_lengths()
-        self.dataset = self.dataset.map(self._trim_padding)
-        self.base_batch = {
-            'input_ids': [],
-            'mode': 'generate',
-            'labels': [],
-            'prompts': [],
-            'tests': [],
-            'entry_points': [],
-            'test_inputs': [],
-            'test_outputs': [],
-            'languages': [],
-            'pass_at_k': pass_at_k,
-            'generation_length': min(self.max_answer_length, self.max_seq_len - self.max_prompt_length),
-            'generation_kwargs': {
-                'pad_token_id': self.pad_tok_id,
-                'num_beams': 1,  # single beam
-                'num_return_sequences': generations_per_sample,
-                'do_sample': True,
-                'use_cache': True,
-                'eos_token_id': self.tokenizer.eos_token_id
-            }
-        }
-        self.update_generation_kwargs(kwargs.get('generation_kwargs', {}))
-
-    def _set_max_prompt_and_answer_lengths(self):
-        """
-        Iterates through the dataset and finds the maximum prompt length and sequence lengths
-
-        Returns:
-            None
-        """
-        max_prompt_length = 0
-        max_answer_length = 0
-        for example in self.dataset:
-            assert isinstance(example, Dict)
-            unpadded_example = [token for token in example[self.context_key] if token != self.pad_tok_id]
-            max_prompt_length = max(max_prompt_length, len(unpadded_example))
-
-            tokenized_answer = self.tokenizer(example['canonical_solution'], add_special_tokens=False)['input_ids']
-            assert isinstance(tokenized_answer, list)
-            len_tokenized_answer = len(tokenized_answer)
-            max_answer_length = max(max_answer_length, len_tokenized_answer)
-
-        self.max_prompt_length = max_prompt_length
-        self.max_answer_length = max_answer_length + _MAX_ANSWER_BUFFER_LENGTH
-
-    def _trim_padding(self, example: Dict):
-        """
-        Adjusts padding to the maximum prompt length rather than max_seq_len.
-        Needs to be done after the dataset has been processed because we don't know the maximum
-        prompt length until after we've tokenized it.
-
-        Returns:
-            dataset: A HuggingFace Dataset with different padding lengths for example[self.context_key]
-        """
-        # Remove padding tokens applied during tokenization
-        unpadded_prompt = [token for token in example[self.context_key] if token != self.pad_tok_id]
-        # Reapply padding only to max_prompt_length
-        full_prompt = _trim_context(unpadded_prompt, [], self.max_prompt_length)
-        padded_context = _make_padded_input(full_prompt, [], self.max_prompt_length, self.pad_tok_id, self.padding_side)
-
-        example[self.context_key] = padded_context
-        return example
-
-    def tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -> Dict[str, Any]:
-        """
-        Adds extra code task details to the example dictionary.
-        See InContextLearningDataset for more details
-        """
-        tokenized_example = super().tokenize_example(prompt_and_fewshot, ctxt, example)
-        tokenized_example['prompt_text'] = example['prompt']
-        tokenized_example['task_id'] = example['task_id']
-        tokenized_example['canonical_solution'] = example['canonical_solution']
-        tokenized_example['test'] = example['test']
-        tokenized_example['entry_point'] = example['entry_point']
-        tokenized_example['test_inputs'] = example['test_inputs']
-        tokenized_example['test_outputs'] = example['test_outputs']
-        tokenized_example['language'] = example['language']
-        return tokenized_example
-
-
-def build_icl_dataloader(
-        icl_task_type: str,
-        dataset_uri: str,
-        tokenizer: transformers.PreTrainedTokenizerBase,
-        batch_size: int,
-        max_seq_len: int,
-        pad_tok_id: int,
-        num_fewshot: int,
-        prompt_string: str,  # e.g. 'translate english to french:'
-        example_delimiter: str,  # e.g. '\n'
-        continuation_delimiter: str,  # e.g. ''
-        hf_loading_vars: Dict,
-        hf_parsing_map: Dict,
-        destination_path: str,
-        prelimiter: str,  # e.g. 'Question: '
-        cot_delimiter: str,  # e.g. ' ### '
-        fewshot_random_seed: int,
-        pass_at_k: int,
-        generations_per_sample: int,
-        generation_kwargs: Dict,
-        early_stopping_criteria: Optional[List[str]] = None,
-        do_normalization: bool = True) -> DataSpec:
-    """
-    Factory method that builds the specific dataset for the specified icl_task_type.
-    See documentation for `get_icl_task_dataloader` for arugment documentation.
-
-    When writing a dataset for a new task, here you will need to:
-        1. add the dataset to the factory and choose an appropriate string
-        2. set the batch size for that task (see InContextLearningMultipleChoiceTaskDataset for why
-            this might be different)
-        3. set the `split_batch` funciton if necessary
-    """
-    if icl_task_type == 'multiple_choice':
-        dataset = InContextLearningMultipleChoiceTaskDataset(
-            dataset_uri=dataset_uri,
-            tokenizer=tokenizer,
-            max_seq_len=max_seq_len,
-            pad_tok_id=pad_tok_id,
-            num_fewshot=num_fewshot,
-            prompt_string=prompt_string,
-            example_delimiter=example_delimiter,
-            continuation_delimiter=continuation_delimiter,
-            destination_path=destination_path,
-            fewshot_random_seed=fewshot_random_seed,
-            hf_loading_vars=hf_loading_vars,
-            hf_parsing_map=hf_parsing_map,
-            generation_kwargs=generation_kwargs,
-        )
-        batch_size = max(dataset.num_choices, batch_size)
-        effective_batchsize = batch_size // dataset.num_choices
-    elif icl_task_type == 'schema':
-        dataset = InContextLearningSchemaTaskDataset(
-            dataset_uri=dataset_uri,
-            tokenizer=tokenizer,
-            max_seq_len=max_seq_len,
-            pad_tok_id=pad_tok_id,
-            num_fewshot=num_fewshot,
-            prompt_string=prompt_string,
-            example_delimiter=example_delimiter,
-            continuation_delimiter=continuation_delimiter,
-            destination_path=destination_path,
-            fewshot_random_seed=fewshot_random_seed,
-            hf_loading_vars=hf_loading_vars,
-            hf_parsing_map=hf_parsing_map,
-            generation_kwargs=generation_kwargs,
-        )
-        batch_size = max(dataset.num_choices, batch_size)
-        effective_batchsize = batch_size // dataset.num_choices
-    elif icl_task_type == 'language_modeling':
-        dataset = InContextLearningLMTaskDataset(
-            dataset_uri=dataset_uri,
-            tokenizer=tokenizer,
-            max_seq_len=max_seq_len,
-            pad_tok_id=pad_tok_id,
-            num_fewshot=num_fewshot,
-            prompt_string=prompt_string,
-            example_delimiter=example_delimiter,
-            continuation_delimiter=continuation_delimiter,
-            destination_path=destination_path,
-            fewshot_random_seed=fewshot_random_seed,
-            hf_loading_vars=hf_loading_vars,
-            hf_parsing_map=hf_parsing_map,
-            generation_kwargs=generation_kwargs,
-        )
-        effective_batchsize = batch_size
-    elif icl_task_type == 'question_answering':
-        dataset = InContextLearningQATaskDataset(
-            dataset_uri=dataset_uri,
-            tokenizer=tokenizer,
-            max_seq_len=max_seq_len,
-            pad_tok_id=pad_tok_id,
-            num_fewshot=num_fewshot,
-            prompt_string=prompt_string,
-            example_delimiter=example_delimiter,
-            continuation_delimiter=continuation_delimiter,
-            destination_path=destination_path,
-            prelimiter=prelimiter,
-            fewshot_random_seed=fewshot_random_seed,
-            hf_loading_vars=hf_loading_vars,
-            hf_parsing_map=hf_parsing_map,
-            cot_delimiter=cot_delimiter,
-            early_stopping_criteria=early_stopping_criteria,
-            do_normalization=do_normalization,
-            generation_kwargs=generation_kwargs,
-        )
-        effective_batchsize = batch_size
-    elif icl_task_type == 'code_evaluation':
-        dataset = InContextLearningCodeEvalDataset(
-            dataset_uri=dataset_uri,
-            tokenizer=tokenizer,
-            max_seq_len=max_seq_len,
-            pad_tok_id=pad_tok_id,
-            num_fewshot=num_fewshot,
-            prompt_string=prompt_string,
-            example_delimiter=example_delimiter,
-            continuation_delimiter=continuation_delimiter,
-            destination_path=destination_path,
-            prelimiter=prelimiter,
-            fewshot_random_seed=fewshot_random_seed,
-            hf_loading_vars=hf_loading_vars,
-            hf_parsing_map=hf_parsing_map,
-            pass_at_k=pass_at_k,
-            generations_per_sample=generations_per_sample,
-            generation_kwargs=generation_kwargs,
-        )
-        effective_batchsize = batch_size
-    else:
-        raise Exception(f'Unrecognized ICL task type: {icl_task_type}')
-
-    sampler = dist.get_sampler(dataset, drop_last=False, shuffle=False)
-
-    split_batch = None
-    if isinstance(
-            dataset,
-        (
-            InContextLearningMultipleChoiceTaskDataset,
-            InContextLearningQATaskDataset,
-            InContextLearningCodeEvalDataset,
-        ),
-    ):
-        split_batch = dataset.split_batch
-
-    return DataSpec(
-        DataLoader(
-            dataset,
-            batch_size=effective_batchsize,
-            sampler=sampler,
-            collate_fn=dataset.collate_fn,
-        ),
-        device_transforms=None,
-        get_num_samples_in_batch=dataset.get_num_samples_in_batch,
-        split_batch=split_batch,
-    )
-
-
-def partition_dataset_by_category(dataset_uri: str, destination_path: str, hf_loading_vars: Dict,
-                                  hf_parsing_map: Dict) -> Dict[str, str]:
-    """
-    If has_categories is enabled, we partition the dataset into a separate dataset for each category value in the data and write each partition to a local file.
-
-    Args:
-        dataset_uri (str): Location of dataset.
-        destination_path (str): Base destination path, we will write a separate partition off this URI for each category.
-
-    Raises:
-        MissingConditionalImportError: If datasets not installed raise exception.
-        Exception: If 'category' key missing from dataset, raise exception.
-    Returns:
-        Dict[str, str]: Mapping of category names to partitioned dataset local files names.
-    """
-    try:
-        from datasets import Dataset as HFDataset  # pyright: ignore[reportGeneralTypeIssues]
-        from datasets import IterableDataset, load_dataset  # pyright: ignore[reportGeneralTypeIssues]
-    except ImportError as e:
-        raise MissingConditionalImportError(
-            extra_deps_group='nlp',
-            conda_package='datasets',
-            conda_channel='conda-forge',
-        ) from e
-    if dataset_uri.startswith('hf://'):
-        dataset_uri = dataset_uri.replace('hf://', '')
-        dataset = load_dataset(dataset_uri, **hf_loading_vars)
-        assert isinstance(dataset, HFDataset) or isinstance(dataset, IterableDataset)
-        if hf_parsing_map:
-            dataset_parsing_func = lambda example: {
-                k: ' '.join([str(example[col]) for col in v]) for k, v in hf_parsing_map.items()
-            }
-            assert hasattr(dataset, 'column_names')
-            dataset = dataset.map(dataset_parsing_func, remove_columns=dataset.column_names)
-    else:
-        with dist.local_rank_zero_download_and_wait(destination_path):
-            if dist.get_local_rank() == 0:
-                get_file(dataset_uri, destination_path, overwrite=True)
-        dataset = load_dataset('json', data_files=destination_path, split='train', streaming=False)
-    assert isinstance(dataset, HFDataset) or isinstance(dataset, IterableDataset)
-    assert hasattr(dataset, 'features')
-    assert dataset.features is not None
-    if 'category' not in dataset.features.keys():
-        raise Exception(f"""Attempted to partition dataset by `category` \
-            but it doesn't have a `category` key. \
-            Got keys: {str(list(dataset.features.keys()))}""")
-    categories = sorted(set(dataset['category']))  # pyright: ignore[reportIndexIssue, reportGeneralTypeIssues]
-    output_files = {}
-    for cat in categories:
-        path = destination_path.split('/')
-        cat_dest = '/'.join(path[:-1]) + f'/{cat}_{path[-1]}'
-        tmp_path_to_broadcast = str(os.path.abspath(cat_dest))
-        gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
-        if dist.get_local_rank() == 0:
-            subset = [
-                l for l in dataset if l['category'] == cat  # pyright: ignore[reportGeneralTypeIssues]
-            ]  # pyright: ignore[reportArgumentType, reportCallIssue]
-            with open(gathered_paths[0], 'w', encoding='utf8') as f:
-                for l in subset:
-                    f.write(json.dumps(l, ensure_ascii=False) + '\n')
-        output_files[cat] = cat_dest
-    return output_files
-
-
-def get_icl_task_dataloader(
-        icl_task_type: str,
-        dataset_uri: str,
-        tokenizer: Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast],
-        batch_size: int,
-        max_seq_len: int,
-        pad_tok_id: int,
-        num_fewshot: int,
-        prompt_string: str,  # e.g. 'translate english to french:'
-        example_delimiter: str,  # e.g. '\n'
-        continuation_delimiter: str = '',
-        destination_path: str = '',
-        question_prelimiter: str = '',  # e.g. 'Question: '
-        fewshot_random_seed: int = 1234,
-        pass_at_k: int = 1,
-        generations_per_sample: int = 1,
-        cot_delimiter: str = '',
-        has_categories: bool = False,
-        hf_loading_vars: Optional[Dict] = None,
-        hf_parsing_map: Optional[Dict] = None,
-        generation_kwargs: Optional[Dict] = None,
-        early_stopping_criteria: Optional[List[str]] = None,
-        do_normalization: bool = True) -> Union[DataSpec, Dict[str, DataSpec]]:
-    """This constructs a dataloader (or dataloaders if has_categories is True) capable of evaluating LLMs on in-context learning language modeling tasks, for example LAMBADA. An example usage is below:
-
-        .. testsetup::
-
-            import transformers
-            from composer.models import HuggingFaceModel
-            from composer.trainer import Trainer
-            dataset_uri = "/tmp/dataset_uri.jsonl"
-            dataset = RandomTextClassificationDataset(size=16, use_keys=True)
-            train_dataloader = torch.utils.data.DataLoader(dataset, batch_size=8)
-            hf_model, tokenizer = HuggingFaceModel.hf_from_composer_checkpoint('composer-hf-checkpoint.pt')
-            # At this point, hf_model is randomly initialized
-            composer_model = HuggingFaceModel(hf_model, hf_tokenizer)
-
-        Example:
-
-        .. testcode::
-
-
-            dl = get_icl_task_dataloader(
-                'language_modeling',
-                dataset_uri,
-                tokenizer,
-                batch_size=2,
-                max_seq_len=2048,
-                pad_tok_id=tokenizer.pad_token_id,
-                num_fewshot=10,
-                prompt_string='translate english to french',
-                example_delimiter='\\n',
-                continuation_delimiter=''
-                )
-            eval_evaluator = Evaluator(
-                    label="lambada",
-                    dataloader=dl,
-                    metric_names=['InContextLearningLMAccuracy']
-                )
-            trainer = Trainer(
-                    model=model,
-                    train_dataloader=train_dataloader,
-                    eval_dataloader=eval_evaluator,
-                    optimizers=optimizer,
-                    max_duration="1ep",
-                )
-
-    Args:
-        icl_task_type (str): Name of icl_task type. One of ['multiple_choice', 'schema', 'language_modeling', 'question_answering', 'code_evaluation']
-        dataset_uri (str): A local path, a remote path beginning with ``s3://`` or another backend, or a HuggingFace dataset uri prepended with ``hf://``.
-            Alternate backends must be supported by :meth:`composer.utils.maybe_create_object_store_from_uri`.
-            A local dataset must consist of rows of JSON data points with task dependant fields.
-            The default keys expected are "context" and "answer".
-        tokenizer (transformers.PreTrainedTokenizerBase): The tokenizer used to map between strings and token ids.
-        batch_size (int): Size of a batch used for eval
-        max_seq_len (int): The maximum sequence length supported by the model.
-        pad_tok_id (int): The special token used for padding batches.
-        num_fewshot (int): The number of complete fewshot examples to prepend before each test example. These are not identical across examples.
-        prompt_string (str, default = ''): Prompt string to put once before all fewshot examples/test examples (e.g. 'Translate english to french.').
-        example_delimiter (str, default = '\\n'): Separator inserted before (context, answer) pairs (e.g. '\\n') for fewshot sampling and prompting.
-        continuation_delimiter: (str, default = ' '): Separator inserted between context and answer in each example (e.g. '\\nA: ').
-        destination_path: (str, default = ''): This is the local file where remote datasets will be saved.
-        question_prelimiter: (str, default = ''): Text to be prepended before each context, including few shot examples (e.g. "Question: ").
-        fewshot_random_seed (int, default = 1234): Random seed to use for fewshot sampling
-        pass_at_k (int): k for how many chances the model gets to write passing code.
-        generations_per_sample (int): How many outputs to generate per prompt. Passed in generation_kwargs under "num_return_sequences" and overwritten by generation_kwargs dict.
-        cot_delimiter (str): Delimiter to place between chain of thoughts and continuations.
-        has_categories: (bool): If ``True``, we will search the dataset file for a category key, and partition the dataset into a separate dataloader for each category occurring in the data.
-        hf_loading_vars (Dict, default = None): A dictionary containing keyword arguments to be passed into `load_dataset` if dataset is being pulled from HF.
-        hf_parsing_map (Dict, default = None): A dictionary containing a mapping from HF columns to ICL dataset keys. The dictionary should be formatted {icl_key:[hf_key1, hf_key1]}.
-            Column contents will be concatenated with ' ' seperating them. If not included, will load the columns already present in the HF dataset.
-        generation_kwargs (Dict, default = None): A dictionary containing keyword arguments to be passed along to the model's generate function. Overwrites any previously specified generation
-                                                  keyword args in this fucntion (see https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig
-                                                  for more details)
-        early_stopping (List, default = None): A list of strings that, when found in a model's output, will be treated as a stopping criteria at metric computation time.
-            Used in QA tasks with CoT
-        do_normalization (bool, default = True): Whether or not to normalize the outputs and labels in InContextLearningQAAccuracy. Only used in QA tasks.
-
-    Returns:
-        DataLoader: A dataloader used for performing in-context learning evaluation on the dataset provided.
-    """
-    if hf_loading_vars is None:
-        hf_loading_vars = {}
-    if hf_parsing_map is None:
-        hf_parsing_map = {}
-    if generation_kwargs is None:
-        generation_kwargs = {}
-    if early_stopping_criteria is None:
-        early_stopping_criteria = []
-
-    if has_categories:
-        result_dls = {}
-        output_files = partition_dataset_by_category(dataset_uri, destination_path, hf_loading_vars, hf_parsing_map)
-        categories = sorted(output_files.keys())
-        for category in categories:
-            partition_uri = output_files[category]
-            result_dls[category] = build_icl_dataloader(
-                icl_task_type=icl_task_type,
-                dataset_uri=partition_uri,
-                tokenizer=tokenizer,
-                batch_size=batch_size,
-                max_seq_len=max_seq_len,
-                pad_tok_id=pad_tok_id,
-                num_fewshot=num_fewshot,
-                prompt_string=prompt_string,
-                example_delimiter=example_delimiter,
-                continuation_delimiter=continuation_delimiter,
-                destination_path=partition_uri + '_tmp',
-                prelimiter=question_prelimiter,
-                cot_delimiter=cot_delimiter,
-                fewshot_random_seed=fewshot_random_seed,
-                pass_at_k=pass_at_k,
-                generations_per_sample=generations_per_sample,
-                hf_loading_vars=hf_loading_vars,
-                hf_parsing_map=hf_parsing_map,
-                generation_kwargs=generation_kwargs,
-                early_stopping_criteria=early_stopping_criteria,
-                do_normalization=do_normalization,
-            )
-        return result_dls
-    else:
-        return build_icl_dataloader(
-            icl_task_type=icl_task_type,
-            dataset_uri=dataset_uri,
-            tokenizer=tokenizer,
-            batch_size=batch_size,
-            max_seq_len=max_seq_len,
-            pad_tok_id=pad_tok_id,
-            num_fewshot=num_fewshot,
-            prompt_string=prompt_string,
-            example_delimiter=example_delimiter,
-            hf_loading_vars=hf_loading_vars,
-            hf_parsing_map=hf_parsing_map,
-            continuation_delimiter=continuation_delimiter,
-            destination_path=destination_path,
-            prelimiter=question_prelimiter,
-            cot_delimiter=cot_delimiter,
-            fewshot_random_seed=fewshot_random_seed,
-            pass_at_k=pass_at_k,
-            generations_per_sample=generations_per_sample,
-            generation_kwargs=generation_kwargs,
-            early_stopping_criteria=early_stopping_criteria,
-            do_normalization=do_normalization,
-        )
diff --git a/composer/metrics/nlp.py b/composer/metrics/nlp.py
index dd4d665678..90ce30d948 100644
--- a/composer/metrics/nlp.py
+++ b/composer/metrics/nlp.py
@@ -4,33 +4,21 @@
 """A collection of common torchmetrics for NLP tasks."""
 
 import logging
-import os
-import re
-import string
-import warnings
-from typing import Any, Dict, List, Mapping, Optional, Union
+from typing import Mapping, Union
 
-import numpy as np
 import torch
 from torch import Tensor
-from torch.nn import functional as F
 from torchmetrics import Metric
 
-from composer.utils.eval_client import EvalClient, LambdaEvalClient, LocalEvalClient, MosaicMLLambdaEvalClient
 
 log = logging.getLogger(__name__)
 
 __all__ = [
-    'InContextLearningLMAccuracy',
-    'InContextLearningMultipleChoiceAccuracy',
-    'InContextLearningQAAccuracy',
-    'InContextLearningCodeEvalAccuracy',
     'BinaryF1Score',
     'LanguageCrossEntropy',
     'MaskedAccuracy',
     'LanguagePerplexity',
-    'InContextLearningLMExpectedCalibrationError',
-    'InContextLearningMCExpectedCalibrationError',
+    'InContextLearningMetric',
 ]
 
 
@@ -209,446 +197,3 @@ def update(self, batch: dict, output_logits: torch.Tensor, labels: torch.Tensor)
             NotImplementedError: Abstract method must be implemented by subclasses
         """
         raise NotImplementedError
-
-
-class InContextLearningQAAccuracy(InContextLearningMetric):
-    r"""Computes accuracy for In-context learning (ICL) question answering (QA) tasks.
-
-    ICL QA tasks consist of some number of example question answering tasks (referred to as the 'context'), followed by a test task where the model must
-    match one of the possible answer aliases (referred to as the 'continuation').
-
-    For example, the model may be provided the context below and evaluated on its ability to correctly predict the continuation.
-
-    Context: `Question: Who was president of the United States in 2012?\nAnswer: Barack Obama\nQuestion: Is water wet?\nAnswer: `
-    Continuation: [`yes`, `no`]
-
-    Both predictions and answers will be normalized before comparison.
-
-    Adds metric state variables:
-        correct (float): The number of instances where the prediction was a prefix for any of the answer aliases.
-        total (float): The number of total instances that were predicted.
-
-    Args:
-        dist_sync_on_step (bool, optional): Synchronize metric state across processes at
-            each forward() before returning the value at the step. Default: ``False``.
-    """
-
-    # Make torchmetrics call update only once
-    full_state_update = False
-
-    def __init__(self, dist_sync_on_step: bool = False):
-        # state from multiple processes
-        super().__init__(dist_sync_on_step=dist_sync_on_step)
-        self.add_state('correct', default=torch.tensor(0.), dist_reduce_fx='sum')
-        self.add_state('total', default=torch.tensor(0.), dist_reduce_fx='sum')
-
-    def normalize_answer(self, answer: str):
-        """Lower text and remove punctuation, articles and extra whitespace.
-
-        Copied from https://github.com/mandarjoshi90/triviaqa/blob/master/evaluation/triviaqa_evaluation.py
-        """
-
-        def remove_articles(text: str) -> str:
-            return re.sub(r'\b(a|an|the)\b', ' ', text)
-
-        def white_space_fix(text: str) -> str:
-            return ' '.join(text.split())
-
-        def handle_punc(text: str) -> str:
-            exclude = set(string.punctuation + ''.join([u'‘', u'’', u'´', u'`']))
-            return ''.join(ch if ch not in exclude else ' ' for ch in text)
-
-        def lower(text: str) -> str:
-            return text.lower()
-
-        def replace_underscore(text: str) -> str:
-            return text.replace('_', ' ')
-
-        return white_space_fix(remove_articles(handle_punc(lower(replace_underscore(answer))))).strip()
-
-    def update(self, outputs: List[str], labels: List[List[str]], batch: Optional[Dict[str, Any]] = None):
-        if batch is None:
-            batch = {}
-        cot_delimiter = batch.get('cot_delimiter', '')
-        do_normalization = batch.get('do_normalization', True)
-        stopping_criteria = batch.get('stopping_criteria', None)
-        for sample_output, sample_labels in zip(outputs, labels):
-            final_answer = sample_output
-
-            if stopping_criteria is not None and len(stopping_criteria) > 0:
-                final_answer = re.split('|'.join(stopping_criteria), final_answer)[0]
-
-            if cot_delimiter is not None and len(cot_delimiter) > 0:
-                final_answer = final_answer.split(cot_delimiter)[-1]
-
-            if do_normalization:
-                cleaned_final_answer = self.normalize_answer(final_answer)
-                cleaned_sample_labels = {self.normalize_answer(label) for label in sample_labels}
-            else:
-                cleaned_final_answer = final_answer
-                cleaned_sample_labels = set(sample_labels)
-
-            if any(cleaned_final_answer.startswith(label) for label in cleaned_sample_labels):
-                self.correct += torch.tensor(1.0)
-            self.total += torch.tensor(1.0)
-
-    def compute(self):
-        assert isinstance(self.correct, Tensor)
-        assert isinstance(self.total, Tensor)
-        return self.correct / self.total
-
-
-class InContextLearningLMAccuracy(InContextLearningMetric):
-    r"""Computes accuracy for In-context learning (ICL) language modeling (LM) tasks.
-
-    ICL LM tasks consist of some number of example language modeling tasks (referred to as the 'context'), followed by a test task where the model must correctly predict all the tokens
-    following tokens in some passage (referred to as the 'continuation').
-
-    For example, the model may be provided the context below and evaluated on its ability to correctly predict the continuation. Note: it doesn't matter
-    whether the model correctly predicts the context tokens.
-
-    Context: `The dog is->fuzzy\nthe water is->hot\nthe tree is->`
-    Continuation: `green`
-
-    Adds metric state variables:
-        correct (float): The number of instances where the prediction masked the target.
-        total (float): The number of total instances that were predicted.
-
-    Args:
-        dist_sync_on_step (bool, optional): Synchronize metric state across processes at
-            each forward() before returning the value at the step. Default: ``False``.
-    """
-
-    # Make torchmetrics call update only once
-    full_state_update = False
-
-    def __init__(self, dist_sync_on_step: bool = False):
-        # state from multiple processes
-        super().__init__(dist_sync_on_step=dist_sync_on_step)
-        self.add_state('correct', default=torch.tensor(0.), dist_reduce_fx='sum')
-        self.add_state('total', default=torch.tensor(0.), dist_reduce_fx='sum')
-
-    def update(self, batch: dict, output_logits: torch.Tensor, labels: torch.Tensor):
-        for batch_idx, cont_idx in enumerate(batch['continuation_indices']):
-            cont_tok_pred = output_logits[batch_idx].index_select(dim=0, index=cont_idx - 1).argmax(dim=-1)
-            cont_tok_targ = labels[batch_idx].index_select(dim=0, index=cont_idx - 1)
-
-            self.correct += (cont_tok_pred == cont_tok_targ).all().int()
-            self.total += torch.tensor(1.0)
-
-    def compute(self):
-        assert isinstance(self.correct, Tensor)
-        assert isinstance(self.total, Tensor)
-        return self.correct / self.total
-
-
-class InContextLearningMultipleChoiceAccuracy(InContextLearningMetric):
-    r"""Computes accuracy for In-context learning (ICL) multiple choice (MC) tasks.
-
-    ICL MC tasks consists of a series of questions with some number of possible choices (only one of which can be correct).
-    At inference time each possible choice is given to the model as a separate input and the one for which the model assigns
-    the lowest perplexity to the choice is considered the model's choice. The model is correct if it "chooses" the right answer.
-
-    Context: `The dog is->fuzzy\nthe water is->hot\nthe tree is->`
-    Continuation: `green`
-
-    Adds metric state variables:
-        correct (float): The number of instances where the prediction masked the target.
-        total (float): The number of total instances that were predicted.
-
-    Args:
-        dist_sync_on_step (bool, optional): Synchronize metric state across processes at
-            each forward() before returning the value at the step. Default: ``False``.
-    """
-
-    # Make torchmetrics call update only once
-    full_state_update = False
-
-    def __init__(self, dist_sync_on_step: bool = False):
-        # state from multiple processes
-        super().__init__(dist_sync_on_step=dist_sync_on_step)
-        self.add_state('correct', default=torch.tensor(0.0), dist_reduce_fx='sum')
-        self.add_state('total', default=torch.tensor(0.0), dist_reduce_fx='sum')
-
-    def update(self, batch: dict, output_logits: torch.Tensor, labels: torch.Tensor):
-        perplexities = []
-        for batch_idx, cont_idx in enumerate(batch['continuation_indices']):
-            # continuation indices refer to indices in the original input's token space
-            cont_tok_logits = output_logits[batch_idx].index_select(dim=0, index=cont_idx - 1)
-            # labels have been shifted left by one index, so the cont_idx needs to be shifted as well.
-            cont_tok_targ = labels[batch_idx].index_select(dim=0, index=cont_idx - 1)
-            cross_entropy = F.cross_entropy(cont_tok_logits, cont_tok_targ)
-            perplexity = torch.exp(cross_entropy)
-            perplexities.append(perplexity)
-
-        for (start, end), gold_idx in zip(batch['choice_groupings'], batch['gold_indices']):
-            subset = perplexities[start:end]
-            idx_min = subset.index(min(subset))
-
-            if idx_min == gold_idx:
-                self.correct += torch.tensor(1.0)
-            self.total += torch.tensor(1.0)
-
-    def compute(self):
-        assert isinstance(self.correct, Tensor)
-        assert isinstance(self.total, Tensor)
-        return self.correct.float() / self.total
-
-
-class InContextLearningExpectedCalibrationError(InContextLearningMetric):
-    """Generic class for Expected Calibration Error (ECE) (cite: https://arxiv.org/pdf/1706.04599.pdf).
-
-    Expected calibration error is calculated by dividing predictions into buckets based on the model's confidence (a probability value between 0 and 1).
-    We then calculate the accuracy within each bucket and calculate the average gap between confidence and accuracy
-    across buckets, weighted by the number of samples in each bucket.
-
-    Each task must implement its own definition of "confidence" to be computed via the `update` method.
-
-    Adds metric state variables:
-    bucket_totals (float): The number of instances where the prediction masked the target per bucket.
-    bucket_correct (float): The number of total instances that were predicted per bucket.
-
-    Args:
-        dist_sync_on_step (bool, optional): Synchronize metric state across processes at
-            each forward() before returning the value at the step. Default: ``False``.
-        n_buckets (int): Number of distinct buckets to split the confidence distribution into
-    """
-
-    def __init__(self, dist_sync_on_step: bool = False, n_buckets: int = 10):
-        # state from multiple processes
-        super().__init__(dist_sync_on_step=dist_sync_on_step)
-        self.n_buckets = n_buckets
-        if n_buckets < 1:
-            raise Exception('`n_buckets`')
-        self.add_state('bucket_totals', default=torch.zeros(n_buckets), dist_reduce_fx='sum')
-        self.add_state('bucket_correct', default=torch.zeros(n_buckets), dist_reduce_fx='sum')
-
-    def update(self, batch: dict, output_logits: torch.Tensor, labels: torch.Tensor):
-        pass
-
-    def compute(self):
-        assert isinstance(self.bucket_correct, Tensor)
-        assert isinstance(self.bucket_totals, Tensor)
-
-        result = torch.tensor(0.0, device=self.bucket_correct.device)
-        total_obs = torch.sum(self.bucket_totals)
-        for i in range(self.n_buckets):
-            if self.bucket_totals[i] == 0:
-                continue
-
-            acc_bucket_i = self.bucket_correct[i] / self.bucket_totals[i]
-            upper_bound = (i + 1) / self.n_buckets
-            lower_bound = i / self.n_buckets
-            conf_bucket_i = torch.tensor((upper_bound + lower_bound) / 2, device=self.bucket_correct.device)
-            result += (self.bucket_totals[i] / total_obs) * torch.abs(acc_bucket_i - conf_bucket_i)
-        return result
-
-
-class InContextLearningMCExpectedCalibrationError(InContextLearningExpectedCalibrationError):
-    r"""Computes Expected Calibration Error (ECE) for In-context learning (ICL) multiple choice (MC) tasks. (source: https://arxiv.org/abs/2012.00955).
-
-    For MC tasks, the model confidence is defined as the softmax of average per-token probability assigned to the top question choice.
-
-    See `InContextLearningExpectedCalibrationError` for more info.
-    """
-
-    # Make torchmetrics call update only once
-    full_state_update = False
-
-    def update(self, batch: Dict[str, Any], output_logits: torch.Tensor, labels: torch.Tensor):
-        output_logits = torch.softmax(output_logits, dim=2)
-        probabilites = []
-        for batch_idx, cont_idx in enumerate(batch['continuation_indices']):
-            cont_tok_logits = output_logits[batch_idx].index_select(dim=0, index=cont_idx - 1)
-            cont_tok_targ = labels[batch_idx].index_select(dim=0, index=cont_idx - 1)
-            probability = cont_tok_logits.index_select(dim=1, index=cont_tok_targ).diagonal().mean()
-            probabilites.append(probability)
-
-        for (start, end), gold_idx in zip(batch['choice_groupings'], batch['gold_indices']):
-            subset = probabilites[start:end]
-            idx_max = subset.index(max(subset))
-            confidence = torch.tensor(subset).max() / torch.tensor(subset).sum()
-
-            assert confidence >= 0.0 and confidence <= 1.0
-            bucket_idx = int(confidence * self.n_buckets)
-            if bucket_idx == self.n_buckets:
-                bucket_idx -= 1
-
-            if idx_max == gold_idx:
-                self.bucket_correct[bucket_idx] += 1  # pyright: ignore [reportGeneralTypeIssues]
-
-            self.bucket_totals[bucket_idx] += 1  # pyright: ignore [reportGeneralTypeIssues]
-
-
-class InContextLearningLMExpectedCalibrationError(InContextLearningExpectedCalibrationError):
-    r"""Computes Expected Calibration Error (ECE) for In-context learning (ICL) language modeling (LM) tasks. (cite: https://arxiv.org/pdf/1706.04599.pdf).
-
-    For LM tasks, the model confidence is defined as the minimum probability assigned to all tokens in the continuation.
-
-    See `InContextLearningExpectedCalibrationError` for more info.
-    """
-
-    # Make torchmetrics call update only once
-    full_state_update = False
-
-    def update(self, batch: Dict[str, Any], output_logits: torch.Tensor, labels: torch.Tensor):
-        output_logits = torch.softmax(output_logits, dim=2)
-        for batch_idx, cont_idx in enumerate(batch['continuation_indices']):
-            cont_tok_logits = output_logits[batch_idx].index_select(dim=0, index=cont_idx - 1)
-            cont_tok_pred = cont_tok_logits.argmax(dim=-1)
-            confidence = cont_tok_logits.max(dim=-1).values.min()
-            cont_tok_targ = labels[batch_idx].index_select(dim=0, index=cont_idx - 1)
-            assert confidence >= 0.0 and confidence <= 1.0
-            bucket_idx = int(confidence * self.n_buckets)
-            if bucket_idx == self.n_buckets:
-                bucket_idx -= 1
-
-            if (cont_tok_pred == cont_tok_targ).all():
-                self.bucket_correct[bucket_idx] += 1  # pyright: ignore [reportGeneralTypeIssues]
-
-            self.bucket_totals[bucket_idx] += 1  # pyright: ignore [reportGeneralTypeIssues]
-
-
-class InContextLearningCodeEvalAccuracy(InContextLearningMetric):
-    r"""Computes accuracy for In-context learning (ICL) code evaluation tasks.
-
-    ICL code eval tasks consist of some number of example code eval tasks (referred to as the 'context'), followed by a test task where the model must
-    complete the code, where we term the code completion a 'continuation'.
-
-    In each case, the model constructs a given number of continuations (termed pass@K for K continuations), and each continuation is run against a set of test cases. The model is considered
-    correct if at least one of the proposed continuations passes all the test cases.
-
-    Runs on AWS Lambdas by default.
-
-    Adds metric state variables:
-        correct (float): The number of instances where the predictions passed all the test cases.
-        total (float): The number of total instances that were predicted.
-
-    Args:
-        dist_sync_on_step (bool, optional): Synchronize metric state across processes at
-            each forward() before returning the value at the step. Default: ``False``.
-    """
-
-    # Make torchmetrics call update only once
-    full_state_update = False
-
-    def __init__(self, dist_sync_on_step: bool = False):
-        # state from multiple processes
-        super().__init__(dist_sync_on_step=dist_sync_on_step)
-        self.add_state('correct', default=torch.tensor(0.), dist_reduce_fx='sum')
-        self.add_state('total', default=torch.tensor(0.), dist_reduce_fx='sum')
-
-        self.eval_device = os.environ.get('CODE_EVAL_DEVICE', None)
-        if self.eval_device is not None:
-            self.eval_device = self.eval_device.upper()
-
-    def get_client(self) -> EvalClient:
-        """Returns a client for the appropriate remote platform."""
-        client = None
-        if self.eval_device == 'LOCAL':
-            warnings.warn(
-                'Running code eval locally may be insecure. Please set environment variable CODE_EVAL_DEVICE '
-                'to LAMBDA to run on remote. To use Lambdas, spin up your instance that checks code, set the URL as '
-                'CODE_EVAL_URL and the API key as CODE_EVAL_APIKEY.')
-            log.debug('Running code eval locally.')
-            client = LocalEvalClient()
-        elif self.eval_device == 'LAMBDA':
-            client = LambdaEvalClient()
-        elif self.eval_device == 'MOSAICML':
-            client = MosaicMLLambdaEvalClient()
-        elif self.eval_device is None:
-            raise ValueError(
-                'Attempting to use InContextLearningCodeEvalAccuracy but environment '
-                'variable `CODE_EVAL_DEVICE` is not set. Please set it to `CODE_EVAL_DEVICE` '
-                'to one of `LOCAL` (for unsafe local eval), `LAMBDA` (for AWS lambda ',
-                'evaluation), or `MOSAICML` (for lambda eval through MAPI).')
-        else:
-            raise ValueError('Environment variable `CODE_EVAL_DEVICE` must be one of `LOCAL`, '
-                             f'`LAMBDA`, or `MOSAICML` but got {self.eval_device}.')
-
-        return client
-
-    def estimator(self, n: int, c: int, k: int) -> float:
-        """Computes the pass@k metric.
-
-        Given the number of generated samples, n, the number of correct samples, c, and the k of interest,
-        this function calculates pass@k as 1 - comb(n - c, k) / comb(n, k) as per the definition of
-        pass@k in the HumanEval paper (https://arxiv.org/abs/2107.03374) and it's associated implementation:
-        https://github.com/openai/human-eval.
-        """
-        if n - c < k:
-            return 1.0
-        return 1.0 - float(np.prod(1.0 - k / np.arange(n - c + 1, n + 1)))
-
-    def update(self, batch: Dict[str, Any], outputs: List[str], labels: List[str]):
-        """Updates the pass@k accuracy of code generation.
-
-        Given a batch of prompts, test cases, and code generations, evaluates the code generations
-        against the test cases and augments the pass@k accuracy of the batch to the values so far.
-
-        Args:
-            batch (Dict[str, Any]): A batch of data produced by the InContextLearningCodeEvalDataset, with
-            the prompt, test cases, and entry points. This will be a dictionary that must have the following
-            arguments:
-            {
-                'prompts': List[str],
-                'test_inputs': List[List[str]],
-                'test_outputs': List[List[str]],
-                'entry_points': List[str],
-                'languages': List[str],
-                'generation_kwargs': Dict[str, Any]
-            }
-            outputs (List[str]): A list of code generations in the format of HF generate with beam search,
-            which is the a list of strings in groups of beam_size e.g. for beam size 2 and batch size 2, the list
-            will be of the format [prompt 1 gen 1, prompt 1 gen 2, prompt 2 gen 1, prompt 2 gen 2]
-            labels (List[str]): A list of the correct code generations, for compatibility with existing HF generate
-            functionalities. This is not used.
-        """
-        del labels  # never used
-        client = self.get_client()
-
-        pass_at_k = batch['pass_at_k']
-        num_generations = batch['generation_kwargs']['num_return_sequences']
-        processed_outputs = [
-            outputs[i * num_generations:(i + 1) * num_generations] for i in range(len(batch['prompts']))
-        ]
-        payloads = []
-        for sample_outputs, sample_prompt, test_inputs, test_outputs, entry_point, language in zip(
-                processed_outputs, batch['prompts'], batch['test_inputs'], batch['test_outputs'], batch['entry_points'],
-                batch['languages']):
-            self.total += torch.tensor(1.0)
-            prompt_payload = []
-            for code_gen in sample_outputs:
-                code_gen = re.split(r'\n[A-Za-z0-9#`]', code_gen)[0]  # remove everything after function ends
-                final_code = sample_prompt + code_gen  # combine prompt with the code generation
-                generation_payload = []
-                for test_input, test_output in zip(test_inputs, test_outputs):
-                    payload = {
-                        'code': final_code,
-                        'input': test_input,
-                        'output': test_output,
-                        'entry_point': entry_point,
-                        'language': language,
-                    }
-                    generation_payload.append(payload)
-
-                prompt_payload.append(generation_payload)
-            payloads.append(prompt_payload)
-
-        results = client.invoke(payloads)
-        for prompt in results:
-            num_correct = 0
-            for generation in prompt:
-                correct = all(generation)
-                if correct:
-                    num_correct += 1
-
-            pass_at_k_rate = self.estimator(num_generations, num_correct, pass_at_k)
-            self.correct += torch.tensor(pass_at_k_rate)
-
-        client.close()  # pyright: ignore [reportOptionalMemberAccess]
-
-    def compute(self):
-        assert isinstance(self.correct, Tensor)
-        assert isinstance(self.total, Tensor)
-        return self.correct / self.total
diff --git a/composer/models/huggingface.py b/composer/models/huggingface.py
index e633db9cb7..b13fbd2082 100644
--- a/composer/models/huggingface.py
+++ b/composer/models/huggingface.py
@@ -19,7 +19,7 @@
 import torch
 from torchmetrics import Metric
 
-from composer.metrics import InContextLearningMetric, InContextLearningQAAccuracy
+from composer.metrics import InContextLearningMetric
 from composer.models.base import ComposerModel
 from composer.utils import MissingConditionalImportError, dist, get_file, import_object, is_model_fsdp, safe_torch_load
 
@@ -473,10 +473,7 @@ def get_metrics(self, is_train: bool = False) -> Dict[str, Metric]:
         return metrics if metrics else {}
 
     def update_metric(self, batch: Any, outputs: Any, metric: Metric) -> None:
-        if isinstance(metric, InContextLearningQAAccuracy):
-            assert self.labels is not None
-            metric.update(batch=batch, outputs=outputs, labels=self.labels)  # pyright: ignore [reportGeneralTypeIssues]
-        elif isinstance(metric, InContextLearningMetric):
+        if isinstance(metric, InContextLearningMetric):
             assert self.labels is not None
             metric.update(batch, outputs, self.labels)  # pyright: ignore [reportGeneralTypeIssues]
         else:
diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py
index 9a98e2b174..e8c17d9f38 100644
--- a/tests/datasets/test_in_context_learning_datasets.py
+++ b/tests/datasets/test_in_context_learning_datasets.py
@@ -16,24 +16,17 @@
 
 # isort: off
 from composer.datasets.in_context_learning_evaluation import (
-    InContextLearningCodeEvalDataset,
     InContextLearningDataset,
-    InContextLearningMultipleChoiceTaskDataset,
-    InContextLearningQATaskDataset,
-    InContextLearningSchemaTaskDataset,
     _get_continuation_span,
     _get_fewshot_sample_idxs,
     _make_padded_input,
     _tokenizer_needs_prefix_space,
     _trim_context,
-    get_icl_task_dataloader,
     strip_data,
 )
 # isort: on
 from composer.datasets.utils import MultiTokenEOSCriteria
 from composer.loggers import InMemoryLogger
-from composer.metrics import (InContextLearningCodeEvalAccuracy, InContextLearningLMAccuracy,
-                              InContextLearningMultipleChoiceAccuracy, InContextLearningQAAccuracy)
 from composer.models import HuggingFaceModel
 from composer.trainer import Trainer
 from composer.utils import dist, reproducibility
@@ -434,1977 +427,3 @@ def test_tokenize_example_with_no_tokenize_labels(tiny_gpt2_tokenizer, tmp_path)
     assert tokenized_example['context'][-1] == tokenizer.eos_token_id
     assert len(tokenized_example['context']) == seqlen
     assert type(tokenized_example['answer']) == str
-
-
-def test_qa_set_cot_no_cot(tmp_path):
-    pytest.importorskip('datasets')
-    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-    dataset_uri = f'{local_data}/triviaqa_small.jsonl'
-    transformers = pytest.importorskip('transformers')
-    tokenizer = transformers.AutoTokenizer.from_pretrained('facebook/opt-125m')  # type: ignore reportUnboundVariable
-
-    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
-    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
-    dl = InContextLearningQATaskDataset(
-        dataset_uri=dataset_uri,
-        tokenizer=tokenizer,
-        max_seq_len=1024,
-        pad_tok_id=tokenizer.eos_token_id,
-        num_fewshot=0,
-        fewshot_random_seed=1234,
-        prompt_string='',
-        example_delimiter='\n',
-        continuation_delimiter=': ',
-        destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
-    )
-    assert not dl.has_cot
-
-
-def test_qa_set_cot_has_cot(tmp_path):
-    pytest.importorskip('datasets')
-    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-    dataset_uri = f'{local_data}/gsm8k_small.jsonl'
-    transformers = pytest.importorskip('transformers')
-    tokenizer = transformers.AutoTokenizer.from_pretrained('facebook/opt-125m')  # type: ignore reportUnboundVariable
-
-    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
-    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
-    dl = InContextLearningQATaskDataset(
-        dataset_uri=dataset_uri,
-        tokenizer=tokenizer,
-        max_seq_len=1024,
-        pad_tok_id=tokenizer.eos_token_id,
-        num_fewshot=0,
-        fewshot_random_seed=1234,
-        prompt_string='',
-        example_delimiter='\n',
-        continuation_delimiter=': ',
-        destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
-    )
-    assert dl.has_cot
-
-
-def test_qa_get_max_answer_length(tiny_gpt2_tokenizer, tmp_path):
-    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-    dataset_uri = f'{local_data}/triviaqa_small.jsonl'
-    tokenizer = tiny_gpt2_tokenizer
-
-    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
-    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
-    dl = InContextLearningQATaskDataset(
-        dataset_uri=dataset_uri,
-        tokenizer=tokenizer,
-        max_seq_len=1024,
-        pad_tok_id=tokenizer.eos_token_id,
-        num_fewshot=0,
-        fewshot_random_seed=1234,
-        prompt_string='',
-        example_delimiter='',
-        continuation_delimiter='',
-        cot_delimiter='',
-        destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
-    )
-    # empirical number from the small test dataset
-    assert dl.max_answer_length == 7
-
-
-def test_qa_get_answer_from_example_with_no_cot(tmp_path, tiny_gpt2_tokenizer):
-    pytest.importorskip('datasets')
-    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-    dataset_uri = f'{local_data}/triviaqa_small.jsonl'
-
-    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
-    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
-    dl = InContextLearningQATaskDataset(
-        dataset_uri=dataset_uri,
-        tokenizer=tiny_gpt2_tokenizer,
-        max_seq_len=1024,
-        pad_tok_id=tiny_gpt2_tokenizer.eos_token_id,
-        num_fewshot=0,
-        fewshot_random_seed=1234,
-        prompt_string='',
-        example_delimiter='\n',
-        continuation_delimiter=': ',
-        cot_delimiter=' ### ',
-        destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
-    )
-    answer = dl.get_answer_from_example({
-        'context': 'empty',
-        'answer': 'this is the correct answer',
-        'chain_of_thought': "Let's think step by step. "
-    })
-    assert answer == 'this is the correct answer'
-
-
-def test_qa_get_answer_from_example_with_cot(tmp_path, tiny_gpt2_tokenizer):
-    pytest.importorskip('datasets')
-    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-    dataset_uri = f'{local_data}/triviaqa_small.jsonl'
-
-    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
-    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
-    dl = InContextLearningQATaskDataset(
-        dataset_uri=dataset_uri,
-        tokenizer=tiny_gpt2_tokenizer,
-        max_seq_len=1024,
-        pad_tok_id=tiny_gpt2_tokenizer.eos_token_id,
-        num_fewshot=0,
-        fewshot_random_seed=1234,
-        prompt_string='',
-        example_delimiter='\n',
-        continuation_delimiter=': ',
-        cot_delimiter=' ### ',
-        destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
-    )
-    dl.has_cot = True
-    answer = dl.get_answer_from_example({
-        'context': 'empty',
-        'answer': 'this is the correct answer',
-        'chain_of_thought': "Let's think step by step. "
-    })
-    assert answer == "Let's think step by step.  ### this is the correct answer"
-
-
-def test_qa_tokenize_example(tiny_gpt2_tokenizer, tmp_path):
-    pytest.importorskip('datasets')
-    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-    dataset_uri = f'{local_data}/triviaqa_small.jsonl'
-
-    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
-    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
-    dl = InContextLearningQATaskDataset(
-        dataset_uri=dataset_uri,
-        tokenizer=tiny_gpt2_tokenizer,
-        max_seq_len=1024,
-        pad_tok_id=tiny_gpt2_tokenizer.eos_token_id,
-        num_fewshot=0,
-        fewshot_random_seed=1234,
-        prompt_string='',
-        example_delimiter='\n',
-        continuation_delimiter=': ',
-        cot_delimiter=' ### ',
-        destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
-    )
-    dl.has_cot = True
-    tokenized_example = dl.tokenize_example(
-        'starting prompt', 'a context', {
-            'context': 'empty',
-            'answer': 'this is the correct answer',
-            'aliases': ['this is the right answer', 'this is the best answer'],
-            'chain_of_thought': "Let's think step by step. "
-        })
-    assert 'aliases' in tokenized_example
-    assert tokenized_example['aliases'] == ['this is the right answer', 'this is the best answer']
-
-
-def test_code_adjust_padding(tiny_gpt2_tokenizer, tmp_path):
-    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-    dataset_uri = f'{local_data}/human_eval_small.jsonl'
-    tokenizer = tiny_gpt2_tokenizer
-    seqlen = 2048
-    num_fewshot = 0
-    prompt_string = ''
-    gen_kwargs = {'temperature': .9, 'top_p': .95, 'num_beams': 9000}
-
-    dl = InContextLearningCodeEvalDataset(
-        dataset_uri=dataset_uri,
-        tokenizer=tokenizer,
-        max_seq_len=seqlen,
-        pad_tok_id=tokenizer.eos_token_id,
-        num_fewshot=num_fewshot,
-        fewshot_random_seed=1,
-        prompt_string=prompt_string,
-        example_delimiter='\n',
-        prelimiter='Code start:',
-        continuation_delimiter='\nPlease code:',
-        destination_path=str(tmp_path / 'test_human_eval_small.jsonl'),
-        generation_kwargs=gen_kwargs,
-        generations_per_sample=10,
-    )
-
-    assert all(len(data['prompt']) == 148 for data in dl.dataset)  # pyright: ignore [reportGeneralTypeIssues]
-
-
-def test_code_update_gen_kwargs(tiny_gpt2_tokenizer, tmp_path):
-    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-    dataset_uri = f'{local_data}/human_eval_small.jsonl'
-    tokenizer = tiny_gpt2_tokenizer
-    seqlen = 2048
-    num_fewshot = 0
-    prompt_string = ''
-    gen_kwargs = {'temperature': .9, 'top_p': .95, 'num_beams': 9000}
-
-    dl = InContextLearningCodeEvalDataset(
-        dataset_uri=dataset_uri,
-        tokenizer=tokenizer,
-        max_seq_len=seqlen,
-        pad_tok_id=tokenizer.eos_token_id,
-        num_fewshot=num_fewshot,
-        fewshot_random_seed=1,
-        prompt_string=prompt_string,
-        example_delimiter='\n',
-        prelimiter='Code start:',
-        continuation_delimiter='\nPlease code:',
-        destination_path=str(tmp_path / 'test_human_eval_small.jsonl'),
-        generation_kwargs=gen_kwargs,
-        generations_per_sample=10,
-    )
-    assert dl.base_batch['generation_kwargs']['num_beams'] == 9000
-    assert dl.base_batch['generation_kwargs']['top_p'] == .95
-    assert dl.base_batch['generation_kwargs']['temperature'] == .9
-    assert dl.base_batch['generation_kwargs']['do_sample'] == True
-
-
-def test_mc_tokenize_example(tiny_gpt2_tokenizer, tmp_path):
-    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-    dataset_uri = f'{local_data}/mmlu_small.jsonl'
-    tokenizer = tiny_gpt2_tokenizer
-    seqlen = 2048
-    num_fewshot = 0
-    prompt_string = ''
-    seqlen = 2048
-    dl = InContextLearningMultipleChoiceTaskDataset(
-        dataset_uri=dataset_uri,
-        tokenizer=tokenizer,
-        max_seq_len=seqlen,
-        pad_tok_id=tokenizer.eos_token_id,
-        num_fewshot=num_fewshot,
-        fewshot_random_seed=1,
-        prompt_string=prompt_string,
-        example_delimiter='\n',
-        continuation_delimiter=' ### ',
-        destination_path=str(tmp_path / 'test_human_eval_small.jsonl'),
-    )
-    example = {
-        'context': "Who's the best eval researcher?\n A. Jeremy\n B. Tessa\n C. Max\n D. Other\nAnswer: ",
-        'choices': ['A', 'B', 'C', 'D'],
-        'gold': 2
-    }
-    tokenized_example = dl.tokenize_example(prompt_and_fewshot='Answer the following: ',
-                                            ctxt=example['context'],
-                                            example=example)
-    unpadded_queries = [context[context != tokenizer.eos_token_id] for context in tokenized_example['query']]
-    untokenized_inputs = [tokenizer.decode(unpadded_input) for unpadded_input in unpadded_queries]
-    correct_output = [
-        "Answer the following: Who's the best eval researcher?\n A. Jeremy\n B. Tessa\n C. Max\n D. Other\nAnswer: A",
-        "Answer the following: Who's the best eval researcher?\n A. Jeremy\n B. Tessa\n C. Max\n D. Other\nAnswer: B",
-        "Answer the following: Who's the best eval researcher?\n A. Jeremy\n B. Tessa\n C. Max\n D. Other\nAnswer: C",
-        "Answer the following: Who's the best eval researcher?\n A. Jeremy\n B. Tessa\n C. Max\n D. Other\nAnswer: D"
-    ]
-    assert untokenized_inputs == correct_output
-
-
-def test_schema_construct_context(tiny_gpt2_tokenizer, tmp_path):
-    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-    dataset_uri = f'{local_data}/winograd_small.jsonl'
-    tokenizer = tiny_gpt2_tokenizer
-    seqlen = 2048
-    num_fewshot = 0
-    seqlen = 2048
-    dl = InContextLearningSchemaTaskDataset(
-        dataset_uri=dataset_uri,
-        tokenizer=tokenizer,
-        max_seq_len=seqlen,
-        pad_tok_id=tokenizer.eos_token_id,
-        num_fewshot=num_fewshot,
-        fewshot_random_seed=1,
-        prompt_string='',
-        example_delimiter='\n',
-        continuation_delimiter=' ### ',
-        destination_path=str(tmp_path / 'test_human_eval_small.jsonl'),
-    )
-    example = {'context_options': ['cont one', 'cont two'], 'gold': 0, 'continuation': 'this is a continuation'}
-    constructed_context = dl.construct_context(example)
-    assert constructed_context == 'cont one ### this is a continuation'
-    constructed_context = dl.construct_context(example, preceding_text='text')
-    assert constructed_context == '\ncont one ### this is a continuation'
-
-
-def test_schema_construct_multiple_contexts(tiny_gpt2_tokenizer, tmp_path):
-    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-    dataset_uri = f'{local_data}/winograd_small.jsonl'
-    tokenizer = tiny_gpt2_tokenizer
-    seqlen = 2048
-    num_fewshot = 0
-    prompt_string = ''
-    seqlen = 2048
-    dl = InContextLearningSchemaTaskDataset(
-        dataset_uri=dataset_uri,
-        tokenizer=tokenizer,
-        max_seq_len=seqlen,
-        pad_tok_id=tokenizer.eos_token_id,
-        num_fewshot=num_fewshot,
-        fewshot_random_seed=1,
-        prompt_string=prompt_string,
-        example_delimiter='\n',
-        continuation_delimiter=' ### ',
-        destination_path=str(tmp_path / 'test_human_eval_small.jsonl'),
-    )
-    example = {'context_options': ['cont one', 'cont two'], 'gold': 0, 'continuation': 'this is a continuation'}
-    constructed_contexts = dl._construct_multiple_contexts(example)
-    assert constructed_contexts == ['cont one', 'cont two']
-    constructed_contexts = dl._construct_multiple_contexts(example, preceding_text='some text')
-    assert constructed_contexts == ['\ncont one ###', '\ncont two ###']
-
-
-def test_schema_tokenize_example(tiny_gpt2_tokenizer, tmp_path):
-    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-    dataset_uri = f'{local_data}/winograd_small.jsonl'
-    tokenizer = tiny_gpt2_tokenizer
-    seqlen = 2048
-    num_fewshot = 0
-    prompt_string = ''
-    seqlen = 2048
-    dl = InContextLearningSchemaTaskDataset(
-        dataset_uri=dataset_uri,
-        tokenizer=tokenizer,
-        max_seq_len=seqlen,
-        pad_tok_id=tokenizer.eos_token_id,
-        num_fewshot=num_fewshot,
-        fewshot_random_seed=1,
-        prompt_string=prompt_string,
-        example_delimiter='\n',
-        continuation_delimiter=' ### ',
-        destination_path=str(tmp_path / 'test_human_eval_small.jsonl'),
-    )
-    example = {'context_options': ['context one', 'context two'], 'gold': 0, 'continuation': 'this is a continuation'}
-    tokenized_example = dl.tokenize_example(prompt_and_fewshot='prompt ',
-                                            context_options=example['context_options'],
-                                            example=example)
-    assert all(tiny_gpt2_tokenizer.decode(cont) == ' this is a continuation' for cont in tokenized_example['answer'])
-    unpadded_inputs = [context[context != tokenizer.eos_token_id] for context in tokenized_example['context_options']]
-    untokenized_inputs = [tokenizer.decode(unpadded_input) for unpadded_input in unpadded_inputs]
-    assert untokenized_inputs == [
-        'prompt context one this is a continuation', 'prompt context two this is a continuation'
-    ]
-
-
-@pytest.mark.parametrize('dataset_uri', ['mmlu_small.jsonl'])
-def test_mc_task_dataloader_subcategories(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
-    pytest.importorskip('datasets')
-
-    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-
-    tokenizer = tiny_gpt2_tokenizer
-    dataset_uri = f'{local_data}/{dataset_uri}'
-    batch_size = 8
-    seqlen = 64
-    dls = get_icl_task_dataloader('multiple_choice',
-                                  dataset_uri=dataset_uri,
-                                  tokenizer=tokenizer,
-                                  batch_size=batch_size,
-                                  max_seq_len=seqlen,
-                                  pad_tok_id=tokenizer.eos_token_id,
-                                  num_fewshot=2,
-                                  prompt_string='The following are multiple choice questions (with answers).\n',
-                                  example_delimiter='\n',
-                                  continuation_delimiter='Answer: ',
-                                  destination_path=str(tmp_path / 'icl.jsonl'),
-                                  has_categories=True)
-    assert isinstance(dls, dict)
-
-    assert 'computer_security' in dls
-    dl = dls['computer_security']
-    assert isinstance(dl.dataloader, DataLoader)  # pyright
-    batch = next(dl.dataloader._get_iterator())
-    assert dl.dataloader.__len__() == 2
-    assert 'input_ids' in batch
-    assert tuple(batch['input_ids'].shape) == (batch_size, seqlen)
-    assert 'attention_mask' in batch
-    assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen)
-    assert 'continuation_indices' in batch
-    assert isinstance(batch['continuation_indices'], list) and len(batch['continuation_indices']) == batch_size
-    assert 'mode' in batch
-    assert batch['mode'] == 'icl_task'
-    min_idx = min(batch['continuation_indices'][0]).item()
-    max_idx = max(batch['continuation_indices'][0]).item()
-    assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx + 1]) == ' A'
-
-
-@pytest.mark.parametrize('dataset_uri', [
-    'pubmed_sm.jsonl',
-])
-def test_lm_task_dataloader_extra_space(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
-    pytest.importorskip('datasets')
-
-    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-
-    tokenizer = tiny_gpt2_tokenizer
-    dataset_uri = f'{local_data}/{dataset_uri}'
-    batch_size = 2
-    seqlen = 64
-    dl = get_icl_task_dataloader('language_modeling',
-                                 dataset_uri=dataset_uri,
-                                 tokenizer=tokenizer,
-                                 batch_size=batch_size,
-                                 max_seq_len=seqlen,
-                                 pad_tok_id=tokenizer.eos_token_id,
-                                 num_fewshot=10,
-                                 prompt_string='',
-                                 example_delimiter='\n',
-                                 continuation_delimiter=' ',
-                                 destination_path=str(tmp_path / 'icl.jsonl'))
-    assert isinstance(dl, DataSpec)
-    assert isinstance(dl.dataloader, DataLoader)  # pyright
-    batch = next(dl.dataloader._get_iterator())
-
-    assert 'input_ids' in batch
-    assert tuple(batch['input_ids'].shape) == (batch_size, seqlen)
-    assert 'attention_mask' in batch
-    assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen)
-    assert 'continuation_indices' in batch
-    assert isinstance(batch['continuation_indices'], list) and len(batch['continuation_indices']) == batch_size
-    assert 'mode' in batch
-    assert batch['mode'] == 'icl_task'
-    min_idx = min(batch['continuation_indices'][0]).item()
-    max_idx = max(batch['continuation_indices'][0]).item()
-    assert '  ' not in tokenizer.decode(batch['input_ids'][0][0:max_idx + 1])
-    assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx + 1]) == ' yes'
-
-
-@pytest.mark.parametrize('dataset_uri', [
-    'lambada_small.jsonl',
-])
-def test_lm_task_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
-    pytest.importorskip('datasets')
-
-    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-
-    tokenizer = tiny_gpt2_tokenizer
-    dataset_uri = f'{local_data}/{dataset_uri}'
-    batch_size = 2
-    seqlen = 64
-    dl = get_icl_task_dataloader('language_modeling',
-                                 dataset_uri=dataset_uri,
-                                 tokenizer=tokenizer,
-                                 batch_size=batch_size,
-                                 max_seq_len=seqlen,
-                                 pad_tok_id=tokenizer.eos_token_id,
-                                 num_fewshot=0,
-                                 prompt_string='',
-                                 example_delimiter='\n',
-                                 continuation_delimiter='',
-                                 destination_path=str(tmp_path / 'icl.jsonl'))
-    assert isinstance(dl, DataSpec)
-    assert isinstance(dl.dataloader, DataLoader)  # pyright
-    batch = next(dl.dataloader._get_iterator())
-
-    assert 'input_ids' in batch
-    assert tuple(batch['input_ids'].shape) == (batch_size, seqlen)
-    assert 'attention_mask' in batch
-    assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen)
-    assert 'continuation_indices' in batch
-    assert isinstance(batch['continuation_indices'], list) and len(batch['continuation_indices']) == batch_size
-    assert 'mode' in batch
-    assert batch['mode'] == 'icl_task'
-    min_idx = min(batch['continuation_indices'][0]).item()
-    max_idx = max(batch['continuation_indices'][0]).item()
-    assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx + 1]) == ' glen'
-
-
-@pytest.mark.parametrize('dataset_uri', ['winograd_small.jsonl'])
-def test_schema_task_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
-    pytest.importorskip('datasets')
-
-    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-
-    tokenizer = tiny_gpt2_tokenizer
-    dataset_uri = f'{local_data}/{dataset_uri}'
-    batch_size = 2
-    seqlen = 64
-    dl = get_icl_task_dataloader('schema',
-                                 dataset_uri=dataset_uri,
-                                 tokenizer=tokenizer,
-                                 batch_size=batch_size,
-                                 max_seq_len=seqlen,
-                                 pad_tok_id=tokenizer.eos_token_id,
-                                 num_fewshot=1,
-                                 prompt_string='',
-                                 example_delimiter='\n',
-                                 continuation_delimiter='',
-                                 destination_path=str(tmp_path / 'icl.jsonl'))
-    assert isinstance(dl, DataSpec)
-    assert isinstance(dl.dataloader, DataLoader)
-    batch = next(dl.dataloader._get_iterator())
-
-    choices_per_question = 2
-    assert 'input_ids' in batch
-    assert tuple(batch['input_ids'].shape) == (batch_size, seqlen)
-    assert 'attention_mask' in batch
-    assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen)
-    assert 'continuation_indices' in batch
-    assert isinstance(batch['continuation_indices'], list) and len(batch['continuation_indices']) == batch_size
-    assert 'mode' in batch
-    assert batch['mode'] == 'icl_task'
-    assert 'gold_indices' in batch
-    assert isinstance(batch['gold_indices'], list) and len(batch['gold_indices']) == batch_size // choices_per_question
-    assert 'choice_groupings' in batch
-    assert isinstance(batch['choice_groupings'], list) and len(
-        batch['choice_groupings']) == batch_size // choices_per_question
-
-    min_idx = min(batch['continuation_indices'][0]).item()
-    max_idx = max(batch['continuation_indices'][0]).item()
-    assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx + 1]) == ' feared violence.'
-
-
-@pytest.mark.parametrize('dataset_uri', ['winograd_small.jsonl'])
-def test_schema_task_dataloader_sentpiece_tokenizer(dataset_uri, tmp_path):
-    pytest.importorskip('datasets')
-    transformers = pytest.importorskip('transformers')
-
-    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-    tokenizer = transformers.AutoTokenizer.from_pretrained(
-        'huggyllama/llama-7b',  # type: ignore reportUnboundVariable
-        use_fast=False)
-    dataset_uri = f'{local_data}/{dataset_uri}'
-    batch_size = 2
-    seqlen = 64
-    dl = get_icl_task_dataloader('schema',
-                                 dataset_uri=dataset_uri,
-                                 tokenizer=tokenizer,
-                                 batch_size=batch_size,
-                                 max_seq_len=seqlen,
-                                 pad_tok_id=tokenizer.eos_token_id,
-                                 num_fewshot=1,
-                                 prompt_string='',
-                                 example_delimiter='\n',
-                                 continuation_delimiter=' ',
-                                 destination_path=str(tmp_path / 'icl.jsonl'))
-    assert isinstance(dl, DataSpec)
-    assert isinstance(dl.dataloader, DataLoader)
-    batch = next(dl.dataloader._get_iterator())
-
-    choices_per_question = 2
-    assert 'input_ids' in batch
-    assert tuple(batch['input_ids'].shape) == (batch_size, seqlen)
-    assert 'attention_mask' in batch
-    assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen)
-    assert 'continuation_indices' in batch
-    assert isinstance(batch['continuation_indices'], list) and len(batch['continuation_indices']) == batch_size
-    assert 'mode' in batch
-    assert batch['mode'] == 'icl_task'
-    assert 'gold_indices' in batch
-    assert isinstance(batch['gold_indices'], list) and len(batch['gold_indices']) == batch_size // choices_per_question
-    assert 'choice_groupings' in batch
-    assert isinstance(batch['choice_groupings'], list) and len(
-        batch['choice_groupings']) == batch_size // choices_per_question
-
-    max_idx = max(batch['continuation_indices'][0]).item()
-    assert tokenizer.decode(
-        batch['input_ids'][0][0:max_idx + 1]
-    ) == "<s>The trophy doesn't fit into the brown suitcase because the suitcase is too small. \nThe city councilmen refused the demonstrators a permit because the city councilmen feared violence."
-
-
-@pytest.mark.parametrize('dataset_uri', ['lambada_small.jsonl'])
-@pytest.mark.parametrize('num_fewshot', [0, 1])
-def test_lm_task_dataloader_opt_tokenizer(tiny_opt_tokenizer, dataset_uri, num_fewshot, tmp_path):
-    pytest.importorskip('datasets')
-
-    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-
-    tokenizer = tiny_opt_tokenizer
-    dataset_uri = f'{local_data}/{dataset_uri}'
-    batch_size = 2
-    seqlen = 512
-    dl = get_icl_task_dataloader('language_modeling',
-                                 dataset_uri=dataset_uri,
-                                 tokenizer=tokenizer,
-                                 batch_size=batch_size,
-                                 max_seq_len=seqlen,
-                                 pad_tok_id=tokenizer.eos_token_id,
-                                 num_fewshot=num_fewshot,
-                                 prompt_string='',
-                                 example_delimiter='\n',
-                                 continuation_delimiter='',
-                                 destination_path=str(tmp_path / 'icl.jsonl'))
-    assert isinstance(dl, DataSpec)
-    assert isinstance(dl.dataloader, DataLoader)  # pyright
-    batch = next(dl.dataloader._get_iterator())
-
-    assert 'input_ids' in batch
-    assert tuple(batch['input_ids'].shape) == (batch_size, seqlen)
-    assert 'attention_mask' in batch
-    assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen)
-    assert 'continuation_indices' in batch
-    assert isinstance(batch['continuation_indices'], list) and len(batch['continuation_indices']) == batch_size
-    assert 'mode' in batch
-    assert batch['mode'] == 'icl_task'
-    min_idx = min(batch['continuation_indices'][0]).item()
-    max_idx = max(batch['continuation_indices'][0]).item()
-    assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx + 1]) == ' glen'
-    assert tokenizer.decode(batch['input_ids'][0][0:min_idx]).startswith('</s>')
-    assert tokenizer.decode(batch['input_ids'][0][0:min_idx]).count('</s>') == 1
-
-
-@pytest.mark.parametrize('dataset_uri', ['piqa_small.jsonl'])
-@pytest.mark.parametrize('num_fewshot', [0, 1])
-def test_mc_task_dataloader_opt_tokenizer(tiny_opt_tokenizer, dataset_uri, num_fewshot, tmp_path):
-    pytest.importorskip('datasets')
-
-    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-
-    tokenizer = tiny_opt_tokenizer
-
-    dataset_uri = f'{local_data}/{dataset_uri}'
-    batch_size = 4
-    seqlen = 64
-    dl = get_icl_task_dataloader('multiple_choice',
-                                 dataset_uri=dataset_uri,
-                                 tokenizer=tokenizer,
-                                 batch_size=batch_size,
-                                 max_seq_len=seqlen,
-                                 pad_tok_id=tokenizer.eos_token_id,
-                                 num_fewshot=num_fewshot,
-                                 prompt_string='',
-                                 example_delimiter='\n',
-                                 continuation_delimiter=': ',
-                                 destination_path=str(tmp_path / 'icl.jsonl'))
-    assert isinstance(dl, DataSpec)
-    assert isinstance(dl.dataloader, DataLoader)  # pyright
-    batch = next(dl.dataloader._get_iterator())
-
-    choices_per_question = 2
-    assert dl.get_num_samples_in_batch(batch) == 2
-    assert 'input_ids' in batch
-    assert tuple(batch['input_ids'].shape) == (batch_size, seqlen)
-    assert 'attention_mask' in batch
-    assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen)
-    assert 'continuation_indices' in batch
-    assert isinstance(batch['continuation_indices'], list) and len(batch['continuation_indices']) == batch_size
-    assert 'mode' in batch
-    assert batch['mode'] == 'icl_task'
-    assert 'gold_indices' in batch
-    assert isinstance(batch['gold_indices'], list) and len(batch['gold_indices']) == batch_size // choices_per_question
-    assert 'choice_groupings' in batch
-    assert isinstance(batch['choice_groupings'], list) and len(
-        batch['choice_groupings']) == batch_size // choices_per_question
-
-    min_idx = min(batch['continuation_indices'][0]).item()
-    max_idx = max(batch['continuation_indices'][0]).item()
-    assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx + 1]) == ' Pour it onto a plate'
-    assert tokenizer.decode(batch['input_ids'][0][0:min_idx]).startswith('</s>')
-    assert tokenizer.decode(batch['input_ids'][0][0:min_idx]).count('</s>') == 1
-
-
-@pytest.mark.parametrize('dataset_uri', ['piqa_small.jsonl'])
-@pytest.mark.parametrize('num_fewshot', [0, 1])
-def test_mc_split_batch(tiny_opt_tokenizer, dataset_uri, num_fewshot, tmp_path):
-    pytest.importorskip('datasets')
-
-    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-
-    tokenizer = tiny_opt_tokenizer
-
-    dataset_uri = f'{local_data}/{dataset_uri}'
-    batch_size = 4
-    seqlen = 512
-    dl = get_icl_task_dataloader('multiple_choice',
-                                 dataset_uri=dataset_uri,
-                                 tokenizer=tokenizer,
-                                 batch_size=batch_size,
-                                 max_seq_len=seqlen,
-                                 pad_tok_id=tokenizer.eos_token_id,
-                                 num_fewshot=num_fewshot,
-                                 prompt_string='',
-                                 example_delimiter='\n',
-                                 continuation_delimiter=': ',
-                                 destination_path=str(tmp_path / 'icl.jsonl'))
-    assert isinstance(dl, DataSpec)
-    assert isinstance(dl.dataloader, DataLoader)  # pyright
-    batch = next(dl.dataloader._get_iterator())
-    choices_per_question = 2
-    real_microbatch_size = batch_size // 2
-    logical_microbatch_size = real_microbatch_size // choices_per_question
-    microbatches = dl.split_batch(batch, logical_microbatch_size)
-    assert len(microbatches) == 2
-    for i, microbatch in enumerate(microbatches):
-        assert dl.get_num_samples_in_batch(microbatch) == 1
-        assert 'input_ids' in microbatch
-        assert tuple(microbatch['input_ids'].shape) == (real_microbatch_size, seqlen)
-        assert 'attention_mask' in microbatch
-        assert tuple(microbatch['attention_mask'].shape) == (real_microbatch_size, seqlen)
-        assert 'continuation_indices' in microbatch
-        assert isinstance(microbatch['continuation_indices'], list) and len(
-            microbatch['continuation_indices']) == real_microbatch_size
-        assert 'mode' in microbatch
-        assert microbatch['mode'] == 'icl_task'
-        assert 'gold_indices' in microbatch
-        assert isinstance(microbatch['gold_indices'], list) and len(
-            microbatch['gold_indices']) == real_microbatch_size // choices_per_question
-        assert 'choice_groupings' in microbatch
-        assert isinstance(microbatch['choice_groupings'], list) and len(
-            microbatch['choice_groupings']) == real_microbatch_size // choices_per_question
-
-        min_idx = min(microbatch['continuation_indices'][0]).item()
-        max_idx = max(microbatch['continuation_indices'][0]).item()
-        if i == 0:
-            assert tokenizer.decode(microbatch['input_ids'][0][min_idx:max_idx + 1]) == ' Pour it onto a plate'
-        elif i == 1:
-            assert tokenizer.decode(
-                microbatch['input_ids'][0][min_idx:max_idx +
-                                           1]) == ' Weld the metal together to get it to stay firmly in place'
-        assert tokenizer.decode(microbatch['input_ids'][0][0:min_idx]).startswith('</s>')
-        assert tokenizer.decode(microbatch['input_ids'][0][0:min_idx]).count('</s>') == 1
-
-
-@pytest.mark.parametrize('dataset_uri', ['triviaqa_small.jsonl'])
-def test_qa_split_batch(tiny_opt_tokenizer, dataset_uri, tmp_path):
-    pytest.importorskip('datasets')
-    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-    dataset_uri = f'{local_data}/{dataset_uri}'
-    tokenizer = tiny_opt_tokenizer
-
-    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
-    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)  # for dist
-    dl = get_icl_task_dataloader(
-        icl_task_type='question_answering',
-        dataset_uri=dataset_uri,
-        tokenizer=tokenizer,
-        batch_size=8,
-        max_seq_len=1024,
-        pad_tok_id=tokenizer.eos_token_id,
-        num_fewshot=0,
-        prompt_string='',
-        example_delimiter='\n',
-        continuation_delimiter=': ',
-        destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
-    )
-
-    assert isinstance(dl, DataSpec)  # pyright
-
-    batch = next(iter(dl.dataloader))
-    split_batch = dl.split_batch(batch, 3)
-
-    assert len(split_batch) == 2
-    split1 = split_batch[0]
-    split2 = split_batch[1]
-
-    assert split1['input_ids'].shape[0] == 3
-    assert split2['input_ids'].shape[0] == 1
-
-    assert split1['attention_mask'].shape[0] == 3
-    assert split2['attention_mask'].shape[0] == 1
-
-    assert isinstance(split1['mode'], str)
-    assert isinstance(split2['mode'], str)
-
-    assert len(split1['labels']) == 3
-    assert len(split2['labels']) == 1
-    assert all(isinstance(v, list) for v in split1['labels'] + split2['labels'])
-
-    assert isinstance(split1['generation_length'], int)
-    assert isinstance(split2['generation_length'], int)
-
-    assert isinstance(split1['generation_kwargs'], dict)
-    assert isinstance(split2['generation_kwargs'], dict)
-
-
-@pytest.mark.parametrize('dataset_uri', ['triviaqa_small.jsonl'])
-@pytest.mark.parametrize('num_fewshot', [0])
-@pytest.mark.parametrize('prompt_string', ['I am a prompt', ''])
-def test_qa_task_dataloader_w_null_eos(dataset_uri, tiny_gpt2_tokenizer, tmp_path, num_fewshot, prompt_string):
-    pytest.importorskip('datasets')
-
-    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-
-    tokenizer = tiny_gpt2_tokenizer
-    dataset_uri = f'{local_data}/{dataset_uri}'
-    batch_size = 4
-    seqlen = 512
-    tiny_gpt2_tokenizer.eos_token_id = None
-    with pytest.raises(ValueError):
-        _ = get_icl_task_dataloader('question_answering',
-                                    dataset_uri,
-                                    tokenizer,
-                                    batch_size,
-                                    max_seq_len=seqlen,
-                                    pad_tok_id=tokenizer.eos_token_id,
-                                    num_fewshot=num_fewshot,
-                                    prompt_string=prompt_string,
-                                    example_delimiter='\n',
-                                    question_prelimiter='Q: ',
-                                    continuation_delimiter='\nA:',
-                                    destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'))
-
-
-@pytest.mark.parametrize('dataset_uri', ['triviaqa_small.jsonl'])
-@pytest.mark.parametrize('num_fewshot', [0, 2])
-@pytest.mark.parametrize('prompt_string', ['I am a prompt', ''])
-def test_qa_task_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path, num_fewshot, prompt_string):
-    pytest.importorskip('datasets')
-
-    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-
-    tokenizer = tiny_gpt2_tokenizer
-    dataset_uri = f'{local_data}/{dataset_uri}'
-    batch_size = 4
-    seqlen = 512
-    # empirical number from the small test dataset
-    maximum_answer_length = 7
-    dl = get_icl_task_dataloader('question_answering',
-                                 dataset_uri=dataset_uri,
-                                 tokenizer=tokenizer,
-                                 batch_size=batch_size,
-                                 max_seq_len=seqlen,
-                                 pad_tok_id=tokenizer.eos_token_id,
-                                 num_fewshot=num_fewshot,
-                                 prompt_string=prompt_string,
-                                 example_delimiter='\n',
-                                 question_prelimiter='Q: ',
-                                 continuation_delimiter='\nA:',
-                                 destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'))
-    assert isinstance(dl, DataSpec)
-
-    assert isinstance(dl.dataloader, DataLoader)  # pyright
-    batch = next(dl.dataloader._get_iterator())
-
-    assert tuple(batch['input_ids'].shape) == (batch_size, seqlen - maximum_answer_length)
-    assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen - maximum_answer_length)
-    assert batch['mode'] == 'generate'
-    # the maximum generation length from the small test data
-
-    assert batch['generation_length'] == maximum_answer_length
-    assert all(item[0] == tokenizer.eos_token_id for item in batch['input_ids'])
-
-    decoded_batch = tokenizer.batch_decode(batch['input_ids'])
-    assert all(item.count('Q: ') == num_fewshot + 1 for item in decoded_batch)
-    assert all(item.count('\nA:') == num_fewshot + 1 for item in decoded_batch)
-
-    if len(prompt_string) > 0:
-        assert all(item.count('I am a prompt') == 1 for item in decoded_batch)
-    assert all(
-        set(found) == set(expected)
-        for found, expected in zip(batch['labels'], [['David Seville'], ['Skorpio', 'Scorpio']]))
-    assert decoded_batch[0].endswith('Q: Who was the man behind The Chipmunks?\nA:')
-    assert decoded_batch[1].endswith('Q: What star sign is Jamie Lee Curtis?\nA:')
-    assert 'eos_token_id' in batch['generation_kwargs']
-
-
-@pytest.mark.parametrize('dataset_uri', ['gsm8k_small.jsonl'])
-@pytest.mark.parametrize('num_fewshot', [0, 2])
-def test_qa_task_with_cot_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path, num_fewshot):
-    pytest.importorskip('datasets')
-
-    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-
-    tokenizer = tiny_gpt2_tokenizer
-    dataset_uri = f'{local_data}/{dataset_uri}'
-    batch_size = 2
-    seqlen = 512
-    # empirical number from the small test dataset
-    maximum_answer_length = 132
-    dl = get_icl_task_dataloader('question_answering',
-                                 dataset_uri=dataset_uri,
-                                 tokenizer=tokenizer,
-                                 batch_size=batch_size,
-                                 max_seq_len=seqlen,
-                                 pad_tok_id=tokenizer.eos_token_id,
-                                 num_fewshot=num_fewshot,
-                                 prompt_string='',
-                                 example_delimiter='\n',
-                                 question_prelimiter='Q: ',
-                                 continuation_delimiter="\nA: Let's think step by step. ",
-                                 cot_delimiter=' #### ',
-                                 destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'))
-    assert isinstance(dl, DataSpec)
-    assert isinstance(dl.dataloader, DataLoader)  # pyright
-    batch = next(dl.dataloader._get_iterator())
-    assert tuple(batch['input_ids'].shape) == (batch_size, seqlen - maximum_answer_length)
-    assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen - maximum_answer_length)
-    assert batch['mode'] == 'generate'
-    # the maximum generation length from the small test data
-    assert batch['generation_length'] == maximum_answer_length
-    assert all(item[0] == tokenizer.eos_token_id for item in batch['input_ids'])
-    decoded_batch = tokenizer.batch_decode(batch['input_ids'])
-    assert all(item.count('Q: ') == num_fewshot + 1 for item in decoded_batch)
-    assert all(item.count('\nA:') == num_fewshot + 1 for item in decoded_batch)
-
-    assert batch['labels'] == [['18'], ['3']]
-    if num_fewshot == 0:
-        assert decoded_batch[0].endswith(
-            "Q: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\nA: Let's think step by step."
-        )
-        assert decoded_batch[1].endswith(
-            "Q: A robe takes 2 bolts of blue fiber and half that much white fiber.  How many bolts in total does it take?\nA: Let's think step by step."
-        )
-    elif num_fewshot == 2:
-        assert decoded_batch[0].endswith(
-            "Q: Josh decides to try flipping a house.  He buys a house for $80,000 and then puts in $50,000 in repairs.  This increased the value of the house by 150%.  How much profit did he make?\nA: Let's think step by step. The cost of the house and repairs came out to 80,000+50,000=$<<80000+50000=130000>>130,000\nHe increased the value of the house by 80,000*1.5=<<80000*1.5=120000>>120,000\nSo the new value of the house is 120,000+80,000=$<<120000+80000=200000>>200,000\nSo he made a profit of 200,000-130,000=$<<200000-130000=70000>>70,000 #### 70000\nQ: James decides to run 3 sprints 3 times a week.  He runs 60 meters each sprint.  How many total meters does he run a week?\nA: Let's think step by step. He sprints 3*3=<<3*3=9>>9 times\nSo he runs 9*60=<<9*60=540>>540 meters #### 540\nQ: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\nA: Let's think step by step."
-        )
-        assert decoded_batch[1].endswith(
-            "Q: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\nA: Let's think step by step. Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\nShe makes 9 * 2 = $<<9*2=18>>18 every day at the farmer’s market. #### 18\nQ: Josh decides to try flipping a house.  He buys a house for $80,000 and then puts in $50,000 in repairs.  This increased the value of the house by 150%.  How much profit did he make?\nA: Let's think step by step. The cost of the house and repairs came out to 80,000+50,000=$<<80000+50000=130000>>130,000\nHe increased the value of the house by 80,000*1.5=<<80000*1.5=120000>>120,000\nSo the new value of the house is 120,000+80,000=$<<120000+80000=200000>>200,000\nSo he made a profit of 200,000-130,000=$<<200000-130000=70000>>70,000 #### 70000\nQ: A robe takes 2 bolts of blue fiber and half that much white fiber.  How many bolts in total does it take?\nA: Let's think step by step."
-        )
-
-
-@pytest.mark.parametrize('dataset_uri', ['piqa_small.jsonl'])
-def test_mc_task_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
-    pytest.importorskip('datasets')
-
-    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-
-    tokenizer = tiny_gpt2_tokenizer
-    dataset_uri = f'{local_data}/{dataset_uri}'
-    batch_size = 2
-    seqlen = 64
-    dl = get_icl_task_dataloader('multiple_choice',
-                                 dataset_uri=dataset_uri,
-                                 tokenizer=tokenizer,
-                                 batch_size=batch_size,
-                                 max_seq_len=seqlen,
-                                 pad_tok_id=tokenizer.eos_token_id,
-                                 num_fewshot=1,
-                                 prompt_string='',
-                                 example_delimiter='\n',
-                                 continuation_delimiter=': ',
-                                 destination_path=str(tmp_path / 'icl.jsonl'))
-    assert isinstance(dl, DataSpec)
-    assert isinstance(dl.dataloader, DataLoader)  # pyright
-    batch = next(dl.dataloader._get_iterator())
-
-    choices_per_question = 2
-    assert 'input_ids' in batch
-    assert tuple(batch['input_ids'].shape) == (batch_size, seqlen)
-    assert 'attention_mask' in batch
-    assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen)
-    assert 'continuation_indices' in batch
-    assert isinstance(batch['continuation_indices'], list) and len(batch['continuation_indices']) == batch_size
-    assert 'mode' in batch
-    assert batch['mode'] == 'icl_task'
-    assert 'gold_indices' in batch
-    assert isinstance(batch['gold_indices'], list) and len(batch['gold_indices']) == batch_size // choices_per_question
-    assert 'choice_groupings' in batch
-    assert isinstance(batch['choice_groupings'], list) and len(
-        batch['choice_groupings']) == batch_size // choices_per_question
-
-    min_idx = min(batch['continuation_indices'][0]).item()
-    max_idx = max(batch['continuation_indices'][0]).item()
-    assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx + 1]) == ' Pour it onto a plate'
-
-
-@pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
-def test_code_eval_split_batch(dataset_uri, tmp_path):
-    pytest.importorskip('datasets')
-    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-    dataset_uri = f'{local_data}/{dataset_uri}'
-    transformers = pytest.importorskip('transformers')
-    tokenizer = transformers.AutoTokenizer.from_pretrained(
-        'EleutherAI/gpt-neox-20b')  # type: ignore reportUnboundVariable
-
-    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
-    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
-    dl = get_icl_task_dataloader(
-        'code_evaluation',
-        dataset_uri=dataset_uri,
-        tokenizer=tokenizer,
-        batch_size=8,
-        max_seq_len=1024,
-        pad_tok_id=tokenizer.eos_token_id,
-        num_fewshot=2,
-        prompt_string='',
-        example_delimiter='\n',
-        continuation_delimiter='',
-        destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
-        generations_per_sample=4,
-    )
-
-    assert isinstance(dl, DataSpec)  # pyright
-
-    batch = next(iter(dl.dataloader))
-    split_batch = dl.split_batch(batch, 3)
-
-    assert len(split_batch) == 2
-    split1 = split_batch[0]
-    split2 = split_batch[1]
-
-    assert split1['input_ids'].shape[0] == 3
-    assert split2['input_ids'].shape[0] == 1
-
-    assert split1['attention_mask'].shape[0] == 3
-    assert split2['attention_mask'].shape[0] == 1
-
-    assert isinstance(split1['mode'], str)
-    assert isinstance(split2['mode'], str)
-
-    list_split = {
-        'labels': str,
-        'prompts': str,
-        'tests': str,
-        'entry_points': str,
-        'test_inputs': list,
-        'test_outputs': list,
-        'languages': str,
-    }
-    for k, v in list_split.items():
-        assert len(split1[k]) == 3
-        assert len(split2[k]) == 1
-        assert all(isinstance(val, v) for val in split1[k] + split2[k])
-
-    assert isinstance(split1['pass_at_k'], int)
-    assert isinstance(split2['pass_at_k'], int)
-
-    assert isinstance(split1['generation_length'], int)
-    assert isinstance(split2['generation_length'], int)
-
-    assert isinstance(split1['generation_kwargs'], dict)
-    assert isinstance(split2['generation_kwargs'], dict)
-
-
-@pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
-@pytest.mark.parametrize('num_fewshot', [0, 2])
-@pytest.mark.parametrize('prompt_string', ['Please code:\n', ''])
-@pytest.mark.parametrize('generations_per_sample', [1, 3])
-def test_code_eval_sentpiece_dataloader(dataset_uri, tmp_path, num_fewshot, prompt_string, generations_per_sample):
-    pytest.importorskip('datasets')
-
-    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-
-    transformers = pytest.importorskip('transformers')
-    tokenizer = transformers.AutoTokenizer.from_pretrained('huggyllama/llama-7b')  # type: ignore reportUnboundVariable
-    dataset_uri = f'{local_data}/{dataset_uri}'
-    batch_size = 4
-    seqlen = 2048
-
-    dl = get_icl_task_dataloader('code_evaluation',
-                                 dataset_uri=dataset_uri,
-                                 tokenizer=tokenizer,
-                                 batch_size=batch_size,
-                                 max_seq_len=seqlen,
-                                 pad_tok_id=tokenizer.eos_token_id,
-                                 num_fewshot=num_fewshot,
-                                 prompt_string=prompt_string,
-                                 example_delimiter='\n',
-                                 continuation_delimiter='',
-                                 question_prelimiter='Code start: \n',
-                                 destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'),
-                                 generations_per_sample=generations_per_sample)
-    assert isinstance(dl, DataSpec)
-
-    assert isinstance(dl.dataloader, DataLoader)  # pyright
-    batch = next(dl.dataloader._get_iterator())
-
-    max_prompt_length = 0
-    if isinstance(dl.dataloader.dataset, InContextLearningCodeEvalDataset):
-        max_prompt_length = dl.dataloader.dataset.max_prompt_length
-    assert tuple(batch['input_ids'].shape) == (batch_size, max_prompt_length)
-    assert tuple(batch['attention_mask'].shape) == (batch_size, max_prompt_length)
-    assert batch['mode'] == 'generate'
-    # the maximum generation length from the small test data
-    assert batch['generation_length'] == 129
-    assert any(item[0] != tokenizer.eos_token_id for item in batch['input_ids'])  # longest should be pushed left
-
-    decoded_batch = tokenizer.batch_decode(batch['input_ids'])
-    assert all(item.count('Code start: \n') == num_fewshot + 1 for item in decoded_batch)
-
-    if len(prompt_string) > 0:
-        assert all(item.count('Please code:\n') == 1 for item in decoded_batch)
-
-    assert batch['labels'] == [
-        '    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx != idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n',
-        "    result = []\n    current_string = []\n    current_depth = 0\n\n    for c in paren_string:\n        if c == '(':\n            current_depth += 1\n            current_string.append(c)\n        elif c == ')':\n            current_depth -= 1\n            current_string.append(c)\n\n            if current_depth == 0:\n                result.append(''.join(current_string))\n                current_string.clear()\n\n    return result\n",
-        '    return number % 1.0\n',
-        '    balance = 0\n\n    for op in operations:\n        balance += op\n        if balance < 0:\n            return True\n\n    return False\n',
-    ]
-
-    assert decoded_batch[0].endswith(
-        "Code start: \nfrom typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    \"\"\"\n"
-    )
-    assert decoded_batch[1].endswith(
-        "Code start: \nfrom typing import List\n\n\ndef separate_paren_groups(paren_string: str) -> List[str]:\n    \"\"\" Input to this function is a string containing multiple groups of nested parentheses. Your goal is to\n    separate those group into separate strings and return the list of those.\n    Separate groups are balanced (each open brace is properly closed) and not nested within each other\n    Ignore any spaces in the input string.\n    >>> separate_paren_groups('( ) (( )) (( )( ))')\n    ['()', '(())', '(()())']\n    \"\"\"\n"
-    )
-    assert decoded_batch[2].endswith(
-        "Code start: \n\n\ndef truncate_number(number: float) -> float:\n    \"\"\" Given a positive floating point number, it can be decomposed into\n    and integer part (largest integer smaller than given number) and decimals\n    (leftover part always smaller than 1).\n\n    Return the decimal part of the number.\n    >>> truncate_number(3.5)\n    0.5\n    \"\"\"\n"
-    )
-    assert decoded_batch[3].endswith(
-        "Code start: \nfrom typing import List\n\n\ndef below_zero(operations: List[int]) -> bool:\n    \"\"\" You're given a list of deposit and withdrawal operations on a bank account that starts with\n    zero balance. Your task is to detect if at any point the balance of account fallls below zero, and\n    at that point function should return True. Otherwise it should return False.\n    >>> below_zero([1, 2, 3])\n    False\n    >>> below_zero([1, 2, -4, 5])\n    True\n    \"\"\"\n"
-    )
-
-
-@pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
-def test_code_eval_test_cases(dataset_uri, tmp_path):
-    pytest.importorskip('datasets')
-
-    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-
-    transformers = pytest.importorskip('transformers')
-    tokenizer = transformers.AutoTokenizer.from_pretrained('huggyllama/llama-7b')  # type: ignore reportUnboundVariable
-    dataset_uri = f'{local_data}/{dataset_uri}'
-    batch_size = 4
-    seqlen = 512
-
-    dl = get_icl_task_dataloader('code_evaluation',
-                                 dataset_uri=dataset_uri,
-                                 tokenizer=tokenizer,
-                                 batch_size=batch_size,
-                                 max_seq_len=seqlen,
-                                 pad_tok_id=tokenizer.eos_token_id,
-                                 num_fewshot=0,
-                                 prompt_string='',
-                                 example_delimiter='\n',
-                                 continuation_delimiter='',
-                                 question_prelimiter='Code start: \n',
-                                 destination_path=str(tmp_path / f'icl_.jsonl'),
-                                 generations_per_sample=1)
-    assert isinstance(dl, DataSpec)
-
-    assert isinstance(dl.dataloader, DataLoader)  # pyright
-    batch = next(dl.dataloader._get_iterator())
-
-    max_prompt_length = 0
-    if isinstance(dl.dataloader.dataset, InContextLearningCodeEvalDataset):
-        max_prompt_length = dl.dataloader.dataset.max_prompt_length
-    assert tuple(batch['input_ids'].shape) == (batch_size, max_prompt_length)
-    assert tuple(batch['attention_mask'].shape) == (batch_size, max_prompt_length)
-    assert batch['mode'] == 'generate'
-    # the maximum generation length from the small test data
-    assert batch['generation_length'] == 129
-    assert any(item[0] != tokenizer.eos_token_id for item in batch['input_ids'])  # longest should be pushed left
-
-    mod = types.ModuleType('test_module')
-    for prompt, solution, inputs, outputs, entry_point in zip(batch['prompts'], batch['labels'], batch['test_inputs'],
-                                                              batch['test_outputs'], batch['entry_points']):
-        exec(prompt + solution, mod.__dict__)
-        for test_input, test_output in zip(inputs, outputs):
-            result = mod.__dict__[entry_point](*eval(test_input))
-            assert result == eval(test_output)
-
-
-@pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
-def test_code_eval_pass_at_k_validity(dataset_uri, tmp_path):
-    pytest.importorskip('datasets')
-
-    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-
-    transformers = pytest.importorskip('transformers')
-    tokenizer = transformers.AutoTokenizer.from_pretrained('huggyllama/llama-7b')  # type: ignore reportUnboundVariable
-    dataset_uri = f'{local_data}/{dataset_uri}'
-    batch_size = 2
-    seqlen = 64
-
-    with pytest.raises(ValueError, match=r'.* pass_at_k .*'):
-        get_icl_task_dataloader('code_evaluation',
-                                dataset_uri=dataset_uri,
-                                tokenizer=tokenizer,
-                                batch_size=batch_size,
-                                max_seq_len=seqlen,
-                                pad_tok_id=tokenizer.eos_token_id,
-                                num_fewshot=0,
-                                prompt_string='',
-                                example_delimiter='\n',
-                                continuation_delimiter='',
-                                question_prelimiter='Code start: \n',
-                                destination_path=str(tmp_path / f'icl_.jsonl'),
-                                pass_at_k=10,
-                                generations_per_sample=1)
-
-
-@pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
-@pytest.mark.parametrize('num_fewshot', [0, 2])
-@pytest.mark.parametrize('prompt_string', ['Please code:\n', ''])
-@pytest.mark.parametrize('generations_per_sample', [1, 3])
-def test_code_eval_task_dataloader(dataset_uri, tmp_path, num_fewshot, prompt_string, generations_per_sample):
-    pytest.importorskip('datasets')
-
-    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-
-    transformers = pytest.importorskip('transformers')
-    tokenizer = transformers.AutoTokenizer.from_pretrained('mosaicml/mpt-7b')  # type: ignore reportUnboundVariable
-    dataset_uri = f'{local_data}/{dataset_uri}'
-    batch_size = 4
-    seqlen = 2048
-
-    dl = get_icl_task_dataloader('code_evaluation',
-                                 dataset_uri=dataset_uri,
-                                 tokenizer=tokenizer,
-                                 batch_size=batch_size,
-                                 max_seq_len=seqlen,
-                                 pad_tok_id=tokenizer.eos_token_id,
-                                 num_fewshot=num_fewshot,
-                                 prompt_string=prompt_string,
-                                 example_delimiter='\n',
-                                 continuation_delimiter='',
-                                 question_prelimiter='Code start: \n',
-                                 destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'),
-                                 generations_per_sample=generations_per_sample,
-                                 generation_kwargs={
-                                     'temperature': .9,
-                                     'top_k': 40
-                                 })
-    assert isinstance(dl, DataSpec)
-
-    assert isinstance(dl.dataloader, DataLoader)  # pyright
-    batch = next(dl.dataloader._get_iterator())
-
-    max_prompt_length = 0
-    if isinstance(dl.dataloader.dataset, InContextLearningCodeEvalDataset):
-        max_prompt_length = dl.dataloader.dataset.max_prompt_length
-    assert tuple(batch['input_ids'].shape) == (batch_size, max_prompt_length)
-    assert tuple(batch['attention_mask'].shape) == (batch_size, max_prompt_length)
-    assert batch['mode'] == 'generate'
-    # the maximum generation length from the small test data
-    assert batch['generation_length'] == 122
-    assert any(item[0] != tokenizer.eos_token_id for item in batch['input_ids'])  # longest should be pushed left
-
-    decoded_batch = tokenizer.batch_decode(batch['input_ids'])
-    assert all(item.count('Code start: \n') == num_fewshot + 1 for item in decoded_batch)
-
-    if len(prompt_string) > 0:
-        assert all(item.count('Please code:\n') == 1 for item in decoded_batch)
-
-    assert batch['labels'] == [
-        '    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx != idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n',
-        "    result = []\n    current_string = []\n    current_depth = 0\n\n    for c in paren_string:\n        if c == '(':\n            current_depth += 1\n            current_string.append(c)\n        elif c == ')':\n            current_depth -= 1\n            current_string.append(c)\n\n            if current_depth == 0:\n                result.append(''.join(current_string))\n                current_string.clear()\n\n    return result\n",
-        '    return number % 1.0\n',
-        '    balance = 0\n\n    for op in operations:\n        balance += op\n        if balance < 0:\n            return True\n\n    return False\n',
-    ]
-
-    assert decoded_batch[0].endswith(
-        "Code start: \nfrom typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    \"\"\"\n"
-    )
-    assert decoded_batch[1].endswith(
-        "Code start: \nfrom typing import List\n\n\ndef separate_paren_groups(paren_string: str) -> List[str]:\n    \"\"\" Input to this function is a string containing multiple groups of nested parentheses. Your goal is to\n    separate those group into separate strings and return the list of those.\n    Separate groups are balanced (each open brace is properly closed) and not nested within each other\n    Ignore any spaces in the input string.\n    >>> separate_paren_groups('( ) (( )) (( )( ))')\n    ['()', '(())', '(()())']\n    \"\"\"\n"
-    )
-    assert decoded_batch[2].endswith(
-        "Code start: \n\n\ndef truncate_number(number: float) -> float:\n    \"\"\" Given a positive floating point number, it can be decomposed into\n    and integer part (largest integer smaller than given number) and decimals\n    (leftover part always smaller than 1).\n\n    Return the decimal part of the number.\n    >>> truncate_number(3.5)\n    0.5\n    \"\"\"\n"
-    )
-    assert decoded_batch[3].endswith(
-        "Code start: \nfrom typing import List\n\n\ndef below_zero(operations: List[int]) -> bool:\n    \"\"\" You're given a list of deposit and withdrawal operations on a bank account that starts with\n    zero balance. Your task is to detect if at any point the balance of account fallls below zero, and\n    at that point function should return True. Otherwise it should return False.\n    >>> below_zero([1, 2, 3])\n    False\n    >>> below_zero([1, 2, -4, 5])\n    True\n    \"\"\"\n"
-    )
-
-
-@pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
-@pytest.mark.parametrize('num_fewshot', [0, 1])
-def test_eval_split_batch(tiny_opt_tokenizer, dataset_uri, num_fewshot, tmp_path):
-    pytest.importorskip('datasets')
-
-    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-    transformers = pytest.importorskip('transformers')
-    tokenizer = transformers.AutoTokenizer.from_pretrained('mosaicml/mpt-7b')  # type: ignore reportUnboundVariable
-    dataset_uri = f'{local_data}/{dataset_uri}'
-    batch_size = 4
-    seqlen = 512
-
-    dl = get_icl_task_dataloader('code_evaluation',
-                                 dataset_uri=dataset_uri,
-                                 tokenizer=tokenizer,
-                                 batch_size=batch_size,
-                                 max_seq_len=seqlen,
-                                 pad_tok_id=tokenizer.eos_token_id,
-                                 num_fewshot=num_fewshot,
-                                 prompt_string='',
-                                 example_delimiter='\n',
-                                 continuation_delimiter='',
-                                 question_prelimiter='Code start: \n',
-                                 destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'),
-                                 generations_per_sample=1,
-                                 generation_kwargs={
-                                     'temperature': .9,
-                                     'top_k': 40
-                                 })
-    assert isinstance(dl, DataSpec)
-    assert isinstance(dl.dataloader, DataLoader)  # pyright
-    batch = next(dl.dataloader._get_iterator())
-    microbatch_size = 1
-    microbatches = dl.split_batch(batch, microbatch_size)
-    assert len(microbatches) == 4
-    for microbatch in microbatches:
-        assert dl.get_num_samples_in_batch(microbatch) == 1
-        assert 'input_ids' in microbatch
-        # TODO: what should this be?
-        # assert tuple(microbatch['input_ids'].shape) == (microbatch_size, seqlen)
-        assert 'attention_mask' in microbatch
-        # assert tuple(microbatch['attention_mask'].shape) == (microbatch_size, seqlen)
-        assert isinstance(microbatch['generation_kwargs'], dict)
-        assert microbatch['generation_kwargs']['temperature'] == .9
-        assert microbatch['generation_kwargs']['top_k'] == 40
-        assert microbatch['generation_kwargs']['pad_token_id'] == 0
-        assert microbatch['generation_kwargs']['num_beams'] == 1
-        assert microbatch['generation_kwargs']['num_return_sequences'] == 1
-        assert microbatch['generation_kwargs']['do_sample'] == True
-        assert microbatch['generation_kwargs']['use_cache'] == True
-        assert microbatch['generation_kwargs']['eos_token_id'] == 0
-
-
-@pytest.mark.parametrize('dataset_uri', ['lambada_small.jsonl'])
-@pytest.mark.parametrize('num_fewshot', [0, 5])
-@device('gpu')
-def test_lm_task_evaluation(device, dataset_uri, num_fewshot, tiny_gpt2_tokenizer, tmp_path):
-    pytest.importorskip('datasets')
-    in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
-    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-    dataset_uri = f'{local_data}/{dataset_uri}'
-    tokenizer = tiny_gpt2_tokenizer
-    batch_size = 2
-    dl = get_icl_task_dataloader(
-        'language_modeling',
-        dataset_uri=dataset_uri,
-        tokenizer=tokenizer,
-        batch_size=batch_size,
-        max_seq_len=2048,
-        pad_tok_id=tokenizer.eos_token_id,
-        num_fewshot=num_fewshot,
-        prompt_string='',
-        example_delimiter='\n',
-        continuation_delimiter='',
-        destination_path=str(tmp_path / 'icl.jsonl'),
-    )
-
-    evaluator = Evaluator(label='lambada', dataloader=dl, metric_names=['InContextLearningLMAccuracy'])
-
-    transformers = pytest.importorskip('transformers')
-    config = transformers.AutoConfig.from_pretrained('EleutherAI/gpt-neo-125M')
-    model = transformers.AutoModelForCausalLM.from_config(config)
-    model = HuggingFaceModel(
-        model=model,
-        tokenizer=None,
-        eval_metrics=[InContextLearningLMAccuracy()],
-        use_logits=True,
-    )
-
-    trainer = Trainer(model=model, max_duration='1ep', loggers=in_memory_logger)
-    trainer.eval(eval_dataloader=evaluator, subset_num_batches=2)
-    assert 'metrics/lambada/InContextLearningLMAccuracy' in in_memory_logger.data.keys()
-    assert in_memory_logger.data['metrics/lambada/InContextLearningLMAccuracy'][0][1].item() == 0
-
-
-@pytest.mark.parametrize('num_fewshot', [0, 5])
-@pytest.mark.parametrize('dataset_uri', ['winograd_small.jsonl'])
-@pytest.mark.filterwarnings(r'ignore:Cannot split .* of length.*:UserWarning')
-def test_schema_task_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tmp_path, tiny_gpt2_model):
-    pytest.importorskip('datasets')
-    in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
-    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-    dataset_uri = f'{local_data}/{dataset_uri}'
-    tokenizer = tiny_gpt2_tokenizer
-    batch_size = 8
-    dl = get_icl_task_dataloader(
-        'schema',
-        dataset_uri=dataset_uri,
-        tokenizer=tokenizer,
-        batch_size=batch_size,
-        max_seq_len=1024,
-        pad_tok_id=tokenizer.eos_token_id,
-        num_fewshot=num_fewshot,
-        prompt_string='',
-        example_delimiter='\n',
-        continuation_delimiter=': ',
-        destination_path=str(tmp_path / 'icl.jsonl'),
-    )
-
-    evaluator = Evaluator(label='winograd', dataloader=dl, metric_names=['InContextLearningMultipleChoiceAccuracy'])
-
-    model = HuggingFaceModel(
-        model=tiny_gpt2_model,
-        tokenizer=tokenizer,
-        eval_metrics=[InContextLearningMultipleChoiceAccuracy()],
-        use_logits=True,
-    )
-
-    trainer = Trainer(model=model, max_duration='1ba', loggers=in_memory_logger)
-    trainer.eval(eval_dataloader=evaluator)
-    assert 'metrics/winograd/InContextLearningMultipleChoiceAccuracy' in in_memory_logger.data.keys()
-    assert in_memory_logger.data['metrics/winograd/InContextLearningMultipleChoiceAccuracy'][0][1].item() > 0
-    num_samples = 0
-    with open(dataset_uri) as f:
-        for _ in f:
-            num_samples += 1
-    assert trainer.state.eval_metrics['winograd']['InContextLearningMultipleChoiceAccuracy'].total == num_samples
-
-
-@pytest.mark.parametrize('dataset_uri', ['mmlu_small.jsonl'])
-@pytest.mark.parametrize('num_fewshot', [0, 5])
-@device('gpu')
-@world_size(1, 2)
-@pytest.mark.filterwarnings(r'ignore:Cannot split .* of length.*:UserWarning')
-def test_mc_task_evaluation_subcategories(device, world_size, dataset_uri, num_fewshot, tiny_gpt2_model,
-                                          tiny_gpt2_tokenizer, tmp_path):
-    pytest.importorskip('datasets')
-    in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
-    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-    dataset_uri = f'{local_data}/{dataset_uri}'
-    tokenizer = tiny_gpt2_tokenizer
-    batch_size = 8
-    max_seq_len = 64
-    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
-    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
-    reproducibility.seed_all(1234)
-    dls = get_icl_task_dataloader('multiple_choice',
-                                  dataset_uri=dataset_uri,
-                                  tokenizer=tokenizer,
-                                  batch_size=batch_size,
-                                  max_seq_len=max_seq_len,
-                                  pad_tok_id=tokenizer.eos_token_id,
-                                  num_fewshot=num_fewshot,
-                                  prompt_string='',
-                                  example_delimiter='\n',
-                                  continuation_delimiter=': ',
-                                  destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
-                                  has_categories=True)
-
-    assert isinstance(dls, dict)
-    evaluators = [
-        Evaluator(label='mmlu/' + k, dataloader=dl, metric_names=['InContextLearningMultipleChoiceAccuracy'])
-        for k, dl in dls.items()
-    ]
-
-    model = HuggingFaceModel(
-        model=tiny_gpt2_model,
-        tokenizer=tiny_gpt2_tokenizer,
-        eval_metrics=[InContextLearningMultipleChoiceAccuracy()],
-        use_logits=True,
-    )
-
-    trainer = Trainer(model=model, loggers=in_memory_logger)
-    trainer.eval(eval_dataloader=evaluators)
-    assert 'metrics/mmlu/computer_security/InContextLearningMultipleChoiceAccuracy' in in_memory_logger.data.keys()
-    assert in_memory_logger.data['metrics/mmlu/computer_security/InContextLearningMultipleChoiceAccuracy'][0][1].item(
-    ) > 0
-    total = trainer.state.eval_metrics['mmlu/computer_security']['InContextLearningMultipleChoiceAccuracy'].total
-    dist.all_reduce(total)  # type: ignore
-    assert total.item() == 4  # type: ignore
-
-
-@pytest.mark.parametrize('dataset_uri', ['piqa_small.jsonl', 'hellaswag_small.jsonl'])
-@pytest.mark.parametrize('num_fewshot', [0, 5])
-@pytest.mark.filterwarnings(r'ignore:Cannot split .* of length.*:UserWarning')
-@device('gpu')
-@world_size(1, 2)
-def test_mc_task_evaluation(device, world_size, num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tmp_path,
-                            tiny_gpt2_model):
-    pytest.importorskip('datasets')
-    in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
-    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-    dataset_uri = f'{local_data}/{dataset_uri}'
-    tokenizer = tiny_gpt2_tokenizer
-    batch_size = 8
-    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
-    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
-
-    # seed because the fewshot selection is currently unseeded
-    reproducibility.seed_all(1234)
-    dl = get_icl_task_dataloader(
-        'multiple_choice',
-        dataset_uri=dataset_uri,
-        tokenizer=tokenizer,
-        batch_size=batch_size,
-        max_seq_len=64,
-        pad_tok_id=tokenizer.eos_token_id,
-        num_fewshot=num_fewshot,
-        prompt_string='',
-        example_delimiter='\n',
-        continuation_delimiter=': ',
-        destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
-    )
-
-    evaluator = Evaluator(label='mc', dataloader=dl, metric_names=['InContextLearningMultipleChoiceAccuracy'])
-
-    model = HuggingFaceModel(
-        model=tiny_gpt2_model,
-        tokenizer=tiny_gpt2_tokenizer,
-        eval_metrics=[InContextLearningMultipleChoiceAccuracy()],
-        use_logits=True,
-    )
-
-    trainer = Trainer(model=model, max_duration='1ba', loggers=in_memory_logger)
-    trainer.eval(eval_dataloader=evaluator)
-    assert 'metrics/mc/InContextLearningMultipleChoiceAccuracy' in in_memory_logger.data.keys()
-    assert in_memory_logger.data['metrics/mc/InContextLearningMultipleChoiceAccuracy'][0][1].item() >= 0
-    num_samples = 0
-    with open(dataset_uri) as f:
-        for _ in f:
-            num_samples += 1
-    total = trainer.state.eval_metrics['mc']['InContextLearningMultipleChoiceAccuracy'].total
-    dist.all_reduce(total)  # type: ignore
-    assert total.item() == num_samples  # type: ignore
-
-
-@pytest.mark.parametrize('num_fewshot', [0, 5])
-@pytest.mark.parametrize('dataset_uri', ['triviaqa_small.jsonl'])
-@pytest.mark.filterwarnings(r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning')
-@pytest.mark.filterwarnings(r'ignore:Cannot split .* of length.*:UserWarning')
-@device('gpu')
-@world_size(1, 2)
-def test_qa_task_evaluation_opt_tokenizer(device, world_size, tiny_opt_tokenizer, tiny_opt_model, num_fewshot,
-                                          dataset_uri, tmp_path):
-    pytest.importorskip('datasets')
-    in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
-    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-    dataset_uri = f'{local_data}/{dataset_uri}'
-    tokenizer = tiny_opt_tokenizer
-
-    batch_size = 4
-    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
-    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
-    dl = get_icl_task_dataloader(
-        'question_answering',
-        dataset_uri=dataset_uri,
-        tokenizer=tokenizer,
-        batch_size=batch_size,
-        max_seq_len=1024,
-        pad_tok_id=tokenizer.eos_token_id,
-        num_fewshot=num_fewshot,
-        prompt_string='',
-        example_delimiter='\n',
-        continuation_delimiter=': ',
-        destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
-    )
-
-    evaluator = Evaluator(label='triviaqa', dataloader=dl, metric_names=['InContextLearningQAAccuracy'])
-    model = HuggingFaceModel(
-        model=tiny_opt_model,
-        tokenizer=tokenizer,
-        eval_metrics=[InContextLearningQAAccuracy()],
-        use_logits=True,
-    )
-
-    trainer = Trainer(model=model, max_duration='1ba', loggers=in_memory_logger)
-
-    trainer.eval(eval_dataloader=evaluator, subset_num_batches=2)
-    assert 'metrics/triviaqa/InContextLearningQAAccuracy' in in_memory_logger.data.keys()
-    assert in_memory_logger.data['metrics/triviaqa/InContextLearningQAAccuracy'][0][1].item() == 0
-
-
-@pytest.mark.parametrize('num_fewshot', [5])
-@pytest.mark.parametrize('dataset_uri', ['gsm8k_small.jsonl'])
-@device('gpu')
-@world_size(1, 2)
-@pytest.mark.filterwarnings(r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning')
-@pytest.mark.filterwarnings(r'ignore:Cannot split .* of length.*:UserWarning')
-def test_qa_task_evaluation_with_cot_opt_tokenizer(device, world_size, tiny_opt_tokenizer, tiny_opt_model, num_fewshot,
-                                                   dataset_uri, tmp_path):
-    pytest.importorskip('datasets')
-    in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
-    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-    dataset_uri = f'{local_data}/{dataset_uri}'
-    tokenizer = tiny_opt_tokenizer
-
-    batch_size = 4
-    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
-    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
-    dl = get_icl_task_dataloader(
-        'question_answering',
-        dataset_uri=dataset_uri,
-        tokenizer=tokenizer,
-        batch_size=batch_size,
-        max_seq_len=1024,
-        pad_tok_id=tokenizer.eos_token_id,
-        num_fewshot=num_fewshot,
-        prompt_string='',
-        example_delimiter='\n',
-        continuation_delimiter="A: Let's think step by step. ",
-        cot_delimiter=' #### ',
-        destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
-    )
-
-    evaluator = Evaluator(label='gsm8k', dataloader=dl, metric_names=['InContextLearningQAAccuracy'])
-    model = HuggingFaceModel(
-        model=tiny_opt_model,
-        tokenizer=tokenizer,
-        eval_metrics=[InContextLearningQAAccuracy()],
-        use_logits=True,
-    )
-
-    trainer = Trainer(model=model, max_duration='1ba', loggers=in_memory_logger)
-
-    trainer.eval(eval_dataloader=evaluator, subset_num_batches=2)
-    assert 'metrics/gsm8k/InContextLearningQAAccuracy' in in_memory_logger.data.keys()
-    assert in_memory_logger.data['metrics/gsm8k/InContextLearningQAAccuracy'][0][1].item() == 0
-
-
-@pytest.mark.parametrize('dataset_uri', ['triviaqa_small.jsonl'])
-@pytest.mark.parametrize('num_fewshot', [0, 5])
-@device('gpu')
-@world_size(1, 2)
-@pytest.mark.filterwarnings(r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning')
-def test_qa_task_evaluation(device, world_size, num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tiny_gpt2_model,
-                            tmp_path):
-    pytest.importorskip('datasets')
-    in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
-    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-    dataset_uri = f'{local_data}/{dataset_uri}'
-    tokenizer = tiny_gpt2_tokenizer
-    batch_size = 2
-    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
-    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
-    dl = get_icl_task_dataloader(
-        'question_answering',
-        dataset_uri=dataset_uri,
-        tokenizer=tokenizer,
-        batch_size=batch_size,
-        max_seq_len=1024,
-        pad_tok_id=tokenizer.eos_token_id,
-        num_fewshot=num_fewshot,
-        prompt_string='',
-        example_delimiter='\n',
-        continuation_delimiter=': ',
-        destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
-    )
-
-    evaluator = Evaluator(label='triviaqa', dataloader=dl, metric_names=['InContextLearningQAAccuracy'])
-
-    model = HuggingFaceModel(
-        model=tiny_gpt2_model,
-        tokenizer=tiny_gpt2_tokenizer,
-        eval_metrics=[InContextLearningQAAccuracy()],
-        use_logits=True,
-    )
-
-    trainer = Trainer(model=model, max_duration='1ba', loggers=in_memory_logger)
-
-    trainer.eval(eval_dataloader=evaluator, subset_num_batches=2)
-    assert 'metrics/triviaqa/InContextLearningQAAccuracy' in in_memory_logger.data.keys()
-    assert in_memory_logger.data['metrics/triviaqa/InContextLearningQAAccuracy'][0][1].item() == 0
-
-
-@pytest.mark.parametrize('dataset_uri', ['gsm8k_small.jsonl'])
-@pytest.mark.parametrize('num_fewshot', [5])
-@pytest.mark.filterwarnings(r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning')
-@device('gpu')
-@world_size(1, 2)
-def test_qa_task_with_cot_evaluation(device, world_size, num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tiny_gpt2_model,
-                                     tmp_path):
-    pytest.importorskip('datasets')
-    in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
-    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-    dataset_uri = f'{local_data}/{dataset_uri}'
-    tokenizer = tiny_gpt2_tokenizer
-    batch_size = 2
-    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
-    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
-    dl = get_icl_task_dataloader(
-        'question_answering',
-        dataset_uri=dataset_uri,
-        tokenizer=tokenizer,
-        batch_size=batch_size,
-        max_seq_len=1024,
-        pad_tok_id=tokenizer.eos_token_id,
-        num_fewshot=num_fewshot,
-        prompt_string='',
-        example_delimiter='\n',
-        continuation_delimiter="A: Let's think step by step",
-        cot_delimiter=' #### ',
-        destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
-    )
-
-    evaluator = Evaluator(label='gsm8k', dataloader=dl, metric_names=['InContextLearningQAAccuracy'])
-
-    model = HuggingFaceModel(
-        model=tiny_gpt2_model,
-        tokenizer=tiny_gpt2_tokenizer,
-        eval_metrics=[InContextLearningQAAccuracy()],
-        use_logits=True,
-    )
-
-    trainer = Trainer(model=model, max_duration='1ba', loggers=in_memory_logger)
-
-    trainer.eval(eval_dataloader=evaluator, subset_num_batches=2)
-    assert 'metrics/gsm8k/InContextLearningQAAccuracy' in in_memory_logger.data.keys()
-    assert in_memory_logger.data['metrics/gsm8k/InContextLearningQAAccuracy'][0][1].item() == 0
-
-
-def test_code_eval_requires_envvar(monkeypatch):
-    monkeypatch.delenv('CODE_EVAL_DEVICE', raising=False)
-    with pytest.raises(ValueError, match='Attempting to use InContextLearningCodeEvalAccuracy but.*'):
-        InContextLearningCodeEvalAccuracy().get_client()
-
-
-def test_code_eval_requires_valid_envvar(monkeypatch):
-    monkeypatch.setenv('CODE_EVAL_DEVICE', 'bigchungus')
-    with pytest.raises(ValueError, match='Environment variable `CODE_EVAL_DEVICE` must be on.*'):
-        InContextLearningCodeEvalAccuracy().get_client()
-
-
-@pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
-@pytest.mark.parametrize('num_fewshot', [0])
-@pytest.mark.parametrize('generations_per_sample', range(1, 3))
-@device('gpu')
-@world_size(1, 2)
-@pytest.mark.filterwarnings(r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning')
-def test_code_eval_microbatching(monkeypatch, device, world_size, tiny_opt_tokenizer, tiny_opt_model, num_fewshot,
-                                 dataset_uri, tmp_path, generations_per_sample):
-    pytest.importorskip('datasets')
-    monkeypatch.setenv('CODE_EVAL_DEVICE', 'LOCAL')
-    in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
-    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-    dataset_uri = f'{local_data}/{dataset_uri}'
-    tokenizer = tiny_opt_tokenizer
-    batch_size = 4
-
-    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
-    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
-    dl = get_icl_task_dataloader(
-        'code_evaluation',
-        dataset_uri=dataset_uri,
-        tokenizer=tokenizer,
-        batch_size=batch_size,
-        max_seq_len=150,
-        pad_tok_id=tokenizer.eos_token_id,
-        num_fewshot=num_fewshot,
-        prompt_string='',
-        example_delimiter='\n',
-        continuation_delimiter=': ',
-        destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
-        generations_per_sample=generations_per_sample,
-    )
-
-    evaluator = Evaluator(label='humaneval',
-                          dataloader=dl,
-                          metric_names=['InContextLearningCodeEvalAccuracy'],
-                          device_eval_microbatch_size=1)
-    model = HuggingFaceModel(
-        model=tiny_opt_model,
-        tokenizer=tokenizer,
-        eval_metrics=[InContextLearningCodeEvalAccuracy()],
-        use_logits=True,
-    )
-
-    trainer = Trainer(model=model, max_duration='1ba', loggers=in_memory_logger)
-    torch.use_deterministic_algorithms(False)
-    trainer.eval(eval_dataloader=evaluator, subset_num_batches=2)
-    torch.use_deterministic_algorithms(True)
-    assert 'metrics/humaneval/InContextLearningCodeEvalAccuracy' in in_memory_logger.data.keys()
-    assert in_memory_logger.data['metrics/humaneval/InContextLearningCodeEvalAccuracy'][0][1].item() == 0
-
-
-@pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
-@pytest.mark.parametrize('num_fewshot', [0])
-@pytest.mark.parametrize('generations_per_sample', range(1, 3))
-@device('gpu')
-@world_size(1, 2)
-@pytest.mark.filterwarnings(r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning')
-def test_code_eval_sentpiece_evaluation(monkeypatch, device, world_size, num_fewshot, dataset_uri, tiny_t5_tokenizer,
-                                        tiny_t5_model, tmp_path, generations_per_sample):
-    pytest.importorskip('datasets')
-    monkeypatch.setenv('CODE_EVAL_DEVICE', 'LOCAL')
-    in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
-    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-    dataset_uri = f'{local_data}/{dataset_uri}'
-    tokenizer = tiny_t5_tokenizer
-    batch_size = 2
-    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
-    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
-    dl = get_icl_task_dataloader(
-        'code_evaluation',
-        dataset_uri=dataset_uri,
-        tokenizer=tokenizer,
-        batch_size=batch_size,
-        max_seq_len=175,
-        pad_tok_id=tokenizer.eos_token_id,
-        num_fewshot=num_fewshot,
-        prompt_string='',
-        example_delimiter='\n',
-        continuation_delimiter=': ',
-        destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
-        generations_per_sample=generations_per_sample,
-    )
-
-    evaluator = Evaluator(label='humaneval', dataloader=dl, metric_names=['InContextLearningCodeEvalAccuracy'])
-    model = HuggingFaceModel(
-        model=tiny_t5_model,
-        tokenizer=tiny_t5_tokenizer,
-        eval_metrics=[InContextLearningCodeEvalAccuracy()],
-        use_logits=True,
-    )
-
-    trainer = Trainer(model=model, max_duration='1ba', loggers=in_memory_logger)
-    torch.use_deterministic_algorithms(False)
-    trainer.eval(eval_dataloader=evaluator, subset_num_batches=2)
-    torch.use_deterministic_algorithms(True)
-    assert 'metrics/humaneval/InContextLearningCodeEvalAccuracy' in in_memory_logger.data.keys()
-    assert in_memory_logger.data['metrics/humaneval/InContextLearningCodeEvalAccuracy'][0][1].item() == 0
-
-
-@pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
-@pytest.mark.parametrize('num_fewshot', [0, 2])
-@pytest.mark.parametrize('generations_per_sample', [1])
-@pytest.mark.filterwarnings(r'ignore: Input length of input_ids is')
-@device('gpu')
-@world_size(1, 2)
-@pytest.mark.filterwarnings(r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning')
-def test_code_eval_task_evaluation(monkeypatch, device, world_size, num_fewshot, dataset_uri, tiny_gpt2_tokenizer,
-                                   tiny_gpt2_model, tmp_path, generations_per_sample):
-    pytest.importorskip('datasets')
-    monkeypatch.setenv('CODE_EVAL_DEVICE', 'LOCAL')
-    in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
-    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-    dataset_uri = f'{local_data}/{dataset_uri}'
-    tokenizer = tiny_gpt2_tokenizer
-    batch_size = 2
-    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
-    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
-    dl = get_icl_task_dataloader(
-        'code_evaluation',
-        dataset_uri=dataset_uri,
-        tokenizer=tokenizer,
-        batch_size=batch_size,
-        max_seq_len=64 * num_fewshot,
-        pad_tok_id=tokenizer.eos_token_id,
-        num_fewshot=num_fewshot,
-        prompt_string='',
-        example_delimiter='\n',
-        continuation_delimiter=': ',
-        destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
-        generations_per_sample=generations_per_sample,
-    )
-
-    evaluator = Evaluator(label='humaneval', dataloader=dl, metric_names=['InContextLearningCodeEvalAccuracy'])
-    model = HuggingFaceModel(
-        model=tiny_gpt2_model,
-        tokenizer=tiny_gpt2_tokenizer,
-        eval_metrics=[InContextLearningCodeEvalAccuracy()],
-        use_logits=True,
-    )
-
-    trainer = Trainer(model=model, max_duration='1ba', loggers=in_memory_logger)
-    torch.use_deterministic_algorithms(False)
-    trainer.eval(eval_dataloader=evaluator, subset_num_batches=2)
-    torch.use_deterministic_algorithms(True)
-    assert 'metrics/humaneval/InContextLearningCodeEvalAccuracy' in in_memory_logger.data.keys()
-    assert in_memory_logger.data['metrics/humaneval/InContextLearningCodeEvalAccuracy'][0][1].item() == 0
-
-
-@pytest.mark.parametrize('dataset_uri', ['lambada_small.jsonl'])
-def test_lm_spacing_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
-    pytest.importorskip('datasets')
-
-    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-
-    tokenizer = tiny_gpt2_tokenizer
-    dataset_uri = f'{local_data}/{dataset_uri}'
-    batch_size = 2
-    seqlen = 512
-    dl = get_icl_task_dataloader('language_modeling',
-                                 dataset_uri=dataset_uri,
-                                 tokenizer=tokenizer,
-                                 batch_size=batch_size,
-                                 max_seq_len=seqlen,
-                                 pad_tok_id=tokenizer.eos_token_id,
-                                 num_fewshot=1,
-                                 prompt_string='',
-                                 example_delimiter='\n',
-                                 continuation_delimiter=' UNIQUE ',
-                                 destination_path=str(tmp_path / 'icl.jsonl'))
-    assert isinstance(dl, DataSpec)
-    assert isinstance(dl.dataloader, DataLoader)  # pyright
-    first_batch = next(dl.dataloader._get_iterator())
-    second_batch = next(dl.dataloader._get_iterator())
-
-    first_batch_text = tokenizer.decode(first_batch['input_ids'][0], skip_special_tokens=True)
-    second_batch_text = tokenizer.decode(second_batch['input_ids'][0], skip_special_tokens=True)
-
-    first_batch_without_last_word = ' '.join(first_batch_text.split(' ')[:-1])
-    second_batch_without_last_word = ' '.join(second_batch_text.split(' ')[:-1])
-
-    assert first_batch_without_last_word.endswith(' UNIQUE')
-    assert second_batch_without_last_word.endswith(' UNIQUE')
-
-    assert first_batch_without_last_word.count(' UNIQUE ') == 1
-    assert second_batch_without_last_word.count(' UNIQUE ') == 1
-
-
-@pytest.mark.parametrize('dataset_uri', ['hf://mosaicml/test_dataset'])
-@pytest.mark.parametrize('num_fewshot', [0, 1])
-@pytest.mark.parametrize('prompt_string', ['Complete the voiceline: ', ''])
-@pytest.mark.parametrize('hf_loading_vars', [{
-    'split': 'test',
-    'name': 'juggernaut',
-}])
-@pytest.mark.parametrize('hf_parsing_map', [None, {'context': ['context'], 'continuation': ['continuation']}])
-@pytest.mark.filterwarnings(
-    r'ignore:The repository for mosaicml/test_dataset contains custom code which must*:FutureWarning')
-def test_hf_dataloading_lm_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path, num_fewshot, prompt_string,
-                                      hf_loading_vars, hf_parsing_map):
-    pytest.importorskip('datasets')
-
-    tokenizer = tiny_gpt2_tokenizer
-    batch_size = 2
-    seqlen = 2048
-    dl = get_icl_task_dataloader('language_modeling',
-                                 dataset_uri=dataset_uri,
-                                 tokenizer=tokenizer,
-                                 batch_size=batch_size,
-                                 max_seq_len=seqlen,
-                                 pad_tok_id=tokenizer.eos_token_id,
-                                 num_fewshot=0,
-                                 prompt_string='',
-                                 example_delimiter='\n',
-                                 continuation_delimiter=' ',
-                                 destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
-                                 hf_loading_vars=hf_loading_vars,
-                                 hf_parsing_map=hf_parsing_map)
-    assert isinstance(dl, DataSpec)
-    assert isinstance(dl.dataloader, DataLoader)  # pyright
-    batch = next(dl.dataloader._get_iterator())
-
-    assert 'input_ids' in batch
-    assert tuple(batch['input_ids'].shape) == (batch_size, seqlen)
-    assert 'attention_mask' in batch
-    assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen)
-    assert 'continuation_indices' in batch
-    assert isinstance(batch['continuation_indices'], list) and len(batch['continuation_indices']) == batch_size
-    assert 'mode' in batch
-    assert batch['mode'] == 'icl_task'
-    min_idx = min(batch['continuation_indices'][0]).item()
-    max_idx = max(batch['continuation_indices'][0]).item()
-    assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx + 1]) == ' and me.'
-
-    decoded_batch = [tokenizer.decode(row[row != tokenizer.eos_token_id]) for row in batch['input_ids']]
-    assert decoded_batch[0] == "Looks like it's just you and me."
-    assert decoded_batch[1] == "There's a fine line between bravery and stupidity."
-
-
-@pytest.mark.parametrize('dataset_uri', ['hf://mosaicml/test_dataset'])
-@pytest.mark.parametrize('num_fewshot', [0, 1])
-@pytest.mark.parametrize('prompt_string', ['What spell does this invoke? ', ''])
-@pytest.mark.parametrize('hf_loading_vars', [{
-    'split': 'test',
-    'name': 'invoker',
-}])
-@pytest.mark.parametrize('hf_parsing_map', [{'context': ['quas', 'wex', 'exort'], 'answer': ['spell']}])
-@pytest.mark.filterwarnings(
-    r'ignore:The repository for mosaicml/test_dataset contains custom code which must*:FutureWarning')
-def test_hf_dataloading_custom_parsing(dataset_uri, tiny_gpt2_tokenizer, tmp_path, num_fewshot, prompt_string,
-                                       hf_loading_vars, hf_parsing_map):
-    pytest.importorskip('datasets')
-
-    tokenizer = tiny_gpt2_tokenizer
-    batch_size = 2
-    seqlen = 2048
-
-    # empirical number from the small test dataset
-    maximum_answer_length = 4
-
-    dl = get_icl_task_dataloader('question_answering',
-                                 dataset_uri=dataset_uri,
-                                 tokenizer=tokenizer,
-                                 batch_size=batch_size,
-                                 max_seq_len=seqlen,
-                                 pad_tok_id=tokenizer.eos_token_id,
-                                 num_fewshot=num_fewshot,
-                                 prompt_string=prompt_string,
-                                 example_delimiter='\n',
-                                 question_prelimiter='Orbs: ',
-                                 continuation_delimiter='\nSpell:',
-                                 destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
-                                 hf_loading_vars=hf_loading_vars,
-                                 hf_parsing_map=hf_parsing_map)
-    assert isinstance(dl, DataSpec)
-    assert isinstance(dl.dataloader, DataLoader)  # pyright
-    batch = next(dl.dataloader._get_iterator())
-
-    assert tuple(batch['input_ids'].shape) == (batch_size, seqlen - maximum_answer_length)
-    assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen - maximum_answer_length)
-    assert batch['mode'] == 'generate'
-    # the maximum generation length from the small test data
-    assert batch['generation_length'] == maximum_answer_length
-    assert all(item[0] == tokenizer.eos_token_id for item in batch['input_ids'])
-
-    decoded_batch = tokenizer.batch_decode(batch['input_ids'])
-    assert all(item.count('Orbs: ') == num_fewshot + 1 for item in decoded_batch)
-    assert all(item.count('\nSpell:') == num_fewshot + 1 for item in decoded_batch)
-
-    if len(prompt_string) > 0:
-        assert all(item.count('What spell does this invoke? ') == 1 for item in decoded_batch)
-    assert all(
-        set(found) == set(expected) for found, expected in zip(batch['labels'], [['defeaning blast'], ['cold snap']]))
-    assert decoded_batch[0].endswith('Orbs: quas wex exort\nSpell:')
-    assert decoded_batch[1].endswith('Orbs: quas quas quas\nSpell:')
diff --git a/tests/metrics/test_nlp_metrics.py b/tests/metrics/test_nlp_metrics.py
index 9a3fa6760d..79edc3338e 100644
--- a/tests/metrics/test_nlp_metrics.py
+++ b/tests/metrics/test_nlp_metrics.py
@@ -8,11 +8,7 @@
 import torch
 from torch.nn.functional import cross_entropy
 
-from composer.metrics.nlp import (BinaryF1Score, InContextLearningCodeEvalAccuracy,
-                                  InContextLearningExpectedCalibrationError, InContextLearningLMAccuracy,
-                                  InContextLearningLMExpectedCalibrationError,
-                                  InContextLearningMCExpectedCalibrationError, InContextLearningMultipleChoiceAccuracy,
-                                  InContextLearningQAAccuracy, LanguageCrossEntropy, LanguagePerplexity, MaskedAccuracy)
+from composer.metrics.nlp import (BinaryF1Score, LanguageCrossEntropy, LanguagePerplexity, MaskedAccuracy)
 
 
 @pytest.mark.parametrize('ignore_index', [-100])
@@ -170,242 +166,3 @@ def test_language_perplexity():
     perplexity = perplexity_metric.compute()
 
     assert torch.equal(torch.exp(ce), perplexity)
-
-
-def test_in_context_learning_lm_accuracy(tiny_gpt2_tokenizer):
-    contexts = ['The dog is', 'I love to eat', 'I hate', 'The weather is']
-    continuations = [' furry', ' pie', ' long lines', ' snowy']
-    pad = tiny_gpt2_tokenizer.pad_token_id
-    inputs = [
-        tiny_gpt2_tokenizer(context)['input_ids'] + tiny_gpt2_tokenizer(continuation)['input_ids']
-        for context, continuation in zip(contexts, continuations)
-    ]
-    inputs = torch.tensor([input + [pad] * (2048 - len(input)) for input in inputs])
-
-    cont_idxs = []
-    for context, continuation in zip(contexts, continuations):
-        start = len(tiny_gpt2_tokenizer(context)['input_ids'])
-        end = start + len(tiny_gpt2_tokenizer(continuation)['input_ids'])
-        cont_idxs.append(torch.tensor(list(range(start, end))))
-
-    batch = {'continuation_indices': cont_idxs, 'labels': inputs.roll(-1), 'input_ids': inputs}
-    logits = torch.nn.functional.one_hot(inputs.roll(-1), num_classes=pad + 1).float() * 100
-    start, end = cont_idxs[1].tolist()[0] - 1, cont_idxs[1].tolist()[-1]
-    logits[1][start:end] = logits[0][start:end].clone()  # make one of the answer's continuations incorrect
-    metric = InContextLearningLMAccuracy()
-    metric.update(batch, logits, batch['labels'])
-
-    assert metric.compute() == 0.75
-
-
-def test_in_context_learning_lm_ece(tiny_gpt2_tokenizer):
-    contexts = ['The dog is', 'I love to eat', 'I hate', 'The weather is']
-    continuations = [' furry', ' pie', ' long lines', ' snowy']
-    pad = tiny_gpt2_tokenizer.pad_token_id
-    inputs = [
-        tiny_gpt2_tokenizer(context)['input_ids'] + tiny_gpt2_tokenizer(continuation)['input_ids']
-        for context, continuation in zip(contexts, continuations)
-    ]
-    inputs = torch.tensor([input + [pad] * (2048 - len(input)) for input in inputs])
-
-    cont_idxs = []
-    for context, continuation in zip(contexts, continuations):
-        start = len(tiny_gpt2_tokenizer(context)['input_ids'])
-        end = start + len(tiny_gpt2_tokenizer(continuation)['input_ids'])
-        cont_idxs.append(torch.tensor(list(range(start, end))))
-
-    batch = {'continuation_indices': cont_idxs, 'labels': inputs.roll(-1), 'input_ids': inputs}
-    # logits are expected to be unnormalized and will undergo softmax, so we must multiply by 100
-    logits = torch.nn.functional.one_hot(inputs.roll(-1), num_classes=pad + 1).float() * 100
-    start, end = cont_idxs[1].tolist()[0] - 1, cont_idxs[1].tolist()[-1]
-    logits[1][start:end] = logits[0][start:end].clone()  # make one of the answer's continuations incorrect
-    metric = InContextLearningLMExpectedCalibrationError()
-    metric.update(batch, logits, batch['labels'])
-    # all observations fall in the top confidence bucket (95%) but accuracy is only 75%,
-    # hence ECE should be 0.2
-    assert abs(metric.compute() - 0.2) < 0.0001
-
-
-def test_in_context_learning_qa_accuracy():
-    outputs = ['Correct but then some more text', 'Incorrect', ' the CORREct with weird casing and spacing']
-    labels = [['Correct'], ['blah', 'blah2'], ['blah', 'correct']]
-    batch = {'cot_delimiter': '', 'labels': labels}
-    metric = InContextLearningQAAccuracy()
-    metric.update(outputs, labels, batch)
-
-    assert metric.compute() == (2 / 3)
-
-
-def test_in_context_learning_qa_cot_accuracy():
-    outputs = [
-        'chain of thought ### Correct but then some more text\n\nanother chain of thought ### Incorrect answer this time',
-        'Incorrect', 'chain of thought ### the CORREct with weird casing and spacing',
-        'incorrect chain of thought delimiter ## Correct but wrong delimiter'
-    ]
-    labels = [['Correct'], ['blah', 'blah2'], ['blah', 'correct'], ['correct']]
-    batch = {'cot_delimiter': ' ### ', 'labels': labels, 'do_normalization': True, 'stopping_criteria': '\n\n'}
-    metric = InContextLearningQAAccuracy()
-    metric.update(outputs, labels, batch)
-
-    assert metric.compute() == (2 / 4)
-
-
-def test_in_context_learning_code_eval_accuracy(monkeypatch):
-    outputs = [
-        '    return 1 if n <= 1 else fib(n - 1) + fib(n - 1)',  # incorrect
-        '   if n <= 1:\n        return 1\n    return fib(n-1) + fib(n-2)',  # incorrect spacing
-        '    return n * 2',  # correct
-        '    return 2*n',  # correct
-        '    return n + 2',  # incorrect
-        '    return n + 1'
-    ]  # correct
-    labels = []
-    prompts = ['def fib(n):\n', 'def multiply_by_two(n):\n', 'def add_one(n):\n']
-    entry_points = ['fib', 'multiply_by_two', 'add_one']
-    test_inputs = [['(1,)', '(2,)', '(4,)'], ['(1,)', '(2,)', '(4,)'], ['(1,)', '(2,)', '(4,)']]
-    test_outputs = [['1', '2', '5'], ['2', '4', '8'], ['2', '3', '5']]
-    languages = ['python', 'python', 'python']
-    monkeypatch.setenv('CODE_EVAL_DEVICE', 'LOCAL')
-    batch = {
-        # This tests deterministic beam search rather than sampling
-        'generation_kwargs': {
-            'num_beams': 1,
-            'num_return_sequences': 2
-        },
-        'prompts': prompts,
-        'pass_at_k': 1,
-        'entry_points': entry_points,
-        'test_inputs': test_inputs,
-        'test_outputs': test_outputs,
-        'languages': languages,
-    }
-    metric = InContextLearningCodeEvalAccuracy()
-    metric.update(batch, outputs, labels)
-
-    # pass@1 values
-    #   program 1: 0
-    #   program 2: 1
-    #   program 3: .5
-    # mean: 0.5
-    assert metric.compute() == 0.5
-
-
-def test_in_context_learning_mc_accuracy(tiny_gpt2_tokenizer):
-    contexts = [
-        'Q: How do you cook a cake?', 'Q: How do you cook a cake?', 'Q: How old is the earth?',
-        'Q: How old is the earth?'
-    ]
-    continuations = [' A: turn on the oven', ' A: do a backflip', ' A: 2 minutes', ' A: 4.5 billion years']
-    gold_indices = [0, 1]
-    choice_groupings = [(0, 2), (2, 4)]
-    pad = tiny_gpt2_tokenizer.pad_token_id
-    inputs = [
-        tiny_gpt2_tokenizer(context)['input_ids'] + tiny_gpt2_tokenizer(continuation)['input_ids']
-        for context, continuation in zip(contexts, continuations)
-    ]
-    inputs = torch.tensor([input + [pad] * (2048 - len(input)) for input in inputs])
-
-    cont_idxs = []
-    for context, continuation in zip(contexts, continuations):
-        start = len(tiny_gpt2_tokenizer(context)['input_ids'])
-        end = start + len(tiny_gpt2_tokenizer(continuation)['input_ids'])
-        cont_idxs.append(torch.tensor(list(range(start, end))))
-
-    batch = {
-        'continuation_indices': cont_idxs,
-        'labels': inputs.roll(-1),
-        'input_ids': inputs,
-        'gold_indices': gold_indices,
-        'choice_groupings': choice_groupings
-    }
-    logits = torch.nn.functional.one_hot(inputs.roll(-1), num_classes=pad + 1).float()
-
-    # for the first two, the correct answer is continuation 0
-    # make the answer correct by making continuation 0 more likely for both answers
-    start, end = cont_idxs[1].tolist()[0] - 1, cont_idxs[1].tolist()[-1]
-    logits[1][start:end] = logits[0][start:end].clone()
-
-    # for the last two, the correct answer is continuation 3
-    # make the answer incorrect by making continuation 2 more likely for both answers
-    start, end = cont_idxs[3].tolist()[0], cont_idxs[3].tolist()[-1]
-    logits[3][start:end] = logits[2][start:end].clone()
-
-    metric = InContextLearningMultipleChoiceAccuracy()
-
-    metric.update(batch, logits, batch['labels'])
-    assert metric.compute() == 0.5
-
-
-def test_in_context_learning_mc_ece(tiny_gpt2_tokenizer):
-    contexts = [
-        'Q: How do you cook a cake?', 'Q: How do you cook a cake?', 'Q: How old is the earth?',
-        'Q: How old is the earth?'
-    ]
-    continuations = [' turn on the oven', ' do a backflip', ' 2 minutes', ' 4.5 billion years']
-    gold_indices = [0, 1]
-    choice_groupings = [(0, 2), (2, 4)]
-    pad = tiny_gpt2_tokenizer.pad_token_id
-    inputs = [
-        tiny_gpt2_tokenizer(context)['input_ids'] + tiny_gpt2_tokenizer(continuation)['input_ids']
-        for context, continuation in zip(contexts, continuations)
-    ]
-    inputs = torch.tensor([input + [pad] * (2048 - len(input)) for input in inputs])
-
-    cont_idxs = []
-    for context, continuation in zip(contexts, continuations):
-        start = len(tiny_gpt2_tokenizer(context)['input_ids'])
-        end = start + len(tiny_gpt2_tokenizer(continuation)['input_ids'])
-        cont_idxs.append(torch.tensor(list(range(start, end))))
-
-    batch = {
-        'continuation_indices': cont_idxs,
-        'labels': inputs.roll(-1),
-        'input_ids': inputs,
-        'gold_indices': gold_indices,
-        'choice_groupings': choice_groupings
-    }
-    logits = torch.nn.functional.one_hot(inputs.roll(-1), num_classes=pad + 1).float() * 100
-    # for the first two, the correct answer is continuation 0
-    # make the answer correct by making continuation 0 more likely for both answers
-    start, end = cont_idxs[1].tolist()[0] - 1, cont_idxs[1].tolist()[-1]
-    logits[1][start:end] = logits[0][start:end].clone()
-
-    # for the last two, the correct answer is continuation 3
-    # make the answer incorrect by making continuation 2 more likely for both answers
-    start, end = cont_idxs[3].tolist()[0] - 1, cont_idxs[3].tolist()[-1]
-    logits[3][start:end] = logits[2][start:end].clone()
-
-    metric = InContextLearningMCExpectedCalibrationError()
-
-    metric.update(batch, logits, batch['labels'])
-
-    # accuracy is 50% but confidence is 95%, so ECE is 45%
-    assert abs(metric.compute().item() - 0.45) < 0.0001
-
-
-def test_in_context_learning_ece():
-    metric = InContextLearningExpectedCalibrationError(n_buckets=1)
-    metric.update(None, None, None)  # pyright: ignore [reportGeneralTypeIssues]
-    metric.bucket_totals[0] = 2  # pyright: ignore [reportGeneralTypeIssues]
-    metric.bucket_correct[0] = 1  # pyright: ignore [reportGeneralTypeIssues]
-    # confidence of bucket = 50%, accuracy = 50% => ECE = 0.0
-    assert metric.compute() == 0.0
-
-    metric = InContextLearningExpectedCalibrationError(n_buckets=10)
-    # this example corresponds to perfect calibration across all 10 buckets
-    metric.update(None, None, None)  # pyright: ignore [reportGeneralTypeIssues]
-    for i in range(len(metric.bucket_totals)):  # pyright: ignore [reportGeneralTypeIssues]
-        upper_bound = (i + 1) / metric.n_buckets
-        lower_bound = i / metric.n_buckets
-        conf_bucket_i = (upper_bound + lower_bound) / 2
-        metric.bucket_totals[i] = metric.n_buckets * 2  # pyright: ignore [reportGeneralTypeIssues]
-        metric.bucket_correct[i] = conf_bucket_i * metric.n_buckets * 2  # pyright: ignore [reportGeneralTypeIssues]
-    assert metric.compute() == 0.0
-
-    metric = InContextLearningExpectedCalibrationError(n_buckets=10)
-    # this example corresponds to perfect calibration
-    metric.update(None, None, None)  # pyright: ignore [reportGeneralTypeIssues]
-    metric.bucket_totals[-1] = 2  # pyright: ignore [reportGeneralTypeIssues]
-    metric.bucket_correct[-1] = 0  # pyright: ignore [reportGeneralTypeIssues]
-    # confidence = 95%, accuracy = 0% => ece = 95%
-    assert metric.compute() == 0.95

From aad8b20c373959b87b7bebce84740819ecbf44d3 Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Thu, 1 Feb 2024 15:59:39 -0500
Subject: [PATCH 2/2] fix

---
 composer/datasets/__init__.py                 | 11 +----
 .../in_context_learning_evaluation.py         |  1 -
 composer/metrics/__init__.py                  | 20 +-------
 composer/metrics/nlp.py                       |  1 -
 .../test_in_context_learning_datasets.py      | 48 -------------------
 tests/metrics/test_nlp_metrics.py             |  2 +-
 tests/models/test_hf_model.py                 |  6 +--
 7 files changed, 7 insertions(+), 82 deletions(-)

diff --git a/composer/datasets/__init__.py b/composer/datasets/__init__.py
index 6496c2b499..532ed67116 100644
--- a/composer/datasets/__init__.py
+++ b/composer/datasets/__init__.py
@@ -3,17 +3,8 @@
 
 """Natively supported datasets."""
 
-from composer.datasets.in_context_learning_evaluation import (InContextLearningCodeEvalDataset,
-                                                              InContextLearningDataset, InContextLearningLMTaskDataset,
-                                                              InContextLearningMultipleChoiceTaskDataset,
-                                                              InContextLearningQATaskDataset,
-                                                              InContextLearningSchemaTaskDataset)
+from composer.datasets.in_context_learning_evaluation import InContextLearningDataset
 
 __all__ = [
     'InContextLearningDataset',
-    'InContextLearningQATaskDataset',
-    'InContextLearningLMTaskDataset',
-    'InContextLearningCodeEvalDataset',
-    'InContextLearningMultipleChoiceTaskDataset',
-    'InContextLearningSchemaTaskDataset',
 ]
diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 2bcf2385e3..558cb01823 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -18,7 +18,6 @@
     import transformers
     from datasets import Dataset as HFDataset  # pyright: ignore[reportGeneralTypeIssues]
 
-
 __all__ = [
     'InContextLearningDataset',
 ]
diff --git a/composer/metrics/__init__.py b/composer/metrics/__init__.py
index 26af16f5b5..fd00c205ac 100644
--- a/composer/metrics/__init__.py
+++ b/composer/metrics/__init__.py
@@ -5,11 +5,8 @@
 
 from composer.metrics.map import MAP
 from composer.metrics.metrics import CrossEntropy, Dice, LossMetric, MIoU
-from composer.metrics.nlp import (BinaryF1Score, InContextLearningCodeEvalAccuracy, InContextLearningLMAccuracy,
-                                  InContextLearningLMExpectedCalibrationError,
-                                  InContextLearningMCExpectedCalibrationError, InContextLearningMetric,
-                                  InContextLearningMultipleChoiceAccuracy, InContextLearningQAAccuracy,
-                                  LanguageCrossEntropy, LanguagePerplexity, MaskedAccuracy)
+from composer.metrics.nlp import (BinaryF1Score, InContextLearningMetric, LanguageCrossEntropy, LanguagePerplexity,
+                                  MaskedAccuracy)
 
 __all__ = [
     'MAP',
@@ -21,18 +18,5 @@
     'LanguageCrossEntropy',
     'MaskedAccuracy',
     'LanguagePerplexity',
-    'InContextLearningLMAccuracy',
-    'InContextLearningMultipleChoiceAccuracy',
-    'InContextLearningQAAccuracy',
-    'InContextLearningMCExpectedCalibrationError',
-    'InContextLearningLMExpectedCalibrationError',
     'InContextLearningMetric',
-    'InContextLearningCodeEvalAccuracy',
 ]
-
-METRIC_DEFAULT_CTORS = {
-    'InContextLearningLMAccuracy': InContextLearningLMAccuracy,
-    'InContextLearningMultipleChoiceAccuracy': InContextLearningMultipleChoiceAccuracy,
-    'InContextLearningQAAccuracy': InContextLearningQAAccuracy,
-    'InContextLearningCodeEvalAccuracy': InContextLearningCodeEvalAccuracy,
-}
diff --git a/composer/metrics/nlp.py b/composer/metrics/nlp.py
index 90ce30d948..c12c30c65b 100644
--- a/composer/metrics/nlp.py
+++ b/composer/metrics/nlp.py
@@ -10,7 +10,6 @@
 from torch import Tensor
 from torchmetrics import Metric
 
-
 log = logging.getLogger(__name__)
 
 __all__ = [
diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py
index 041b6c6964..f9d5479045 100644
--- a/tests/datasets/test_in_context_learning_datasets.py
+++ b/tests/datasets/test_in_context_learning_datasets.py
@@ -269,54 +269,6 @@ def test_update_generation_kwargs_no_kwargs(tiny_gpt2_tokenizer, tmp_path):
     assert not 'generation_kwargs' in dl.base_batch
 
 
-def test_update_generation_kwargs_no_kwargs_qa_dataset(tmp_path):
-    pytest.importorskip('datasets')
-    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-    dataset_uri = f'{local_data}/triviaqa_small.jsonl'
-    transformers = pytest.importorskip('transformers')
-    tokenizer = transformers.AutoTokenizer.from_pretrained('facebook/opt-125m')  # type: ignore reportUnboundVariable
-
-    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
-    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
-    dl = InContextLearningQATaskDataset(dataset_uri=dataset_uri,
-                                        tokenizer=tokenizer,
-                                        max_seq_len=1024,
-                                        pad_tok_id=tokenizer.eos_token_id,
-                                        num_fewshot=0,
-                                        fewshot_random_seed=1234,
-                                        prompt_string='',
-                                        example_delimiter='\n',
-                                        continuation_delimiter=': ',
-                                        destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
-                                        generation_kwargs=None)
-    assert len(dl.base_batch['generation_kwargs']) == 3
-
-
-def test_update_generation_kwargs_with_kwargs_qa_dataset(tmp_path):
-    pytest.importorskip('datasets')
-    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-    dataset_uri = f'{local_data}/triviaqa_small.jsonl'
-    transformers = pytest.importorskip('transformers')
-    tokenizer = transformers.AutoTokenizer.from_pretrained('facebook/opt-125m')  # type: ignore reportUnboundVariable
-
-    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
-    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
-    dl = InContextLearningQATaskDataset(dataset_uri=dataset_uri,
-                                        tokenizer=tokenizer,
-                                        max_seq_len=1024,
-                                        pad_tok_id=tokenizer.eos_token_id,
-                                        num_fewshot=0,
-                                        fewshot_random_seed=1234,
-                                        prompt_string='',
-                                        example_delimiter='\n',
-                                        continuation_delimiter=': ',
-                                        destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
-                                        generation_kwargs={'temperature': 0.9})
-    assert 'generation_kwargs' in dl.base_batch
-    assert dl.base_batch['generation_kwargs']['temperature'] == 0.9
-    assert len(dl.base_batch['generation_kwargs']) == 4
-
-
 @pytest.mark.filterwarnings(
     r'ignore:The repository for mosaicml/test_dataset contains custom code which must*:FutureWarning')
 def test_construct_context(tiny_gpt2_tokenizer, tmp_path):
diff --git a/tests/metrics/test_nlp_metrics.py b/tests/metrics/test_nlp_metrics.py
index 79edc3338e..6b6dfbb009 100644
--- a/tests/metrics/test_nlp_metrics.py
+++ b/tests/metrics/test_nlp_metrics.py
@@ -8,7 +8,7 @@
 import torch
 from torch.nn.functional import cross_entropy
 
-from composer.metrics.nlp import (BinaryF1Score, LanguageCrossEntropy, LanguagePerplexity, MaskedAccuracy)
+from composer.metrics.nlp import BinaryF1Score, LanguageCrossEntropy, LanguagePerplexity, MaskedAccuracy
 
 
 @pytest.mark.parametrize('ignore_index', [-100])
diff --git a/tests/models/test_hf_model.py b/tests/models/test_hf_model.py
index e677941e9e..2ae2c92528 100644
--- a/tests/models/test_hf_model.py
+++ b/tests/models/test_hf_model.py
@@ -19,7 +19,7 @@
 from torchmetrics.regression import PearsonCorrCoef
 
 from composer.loggers import InMemoryLogger
-from composer.metrics import InContextLearningLMAccuracy, LanguageCrossEntropy, MaskedAccuracy
+from composer.metrics import LanguageCrossEntropy, MaskedAccuracy
 from composer.models import HuggingFaceModel
 from composer.trainer import Trainer
 from composer.utils import dist, is_model_fsdp
@@ -961,13 +961,13 @@ def test_separate_eval_metrics(tiny_bert_model, tiny_bert_tokenizer):
         tiny_bert_model,
         tokenizer=tiny_bert_tokenizer,
         metrics=[LanguageCrossEntropy()],
-        eval_metrics=[MaskedAccuracy(), InContextLearningLMAccuracy()],
+        eval_metrics=[MaskedAccuracy()],
     )
 
     assert hf_model.train_metrics is not None
     assert hf_model.val_metrics is not None
     assert hf_model.train_metrics.keys() == {'LanguageCrossEntropy'}
-    assert hf_model.val_metrics.keys() == {'InContextLearningLMAccuracy', 'MaskedAccuracy'}
+    assert hf_model.val_metrics.keys() == {'MaskedAccuracy'}
 
 
 @pytest.mark.parametrize('checkpoint_upload_folder', [None, 's3://checkpoints-bucket/'])