From a7770148e2b7092cd4707b9d2529371b689d0eac Mon Sep 17 00:00:00 2001
From: Milo Cress <milo.cress@databricks.com>
Date: Tue, 7 May 2024 21:32:32 -0400
Subject: [PATCH] Bump version v0.9.0.dev0 (#1181)

* update init.py

* deprecate

* delete in context learning code eval dataset

* remove deprecated export

* removed more code_eval stuff

* oopsie

* guh

* fix
---
 llmfoundry/__init__.py                        |   2 +-
 llmfoundry/eval/__init__.py                   |   4 -
 llmfoundry/eval/datasets/__init__.py          |   2 -
 .../in_context_learning_evaluation.py         | 278 +------
 llmfoundry/eval/metrics/__init__.py           |   2 -
 llmfoundry/eval/metrics/nlp.py                | 228 ------
 llmfoundry/metrics/__init__.py                |   4 -
 llmfoundry/utils/builders.py                  |  13 +-
 scripts/eval/README.md                        |  22 -
 scripts/eval/yamls/coding_tasks.yaml          |  65 --
 scripts/eval/yamls/eval_gauntlet_v0.1.yaml    |  28 -
 .../eval/test_in_context_learning_datasets.py | 680 ------------------
 tests/eval/test_nlp_metrics.py                |  70 --
 13 files changed, 3 insertions(+), 1395 deletions(-)
 delete mode 100644 scripts/eval/yamls/coding_tasks.yaml

diff --git a/llmfoundry/__init__.py b/llmfoundry/__init__.py
index c32b9736df..c9666566bf 100644
--- a/llmfoundry/__init__.py
+++ b/llmfoundry/__init__.py
@@ -71,4 +71,4 @@
     'utils',
 ]
 
-__version__ = '0.8.0.dev0'
+__version__ = '0.9.0.dev0'
diff --git a/llmfoundry/eval/__init__.py b/llmfoundry/eval/__init__.py
index 54f8217920..90496ce0c7 100644
--- a/llmfoundry/eval/__init__.py
+++ b/llmfoundry/eval/__init__.py
@@ -2,7 +2,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from llmfoundry.eval.datasets.in_context_learning_evaluation import (
-    InContextLearningCodeEvalDataset,
     InContextLearningDataset,
     InContextLearningGenerationTaskWithAnswersDataset,
     InContextLearningLMTaskDataset,
@@ -11,7 +10,6 @@
     get_icl_task_dataloader,
 )
 from llmfoundry.eval.metrics.nlp import (
-    InContextLearningCodeEvalAccuracy,
     InContextLearningGenerationExactMatchAccuracy,
     InContextLearningLMAccuracy,
     InContextLearningLMExpectedCalibrationError,
@@ -25,14 +23,12 @@
     'InContextLearningLMTaskDataset',
     'InContextLearningMultipleChoiceTaskDataset',
     'InContextLearningSchemaTaskDataset',
-    'InContextLearningCodeEvalDataset',
     'InContextLearningGenerationTaskWithAnswersDataset',
     'get_icl_task_dataloader',
     'InContextLearningMetric',
     'InContextLearningLMAccuracy',
     'InContextLearningMultipleChoiceAccuracy',
     'InContextLearningGenerationExactMatchAccuracy',
-    'InContextLearningCodeEvalAccuracy',
     'InContextLearningLMExpectedCalibrationError',
     'InContextLearningMCExpectedCalibrationError',
 ]
diff --git a/llmfoundry/eval/datasets/__init__.py b/llmfoundry/eval/datasets/__init__.py
index 517dc3e1f3..02a2b88b21 100644
--- a/llmfoundry/eval/datasets/__init__.py
+++ b/llmfoundry/eval/datasets/__init__.py
@@ -4,7 +4,6 @@
 """Natively supported in-context learning evaluation datasets."""
 
 from llmfoundry.eval.datasets.in_context_learning_evaluation import (
-    InContextLearningCodeEvalDataset,
     InContextLearningDataset,
     InContextLearningGenerationTaskWithAnswersDataset,
     InContextLearningLMTaskDataset,
@@ -28,7 +27,6 @@
     'InContextLearningDataset',
     'InContextLearningGenerationTaskWithAnswersDataset',
     'InContextLearningLMTaskDataset',
-    'InContextLearningCodeEvalDataset',
     'InContextLearningMultipleChoiceTaskDataset',
     'InContextLearningSchemaTaskDataset',
     'get_icl_task_dataloader',
diff --git a/llmfoundry/eval/datasets/in_context_learning_evaluation.py b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
index 7ca1cfe7f6..debb0dbc6f 100644
--- a/llmfoundry/eval/datasets/in_context_learning_evaluation.py
+++ b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
@@ -8,7 +8,6 @@
 import logging
 import os
 import random
-import warnings
 from typing import Any, Dict, Iterable, List, Optional, Sequence, Union
 
 import torch
@@ -30,7 +29,6 @@
     tokenizer_needs_prefix_space,
     trim_context,
 )
-from llmfoundry.utils.warnings import VersionedDeprecationWarning
 
 log = logging.getLogger(__name__)
 
@@ -42,7 +40,6 @@
     'InContextLearningLMTaskDataset',
     'InContextLearningMultipleChoiceTaskDataset',
     'InContextLearningSchemaTaskDataset',
-    'InContextLearningCodeEvalDataset',
     'InContextLearningGenerationTaskWithAnswersDataset',
     'get_icl_task_dataloader',
 ]
@@ -1292,245 +1289,6 @@ def tokenize_example(
         return tokenized_example
 
 
-class InContextLearningCodeEvalDataset(InContextLearningDataset):
-    """A dataset that constructs batches for in-context learning code.
-
-    evaluation.
-
-    The input format is expected to be a jsonl file with the following fields:
-
-    - task_id: Label of given task
-    - prompt: The code snippet that must be completed
-    - entry_point: The entry to the function/code snippet to generate
-    - canonical_solution: Working solution
-    - test: The checker code that will run to completion if the code generation is valid and otherwise throw assertion
-    - test_inputs: List of test inputs
-    - test_outputs: List of test outputs
-    - language: The language of the code snippet
-
-    Each batch then consists of the following the structure
-
-    - input_ids: Input tensor batch x seqlen x num tokens
-    - mode: Indicates to the model that this is an ICL task and may rely on a custom code path to properly update metrics
-    - mode: Always set to 'generate'
-    - labels: Exact solution for the coding problem
-    - prompts: Prompt for the task
-    - entry_points: List of entry points
-    - test_inputs: List of test inputs
-    - test_outputs: List of test outputs
-    - languages:  List of languages
-    - pass_at_k: Passed value for pass_at_k
-    - generation_kwargs: Dictionary of kwargs needed for generation. Includes the following, which will be individually overwritten
-      by keys in generation_kwargs if set (see https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig
-      for more details):
-
-        - pad_token_id: ID for padding token, derived automatically
-        - num_beams: How many beams to search for generations, default set to 1
-        - do_sample: Determines whether model is sampling or greedily decoding. Always set to True
-        - use_cache: Whether or not to use past key values to speed up sampling. Always set to True
-
-    Additional Args:
-        generations_per_sample (int) (defaults to 1): The number of independently computed returned sequences for each element in the batch
-        pass_at_k (int) (defaults to 1): k for how many chances the model gets to write passing code
-    """
-
-    def __init__(
-        self,
-        generations_per_sample: int,
-        pass_at_k: Union[int, list[int]] = 1,
-        *args: Any,
-        **kwargs: Any,
-    ):
-        if isinstance(pass_at_k, int):
-            pass_at_k = [pass_at_k]
-        if generations_per_sample < max(pass_at_k):
-            raise ValueError(
-                f'generations_per_sample ({generations_per_sample}) must be greater than or equal to pass_at_k ({pass_at_k}) for code evaluation.',
-            )
-        batch_mapping = {
-            'input_ids': 'prompt',
-            'prompts': 'prompt_text',
-            'tests': 'test',
-            'labels': 'canonical_solution',
-            'entry_points': 'entry_point',
-            'test_inputs': 'test_inputs',
-            'test_outputs': 'test_outputs',
-            'languages': 'language',
-            'sample_id': 'sample_id',
-        }
-        # Linting complains if these are not set in init
-        self.max_prompt_length = 0
-        self.max_answer_length = 0
-        static_keys = [
-            'mode',
-            'pass_at_k',
-            'generation_kwargs',
-            'generations_per_sample',
-            'dataset_size',
-        ]
-        list_keys = [
-            'prompts',
-            'tests',
-            'entry_points',
-            'test_inputs',
-            'test_outputs',
-            'languages',
-            'labels',
-            'sample_id',
-        ]
-        tensor_keys = ['input_ids', 'attention_mask']
-        super().__init__(
-            context_key='prompt',
-            answer_key='canonical_solution',
-            strip_dataset=False,
-            static_keys=static_keys,
-            list_keys=list_keys,
-            tensor_keys=tensor_keys,
-            tokenize_labels=False,
-            padding_side='left',
-            batch_mapping=batch_mapping,
-            *args,
-            **kwargs,
-        )
-        self._set_max_prompt_and_answer_lengths()
-        if self.max_seq_len < self.max_prompt_length:
-            log.warning(f'`max_seq_len` {self.max_seq_len} was less than `max_prompt_len`: {self.max_prompt_length}' \
-                        + ' setting  `max_seq_len`=`max_prompt_len`')
-            self.max_seq_len = self.max_prompt_length
-        dataset_size = len(self.dataset)
-        self.dataset = self.dataset.map(self._trim_padding)
-        self.dataset = self.repeat_dataset(self.dataset, generations_per_sample)
-
-        if self.max_answer_length < self.max_seq_len - self.max_prompt_length:
-            max_new_tokens = self.max_answer_length
-        else:
-            max_new_tokens = self.max_seq_len - self.max_prompt_length
-
-        self.base_batch = {
-            'input_ids': [],
-            'mode': 'generate',
-            'labels': [],
-            'prompts': [],
-            'tests': [],
-            'entry_points': [],
-            'test_inputs': [],
-            'test_outputs': [],
-            'languages': [],
-            'pass_at_k': pass_at_k,
-            'generation_kwargs': {
-                'pad_token_id': self.pad_tok_id,
-                'num_beams': 1,  # single beam
-                'do_sample': True,
-                'temperature': 0.2,  # good default for code
-                'use_cache': True,
-                'eos_token_id': self.tokenizer.eos_token_id,
-                'max_new_tokens': max(max_new_tokens, 1),
-            },
-            'sample_id': [],
-            'pass_at_k': list(pass_at_k),
-            'generations_per_sample': generations_per_sample,
-            'dataset_size': dataset_size,
-        }
-        if 'generation_kwargs' in kwargs:
-            self.update_generation_kwargs(kwargs['generation_kwargs'])
-
-    def repeat_dataset(self, dataset: HFDataset, repetitions: int) -> HFDataset:
-
-        def _repeat_dataset():
-            for i, sample in enumerate(dataset):
-                for _ in range(repetitions):
-                    assert isinstance(sample, dict)
-                    yield {'sample_id': i, **sample}
-
-        from datasets import \
-            Dataset as HFDataset  # pyright: ignore[reportGeneralTypeIssues]
-
-        repeated_dataset = HFDataset.from_generator(_repeat_dataset)
-        assert isinstance(repeated_dataset, HFDataset)
-        return repeated_dataset
-
-    def _set_max_prompt_and_answer_lengths(self):
-        """Iterates through the dataset and finds the maximum prompt length and.
-
-        sequence lengths.
-
-        Returns:
-            None
-        """
-        max_prompt_length = 0
-        max_answer_length = 0
-        for example in self.dataset:
-            assert isinstance(example, Dict)
-            unpadded_example = [
-                token for token in example[self.context_key]
-                if token != self.pad_tok_id
-            ]
-            max_prompt_length = max(max_prompt_length, len(unpadded_example))
-
-            tokenized_answer = self.tokenizer(
-                example['canonical_solution'],
-                add_special_tokens=False,
-            )['input_ids']
-            assert isinstance(tokenized_answer, list)
-            len_tokenized_answer = len(tokenized_answer)
-            max_answer_length = max(max_answer_length, len_tokenized_answer)
-
-        self.max_prompt_length = max_prompt_length
-        self.max_answer_length = max_answer_length + _MAX_ANSWER_BUFFER_LENGTH
-
-    def _trim_padding(self, example: Dict):
-        """Adjusts padding to the maximum prompt length rather than max_seq_len.
-
-        Needs to be done after the dataset has been processed because we don't
-        know the maximum prompt length until after we've tokenized it.
-
-        Returns:
-            dataset: A HuggingFace Dataset with different padding lengths for example[self.context_key]
-        """
-        # Remove padding tokens applied during tokenization
-        unpadded_prompt = [
-            token for token in example[self.context_key]
-            if token != self.pad_tok_id
-        ]
-        # Reapply padding only to max_prompt_length
-        full_prompt = trim_context(unpadded_prompt, [], self.max_prompt_length)
-        padded_context = make_padded_input(
-            full_prompt,
-            [],
-            self.max_prompt_length,
-            self.pad_tok_id,
-            self.padding_side,
-        )
-
-        example[self.context_key] = padded_context
-        return example
-
-    def tokenize_example(
-        self,
-        prompt_and_fewshot: str,
-        ctxt: str,
-        example: Dict,
-    ) -> Dict[str, Any]:
-        """Adds extra code task details to the example dictionary.
-
-        See InContextLearningDataset for more details
-        """
-        tokenized_example = super().tokenize_example(
-            prompt_and_fewshot,
-            ctxt,
-            example,
-        )
-        tokenized_example['prompt_text'] = example['prompt']
-        tokenized_example['task_id'] = example['task_id']
-        tokenized_example['canonical_solution'] = example['canonical_solution']
-        tokenized_example['test'] = example['test']
-        tokenized_example['entry_point'] = example['entry_point']
-        tokenized_example['test_inputs'] = example['test_inputs']
-        tokenized_example['test_outputs'] = example['test_outputs']
-        tokenized_example['language'] = example['language']
-        return tokenized_example
-
-
 def build_icl_dataloader(
     icl_task_type: str,
     dataset_uri: str,
@@ -1621,14 +1379,7 @@ def build_icl_dataloader(
             generation_kwargs=generation_kwargs,
         )
         effective_batchsize = batch_size
-    elif icl_task_type == 'generation_task_with_answers' or icl_task_type == 'question_answering':
-        if icl_task_type == 'question_answering':
-            warnings.warn(
-                VersionedDeprecationWarning(
-                    "ICL task type 'question_answering' is now deprecated. Use identifier 'generation_task_with_answers'",
-                    'v0.9.0',
-                ),
-            )
+    elif icl_task_type == 'generation_task_with_answers':
         dataset = InContextLearningGenerationTaskWithAnswersDataset(
             dataset_uri=dataset_uri,
             tokenizer=tokenizer,
@@ -1649,32 +1400,6 @@ def build_icl_dataloader(
             generation_kwargs=generation_kwargs,
         )
         effective_batchsize = batch_size
-    elif icl_task_type == 'code_evaluation':
-        warnings.warn(
-            VersionedDeprecationWarning(
-                "ICL task type 'code_evaluation' is deprecated and will no longer be supported. ",
-                'v0.9.0',
-            ),
-        )
-        dataset = InContextLearningCodeEvalDataset(
-            dataset_uri=dataset_uri,
-            tokenizer=tokenizer,
-            max_seq_len=max_seq_len,
-            pad_tok_id=pad_tok_id,
-            num_fewshot=num_fewshot,
-            prompt_string=prompt_string,
-            example_delimiter=example_delimiter,
-            continuation_delimiter=continuation_delimiter,
-            destination_path=destination_path,
-            prelimiter=prelimiter,
-            fewshot_random_seed=fewshot_random_seed,
-            hf_loading_vars=hf_loading_vars,
-            hf_parsing_map=hf_parsing_map,
-            pass_at_k=pass_at_k,
-            generations_per_sample=generations_per_sample,
-            generation_kwargs=generation_kwargs,
-        )
-        effective_batchsize = batch_size
     else:
         raise Exception(f'Unrecognized ICL task type: {icl_task_type}')
 
@@ -1686,7 +1411,6 @@ def build_icl_dataloader(
         (
             InContextLearningMultipleChoiceTaskDataset,
             InContextLearningGenerationTaskWithAnswersDataset,
-            InContextLearningCodeEvalDataset,
         ),
     ):
         split_batch = dataset.split_batch
diff --git a/llmfoundry/eval/metrics/__init__.py b/llmfoundry/eval/metrics/__init__.py
index 6a50fcb484..03fd4407c1 100644
--- a/llmfoundry/eval/metrics/__init__.py
+++ b/llmfoundry/eval/metrics/__init__.py
@@ -4,7 +4,6 @@
 """A collection of common torchmetrics."""
 
 from llmfoundry.eval.metrics.nlp import (
-    InContextLearningCodeEvalAccuracy,
     InContextLearningGenerationExactMatchAccuracy,
     InContextLearningLMAccuracy,
     InContextLearningLMExpectedCalibrationError,
@@ -18,7 +17,6 @@
     'InContextLearningLMAccuracy',
     'InContextLearningMultipleChoiceAccuracy',
     'InContextLearningGenerationExactMatchAccuracy',
-    'InContextLearningCodeEvalAccuracy',
     'InContextLearningLMExpectedCalibrationError',
     'InContextLearningMCExpectedCalibrationError',
 ]
diff --git a/llmfoundry/eval/metrics/nlp.py b/llmfoundry/eval/metrics/nlp.py
index a7764a0d0a..3ee30ebf5e 100644
--- a/llmfoundry/eval/metrics/nlp.py
+++ b/llmfoundry/eval/metrics/nlp.py
@@ -6,21 +6,11 @@
 import copy
 import functools
 import logging
-import os
 import re
 import string
-import warnings
 from typing import Any, Callable, Dict, List
 
-import numpy as np
 import torch
-from composer.utils import dist
-from composer.utils.eval_client import (
-    EvalClient,
-    LambdaEvalClient,
-    LocalEvalClient,
-    MosaicMLLambdaEvalClient,
-)
 from torch import Tensor
 from torch.nn import functional as F
 from torchmetrics import Metric
@@ -32,7 +22,6 @@
     'InContextLearningLMAccuracy',
     'InContextLearningMultipleChoiceAccuracy',
     'InContextLearningGenerationExactMatchAccuracy',
-    'InContextLearningCodeEvalAccuracy',
     'InContextLearningLMExpectedCalibrationError',
     'InContextLearningMCExpectedCalibrationError',
 ]
@@ -408,223 +397,6 @@ def compute(self):
         return self.correct.float() / self.total
 
 
-class InContextLearningCodeEvalAccuracy(InContextLearningMetric):
-    r"""Computes accuracy for In-context learning (ICL) code evaluation tasks.
-
-    ICL code eval tasks consist of some number of example code eval tasks (referred to as the 'context'), followed by a test task where the model must
-    complete the code, where we term the code completion a 'continuation'.
-
-    In each case, the model constructs a given number of continuations (termed pass@K for K continuations), and each continuation is run against a set of test cases. The model is considered
-    correct if at least one of the proposed continuations passes all the test cases.
-
-    Runs on AWS Lambdas by default.
-
-    Adds metric state variables:
-        correct (float): The number of instances where the predictions passed all the test cases.
-        total (float): The number of total instances that were predicted.
-
-    Args:
-        dist_sync_on_step (bool, optional): Synchronize metric state across processes at
-            each forward() before returning the value at the step. Default: ``False``.
-    """
-
-    # Make torchmetrics call update only once
-    full_state_update = False
-
-    def __init__(self, dist_sync_on_step: bool = False):
-        # state from multiple processes
-        super().__init__(dist_sync_on_step=dist_sync_on_step)
-
-        self._initialized = False
-        self.dataset_size = 0
-        self.pass_at_k = []
-        self.num_generations = 0
-        self.eval_device = os.environ.get('CODE_EVAL_DEVICE', None)
-        if self.eval_device is not None:
-            self.eval_device = self.eval_device.upper()
-        self.metric_result_dict = {
-            'context': [],
-            'output': [],
-            'result': [],
-            'sample_id': [],
-        }
-
-    def get_client(self) -> EvalClient:
-        """Returns a client for the appropriate remote platform."""
-        client = None
-        if self.eval_device == 'LOCAL':
-            warnings.warn(
-                'Running code eval locally may be insecure. Please set environment variable CODE_EVAL_DEVICE '
-                +
-                'to LAMBDA to run on remote. To use Lambdas, spin up your instance that checks code, set the URL as '
-                + 'CODE_EVAL_URL and the API key as CODE_EVAL_APIKEY.',
-            )
-            log.debug('Running code eval locally.')
-            client = LocalEvalClient()
-        elif self.eval_device == 'LAMBDA':
-            client = LambdaEvalClient()
-        elif self.eval_device == 'MOSAICML':
-            client = MosaicMLLambdaEvalClient()
-        elif self.eval_device is None:
-            raise ValueError(
-                'Attempting to use InContextLearningCodeEvalAccuracy but environment '
-                +
-                'variable `CODE_EVAL_DEVICE` is not set. Please set it to `CODE_EVAL_DEVICE` '
-                +
-                'to one of `LOCAL` (for unsafe local eval), `LAMBDA` (for AWS lambda '
-                + 'evaluation), or `MOSAICML` (for lambda eval through MAPI).',
-            )
-        else:
-            raise ValueError(
-                'Environment variable `CODE_EVAL_DEVICE` must be one of `LOCAL`, '
-                + f'`LAMBDA`, or `MOSAICML` but got {self.eval_device}.',
-            )
-
-        return client
-
-    def estimator(self, n: int, c: int, k: int) -> float:
-        """Computes the pass@k metric.
-
-        Given the number of generated samples, n, the number of correct samples, c, and the k of interest,
-        this function calculates pass@k as 1 - comb(n - c, k) / comb(n, k) as per the definition of
-        pass@k in the HumanEval paper (https://arxiv.org/abs/2107.03374) and it's associated implementation:
-        https://github.com/openai/human-eval.
-        """
-        if n - c < k:
-            return 1.0
-        return 1.0 - float(np.prod(1.0 - k / np.arange(n - c + 1, n + 1)))
-
-    def _initialize_state(self, batch: dict[str, Any]):
-        device = batch['input_ids'].device
-        self.dataset_size = batch['dataset_size']
-        self.pass_at_k = batch['pass_at_k']
-        self.num_generations = batch['generations_per_sample']
-
-        # We need to defer the accumulator initialization because it depends on dataset size
-        self.add_state(
-            'correct',
-            default=torch.zeros(self.dataset_size, device=device),
-            dist_reduce_fx='sum',
-        )
-        self.add_state(
-            'total',
-            default=torch.zeros(self.dataset_size, device=device),
-            dist_reduce_fx='sum',
-        )
-        dist.barrier()
-        self._initialized = True
-
-    def update(
-        self,
-        batch: Dict[str, Any],
-        outputs: List[str],
-        labels: List[str],
-    ):
-        """Updates the pass@k accuracy of code generation.
-
-        Given a batch of prompts, test cases, and code generations, evaluates the code generations
-        against the test cases and augments the pass@k accuracy of the batch to the values so far.
-
-        Args:
-            batch (Dict[str, Any]): A batch of data produced by the InContextLearningCodeEvalDataset, with
-            the prompt, test cases, and entry points. This will be a dictionary that must have the following
-            arguments:
-            {
-                'prompts': List[str],
-                'test_inputs': List[List[str]],
-                'test_outputs': List[List[str]],
-                'entry_points': List[str],
-                'languages': List[str],
-                'generation_kwargs': Dict[str, Any]
-            }
-            outputs (List[str]): A list of code generations in the format of HF generate with beam search,
-            which is the a list of strings in groups of beam_size e.g. for beam size 2 and batch size 2, the list
-            will be of the format [prompt 1 gen 1, prompt 1 gen 2, prompt 2 gen 1, prompt 2 gen 2]
-            labels (List[str]): A list of the correct code generations, for compatibility with existing HF generate
-            functionalities. This is not used.
-        """
-        if not self._initialized:
-            self._initialize_state(batch)
-
-        del labels  # never used
-        client = self.get_client()
-
-        metric_result_dict = copy.deepcopy(self.metric_result_dict)
-        for sample_id, code_gen, sample_prompt, test_inputs, test_outputs, entry_point, language in zip(
-            batch['sample_id'],
-            outputs,
-            batch['prompts'],
-            batch['test_inputs'],
-            batch['test_outputs'],
-            batch['entry_points'],
-            batch['languages'],
-        ):
-
-            idx = sample_id
-            self.total[idx] += 1.0
-            metric_result_dict['sample_id'].append(sample_id)
-
-            code_gen = re.split(r'\n[A-Za-z0-9#`]', code_gen)[
-                0]  # remove everything after function ends
-            final_code = sample_prompt + code_gen  # combine prompt with the code generation
-            metric_result_dict['context'].append(sample_prompt)
-            metric_result_dict['output'].append(code_gen)
-
-            test_results = []
-            for test_input, test_output in zip(test_inputs, test_outputs):
-                payload = {
-                    'code': final_code,
-                    'input': test_input,
-                    'output': test_output,
-                    'entry_point': entry_point,
-                    'language': language,
-                }
-
-                result = client.invoke([[[payload]]])[0][0][0]
-                test_results.append(result)
-
-            if all(test_results):
-                self.correct[idx] += 1.0
-                metric_result_dict['result'].append(1)
-            else:
-                metric_result_dict['result'].append(0)
-
-        client.close()  # pyright: ignore [reportOptionalMemberAccess]
-        return metric_result_dict
-
-    def compute(self):
-        assert isinstance(self.correct, Tensor)
-        assert isinstance(self.total, Tensor)
-        complete = self.total == self.num_generations  # so that eval subset batches can be used
-
-        if complete.sum() < (self.total != 0).sum():
-            warnings.warn(
-                'Some samples in the dataset have less than the expected number of generations. '
-                +
-                'This is expected if you are using a subset of the dataset for evaluation.',
-            )
-
-        if (self.correct > self.total).any().item():
-            raise ValueError(
-                'Internal error some samples have more correct than  total generations. This should not happen.',
-            )
-
-        results = {}
-        n = self.num_generations
-
-        for k in self.pass_at_k:
-            pass_at_k = sum([
-                self.estimator(n, int(c.item()), k)
-                for c in self.correct[complete]
-            ]) / complete.sum().item()
-            results[f'pass@{k}'] = torch.tensor(pass_at_k)
-
-        if len(results) == 1:  # backwards compatibility
-            return list(results.values())[0]
-
-        return results
-
-
 class InContextLearningExpectedCalibrationError(InContextLearningMetric):
     """Generic class for Expected Calibration Error (ECE).
 
diff --git a/llmfoundry/metrics/__init__.py b/llmfoundry/metrics/__init__.py
index 18067c3283..81a8b91e28 100644
--- a/llmfoundry/metrics/__init__.py
+++ b/llmfoundry/metrics/__init__.py
@@ -8,7 +8,6 @@
 )
 
 from llmfoundry.eval.metrics import (
-    InContextLearningCodeEvalAccuracy,
     InContextLearningGenerationExactMatchAccuracy,
     InContextLearningLMAccuracy,
     InContextLearningLMExpectedCalibrationError,
@@ -33,7 +32,6 @@
     'qa_accuracy',
     func=InContextLearningGenerationExactMatchAccuracy,
 )
-metrics.register('code_eval_accuracy', func=InContextLearningCodeEvalAccuracy)
 metrics.register('language_cross_entropy', func=LanguageCrossEntropy)
 metrics.register('language_perplexity', func=LanguagePerplexity)
 metrics.register('masked_accuracy', func=MaskedAccuracy)
@@ -53,7 +51,6 @@
     'mc_expected_calibration_error',
     'mc_accuracy',
     'qa_accuracy',
-    'code_eval_accuracy',
 ]
 
 DEFAULT_ENC_DEC_METRICS = [
@@ -68,7 +65,6 @@
     'InContextLearningMCExpectedCalibrationError',
     'InContextLearningMultipleChoiceAccuracy',
     'InContextLearningGenerationExactMatchAccuracy',
-    'InContextLearningCodeEvalAccuracy',
     'DEFAULT_CAUSAL_LM_TRAIN_METRICS',
     'DEFAULT_CAUSAL_LM_EVAL_METRICS',
     'DEFAULT_ENC_DEC_METRICS',
diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py
index 1c9dbc54a3..39025b8066 100644
--- a/llmfoundry/utils/builders.py
+++ b/llmfoundry/utils/builders.py
@@ -6,7 +6,6 @@
 import logging
 import os
 import re
-import warnings
 from collections import OrderedDict
 from typing import (
     Any,
@@ -38,7 +37,6 @@
     get_icl_task_dataloader
 from llmfoundry.tokenizers.tiktoken import TiktokenTokenizerWrapper
 from llmfoundry.utils.registry_utils import construct_from_registry
-from llmfoundry.utils.warnings import VersionedDeprecationWarning
 
 log = logging.getLogger(__name__)
 
@@ -531,19 +529,10 @@ def _validate_cfg(icl_cfg: DictConfig):
                 icl_cfg.metric_names = [
                     'InContextLearningMultipleChoiceAccuracy',
                 ]
-            elif icl_cfg.icl_task_type == 'generation_task_with_answers' or icl_cfg.icl_task_type == 'question_answering':
-                if icl_cfg.icl_task_type == 'question_answering':
-                    warnings.warn(
-                        VersionedDeprecationWarning(
-                            "ICL task type 'question_answering' is now deprecated. Use identifier 'generation_task_with_answers'",
-                            'v0.9.0',
-                        ),
-                    )
+            elif icl_cfg.icl_task_type == 'generation_task_with_answers':
                 icl_cfg.metric_names = [
                     'InContextLearningGenerationExactMatchAccuracy',
                 ]
-            elif icl_cfg.icl_task_type == 'code_evaluation':
-                icl_cfg.metric_names = ['InContextLearningCodeEvalAccuracy']
             else:
                 raise ValueError(
                     f'No metric_names defined, unable to build default metrics for icl_task_type={icl_cfg.icl_task_type}.',
diff --git a/scripts/eval/README.md b/scripts/eval/README.md
index 3a748066ec..b1fd6d148e 100644
--- a/scripts/eval/README.md
+++ b/scripts/eval/README.md
@@ -160,7 +160,6 @@ llm-foundry currently supports five ICL formats:
 2. InContextLearningLMTaskDataset
 3. InContextLearningMultipleChoiceTaskDataset
 4. InContextLearningSchemaTaskDataset
-5. InContextLearningCodeEvalDataset
 
 ----
 
@@ -348,27 +347,6 @@ Below is a YAML section that works with the Winograd dataset in [`scripts/eval/l
 >
 
 
-----
-
-### InContextLearningCodeEvalDataset
-
-The ICL CodeEvalDataset takes a prompt, and, working with the NLP metric [InContextLearningCodeEvalAccuracy](https://docs.mosaicml.com/projects/composer/en/latest/api_reference/generated/composer.metrics.InContextLearningCodeEvalAccuracy.html), generates code which gets run against the supplied tests, as in HumanEval ([Evaluating Large Language Models Trained on Code](https://arxiv.org/abs/2107.03374)) and MBPP ([Program Synthesis with Large Language Models](https://arxiv.org/abs/2108.07732)). This generation involves many decoding steps, so can take longer per sample than other ICL tasks. An example datum:
-
-```json
-{"task_id": "JavaScript/2", "prompt": "/* Given a positive floating point number, it can be decomposed into\n  and integer part (largest integer smaller than given number) and decimals\n  (leftover part always smaller than 1).\n\n  Return the decimal part of the number.\n  >>> truncateNumber(3.5)\n  0.5\n  */\nconst truncateNumber = (number) => {\n", "canonical_solution": "  return number % 1.0;\n}\n\n", "test": "const testTruncateNumber = () => {\n  console.assert(truncateNumber(3.5) === 0.5)\n\n  console.assert(Math.abs(truncateNumber(1.33) - 0.33) < 1e-6)\n\n  console.assert(Math.abs(truncateNumber(123.456 - 0.456) < 1e-6))\n}\n\ntestTruncateNumber()\n", "entry_point": "truncateNumber", "test_inputs": ["3.5", "1.33", "123.456"], "test_outputs": ["0.5", "0.33", "0.456"], "language": "javascript"}
-```
-
-Required keys for each datum:
-
-* `prompt: str`
-* `test: str`
-* `entry_point: str`
-* `test_inputs: List[str]`
-* `test_outputs: List[str]`
-* `language: str`
-
-Code evaluation can happen locally (insecure) or inside an AWS Lambda function sandbox. This is controlled by setting the environment variable `CODE_EVAL_DEVICE` to `LOCAL` or `LAMBDA`. If set to `LAMBDA`, you must also provide `CODE_EVAL_URL` and `CODE_EVAL_APIKEY` to query the API gateway in the AWS Sandbox.
-
 ----
 
 ### Build your own dataset (BYOD)
diff --git a/scripts/eval/yamls/coding_tasks.yaml b/scripts/eval/yamls/coding_tasks.yaml
deleted file mode 100644
index 78f2a213bc..0000000000
--- a/scripts/eval/yamls/coding_tasks.yaml
+++ /dev/null
@@ -1,65 +0,0 @@
-icl_tasks:
--
-  label: human_eval
-  dataset_uri: eval/local_data/programming/human_eval.jsonl  # ADD YOUR OWN DATASET URI
-  num_fewshot: [0]
-  pass_at_k: 1
-  generations_per_sample: 5
-  batch_size: 1
-  icl_task_type: code_evaluation
--
-  label: human_eval_cpp
-  dataset_uri: eval/local_data/programming/processed_human_eval_cpp.jsonl  # ADD YOUR OWN DATASET URI
-  num_fewshot: [0]
-  pass_at_k: 1
-  generations_per_sample: 5
-  batch_size: 1
-  icl_task_type: code_evaluation
--
-  label: human_eval_js
-  dataset_uri: eval/local_data/programming/processed_human_eval_js.jsonl  # ADD YOUR OWN DATASET URI
-  num_fewshot: [0]
-  pass_at_k: 1
-  generations_per_sample: 5
-  batch_size: 1
-  icl_task_type: code_evaluation
--
-  label: human_eval_return_simple
-  dataset_uri: eval/local_data/programming/human_eval_return_simple.jsonl  # ADD YOUR OWN DATASET URI
-  num_fewshot: [0]
-  pass_at_k: 1
-  generations_per_sample: 5
-  batch_size: 1
-  icl_task_type: code_evaluation
--
-  label: human_eval_return_complex
-  dataset_uri: eval/local_data/programming/human_eval_return_complex.jsonl  # ADD YOUR OWN DATASET URI
-  num_fewshot: [0]
-  pass_at_k: 1
-  generations_per_sample: 5
-  batch_size: 1
-  icl_task_type: code_evaluation
--
-  label: human_eval_25
-  dataset_uri: eval/local_data/programming/human_eval-0.25.jsonl  # ADD YOUR OWN DATASET URI
-  num_fewshot: [0]
-  pass_at_k: 1
-  generations_per_sample: 5
-  batch_size: 1
-  icl_task_type: code_evaluation
--
-  label: human_eval_50
-  dataset_uri: eval/local_data/programming/human_eval-0.5.jsonl  # ADD YOUR OWN DATASET URI
-  num_fewshot: [0]
-  pass_at_k: 1
-  generations_per_sample: 5
-  batch_size: 1
-  icl_task_type: code_evaluation
--
-  label: human_eval_75
-  dataset_uri: eval/local_data/programming/human_eval-0.75.jsonl  # ADD YOUR OWN DATASET URI
-  num_fewshot: [0]
-  pass_at_k: 1
-  generations_per_sample: 5
-  batch_size: 1
-  icl_task_type: code_evaluation
diff --git a/scripts/eval/yamls/eval_gauntlet_v0.1.yaml b/scripts/eval/yamls/eval_gauntlet_v0.1.yaml
index eb860db8be..2dcd8a6a18 100644
--- a/scripts/eval/yamls/eval_gauntlet_v0.1.yaml
+++ b/scripts/eval/yamls/eval_gauntlet_v0.1.yaml
@@ -170,31 +170,3 @@ eval_gauntlet:
     - name: bbq
       num_fewshot: 3
       random_baseline: 0.5
-# THIS CATEGORY IS PARTICULARLY SLOW, USE SPARINGLY.
-# TASKS ARE DEFINED IN `coding_tasks.yaml`
-# - name: programming
-#   benchmarks:
-#   - name: human_eval
-#     num_fewshot: 0
-#     random_baseline: 0.0
-#   - name: human_eval_cpp
-#     num_fewshot: 0
-#     random_baseline: 0.0
-#   - name: human_eval_js
-#     num_fewshot: 0
-#     random_baseline: 0.0
-#   - name: human_eval_return_simple
-#     num_fewshot: 0
-#     random_baseline: 0.0
-#   - name: human_eval_return_complex
-#     num_fewshot: 0
-#     random_baseline: 0.0
-#   - name: human_eval_25
-#     num_fewshot: 0
-#     random_baseline: 0.0
-#   - name: human_eval_50
-#     num_fewshot: 0
-#     random_baseline: 0.0
-#   - name: human_eval_75
-#     num_fewshot: 0
-#     random_baseline: 0.0
diff --git a/tests/eval/test_in_context_learning_datasets.py b/tests/eval/test_in_context_learning_datasets.py
index ea87ed17d0..9660354c83 100644
--- a/tests/eval/test_in_context_learning_datasets.py
+++ b/tests/eval/test_in_context_learning_datasets.py
@@ -4,7 +4,6 @@
 import contextlib
 import os
 import random
-import types
 from pathlib import Path
 from typing import Dict, List, Optional
 
@@ -21,7 +20,6 @@
 from torch.utils.data import DataLoader
 
 from llmfoundry.eval.datasets import (
-    InContextLearningCodeEvalDataset,
     InContextLearningDataset,
     InContextLearningGenerationTaskWithAnswersDataset,
     InContextLearningMultipleChoiceTaskDataset,
@@ -35,7 +33,6 @@
     trim_context,
 )
 from llmfoundry.eval.metrics import (
-    InContextLearningCodeEvalAccuracy,
     InContextLearningGenerationExactMatchAccuracy,
     InContextLearningLMAccuracy,
     InContextLearningMultipleChoiceAccuracy,
@@ -890,72 +887,6 @@ def test_qa_tokenize_example(
     ]
 
 
-def test_code_adjust_padding(
-    tiny_gpt2_tokenizer: transformers.AutoTokenizer,
-    tmp_path: Path,
-):
-    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-    dataset_uri = f'{local_data}/human_eval_small.jsonl'
-    tokenizer = tiny_gpt2_tokenizer
-    seqlen = 2048
-    num_fewshot = 0
-    prompt_string = ''
-    gen_kwargs = {'temperature': .9, 'top_p': .95, 'num_beams': 9000}
-
-    dl = InContextLearningCodeEvalDataset(
-        dataset_uri=dataset_uri,
-        tokenizer=tokenizer,
-        max_seq_len=seqlen,
-        pad_tok_id=tokenizer.eos_token_id,
-        num_fewshot=num_fewshot,
-        fewshot_random_seed=1,
-        prompt_string=prompt_string,
-        example_delimiter='\n',
-        prelimiter='Code start:',
-        continuation_delimiter='\nPlease code:',
-        destination_path=str(tmp_path / 'test_human_eval_small.jsonl'),
-        generation_kwargs=gen_kwargs,
-        generations_per_sample=10,
-    )
-
-    assert all(
-        len(data['prompt']) == 148 for data in dl.dataset
-    )  # pyright: ignore [reportGeneralTypeIssues]
-
-
-def test_code_update_gen_kwargs(
-    tiny_gpt2_tokenizer: transformers.AutoTokenizer,
-    tmp_path: Path,
-):
-    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-    dataset_uri = f'{local_data}/human_eval_small.jsonl'
-    tokenizer = tiny_gpt2_tokenizer
-    seqlen = 2048
-    num_fewshot = 0
-    prompt_string = ''
-    gen_kwargs = {'temperature': .9, 'top_p': .95, 'num_beams': 9000}
-
-    dl = InContextLearningCodeEvalDataset(
-        dataset_uri=dataset_uri,
-        tokenizer=tokenizer,
-        max_seq_len=seqlen,
-        pad_tok_id=tokenizer.eos_token_id,
-        num_fewshot=num_fewshot,
-        fewshot_random_seed=1,
-        prompt_string=prompt_string,
-        example_delimiter='\n',
-        prelimiter='Code start:',
-        continuation_delimiter='\nPlease code:',
-        destination_path=str(tmp_path / 'test_human_eval_small.jsonl'),
-        generation_kwargs=gen_kwargs,
-        generations_per_sample=10,
-    )
-    assert dl.base_batch['generation_kwargs']['num_beams'] == 9000
-    assert dl.base_batch['generation_kwargs']['top_p'] == .95
-    assert dl.base_batch['generation_kwargs']['temperature'] == .9
-    assert dl.base_batch['generation_kwargs']['do_sample'] == True
-
-
 def test_mc_tokenize_example(
     tiny_gpt2_tokenizer: transformers.AutoTokenizer,
     tmp_path: Path,
@@ -1897,404 +1828,6 @@ def test_mc_task_dataloader(
     ) == ' Pour it onto a plate'
 
 
-@pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
-def test_code_eval_split_batch(dataset_uri: str, tmp_path: Path):
-    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-    dataset_uri = f'{local_data}/{dataset_uri}'
-
-    tokenizer = transformers.AutoTokenizer.from_pretrained(
-        'EleutherAI/gpt-neox-20b',
-    )  # type: ignore reportUnboundVariable
-
-    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
-    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
-    dl = get_icl_task_dataloader(
-        'code_evaluation',
-        dataset_uri=dataset_uri,
-        tokenizer=tokenizer,
-        batch_size=5,
-        max_seq_len=1024,
-        pad_tok_id=tokenizer.eos_token_id,
-        num_fewshot=2,
-        prompt_string='',
-        example_delimiter='\n',
-        continuation_delimiter='',
-        destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
-        generations_per_sample=3,
-    )
-
-    assert isinstance(dl, DataSpec)  # pyright
-    batches = list(dl.dataloader)
-
-    for k in ('input_ids', 'attention_mask'):
-        assert [b[k].shape[0] for b in batches] == [5, 5, 2]
-
-    list_keys = {
-        'labels': str,
-        'prompts': str,
-        'tests': str,
-        'entry_points': str,
-        'test_inputs': list,
-        'test_outputs': list,
-        'languages': str,
-    }
-
-    for batch, size in zip(batches, [5, 5, 2]):
-        for field, type_ in list_keys.items():
-            assert len(batch[field]) == size
-            assert all(isinstance(val, type_) for val in batch[field])
-
-    static_keys = {'pass_at_k': (int, list), 'generation_kwargs': dict}
-    for batch in batches:
-        assert 'generation_kwargs' in batch
-        assert 'max_new_tokens' in batch['generation_kwargs']
-        assert isinstance(batch['generation_kwargs']['max_new_tokens'], int)
-        for field, type_ in static_keys.items():
-            assert isinstance(batch[field], type_)
-
-
-@pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
-@pytest.mark.parametrize('num_fewshot', [0, 2])
-@pytest.mark.parametrize('prompt_string', ['Please code:\n', ''])
-@pytest.mark.parametrize('generations_per_sample', [1, 3])
-def test_code_eval_sentpiece_dataloader(
-    dataset_uri: str,
-    tmp_path: Path,
-    num_fewshot: int,
-    prompt_string: str,
-    generations_per_sample: int,
-    tiny_llama_tokenizer: transformers.AutoTokenizer,
-):
-
-    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-
-    tokenizer = tiny_llama_tokenizer
-    dataset_uri = f'{local_data}/{dataset_uri}'
-    batch_size = 5
-    seqlen = 2048
-
-    dl = get_icl_task_dataloader(
-        'code_evaluation',
-        dataset_uri=dataset_uri,
-        tokenizer=tokenizer,
-        batch_size=batch_size,
-        max_seq_len=seqlen,
-        pad_tok_id=tokenizer.eos_token_id,
-        num_fewshot=num_fewshot,
-        prompt_string=prompt_string,
-        example_delimiter='\n',
-        continuation_delimiter='',
-        question_prelimiter='Code start: \n',
-        destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'),
-        generations_per_sample=generations_per_sample,
-    )
-    assert isinstance(dl, DataSpec)
-
-    assert isinstance(dl.dataloader, DataLoader)  # pyright
-    batches = list(dl.dataloader)
-    dataset_size = len(open(dataset_uri, 'r').read().strip().split('\n'))
-    dataset_size *= generations_per_sample
-
-    max_prompt_length = 0
-
-    has_left_padding = []
-    for i, batch in enumerate(batches):
-        if isinstance(dl.dataloader.dataset, InContextLearningCodeEvalDataset):
-            max_prompt_length = dl.dataloader.dataset.max_prompt_length
-        N = len(batches)
-        bs = batch_size if i < N - 1 else dataset_size - (N - 1) * batch_size
-        assert tuple(batch['input_ids'].shape) == (bs, max_prompt_length)
-        assert tuple(batch['attention_mask'].shape) == (bs, max_prompt_length)
-        assert batch['mode'] == 'generate'
-        # the maximum generation length from the small test data
-        assert batch['generation_kwargs']['max_new_tokens'] == 129
-        has_left_padding.extend([
-            item[0] == tokenizer.eos_token_id for item in batch['input_ids']
-        ])
-    assert not all(has_left_padding)  # longest should be pushed left
-
-    decoded_batches = [
-        tokenizer.batch_decode(batch['input_ids']) for batch in batches
-    ]
-    for decoded_batch in decoded_batches:
-        assert all(
-            item.count('Code start: \n') == num_fewshot + 1
-            for item in decoded_batch
-        )
-
-        if len(prompt_string) > 0:
-            assert all(
-                item.count('Please code:\n') == 1 for item in decoded_batch
-            )
-
-    labels = [
-        '    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx != idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n',
-        "    result = []\n    current_string = []\n    current_depth = 0\n\n    for c in paren_string:\n        if c == '(':\n            current_depth += 1\n            current_string.append(c)\n        elif c == ')':\n            current_depth -= 1\n            current_string.append(c)\n\n            if current_depth == 0:\n                result.append(''.join(current_string))\n                current_string.clear()\n\n    return result\n",
-        '    return number % 1.0\n',
-        '    balance = 0\n\n    for op in operations:\n        balance += op\n        if balance < 0:\n            return True\n\n    return False\n',
-    ]
-
-    # assert decoded_batch[0].endswith(
-    samples = [
-        "Code start: \nfrom typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    \"\"\"\n",
-        "Code start: \nfrom typing import List\n\n\ndef separate_paren_groups(paren_string: str) -> List[str]:\n    \"\"\" Input to this function is a string containing multiple groups of nested parentheses. Your goal is to\n    separate those group into separate strings and return the list of those.\n    Separate groups are balanced (each open brace is properly closed) and not nested within each other\n    Ignore any spaces in the input string.\n    >>> separate_paren_groups('( ) (( )) (( )( ))')\n    ['()', '(())', '(()())']\n    \"\"\"\n",
-        "Code start: \n\n\ndef truncate_number(number: float) -> float:\n    \"\"\" Given a positive floating point number, it can be decomposed into\n    and integer part (largest integer smaller than given number) and decimals\n    (leftover part always smaller than 1).\n\n    Return the decimal part of the number.\n    >>> truncate_number(3.5)\n    0.5\n    \"\"\"\n",
-        "Code start: \nfrom typing import List\n\n\ndef below_zero(operations: List[int]) -> bool:\n    \"\"\" You're given a list of deposit and withdrawal operations on a bank account that starts with\n    zero balance. Your task is to detect if at any point the balance of account fallls below zero, and\n    at that point function should return True. Otherwise it should return False.\n    >>> below_zero([1, 2, 3])\n    False\n    >>> below_zero([1, 2, -4, 5])\n    True\n    \"\"\"\n",
-    ]
-    for i in range(4):
-        for j in range(generations_per_sample):
-            k = i * generations_per_sample + j
-            b, n = divmod(k, batch_size)
-            assert batches[b]['labels'][n] == labels[i]
-            assert decoded_batches[b][n].endswith(samples[i])
-
-
-@pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
-def test_code_eval_test_cases(dataset_uri: str, tmp_path: Path):
-    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-
-    tokenizer = transformers.AutoTokenizer.from_pretrained(
-        'huggyllama/llama-7b',
-    )  # type: ignore reportUnboundVariable
-    dataset_uri = f'{local_data}/{dataset_uri}'
-    batch_size = 4
-    seqlen = 512
-
-    dl = get_icl_task_dataloader(
-        'code_evaluation',
-        dataset_uri=dataset_uri,
-        tokenizer=tokenizer,
-        batch_size=batch_size,
-        max_seq_len=seqlen,
-        pad_tok_id=tokenizer.eos_token_id,
-        num_fewshot=0,
-        prompt_string='',
-        example_delimiter='\n',
-        continuation_delimiter='',
-        question_prelimiter='Code start: \n',
-        destination_path=str(tmp_path / f'icl_.jsonl'),
-        generations_per_sample=1,
-    )
-    assert isinstance(dl, DataSpec)
-
-    assert isinstance(dl.dataloader, DataLoader)  # pyright
-    batch = next(dl.dataloader._get_iterator())
-
-    max_prompt_length = 0
-    if isinstance(dl.dataloader.dataset, InContextLearningCodeEvalDataset):
-        max_prompt_length = dl.dataloader.dataset.max_prompt_length
-    assert tuple(batch['input_ids'].shape) == (batch_size, max_prompt_length)
-    assert tuple(
-        batch['attention_mask'].shape,
-    ) == (batch_size, max_prompt_length)
-    assert batch['mode'] == 'generate'
-    # the maximum generation length from the small test data
-    assert batch['generation_kwargs']['max_new_tokens'] == 129
-    assert any(
-        item[0] != tokenizer.eos_token_id for item in batch['input_ids']
-    )  # longest should be pushed left
-
-    mod = types.ModuleType('test_module')
-    for prompt, solution, inputs, outputs, entry_point in zip(
-        batch['prompts'],
-        batch['labels'],
-        batch['test_inputs'],
-        batch['test_outputs'],
-        batch['entry_points'],
-    ):
-        exec(prompt + solution, mod.__dict__)
-        for test_input, test_output in zip(inputs, outputs):
-            result = mod.__dict__[entry_point](*eval(test_input))
-            assert result == eval(test_output)
-
-
-@pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
-def test_code_eval_pass_at_k_validity(dataset_uri: str, tmp_path: Path):
-    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-
-    tokenizer = transformers.AutoTokenizer.from_pretrained(
-        'huggyllama/llama-7b',
-    )  # type: ignore reportUnboundVariable
-    dataset_uri = f'{local_data}/{dataset_uri}'
-    batch_size = 2
-    seqlen = 64
-
-    with pytest.raises(ValueError, match=r'.* pass_at_k .*'):
-        get_icl_task_dataloader(
-            'code_evaluation',
-            dataset_uri=dataset_uri,
-            tokenizer=tokenizer,
-            batch_size=batch_size,
-            max_seq_len=seqlen,
-            pad_tok_id=tokenizer.eos_token_id,
-            num_fewshot=0,
-            prompt_string='',
-            example_delimiter='\n',
-            continuation_delimiter='',
-            question_prelimiter='Code start: \n',
-            destination_path=str(tmp_path / f'icl_.jsonl'),
-            pass_at_k=10,
-            generations_per_sample=1,
-        )
-
-
-@pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
-@pytest.mark.parametrize('num_fewshot', [0, 2])
-@pytest.mark.parametrize('prompt_string', ['Please code:\n', ''])
-@pytest.mark.parametrize('generations_per_sample', [1, 3])
-def test_code_eval_task_dataloader(
-    dataset_uri: str,
-    tmp_path: Path,
-    num_fewshot: int,
-    prompt_string: str,
-    generations_per_sample: int,
-):
-
-    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-
-    tokenizer = transformers.AutoTokenizer.from_pretrained(
-        'mosaicml/mpt-7b',
-    )  # type: ignore reportUnboundVariable
-    dataset_uri = f'{local_data}/{dataset_uri}'
-    batch_size = 4
-    seqlen = 2048
-
-    dl = get_icl_task_dataloader(
-        'code_evaluation',
-        dataset_uri=dataset_uri,
-        tokenizer=tokenizer,
-        batch_size=batch_size,
-        max_seq_len=seqlen,
-        pad_tok_id=tokenizer.eos_token_id,
-        num_fewshot=num_fewshot,
-        prompt_string=prompt_string,
-        example_delimiter='\n',
-        continuation_delimiter='',
-        question_prelimiter='Code start: \n',
-        destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'),
-        generations_per_sample=generations_per_sample,
-        generation_kwargs={
-            'temperature': .9,
-            'top_k': 40,
-        },
-    )
-    assert isinstance(dl, DataSpec)
-
-    assert isinstance(dl.dataloader, DataLoader)  # pyright
-    batches = list(dl.dataloader)
-    dataset_size = len(open(dataset_uri, 'r').read().strip().split('\n'))
-    dataset_size *= generations_per_sample
-
-    has_left_padding = []
-    for i, batch in enumerate(batches):
-        max_prompt_length = 0
-        if isinstance(dl.dataloader.dataset, InContextLearningCodeEvalDataset):
-            max_prompt_length = dl.dataloader.dataset.max_prompt_length
-        N = len(batches)
-        bs = batch_size if i < N - 1 else dataset_size - (N - 1) * batch_size
-        assert tuple(batch['input_ids'].shape) == (bs, max_prompt_length)
-        assert tuple(batch['attention_mask'].shape) == (bs, max_prompt_length)
-        assert batch['mode'] == 'generate'
-        # the maximum generation length from the small test data
-        assert batch['generation_kwargs']['max_new_tokens'] == 122
-        has_left_padding.extend([
-            item[0] == tokenizer.eos_token_id for item in batch['input_ids']
-        ])
-    assert not all(has_left_padding)  # longest should be pushed left
-
-    decoded_batches = [
-        tokenizer.batch_decode(batch['input_ids']) for batch in batches
-    ]
-    for decoded_batch in decoded_batches:
-        assert all(
-            item.count('Code start: \n') == num_fewshot + 1
-            for item in decoded_batch
-        )
-
-        if len(prompt_string) > 0:
-            assert all(
-                item.count('Please code:\n') == 1 for item in decoded_batch
-            )
-
-    labels = [
-        '    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx != idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n',
-        "    result = []\n    current_string = []\n    current_depth = 0\n\n    for c in paren_string:\n        if c == '(':\n            current_depth += 1\n            current_string.append(c)\n        elif c == ')':\n            current_depth -= 1\n            current_string.append(c)\n\n            if current_depth == 0:\n                result.append(''.join(current_string))\n                current_string.clear()\n\n    return result\n",
-        '    return number % 1.0\n',
-        '    balance = 0\n\n    for op in operations:\n        balance += op\n        if balance < 0:\n            return True\n\n    return False\n',
-    ]
-
-    # assert decoded_batch[0].endswith(
-    samples = [
-        "Code start: \nfrom typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    \"\"\"\n",
-        "Code start: \nfrom typing import List\n\n\ndef separate_paren_groups(paren_string: str) -> List[str]:\n    \"\"\" Input to this function is a string containing multiple groups of nested parentheses. Your goal is to\n    separate those group into separate strings and return the list of those.\n    Separate groups are balanced (each open brace is properly closed) and not nested within each other\n    Ignore any spaces in the input string.\n    >>> separate_paren_groups('( ) (( )) (( )( ))')\n    ['()', '(())', '(()())']\n    \"\"\"\n",
-        "Code start: \n\n\ndef truncate_number(number: float) -> float:\n    \"\"\" Given a positive floating point number, it can be decomposed into\n    and integer part (largest integer smaller than given number) and decimals\n    (leftover part always smaller than 1).\n\n    Return the decimal part of the number.\n    >>> truncate_number(3.5)\n    0.5\n    \"\"\"\n",
-        "Code start: \nfrom typing import List\n\n\ndef below_zero(operations: List[int]) -> bool:\n    \"\"\" You're given a list of deposit and withdrawal operations on a bank account that starts with\n    zero balance. Your task is to detect if at any point the balance of account fallls below zero, and\n    at that point function should return True. Otherwise it should return False.\n    >>> below_zero([1, 2, 3])\n    False\n    >>> below_zero([1, 2, -4, 5])\n    True\n    \"\"\"\n",
-    ]
-    for i in range(4):
-        for j in range(generations_per_sample):
-            k = i * generations_per_sample + j
-            b, n = divmod(k, batch_size)
-            assert batches[b]['labels'][n] == labels[i]
-            assert decoded_batches[b][n].endswith(samples[i])
-
-
-@pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
-@pytest.mark.parametrize('num_fewshot', [0, 1])
-def test_eval_split_batch(
-    mpt_tokenizer: transformers.AutoTokenizer,
-    dataset_uri: str,
-    num_fewshot: int,
-    tmp_path: Path,
-):
-
-    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-    tokenizer = mpt_tokenizer  # type: ignore reportUnboundVariable
-    dataset_uri = f'{local_data}/{dataset_uri}'
-    batch_size = 4
-    seqlen = 512
-
-    dl = get_icl_task_dataloader(
-        'code_evaluation',
-        dataset_uri=dataset_uri,
-        tokenizer=tokenizer,
-        batch_size=batch_size,
-        max_seq_len=seqlen,
-        pad_tok_id=tokenizer.eos_token_id,
-        num_fewshot=num_fewshot,
-        prompt_string='',
-        example_delimiter='\n',
-        continuation_delimiter='',
-        question_prelimiter='Code start: \n',
-        destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'),
-        generations_per_sample=1,
-        generation_kwargs={
-            'temperature': .9,
-            'top_k': 40,
-        },
-    )
-    assert isinstance(dl, DataSpec)
-    assert isinstance(dl.dataloader, DataLoader)  # pyright
-    batch = next(dl.dataloader._get_iterator())
-    microbatch_size = 1
-    microbatches = dl.split_batch(batch, microbatch_size)
-    assert len(microbatches) == 4
-    for microbatch in microbatches:
-        assert dl.get_num_samples_in_batch(microbatch) == 1
-        assert 'input_ids' in microbatch
-        # TODO: what should this be?
-        # assert tuple(microbatch['input_ids'].shape) == (microbatch_size, seqlen)
-        assert 'attention_mask' in microbatch
-        # assert tuple(microbatch['attention_mask'].shape) == (microbatch_size, seqlen)
-        assert isinstance(microbatch['generation_kwargs'], dict)
-        assert microbatch['generation_kwargs']['temperature'] == .9
-        assert microbatch['generation_kwargs']['top_k'] == 40
-        assert microbatch['generation_kwargs']['pad_token_id'] == 0
-        assert microbatch['generation_kwargs']['num_beams'] == 1
-        assert microbatch['generation_kwargs']['do_sample'] == True
-        assert microbatch['generation_kwargs']['use_cache'] == True
-        assert microbatch['generation_kwargs']['eos_token_id'] == 0
-
-
 @pytest.mark.parametrize('num_fewshot', [0, 5])
 @pytest.mark.parametrize('dataset_uri', ['lambada_small.jsonl'])
 # @pytest.mark.gpu
@@ -2788,219 +2321,6 @@ def test_qa_task_with_cot_evaluation(
             1].item() == 0
 
 
-def test_code_eval_requires_envvar(monkeypatch: pytest.MonkeyPatch):
-    monkeypatch.delenv('CODE_EVAL_DEVICE', raising=False)
-    with pytest.raises(
-        ValueError,
-        match='Attempting to use InContextLearningCodeEvalAccuracy but.*',
-    ):
-        InContextLearningCodeEvalAccuracy().get_client()
-
-
-def test_code_eval_requires_valid_envvar(monkeypatch: pytest.MonkeyPatch):
-    monkeypatch.setenv('CODE_EVAL_DEVICE', 'bigchungus')
-    with pytest.raises(
-        ValueError,
-        match='Environment variable `CODE_EVAL_DEVICE` must be on.*',
-    ):
-        InContextLearningCodeEvalAccuracy().get_client()
-
-
-@pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
-@pytest.mark.parametrize('num_fewshot', [0])
-@pytest.mark.parametrize('generations_per_sample', range(1, 3))
-@pytest.mark.gpu
-@pytest.mark.world_size(2)
-@pytest.mark.filterwarnings(
-    r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning',
-)
-def test_code_eval_microbatching(
-    monkeypatch: pytest.MonkeyPatch,
-    tiny_opt_tokenizer: transformers.AutoTokenizer,
-    tiny_opt_model: transformers.AutoModelForCausalLM,
-    num_fewshot: int,
-    dataset_uri: str,
-    tmp_path: Path,
-    generations_per_sample: int,
-):
-
-    monkeypatch.setenv('CODE_EVAL_DEVICE', 'LOCAL')
-    in_memory_logger = InMemoryLogger(
-    )  # track the logged metrics in the in_memory_logger
-    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-    dataset_uri = f'{local_data}/{dataset_uri}'
-    tokenizer = tiny_opt_tokenizer
-    batch_size = 4
-
-    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
-    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
-    dl = get_icl_task_dataloader(
-        'code_evaluation',
-        dataset_uri=dataset_uri,
-        tokenizer=tokenizer,
-        batch_size=batch_size,
-        max_seq_len=150,
-        pad_tok_id=tokenizer.eos_token_id,
-        num_fewshot=num_fewshot,
-        prompt_string='',
-        example_delimiter='\n',
-        continuation_delimiter=': ',
-        destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
-        generations_per_sample=generations_per_sample,
-    )
-
-    evaluator = Evaluator(
-        label='humaneval',
-        dataloader=dl,
-        metric_names=['InContextLearningCodeEvalAccuracy'],
-        device_eval_microbatch_size=1,
-    )
-    model = HuggingFaceModel(
-        model=tiny_opt_model,
-        tokenizer=tokenizer,
-        eval_metrics=[InContextLearningCodeEvalAccuracy()],
-        use_logits=True,
-    )
-
-    trainer = Trainer(model=model, max_duration='1ba', loggers=in_memory_logger)
-    torch.use_deterministic_algorithms(False)
-    trainer.eval(eval_dataloader=evaluator)
-    torch.use_deterministic_algorithms(True)
-    assert 'metrics/humaneval/InContextLearningCodeEvalAccuracy' in in_memory_logger.data.keys(
-    )
-    assert in_memory_logger.data[
-        'metrics/humaneval/InContextLearningCodeEvalAccuracy'][0][1].item() == 0
-
-
-@pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
-@pytest.mark.parametrize('num_fewshot', [0])
-@pytest.mark.parametrize('generations_per_sample', range(1, 3))
-@pytest.mark.gpu
-@pytest.mark.world_size(2)
-@pytest.mark.filterwarnings(
-    r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning',
-)
-def test_code_eval_sentpiece_evaluation(
-    monkeypatch: pytest.MonkeyPatch,
-    num_fewshot: int,
-    dataset_uri: str,
-    tiny_opt_tokenizer: transformers.AutoTokenizer,
-    tiny_opt_model: transformers.AutoModelForCausalLM,
-    tmp_path: Path,
-    generations_per_sample: int,
-):
-
-    monkeypatch.setenv('CODE_EVAL_DEVICE', 'LOCAL')
-    in_memory_logger = InMemoryLogger(
-    )  # track the logged metrics in the in_memory_logger
-    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-    dataset_uri = f'{local_data}/{dataset_uri}'
-    tokenizer = tiny_opt_tokenizer
-    batch_size = 2
-    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
-    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
-    dl = get_icl_task_dataloader(
-        'code_evaluation',
-        dataset_uri=dataset_uri,
-        tokenizer=tokenizer,
-        batch_size=batch_size,
-        max_seq_len=175,
-        pad_tok_id=tokenizer.eos_token_id,
-        num_fewshot=num_fewshot,
-        prompt_string='',
-        example_delimiter='\n',
-        continuation_delimiter=': ',
-        destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
-        generations_per_sample=generations_per_sample,
-    )
-
-    evaluator = Evaluator(
-        label='humaneval',
-        dataloader=dl,
-        metric_names=['InContextLearningCodeEvalAccuracy'],
-    )
-    model = HuggingFaceModel(
-        model=tiny_opt_model,
-        tokenizer=tiny_opt_tokenizer,
-        eval_metrics=[InContextLearningCodeEvalAccuracy()],
-        use_logits=True,
-    )
-
-    trainer = Trainer(model=model, max_duration='1ba', loggers=in_memory_logger)
-    torch.use_deterministic_algorithms(False)
-    trainer.eval(eval_dataloader=evaluator)
-    torch.use_deterministic_algorithms(True)
-    assert 'metrics/humaneval/InContextLearningCodeEvalAccuracy' in in_memory_logger.data.keys(
-    )
-    assert in_memory_logger.data[
-        'metrics/humaneval/InContextLearningCodeEvalAccuracy'][0][1].item() == 0
-
-
-@pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
-@pytest.mark.parametrize('num_fewshot', [0, 2])
-@pytest.mark.parametrize('generations_per_sample', [1])
-@pytest.mark.filterwarnings(r'ignore: Input length of input_ids is')
-@pytest.mark.gpu
-@pytest.mark.world_size(2)
-@pytest.mark.filterwarnings(
-    r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning',
-)
-def test_code_eval_task_evaluation(
-    monkeypatch: pytest.MonkeyPatch,
-    num_fewshot: int,
-    dataset_uri: str,
-    tiny_gpt2_tokenizer: transformers.AutoTokenizer,
-    tiny_gpt2_model: transformers.AutoModelForCausalLM,
-    tmp_path: Path,
-    generations_per_sample: int,
-):
-
-    monkeypatch.setenv('CODE_EVAL_DEVICE', 'LOCAL')
-    in_memory_logger = InMemoryLogger(
-    )  # track the logged metrics in the in_memory_logger
-    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-    dataset_uri = f'{local_data}/{dataset_uri}'
-    tokenizer = tiny_gpt2_tokenizer
-    batch_size = 2
-    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
-    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
-    dl = get_icl_task_dataloader(
-        'code_evaluation',
-        dataset_uri=dataset_uri,
-        tokenizer=tokenizer,
-        batch_size=batch_size,
-        max_seq_len=64 * num_fewshot,
-        pad_tok_id=tokenizer.eos_token_id,
-        num_fewshot=num_fewshot,
-        prompt_string='',
-        example_delimiter='\n',
-        continuation_delimiter=': ',
-        destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
-        generations_per_sample=generations_per_sample,
-    )
-
-    evaluator = Evaluator(
-        label='humaneval',
-        dataloader=dl,
-        metric_names=['InContextLearningCodeEvalAccuracy'],
-    )
-    model = HuggingFaceModel(
-        model=tiny_gpt2_model,
-        tokenizer=tiny_gpt2_tokenizer,
-        eval_metrics=[InContextLearningCodeEvalAccuracy()],
-        use_logits=True,
-    )
-
-    trainer = Trainer(model=model, max_duration='1ba', loggers=in_memory_logger)
-    torch.use_deterministic_algorithms(False)
-    trainer.eval(eval_dataloader=evaluator)
-    torch.use_deterministic_algorithms(True)
-    assert 'metrics/humaneval/InContextLearningCodeEvalAccuracy' in in_memory_logger.data.keys(
-    )
-    assert in_memory_logger.data[
-        'metrics/humaneval/InContextLearningCodeEvalAccuracy'][0][1].item() == 0
-
-
 @pytest.mark.parametrize('dataset_uri', ['lambada_small.jsonl'])
 def test_lm_spacing_dataloader(
     dataset_uri: str,
diff --git a/tests/eval/test_nlp_metrics.py b/tests/eval/test_nlp_metrics.py
index e07be4d863..17f0f3146f 100644
--- a/tests/eval/test_nlp_metrics.py
+++ b/tests/eval/test_nlp_metrics.py
@@ -1,14 +1,10 @@
 # Copyright 2024 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Any, List
-
-import pytest
 import torch
 import transformers
 
 from llmfoundry.eval.metrics import (
-    InContextLearningCodeEvalAccuracy,
     InContextLearningGenerationExactMatchAccuracy,
     InContextLearningLMAccuracy,
     InContextLearningMultipleChoiceAccuracy,
@@ -86,72 +82,6 @@ def test_in_context_learning_qa_cot_accuracy():
     assert metric.compute() == (2 / 4)
 
 
-def test_in_context_learning_code_eval_accuracy(
-    monkeypatch: pytest.MonkeyPatch,
-):
-    outputs = [
-        '    return 1 if n <= 1 else fib(n - 1) + fib(n - 1)',  # incorrect
-        '   if n <= 1:\n        return 1\n    return fib(n-1) + fib(n-2)',  # incorrect spacing
-        '    return n * 2',  # correct
-        '    return 2*n',  # correct
-        '    return n + 2',  # incorrect
-        '    return n + 1',
-    ]  # correct
-    labels = []
-    prompts = [
-        'def fib(n):\n',
-        'def multiply_by_two(n):\n',
-        'def add_one(n):\n',
-    ]
-    entry_points = ['fib', 'multiply_by_two', 'add_one']
-    test_inputs = [['(1,)', '(2,)', '(4,)'], ['(1,)', '(2,)', '(4,)'],
-                   ['(1,)', '(2,)', '(4,)']]
-    test_outputs = [['1', '2', '5'], ['2', '4', '8'], ['2', '3', '5']]
-    sample_ids = [0, 1, 2]
-    languages = ['python', 'python', 'python']
-    monkeypatch.setenv('CODE_EVAL_DEVICE', 'LOCAL')
-    generations_per_sample = 2
-
-    def repeat(values: List[Any]):
-        return [val for val in values for _ in range(generations_per_sample)]
-
-    transformers = pytest.importorskip('transformers')
-    tokenizer = transformers.AutoTokenizer.from_pretrained(
-        'mosaicml/mpt-7b',
-    )  # type: ignore reportUnboundVariable
-    tokenizer.pad_token = tokenizer.eos_token
-    input_ids = tokenizer.batch_encode_plus(
-        repeat(prompts),
-        return_tensors='pt',
-        padding=True,
-    )['input_ids']
-    batch = {
-        # This tests deterministic beam search rather than sampling
-        'input_ids': input_ids,
-        'generation_kwargs': {
-            'num_beams': 1,
-        },
-        'prompts': repeat(prompts),
-        'pass_at_k': [1],
-        'entry_points': repeat(entry_points),
-        'test_inputs': repeat(test_inputs),
-        'test_outputs': repeat(test_outputs),
-        'languages': repeat(languages),
-        'dataset_size': len(prompts),
-        'generations_per_sample': generations_per_sample,
-        'sample_id': repeat(sample_ids),
-    }
-    metric = InContextLearningCodeEvalAccuracy()
-    metric.update(batch, outputs, labels)
-
-    # pass@1 values
-    #   program 1: 0
-    #   program 2: 1
-    #   program 3: .5
-    # mean: 0.5
-    assert metric.compute() == 0.5
-
-
 def test_in_context_learning_mc_accuracy(
     tiny_gpt2_tokenizer: transformers.AutoTokenizer,
 ):