From 6c9ace81d2768bc847bf6e111f6593f91da0ee5b Mon Sep 17 00:00:00 2001
From: Jeremy D <115047575+bmosaicml@users.noreply.github.com>
Date: Wed, 19 Apr 2023 17:44:39 -0400
Subject: [PATCH 01/10] Ece icl (#2135)

* Seed the fewshot sampling in the ICL datasets (#2100)

* merge

* add ece for lm and mc

* fetch upstream

* fetch upstream

* Apply suggestions from code review

Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com>

* incorporate comments

* incorporate comments

* de;ete multi gpu

---------

Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com>
---
 .../in_context_learning_evaluation.py         | 174 +++++++++++++++++
 composer/metrics/__init__.py                  |  25 +--
 composer/metrics/nlp.py                       | 176 +++++++++++++++++-
 tests/datasets/local_data/human_eval.jsonl    |   8 +
 .../test_in_context_learning_datasets.py      |  94 +++++++++-
 tests/metrics/test_nlp_metrics.py             | 127 ++++++++++++-
 6 files changed, 574 insertions(+), 30 deletions(-)
 create mode 100644 tests/datasets/local_data/human_eval.jsonl

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index ef6b52f793..cbacd0dd2a 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -4,6 +4,7 @@
 
 from __future__ import annotations
 
+import json
 import random
 from typing import TYPE_CHECKING, Any, Union
 
@@ -246,6 +247,167 @@ def get_num_samples_in_batch(self, batch) -> int:
         return batch['input_ids'].shape[0]
 
 
+class InContextLearningCodeTracingTaskDataset(Dataset):
+    """A dataset that construct batches for in-context learning code tracing evaluation
+
+    Args:
+        dataset_uri (str): Either a local path, or a remote path beginning with ``s3://``, or another backend
+            supported by :meth:`composer.utils.maybe_create_object_store_from_uri`. Dataset must consist of rows of JSON data points with "context",
+            and "continuation". See tests/datasets/local_data/lambada_small.jsonl.
+        tokenizer (Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast]): The tokenizer used to transform data into batches
+        batch_size (int): Size of a batch used for eval
+        max_seq_len (int): The sequence length expected by the model
+        pad_tok_id (int): The special token reserved for padding the ends of batches
+        num_fewshot (int): The number of complete fewshot examples to prepend before each test example
+        prompt_string (str): Prompt string to put once before all fewshot examples/test examples (e.g. 'translate english to french')
+        example_delimiter (str): Separator that goes between individual (context, continuation) pairs (e.g. '\n')        continuation_delimiter: (str): Separator that goes between context and continuation in each example (e.g. '->')
+        destination_path (str): Temporary path to store downloaded datasets
+        fewshot_random_seed (int): Random seed used to select fewshot examples
+    """
+
+    def __init__(
+        self,
+        dataset_uri: str,
+        tokenizer: Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast],
+        max_seq_len: int,
+        pad_tok_id: int,
+        num_fewshot: int,
+        prompt_string: str,
+        example_delimiter: str,
+        continuation_delimiter: str,
+        destination_path: str,
+        fewshot_random_seed: int,
+    ):
+        try:
+            from datasets import load_dataset  # pyright: ignore [reportGeneralTypeIssues]
+        except ImportError as e:
+            raise MissingConditionalImportError(extra_deps_group='nlp',
+                                                conda_package='datasets',
+                                                conda_channel='conda-forge') from e
+        with dist.local_rank_zero_download_and_wait(destination_path):
+            if dist.get_local_rank() == 0:
+                get_file(dataset_uri, destination_path, overwrite=True)
+        dataset = load_dataset('json', data_files=destination_path, split='train', streaming=False)
+        self.samples = list(
+            dataset.map(
+                lambda examples: {
+                    'task_id': examples['task_id'],
+                    'prompt': examples['prompt'],
+                    'canonical_solution': examples['canonical_solution'],
+                    'test_inputs': examples['test_inputs'],
+                    'test_outputs': examples['test_outputs'],
+                    'test': examples['test']
+                }))
+        self.tokenizer = tokenizer
+        self.max_seq_len = max_seq_len
+        self.pad_tok_id = pad_tok_id
+        fewshot_rng = random.Random(fewshot_random_seed)
+        self.encoded_dataset = self.prep_examples(num_fewshot, prompt_string, example_delimiter, continuation_delimiter,
+                                                  fewshot_rng)
+
+    @staticmethod
+    def stringify_input(input_tuple):
+        tup = eval(input_tuple)
+        res = '\t'.join([f'arg_{i}={json.dumps(x)}' for i, x in enumerate(tup)])
+        return res
+
+    def prep_examples(self, num_fewshot: int, prompt_string: str, example_delimiter: str, continuation_delimiter: str,
+                      fewshot_rng: random.Random):
+        """Prepares a set of language modeling tasks into tokenized format with prompt and fewshot examples.
+
+        Each task consists of a context and a continuation as well as an optional prompt and optional list of
+        example context/continuation pairs which precede the test context/continuation pair.
+
+        Args:
+            num_fewshot (int): Number of examples context/continuation pairs to prepend to the test pair
+            prompt_string (str): The prompt to prepend to all inputs
+            example_delimiter (str): The delimiter used to separate each individual context/continuation pair
+            continuation_delimiter (str): The delimiter used to separate each context from its continuation
+            fewshot_rng (random.Random): Random number generator used to select fewshot examples
+
+        Returns:
+            dict: Contains the context, the continuation, and the preamble (prompt + fewshot examples)
+        """
+        examples = []
+        for sample_idx in tqdm(range(len(self.samples))):
+
+            preamble = prompt_string
+
+            if num_fewshot > 0:
+                fewshot_idxs = _get_fewshot_sample_idxs(len(self.samples), num_fewshot, sample_idx, fewshot_rng)
+                for fewshot_idx in fewshot_idxs:
+                    prompt, soln, entry_point, test_in, test_out = (
+                        self.samples[fewshot_idx]['prompt'],
+                        self.samples[fewshot_idx]['canonical_solution'],
+                        self.samples[fewshot_idx]['entry_point'],
+                        self.samples[fewshot_idx]['test_inputs'],
+                        self.samples[fewshot_idx]['test_outputs'],
+                    )
+                    test_idx = random.choice(range(0, len(test_in)))
+                    example = f"""{example_delimiter}\n{prompt}\n{soln}\n####\nEntry point: {entry_point}\nInputs: {self.stringify_input(test_in[test_idx])}\nOutputs: {test_out[test_idx]}\n####\n"""
+
+                    preamble += example
+
+            prompt, soln, entry_point, test_in, test_out = (
+                self.samples[sample_idx]['prompt'],
+                self.samples[sample_idx]['canonical_solution'],
+                self.samples[sample_idx]['entry_point'],
+                self.samples[sample_idx]['test_inputs'],
+                self.samples[sample_idx]['test_outputs'],
+            )
+
+            for inp, out in zip(test_in, test_out):
+                encoded_example = {}
+                context = f"""{example_delimiter}\n{prompt}\n{soln}\n####\nEntry point: {entry_point}\nInputs: {self.stringify_input(inp)}\nOutputs:"""
+                out = f' {out}'
+                encoded_example['preamble'] = self.tokenizer(
+                    preamble
+                )  # if the preamble is empty then these will be 0-length lists, unless the tokenizer adds special tokens to empty strings (e.g. OPT tokenizer)
+                encoded_example['context'] = self.tokenizer(context, add_special_tokens=False)
+                encoded_example['continuation'] = self.tokenizer(out, add_special_tokens=False)
+                encoded_example['task_id'] = self.samples[sample_idx]['task_id']
+
+                examples.append(encoded_example)
+
+        return examples
+
+    def __getitem__(self, index):
+        return self.encoded_dataset[index]
+
+    def __len__(self):
+        return len(self.encoded_dataset)
+
+    def collate_fn(self, data):
+        inputs = []
+        continuation_indices = []
+        task_ids = []
+        for data_pair in data:
+            preamble, context, continuation = (data_pair['preamble'], data_pair['context'], data_pair['continuation'])
+            task_ids.append(data_pair['task_id'])
+            context_enc = preamble['input_ids'] + context['input_ids']
+            continuation_enc = continuation['input_ids']
+
+            inp, continuation_span = _make_padded_input(context_enc, continuation_enc, self.max_seq_len,
+                                                        self.pad_tok_id)
+
+            inputs.append(inp)
+            continuation_indices.append(continuation_span)
+
+        batch = {
+            'input_ids': torch.stack(inputs),
+            'continuation_indices': continuation_indices,
+            'mode': 'icl_task',
+            'labels': torch.stack(inputs),
+            'task_ids': task_ids
+        }
+
+        batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
+        return batch
+
+    def get_num_samples_in_batch(self, batch) -> int:
+        return batch['input_ids'].shape[0]
+
+
 class InContextLearningLMTaskDataset(Dataset):
     """A dataset that construct batches for in-context learning language modeling evaluation
 
@@ -645,6 +807,18 @@ def get_icl_task_dataloader(
                                                  destination_path=destination_path,
                                                  fewshot_random_seed=fewshot_random_seed)
         effective_batchsize = batch_size
+    elif icl_task_type == 'code_tracing':
+        dataset = InContextLearningCodeTracingTaskDataset(dataset_uri,
+                                                          tokenizer,
+                                                          max_seq_len,
+                                                          pad_tok_id,
+                                                          num_fewshot,
+                                                          prompt_string,
+                                                          example_delimiter,
+                                                          continuation_delimiter,
+                                                          destination_path=destination_path,
+                                                          fewshot_random_seed=fewshot_random_seed)
+        effective_batchsize = batch_size
     elif icl_task_type == 'question_answering':
         dataset = InContextLearningQATaskDataset(dataset_uri,
                                                  tokenizer,
diff --git a/composer/metrics/__init__.py b/composer/metrics/__init__.py
index 27b7ef9c07..65c992b236 100644
--- a/composer/metrics/__init__.py
+++ b/composer/metrics/__init__.py
@@ -5,26 +5,19 @@
 
 from composer.metrics.map import MAP
 from composer.metrics.metrics import CrossEntropy, Dice, LossMetric, MIoU
-from composer.metrics.nlp import (BinaryF1Score, HFCrossEntropy, InContextLearningLMAccuracy, InContextLearningMetric,
+from composer.metrics.nlp import (BinaryF1Score, HFCrossEntropy, InContextLearningCodeTracingAveragePassRate,
+                                  InContextLearningCodeTracingFullPassRate, InContextLearningLMAccuracy,
+                                  InContextLearningLMExpectedCalibrationError,
+                                  InContextLearningMCExpectedCalibrationError, InContextLearningMetric,
                                   InContextLearningMultipleChoiceAccuracy, InContextLearningQAAccuracy,
                                   LanguageCrossEntropy, LanguagePerplexity, MaskedAccuracy, Perplexity)
 
 __all__ = [
-    'MAP',
-    'MIoU',
-    'Dice',
-    'CrossEntropy',
-    'LossMetric',
-    'Perplexity',
-    'BinaryF1Score',
-    'HFCrossEntropy',
-    'LanguageCrossEntropy',
-    'MaskedAccuracy',
-    'LanguagePerplexity',
-    'InContextLearningLMAccuracy',
-    'InContextLearningMultipleChoiceAccuracy',
-    'InContextLearningQAAccuracy',
-    'InContextLearningMetric',
+    'MAP', 'MIoU', 'Dice', 'CrossEntropy', 'LossMetric', 'Perplexity', 'BinaryF1Score', 'HFCrossEntropy',
+    'LanguageCrossEntropy', 'MaskedAccuracy', 'LanguagePerplexity', 'InContextLearningLMAccuracy',
+    'InContextLearningMultipleChoiceAccuracy', 'InContextLearningQAAccuracy',
+    'InContextLearningMCExpectedCalibrationError', 'InContextLearningLMExpectedCalibrationError',
+    'InContextLearningMetric', 'InContextLearningCodeTracingFullPassRate', 'InContextLearningCodeTracingAveragePassRate'
 ]
 
 METRIC_DEFAULT_CTORS = {
diff --git a/composer/metrics/nlp.py b/composer/metrics/nlp.py
index a12c8c3fd8..e615b6afe0 100644
--- a/composer/metrics/nlp.py
+++ b/composer/metrics/nlp.py
@@ -5,7 +5,7 @@
 import re
 import string
 import warnings
-from typing import List, Mapping, Optional, Union
+from typing import Any, Dict, List, Mapping, Optional, Union
 
 import torch
 from torch import Tensor
@@ -15,8 +15,10 @@
 from composer.loss import soft_cross_entropy
 
 __all__ = [
-    'Perplexity', 'InContextLearningLMAccuracy', 'BinaryF1Score', 'HFCrossEntropy', 'LanguageCrossEntropy',
-    'MaskedAccuracy', 'LanguagePerplexity'
+    'Perplexity', 'InContextLearningLMAccuracy', 'InContextLearningMultipleChoiceAccuracy',
+    'InContextLearningQAAccuracy', 'BinaryF1Score', 'HFCrossEntropy', 'LanguageCrossEntropy', 'MaskedAccuracy',
+    'LanguagePerplexity', 'InContextLearningLMExpectedCalibrationError', 'InContextLearningMCExpectedCalibrationError',
+    'InContextLearningCodeTracingFullPassRate', 'InContextLearningCodeTracingAveragePassRate'
 ]
 
 
@@ -402,6 +404,58 @@ def compute(self):
         return self.correct / self.total
 
 
+class InContextLearningCodeTracingFullPassRate(InContextLearningMetric):
+
+    # Make torchmetrics call update only once
+    full_state_update = False
+
+    def __init__(self, dist_sync_on_step: bool = False):
+        # state from multiple processes
+        super().__init__(dist_sync_on_step=dist_sync_on_step)
+        self.add_state('correct', default=torch.tensor(0.), dist_reduce_fx='sum')
+        self.add_state('total', default=torch.tensor(0.), dist_reduce_fx='sum')
+
+    def update(self, batch: dict, output_logits: torch.Tensor, labels: torch.Tensor):
+        breakpoint()
+        for batch_idx, cont_idx in enumerate(batch['continuation_indices']):
+            cont_tok_pred = output_logits[batch_idx].index_select(dim=0, index=cont_idx - 1).argmax(dim=-1)
+            cont_tok_targ = labels[batch_idx].index_select(dim=0, index=cont_idx - 1)
+
+            self.correct += (cont_tok_pred == cont_tok_targ).all().int()
+            self.total += torch.tensor(1.0)
+
+    def compute(self):
+        assert isinstance(self.correct, Tensor)
+        assert isinstance(self.total, Tensor)
+        return self.correct / self.total
+
+
+class InContextLearningCodeTracingAveragePassRate(InContextLearningMetric):
+
+    # Make torchmetrics call update only once
+    full_state_update = False
+
+    def __init__(self, dist_sync_on_step: bool = False):
+        # state from multiple processes
+        super().__init__(dist_sync_on_step=dist_sync_on_step)
+        self.add_state('correct', default=torch.tensor(0.), dist_reduce_fx='sum')
+        self.add_state('total', default=torch.tensor(0.), dist_reduce_fx='sum')
+
+    def update(self, batch: dict, output_logits: torch.Tensor, labels: torch.Tensor):
+        breakpoint()
+        for batch_idx, cont_idx in enumerate(batch['continuation_indices']):
+            cont_tok_pred = output_logits[batch_idx].index_select(dim=0, index=cont_idx - 1).argmax(dim=-1)
+            cont_tok_targ = labels[batch_idx].index_select(dim=0, index=cont_idx - 1)
+
+            self.correct += (cont_tok_pred == cont_tok_targ).all().int()
+            self.total += torch.tensor(1.0)
+
+    def compute(self):
+        assert isinstance(self.correct, Tensor)
+        assert isinstance(self.total, Tensor)
+        return self.correct / self.total
+
+
 class InContextLearningMultipleChoiceAccuracy(InContextLearningMetric):
     r"""Computes accuracy for In-context learning (ICL) multiple choice (MC) tasks.
 
@@ -433,7 +487,9 @@ def __init__(self, dist_sync_on_step: bool = False):
     def update(self, batch: dict, output_logits: torch.Tensor, labels: torch.Tensor):
         perplexities = []
         for batch_idx, cont_idx in enumerate(batch['continuation_indices']):
+            # continuation indices refer to indices in the original input's token space
             cont_tok_logits = output_logits[batch_idx].index_select(dim=0, index=cont_idx - 1)
+            # labels have been shifted left by one index, so the cont_idx needs to be shifted as well.
             cont_tok_targ = labels[batch_idx].index_select(dim=0, index=cont_idx - 1)
             cross_entropy = F.cross_entropy(cont_tok_logits, cont_tok_targ)
             perplexity = torch.exp(cross_entropy)
@@ -451,3 +507,117 @@ def compute(self):
         assert isinstance(self.correct, Tensor)
         assert isinstance(self.total, Tensor)
         return self.correct.float() / self.total
+
+
+class InContextLearningExpectedCalibrationError(InContextLearningMetric):
+    """Generic class for Expected Calibration Error (ECE) (cite: https://arxiv.org/pdf/1706.04599.pdf).
+
+    Expected calibration error is calculated by dividing predictions into buckets based on the model's confidence (a probability value between 0 and 1).
+    We then calculate the accuracy within each bucket and calculate the average gap between confidence and accuracy
+    across buckets, weighted by the number of samples in each bucket.
+
+    Each task must implement its own definition of "confidence" to be computed via the `update` method.
+
+    Adds metric state variables:
+    bucket_totals (float): The number of instances where the prediction masked the target per bucket.
+    bucket_correct (float): The number of total instances that were predicted per bucket.
+
+    Args:
+        dist_sync_on_step (bool, optional): Synchronize metric state across processes at
+            each forward() before returning the value at the step. Default: ``False``.
+        n_buckets (int): Number of distinct buckets to split the confidence distribution into
+    """
+
+    def __init__(self, dist_sync_on_step: bool = False, n_buckets: int = 10):
+        # state from multiple processes
+        super().__init__(dist_sync_on_step=dist_sync_on_step)
+        self.n_buckets = n_buckets
+        if n_buckets < 1:
+            raise Exception('`n_buckets`')
+        self.add_state('bucket_totals', default=torch.zeros(n_buckets), dist_reduce_fx='sum')
+        self.add_state('bucket_correct', default=torch.zeros(n_buckets), dist_reduce_fx='sum')
+
+    def update(self, batch: dict, output_logits: torch.Tensor, labels: torch.Tensor):
+        pass
+
+    def compute(self):
+        assert isinstance(self.bucket_correct, Tensor)
+        assert isinstance(self.bucket_totals, Tensor)
+
+        result = torch.tensor(0.0, device=self.bucket_correct.device)
+        total_obs = torch.sum(self.bucket_totals)
+        for i in range(self.n_buckets):
+            if self.bucket_totals[i] == 0:
+                continue
+
+            acc_bucket_i = self.bucket_correct[i] / self.bucket_totals[i]
+            upper_bound = (i + 1) / self.n_buckets
+            lower_bound = i / self.n_buckets
+            conf_bucket_i = torch.tensor((upper_bound + lower_bound) / 2, device=self.bucket_correct.device)
+            result += (self.bucket_totals[i] / total_obs) * torch.abs(acc_bucket_i - conf_bucket_i)
+        return result
+
+
+class InContextLearningMCExpectedCalibrationError(InContextLearningExpectedCalibrationError):
+    r"""Computes Expected Calibration Error (ECE) for In-context learning (ICL) multiple choice (MC) tasks. (source: https://arxiv.org/abs/2012.00955).
+
+    For MC tasks, the model confidence is defined as the softmax of average per-token probability assigned to the top question choice.
+
+    See `InContextLearningExpectedCalibrationError` for more info.
+    """
+
+    # Make torchmetrics call update only once
+    full_state_update = False
+
+    def update(self, batch: Dict[str, Any], output_logits: torch.Tensor, labels: torch.Tensor):
+        output_logits = torch.softmax(output_logits, dim=2)
+        probabilites = []
+        for batch_idx, cont_idx in enumerate(batch['continuation_indices']):
+            cont_tok_logits = output_logits[batch_idx].index_select(dim=0, index=cont_idx - 1)
+            cont_tok_targ = labels[batch_idx].index_select(dim=0, index=cont_idx - 1)
+            probability = cont_tok_logits.index_select(dim=1, index=cont_tok_targ).diagonal().mean()
+            probabilites.append(probability)
+
+        for (start, end), gold_idx in zip(batch['choice_groupings'], batch['gold_indices']):
+            subset = probabilites[start:end]
+            idx_max = subset.index(max(subset))
+            confidence = torch.tensor(subset).max() / torch.tensor(subset).sum()
+
+            assert confidence >= 0.0 and confidence <= 1.0
+            bucket_idx = int(confidence * self.n_buckets)
+            if bucket_idx == self.n_buckets:
+                bucket_idx -= 1
+
+            if idx_max == gold_idx:
+                self.bucket_correct[bucket_idx] += 1  # pyright: ignore [reportGeneralTypeIssues]
+
+            self.bucket_totals[bucket_idx] += 1  # pyright: ignore [reportGeneralTypeIssues]
+
+
+class InContextLearningLMExpectedCalibrationError(InContextLearningExpectedCalibrationError):
+    r"""Computes Expected Calibration Error (ECE) for In-context learning (ICL) language modeling (LM) tasks. (cite: https://arxiv.org/pdf/1706.04599.pdf).
+
+    For LM tasks, the model confidence is defined as the minimum probability assigned to all tokens in the continuation.
+
+    See `InContextLearningExpectedCalibrationError` for more info.
+    """
+
+    # Make torchmetrics call update only once
+    full_state_update = False
+
+    def update(self, batch: Dict[str, Any], output_logits: torch.Tensor, labels: torch.Tensor):
+        output_logits = torch.softmax(output_logits, dim=2)
+        for batch_idx, cont_idx in enumerate(batch['continuation_indices']):
+            cont_tok_logits = output_logits[batch_idx].index_select(dim=0, index=cont_idx - 1)
+            cont_tok_pred = cont_tok_logits.argmax(dim=-1)
+            confidence = cont_tok_logits.max(dim=-1).values.min()
+            cont_tok_targ = labels[batch_idx].index_select(dim=0, index=cont_idx - 1)
+            assert confidence >= 0.0 and confidence <= 1.0
+            bucket_idx = int(confidence * self.n_buckets)
+            if bucket_idx == self.n_buckets:
+                bucket_idx -= 1
+
+            if (cont_tok_pred == cont_tok_targ).all():
+                self.bucket_correct[bucket_idx] += 1  # pyright: ignore [reportGeneralTypeIssues]
+
+            self.bucket_totals[bucket_idx] += 1  # pyright: ignore [reportGeneralTypeIssues]
diff --git a/tests/datasets/local_data/human_eval.jsonl b/tests/datasets/local_data/human_eval.jsonl
new file mode 100644
index 0000000000..2bd1fa7750
--- /dev/null
+++ b/tests/datasets/local_data/human_eval.jsonl
@@ -0,0 +1,8 @@
+{"task_id": "HumanEval/1", "prompt": "from typing import List\n\n\ndef separate_paren_groups(paren_string: str) -> List[str]:\n    \"\"\" Input to this function is a string containing multiple groups of nested parentheses. Your goal is to\n    separate those group into separate strings and return the list of those.\n    Separate groups are balanced (each open brace is properly closed) and not nested within each other\n    Ignore any spaces in the input string.\n    >>> separate_paren_groups('( ) (( )) (( )( ))')\n    ['()', '(())', '(()())']\n    \"\"\"\n", "entry_point": "separate_paren_groups", "canonical_solution": "    result = []\n    current_string = []\n    current_depth = 0\n\n    for c in paren_string:\n        if c == '(':\n            current_depth += 1\n            current_string.append(c)\n        elif c == ')':\n            current_depth -= 1\n            current_string.append(c)\n\n            if current_depth == 0:\n                result.append(''.join(current_string))\n                current_string.clear()\n\n    return result\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate('(()()) ((())) () ((())()())') == [\n        '(()())', '((()))', '()', '((())()())'\n    ]\n    assert candidate('() (()) ((())) (((())))') == [\n        '()', '(())', '((()))', '(((())))'\n    ]\n    assert candidate('(()(())((())))') == [\n        '(()(())((())))'\n    ]\n    assert candidate('( ) (( )) (( )( ))') == ['()', '(())', '(()())']\n", "test_inputs": ["('(()()) ((())) () ((())()())',)", "('() (()) ((())) (((())))',)", "('(()(())((())))',)", "('( ) (( )) (( )( ))',)"], "test_outputs": ["['(()())', '((()))', '()', '((())()())']", "['()', '(())', '((()))', '(((())))']", "['(()(())((())))']", "['()', '(())', '(()())']"]}
+{"task_id": "HumanEval/0", "prompt": "from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    \"\"\"\n", "entry_point": "has_close_elements", "canonical_solution": "    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx != idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\n    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True\n    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False\n    assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True\n    assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True\n    assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False\n\n", "test_inputs": ["([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3)", "([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05)", "([1.0, 2.0, 5.9, 4.0, 5.0], 0.95)", "([1.0, 2.0, 5.9, 4.0, 5.0], 0.8)", "([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1)", "([1.1, 2.2, 3.1, 4.1, 5.1], 1.0)", "([1.1, 2.2, 3.1, 4.1, 5.1], 0.5)"], "test_outputs": ["True", "False", "True", "False", "True", "True", "False"]}
+{"task_id": "HumanEval/2", "prompt": "\n\ndef truncate_number(number: float) -> float:\n    \"\"\" Given a positive floating point number, it can be decomposed into\n    and integer part (largest integer smaller than given number) and decimals\n    (leftover part always smaller than 1).\n\n    Return the decimal part of the number.\n    >>> truncate_number(3.5)\n    0.5\n    \"\"\"\n", "entry_point": "truncate_number", "canonical_solution": "    return number % 1.0\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate(3.5) == 0.5\n    assert abs(candidate(1.33) - 0.33) < 1e-6\n    assert abs(candidate(123.456) - 0.456) < 1e-6\n", "test_inputs": ["(3.5,)", "(1.33,)", "(123.456,)"], "test_outputs": ["0.5", "0.33000000000000007", "0.45600000000000307"]}
+{"task_id": "HumanEval/3", "prompt": "from typing import List\n\n\ndef below_zero(operations: List[int]) -> bool:\n    \"\"\" You're given a list of deposit and withdrawal operations on a bank account that starts with\n    zero balance. Your task is to detect if at any point the balance of account fallls below zero, and\n    at that point function should return True. Otherwise it should return False.\n    >>> below_zero([1, 2, 3])\n    False\n    >>> below_zero([1, 2, -4, 5])\n    True\n    \"\"\"\n", "entry_point": "below_zero", "canonical_solution": "    balance = 0\n\n    for op in operations:\n        balance += op\n        if balance < 0:\n            return True\n\n    return False\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate([]) == False\n    assert candidate([1, 2, -3, 1, 2, -3]) == False\n    assert candidate([1, 2, -4, 5, 6]) == True\n    assert candidate([1, -1, 2, -2, 5, -5, 4, -4]) == False\n    assert candidate([1, -1, 2, -2, 5, -5, 4, -5]) == True\n    assert candidate([1, -2, 2, -2, 5, -5, 4, -4]) == True\n", "test_inputs": ["([],)", "([1, 2, -3, 1, 2, -3],)", "([1, 2, -4, 5, 6],)", "([1, -1, 2, -2, 5, -5, 4, -4],)", "([1, -1, 2, -2, 5, -5, 4, -5],)", "([1, -2, 2, -2, 5, -5, 4, -4],)"], "test_outputs": ["False", "False", "True", "False", "True", "True"]}
+{"task_id": "HumanEval/4", "prompt": "from typing import List\n\n\ndef mean_absolute_deviation(numbers: List[float]) -> float:\n    \"\"\" For a given list of input numbers, calculate Mean Absolute Deviation\n    around the mean of this dataset.\n    Mean Absolute Deviation is the average absolute difference between each\n    element and a centerpoint (mean in this case):\n    MAD = average | x - x_mean |\n    >>> mean_absolute_deviation([1.0, 2.0, 3.0, 4.0])\n    1.0\n    \"\"\"\n", "entry_point": "mean_absolute_deviation", "canonical_solution": "    mean = sum(numbers) / len(numbers)\n    return sum(abs(x - mean) for x in numbers) / len(numbers)\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert abs(candidate([1.0, 2.0, 3.0]) - 2.0/3.0) < 1e-6\n    assert abs(candidate([1.0, 2.0, 3.0, 4.0]) - 1.0) < 1e-6\n    assert abs(candidate([1.0, 2.0, 3.0, 4.0, 5.0]) - 6.0/5.0) < 1e-6\n\n", "test_inputs": ["([1.0, 2.0, 3.0],)", "([1.0, 2.0, 3.0, 4.0],)", "([1.0, 2.0, 3.0, 4.0, 5.0],)"], "test_outputs": ["0.6666666666666666", "1.0", "1.2"]}
+{"task_id": "HumanEval/5", "prompt": "from typing import List\n\n\ndef intersperse(numbers: List[int], delimeter: int) -> List[int]:\n    \"\"\" Insert a number 'delimeter' between every two consecutive elements of input list `numbers'\n    >>> intersperse([], 4)\n    []\n    >>> intersperse([1, 2, 3], 4)\n    [1, 4, 2, 4, 3]\n    \"\"\"\n", "entry_point": "intersperse", "canonical_solution": "    if not numbers:\n        return []\n\n    result = []\n\n    for n in numbers[:-1]:\n        result.append(n)\n        result.append(delimeter)\n\n    result.append(numbers[-1])\n\n    return result\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate([], 7) == []\n    assert candidate([5, 6, 3, 2], 8) == [5, 8, 6, 8, 3, 8, 2]\n    assert candidate([2, 2, 2], 2) == [2, 2, 2, 2, 2]\n", "test_inputs": ["([], 7)", "([5, 6, 3, 2], 8)", "([2, 2, 2], 2)"], "test_outputs": ["[]", "[5, 8, 6, 8, 3, 8, 2]", "[2, 2, 2, 2, 2]"]}
+{"task_id": "HumanEval/6", "prompt": "from typing import List\n\n\ndef parse_nested_parens(paren_string: str) -> List[int]:\n    \"\"\" Input to this function is a string represented multiple groups for nested parentheses separated by spaces.\n    For each of the group, output the deepest level of nesting of parentheses.\n    E.g. (()()) has maximum two levels of nesting while ((())) has three.\n\n    >>> parse_nested_parens('(()()) ((())) () ((())()())')\n    [2, 3, 1, 3]\n    \"\"\"\n", "entry_point": "parse_nested_parens", "canonical_solution": "    def parse_paren_group(s):\n        depth = 0\n        max_depth = 0\n        for c in s:\n            if c == '(':\n                depth += 1\n                max_depth = max(depth, max_depth)\n            else:\n                depth -= 1\n\n        return max_depth\n\n    return [parse_paren_group(x) for x in paren_string.split(' ') if x]\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate('(()()) ((())) () ((())()())') == [2, 3, 1, 3]\n    assert candidate('() (()) ((())) (((())))') == [1, 2, 3, 4]\n    assert candidate('(()(())((())))') == [4]\n", "test_inputs": ["('(()()) ((())) () ((())()())',)", "('() (()) ((())) (((())))',)", "('(()(())((())))',)"], "test_outputs": ["[2, 3, 1, 3]", "[1, 2, 3, 4]", "[4]"]}
+{"task_id": "HumanEval/7", "prompt": "from typing import List\n\n\ndef filter_by_substring(strings: List[str], substring: str) -> List[str]:\n    \"\"\" Filter an input list of strings only for ones that contain given substring\n    >>> filter_by_substring([], 'a')\n    []\n    >>> filter_by_substring(['abc', 'bacd', 'cde', 'array'], 'a')\n    ['abc', 'bacd', 'array']\n    \"\"\"\n", "entry_point": "filter_by_substring", "canonical_solution": "    return [x for x in strings if substring in x]\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate([], 'john') == []\n    assert candidate(['xxx', 'asd', 'xxy', 'john doe', 'xxxAAA', 'xxx'], 'xxx') == ['xxx', 'xxxAAA', 'xxx']\n    assert candidate(['xxx', 'asd', 'aaaxxy', 'john doe', 'xxxAAA', 'xxx'], 'xx') == ['xxx', 'aaaxxy', 'xxxAAA', 'xxx']\n    assert candidate(['grunt', 'trumpet', 'prune', 'gruesome'], 'run') == ['grunt', 'prune']\n", "test_inputs": ["([], 'john')", "(['xxx', 'asd', 'xxy', 'john doe', 'xxxAAA', 'xxx'], 'xxx')", "(['xxx', 'asd', 'aaaxxy', 'john doe', 'xxxAAA', 'xxx'], 'xx')", "(['grunt', 'trumpet', 'prune', 'gruesome'], 'run')"], "test_outputs": ["[]", "['xxx', 'xxxAAA', 'xxx']", "['xxx', 'aaaxxy', 'xxxAAA', 'xxx']", "['grunt', 'prune']"]}
diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py
index 56bfbd3850..47cc96ba9c 100644
--- a/tests/datasets/test_in_context_learning_datasets.py
+++ b/tests/datasets/test_in_context_learning_datasets.py
@@ -15,7 +15,8 @@
 from composer.datasets.in_context_learning_evaluation import (_get_fewshot_sample_idxs, _make_padded_input,
                                                               get_icl_task_dataloader)
 from composer.loggers import InMemoryLogger
-from composer.metrics import (InContextLearningLMAccuracy, InContextLearningMultipleChoiceAccuracy,
+from composer.metrics import (InContextLearningCodeTracingAveragePassRate, InContextLearningCodeTracingFullPassRate,
+                              InContextLearningLMAccuracy, InContextLearningMultipleChoiceAccuracy,
                               InContextLearningQAAccuracy)
 from composer.models import HuggingFaceModel
 from composer.trainer import Trainer
@@ -88,6 +89,48 @@ def test_make_padding(tiny_gpt2_tokenizer, padding_side):
             assert input_ids[:-48].tolist() == context
 
 
+@pytest.mark.parametrize('dataset_uri', ['human_eval.jsonl'])
+def test_code_tracing_task_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
+    pytest.importorskip('datasets')
+
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+
+    tokenizer = tiny_gpt2_tokenizer
+    dataset_uri = f'{local_data}/{dataset_uri}'
+    batch_size = 2
+    seqlen = 2048
+    dl = get_icl_task_dataloader(
+        'code_tracing',
+        dataset_uri,
+        tokenizer,
+        batch_size,
+        max_seq_len=seqlen,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=2,
+        prompt_string=
+        'For the Python code snippets below, determine what the output would be if the entry point function were evaluated on the input below.\n',
+        example_delimiter='\n####\nPython code\n####\n',
+        continuation_delimiter='',
+        destination_path=str(tmp_path / 'icl.jsonl'))
+
+    assert isinstance(dl.dataloader, DataLoader)  # pyright
+    batch = next(dl.dataloader._get_iterator())
+
+    assert 'input_ids' in batch
+    assert tuple(batch['input_ids'].shape) == (batch_size, seqlen)
+    assert 'attention_mask' in batch
+    assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen)
+    assert 'continuation_indices' in batch
+    assert isinstance(batch['continuation_indices'], list) and len(batch['continuation_indices']) == batch_size
+    assert 'mode' in batch
+    assert 'task_ids' in batch
+    assert batch['task_ids'] == ['HumanEval/1', 'HumanEval/1']
+    assert batch['mode'] == 'icl_task'
+    min_idx = min(batch['continuation_indices'][0]).item()
+    max_idx = max(batch['continuation_indices'][0]).item()
+    assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx + 1]) == " ['(()())', '((()))', '()', '((())()())']"
+
+
 @pytest.mark.parametrize('dataset_uri', ['lambada_small.jsonl'])
 def test_lm_task_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
     pytest.importorskip('datasets')
@@ -351,6 +394,53 @@ def test_lm_task_evaluation(device, dataset_uri, num_fewshot, tiny_gpt2_tokenize
     assert in_memory_logger.data['metrics/lambada/InContextLearningLMAccuracy'][0][1].item() == 0
 
 
+@pytest.mark.parametrize('dataset_uri', ['human_eval.jsonl'])
+@device('cpu')
+def test_code_tracing_task_evaluation(device, dataset_uri, tiny_gpt2_tokenizer, tiny_gpt2_model, tmp_path):
+    pytest.importorskip('datasets')
+    in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    dataset_uri = f'{local_data}/{dataset_uri}'
+    tokenizer = tiny_gpt2_tokenizer
+    dl = get_icl_task_dataloader(
+        'code_tracing',
+        dataset_uri,
+        tokenizer,
+        2,
+        max_seq_len=1024,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=2,
+        prompt_string=
+        'For the Python code snippets below, determine what the output would be if the entry point function were evaluated on the input below.\n',
+        example_delimiter='\n####\nPython code\n####\n',
+        continuation_delimiter='',
+        destination_path=str(tmp_path / 'icl.jsonl'))
+
+    evaluator = Evaluator(
+        label='human_eval_code_tracing',
+        dataloader=dl,
+        metric_names=['InContextLearningCodeTracingAveragePassRate', 'InContextLearningCodeTracingFullPassRate'])
+
+    config = transformers.AutoConfig.from_pretrained('EleutherAI/gpt-neo-125M')
+    model = transformers.AutoModelForCausalLM.from_config(config)
+    model = HuggingFaceModel(
+        model=tiny_gpt2_model,
+        tokenizer=tokenizer,
+        eval_metrics=[InContextLearningCodeTracingAveragePassRate(),
+                      InContextLearningCodeTracingFullPassRate()],
+        use_logits=True,
+    )
+
+    trainer = Trainer(model=model, max_duration='1ep', loggers=in_memory_logger)
+    trainer.eval(eval_dataloader=evaluator, subset_num_batches=2)
+    assert 'metrics/human_eval_code_tracing/InContextLearningCodeTracingAveragePassRate' in in_memory_logger.data.keys()
+    assert 'metrics/human_eval_code_tracing/InContextLearningCodeTracingFullPassRate' in in_memory_logger.data.keys()
+    assert in_memory_logger.data['metrics/human_eval_code_tracing/InContextLearningCodeTracingAveragePassRate'][0][
+        1].item() == 0
+    assert in_memory_logger.data['metrics/human_eval_code_tracing/InContextLearningCodeTracingFullPassRate'][0][1].item(
+    ) == 0
+
+
 @pytest.mark.parametrize('dataset_uri', ['piqa_small.jsonl', 'hellaswag_small.jsonl'])
 @device('gpu')
 @pytest.mark.parametrize('num_fewshot', [0, 5])
@@ -387,7 +477,7 @@ def test_mc_task_evaluation(device, num_fewshot, dataset_uri, tiny_gpt2_tokenize
     )
 
     trainer = Trainer(model=model, max_duration='1ba', loggers=in_memory_logger)
-    trainer.eval(eval_dataloader=evaluator, subset_num_batches=2)
+    trainer.eval(eval_dataloader=evaluator)
     assert 'metrics/lambada/InContextLearningMultipleChoiceAccuracy' in in_memory_logger.data.keys()
     assert in_memory_logger.data['metrics/lambada/InContextLearningMultipleChoiceAccuracy'][0][1].item() > 0
 
diff --git a/tests/metrics/test_nlp_metrics.py b/tests/metrics/test_nlp_metrics.py
index 9b47a49359..1fa88f1b94 100644
--- a/tests/metrics/test_nlp_metrics.py
+++ b/tests/metrics/test_nlp_metrics.py
@@ -1,15 +1,20 @@
 # Copyright 2022 MosaicML Composer authors
 # SPDX-License-Identifier: Apache-2.0
 
+'do a backflip'  # Copyright 2022 MosaicML Composer authors
+# SPDX-License-Identifier: Apache-2.0
+
 import math
 
 import pytest
 import torch
 from torch.nn.functional import cross_entropy
 
-from composer.metrics.nlp import (BinaryF1Score, HFCrossEntropy, InContextLearningLMAccuracy,
-                                  InContextLearningMultipleChoiceAccuracy, InContextLearningQAAccuracy,
-                                  LanguageCrossEntropy, LanguagePerplexity, MaskedAccuracy, Perplexity)
+from composer.metrics.nlp import (BinaryF1Score, HFCrossEntropy, InContextLearningExpectedCalibrationError,
+                                  InContextLearningLMAccuracy, InContextLearningLMExpectedCalibrationError,
+                                  InContextLearningMCExpectedCalibrationError, InContextLearningMultipleChoiceAccuracy,
+                                  InContextLearningQAAccuracy, LanguageCrossEntropy, LanguagePerplexity, MaskedAccuracy,
+                                  Perplexity)
 
 
 @pytest.mark.parametrize('ignore_index', [-100])
@@ -262,15 +267,44 @@ def test_in_context_learning_lm_accuracy(tiny_gpt2_tokenizer):
         end = start + len(tiny_gpt2_tokenizer(continuation)['input_ids'])
         cont_idxs.append(torch.tensor(list(range(start, end))))
 
-    batch = {'continuation_indices': cont_idxs, 'labels': inputs, 'input_ids': inputs}
-    logits = torch.nn.functional.one_hot(inputs, num_classes=pad + 1)
-    logits[2] = logits[1].clone()  # make one of the answers incorrect
+    batch = {'continuation_indices': cont_idxs, 'labels': inputs.roll(-1), 'input_ids': inputs}
+    logits = torch.nn.functional.one_hot(inputs.roll(-1), num_classes=pad + 1).float() * 100
+    start, end = cont_idxs[1].tolist()[0] - 1, cont_idxs[1].tolist()[-1]
+    logits[1][start:end] = logits[0][start:end].clone()  # make one of the answer's continuations incorrect
     metric = InContextLearningLMAccuracy()
     metric.update(batch, logits, batch['labels'])
 
     assert metric.compute() == 0.75
 
 
+def test_in_context_learning_lm_ece(tiny_gpt2_tokenizer):
+    contexts = ['The dog is', 'I love to eat', 'I hate', 'The weather is']
+    continuations = [' furry', ' pie', ' long lines', ' snowy']
+    pad = tiny_gpt2_tokenizer.pad_token_id
+    inputs = [
+        tiny_gpt2_tokenizer(context)['input_ids'] + tiny_gpt2_tokenizer(continuation)['input_ids']
+        for context, continuation in zip(contexts, continuations)
+    ]
+    inputs = torch.tensor([input + [pad] * (2048 - len(input)) for input in inputs])
+
+    cont_idxs = []
+    for context, continuation in zip(contexts, continuations):
+        start = len(tiny_gpt2_tokenizer(context)['input_ids'])
+        end = start + len(tiny_gpt2_tokenizer(continuation)['input_ids'])
+        cont_idxs.append(torch.tensor(list(range(start, end))))
+
+    batch = {'continuation_indices': cont_idxs, 'labels': inputs.roll(-1), 'input_ids': inputs}
+    # logits are expected to be unnormalized and will undergo softmax, so we must multiply by 100
+    logits = torch.nn.functional.one_hot(inputs.roll(-1), num_classes=pad + 1).float() * 100
+    start, end = cont_idxs[1].tolist()[0] - 1, cont_idxs[1].tolist()[-1]
+    logits[1][start:end] = logits[0][start:end].clone()  # make one of the answer's continuations incorrect
+    metric = InContextLearningLMExpectedCalibrationError()
+    metric.update(batch, logits, batch['labels'])
+    # all observations fall in the top confidence bucket (95%) but accuracy is only 75%,
+    # hence ECE should be 0.2
+    assert abs(metric.compute() - 0.2) < 0.0001
+
+
 def test_in_context_learning_qa_accuracy():
     outputs = ['Correct but then some more text', 'Incorrect', ' the CORREct with weird casing and spacing']
     labels = [['Correct'], ['blah', 'blah2'], ['blah', 'correct']]
@@ -304,16 +338,16 @@ def test_in_context_learning_mc_accuracy(tiny_gpt2_tokenizer):
 
     batch = {
         'continuation_indices': cont_idxs,
-        'labels': inputs,
+        'labels': inputs.roll(-1),
         'input_ids': inputs,
         'gold_indices': gold_indices,
         'choice_groupings': choice_groupings
     }
-    logits = torch.nn.functional.one_hot(inputs, num_classes=pad + 1).float()
+    logits = torch.nn.functional.one_hot(inputs.roll(-1), num_classes=pad + 1).float()
 
     # for the first two, the correct answer is continuation 0
     # make the answer correct by making continuation 0 more likely for both answers
-    start, end = cont_idxs[1].tolist()[0], cont_idxs[1].tolist()[-1]
+    start, end = cont_idxs[1].tolist()[0] - 1, cont_idxs[1].tolist()[-1]
     logits[1][start:end] = logits[0][start:end].clone()
 
     # for the last two, the correct answer is continuation 3
@@ -325,3 +359,78 @@ def test_in_context_learning_mc_accuracy(tiny_gpt2_tokenizer):
 
     metric.update(batch, logits, batch['labels'])
     assert metric.compute() == 0.5
+
+
+def test_in_context_learning_mc_ece(tiny_gpt2_tokenizer):
+    contexts = [
+        'Q: How do you cook a cake?', 'Q: How do you cook a cake?', 'Q: How old is the earth?',
+        'Q: How old is the earth?'
+    ]
+    continuations = [' turn on the oven', ' do a backflip', ' 2 minutes', ' 4.5 billion years']
+    gold_indices = [0, 1]
+    choice_groupings = [(0, 2), (2, 4)]
+    pad = tiny_gpt2_tokenizer.pad_token_id
+    inputs = [
+        tiny_gpt2_tokenizer(context)['input_ids'] + tiny_gpt2_tokenizer(continuation)['input_ids']
+        for context, continuation in zip(contexts, continuations)
+    ]
+    inputs = torch.tensor([input + [pad] * (2048 - len(input)) for input in inputs])
+
+    cont_idxs = []
+    for context, continuation in zip(contexts, continuations):
+        start = len(tiny_gpt2_tokenizer(context)['input_ids'])
+        end = start + len(tiny_gpt2_tokenizer(continuation)['input_ids'])
+        cont_idxs.append(torch.tensor(list(range(start, end))))
+
+    batch = {
+        'continuation_indices': cont_idxs,
+        'labels': inputs.roll(-1),
+        'input_ids': inputs,
+        'gold_indices': gold_indices,
+        'choice_groupings': choice_groupings
+    }
+    logits = torch.nn.functional.one_hot(inputs.roll(-1), num_classes=pad + 1).float() * 100
+    # for the first two, the correct answer is continuation 0
+    # make the answer correct by making continuation 0 more likely for both answers
+    start, end = cont_idxs[1].tolist()[0] - 1, cont_idxs[1].tolist()[-1]
+    logits[1][start:end] = logits[0][start:end].clone()
+
+    # for the last two, the correct answer is continuation 3
+    # make the answer incorrect by making continuation 2 more likely for both answers
+    start, end = cont_idxs[3].tolist()[0] - 1, cont_idxs[3].tolist()[-1]
+    logits[3][start:end] = logits[2][start:end].clone()
+
+    metric = InContextLearningMCExpectedCalibrationError()
+
+    metric.update(batch, logits, batch['labels'])
+
+    # accuracy is 50% but confidence is 95%, so ECE is 45%
+    assert abs(metric.compute().item() - 0.45) < 0.0001
+
+
+def test_in_context_learning_ece():
+    metric = InContextLearningExpectedCalibrationError(n_buckets=1)
+    metric.update(None, None, None)  # pyright: ignore [reportGeneralTypeIssues]
+    metric.bucket_totals[0] = 2  # pyright: ignore [reportGeneralTypeIssues]
+    metric.bucket_correct[0] = 1  # pyright: ignore [reportGeneralTypeIssues]
+    # confidence of bucket = 50%, accuracy = 50% => ECE = 0.0
+    assert metric.compute() == 0.0
+
+    metric = InContextLearningExpectedCalibrationError(n_buckets=10)
+    # this example corresponds to perfect calibration across all 10 buckets
+    metric.update(None, None, None)  # pyright: ignore [reportGeneralTypeIssues]
+    for i in range(len(metric.bucket_totals)):  # pyright: ignore [reportGeneralTypeIssues]
+        upper_bound = (i + 1) / metric.n_buckets
+        lower_bound = i / metric.n_buckets
+        conf_bucket_i = (upper_bound + lower_bound) / 2
+        metric.bucket_totals[i] = metric.n_buckets * 2  # pyright: ignore [reportGeneralTypeIssues]
+        metric.bucket_correct[i] = conf_bucket_i * metric.n_buckets * 2  # pyright: ignore [reportGeneralTypeIssues]
+    assert metric.compute() == 0.0
+
+    metric = InContextLearningExpectedCalibrationError(n_buckets=10)
+    # this example corresponds to perfect calibration
+    metric.update(None, None, None)  # pyright: ignore [reportGeneralTypeIssues]
+    metric.bucket_totals[-1] = 2  # pyright: ignore [reportGeneralTypeIssues]
+    metric.bucket_correct[-1] = 0  # pyright: ignore [reportGeneralTypeIssues]
+    # confidence = 95%, accuracy = 0% => ece = 95%
+    assert metric.compute() == 0.95

From a0a81b14e08c03111b208ef9a6e1269668068677 Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Tue, 3 Oct 2023 12:04:46 -0400
Subject: [PATCH 02/10] wip

---
 .../in_context_learning_evaluation.py         | 147 ++++++++----------
 composer/metrics/__init__.py                  |   4 -
 composer/metrics/nlp.py                       |  65 +-------
 tests/datasets/local_data/human_eval.jsonl    |   8 -
 .../local_data/human_eval_small.jsonl         |   2 +-
 .../test_in_context_learning_datasets.py      |  94 ++++++++++-
 6 files changed, 164 insertions(+), 156 deletions(-)
 delete mode 100644 tests/datasets/local_data/human_eval.jsonl

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index f71684cf49..e5e6d7dc93 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -290,7 +290,7 @@ def split_batch(self, batch: Any, microbatch_size: int):
         return [{k: v[idx] for k, v in chunked.items()} for idx in range(num_chunks)]
 
 
-class InContextLearningCodeTracingTaskDataset(Dataset):
+class InContextLearningExecutionPredictionTaskDataset(Dataset):
     """A dataset that construct batches for in-context learning code tracing evaluation
 
     Args:
@@ -308,19 +308,19 @@ class InContextLearningCodeTracingTaskDataset(Dataset):
         fewshot_random_seed (int): Random seed used to select fewshot examples
     """
 
-    def __init__(
-        self,
-        dataset_uri: str,
-        tokenizer: Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast],
-        max_seq_len: int,
-        pad_tok_id: int,
-        num_fewshot: int,
-        prompt_string: str,
-        example_delimiter: str,
-        continuation_delimiter: str,
-        destination_path: str,
-        fewshot_random_seed: int,
-    ):
+    def __init__(self,
+                 dataset_uri: str,
+                 tokenizer: Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast],
+                 max_seq_len: int,
+                 pad_tok_id: int,
+                 num_fewshot: int,
+                 prompt_string: str,
+                 example_delimiter: str,
+                 destination_path: str,
+                 fewshot_random_seed: int,
+                 fn_delimiter: str = '',
+                 output_delimiter: str = '',
+                 input_delimiter: str = ''):
         try:
             from datasets import load_dataset  # pyright: ignore [reportGeneralTypeIssues]
         except ImportError as e:
@@ -345,17 +345,17 @@ def __init__(
         self.max_seq_len = max_seq_len
         self.pad_tok_id = pad_tok_id
         fewshot_rng = random.Random(fewshot_random_seed)
-        self.encoded_dataset = self.prep_examples(num_fewshot, prompt_string, example_delimiter, continuation_delimiter,
-                                                  fewshot_rng)
+        self.encoded_dataset = self.prep_examples(num_fewshot, prompt_string, example_delimiter, fn_delimiter,
+                                                  output_delimiter, input_delimiter, fewshot_rng)
 
     @staticmethod
     def stringify_input(input_tuple):
         tup = eval(input_tuple)
-        res = '\t'.join([f'arg_{i}={json.dumps(x)}' for i, x in enumerate(tup)])
+        res = '{' + ', '.join([f'arg_{i}: {json.dumps(x)}' for i, x in enumerate(tup)]) + '}'
         return res
 
-    def prep_examples(self, num_fewshot: int, prompt_string: str, example_delimiter: str, continuation_delimiter: str,
-                      fewshot_rng: random.Random):
+    def prep_examples(self, num_fewshot: int, prompt_string: str, example_delimiter: str, fn_delimiter: str,
+                      output_delimiter: str, input_delimiter: str, fewshot_rng: random.Random):
         """Prepares a set of language modeling tasks into tokenized format with prompt and fewshot examples.
 
         Each task consists of a context and a continuation as well as an optional prompt and optional list of
@@ -387,7 +387,7 @@ def prep_examples(self, num_fewshot: int, prompt_string: str, example_delimiter:
                         self.samples[fewshot_idx]['test_outputs'],
                     )
                     test_idx = random.choice(range(0, len(test_in)))
-                    example = f"""{example_delimiter}\n{prompt}\n{soln}\n####\nEntry point: {entry_point}\nInputs: {self.stringify_input(test_in[test_idx])}\nOutputs: {test_out[test_idx]}\n####\n"""
+                    example = f"""{example_delimiter}{prompt}{soln}{fn_delimiter}{entry_point}{input_delimiter}{self.stringify_input(test_in[test_idx])}{output_delimiter}{test_out[test_idx]}"""
 
                     preamble += example
 
@@ -398,10 +398,10 @@ def prep_examples(self, num_fewshot: int, prompt_string: str, example_delimiter:
                 self.samples[sample_idx]['test_inputs'],
                 self.samples[sample_idx]['test_outputs'],
             )
-
+            
             for inp, out in zip(test_in, test_out):
                 encoded_example = {}
-                context = f"""{example_delimiter}\n{prompt}\n{soln}\n####\nEntry point: {entry_point}\nInputs: {self.stringify_input(inp)}\nOutputs:"""
+                context = f"""{example_delimiter}{prompt}{soln}{fn_delimiter}{entry_point}{input_delimiter}{self.stringify_input(inp)}{output_delimiter}"""
                 out = f' {out}'
                 encoded_example['preamble'] = self.tokenizer(
                     preamble
@@ -1259,22 +1259,24 @@ def split_batch(self, batch: Any, microbatch_size: int):
 
 
 def build_icl_dataloader(
-    icl_task_type: str,
-    dataset_uri: str,
-    tokenizer: Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast],
-    batch_size: int,
-    max_seq_len: int,
-    pad_tok_id: int,
-    num_fewshot: int,
-    prompt_string: str,  # e.g. 'translate english to french:'
-    example_delimiter: str,  # e.g. '\n'
-    continuation_delimiter: str,  # e.g. ''
-    destination_path: str,
-    question_prelimiter: str = '',  # e.g. 'Question: '
-    fewshot_random_seed: int = 1234,
-    pass_at_k: int = 1,
-    generations_per_sample: int = 1,
-) -> DataSpec:
+        icl_task_type: str,
+        dataset_uri: str,
+        tokenizer: Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast],
+        batch_size: int,
+        max_seq_len: int,
+        pad_tok_id: int,
+        num_fewshot: int,
+        prompt_string: str,  # e.g. 'translate english to french:'
+        example_delimiter: str,  # e.g. '\n'
+        continuation_delimiter: str,  # e.g. ''
+        destination_path: str,
+        question_prelimiter: str = '',  # e.g. 'Question: '
+        fewshot_random_seed: int = 1234,
+        pass_at_k: int = 1,
+        generations_per_sample: int = 1,
+        extra_delimiters: Optional[dict] = None) -> DataSpec:
+    if extra_delimiters is None:
+        extra_delimiters = {}
     if icl_task_type == 'multiple_choice':
         dataset = InContextLearningMultipleChoiceTaskDataset(dataset_uri,
                                                              tokenizer,
@@ -1340,17 +1342,17 @@ def build_icl_dataloader(
                                                    pass_at_k=pass_at_k,
                                                    generations_per_sample=generations_per_sample)
         effective_batchsize = batch_size
-    elif icl_task_type == 'code_tracing':
-        dataset = InContextLearningCodeTracingTaskDataset(dataset_uri,
-                                                          tokenizer,
-                                                          max_seq_len,
-                                                          pad_tok_id,
-                                                          num_fewshot,
-                                                          prompt_string,
-                                                          example_delimiter,
-                                                          continuation_delimiter,
-                                                          destination_path=destination_path,
-                                                          fewshot_random_seed=fewshot_random_seed)
+    elif icl_task_type == 'code_execution_prediction':
+        dataset = InContextLearningExecutionPredictionTaskDataset(dataset_uri,
+                                                                  tokenizer,
+                                                                  max_seq_len,
+                                                                  pad_tok_id,
+                                                                  num_fewshot,
+                                                                  prompt_string,
+                                                                  example_delimiter,
+                                                                  destination_path=destination_path,
+                                                                  fewshot_random_seed=fewshot_random_seed,
+                                                                  **extra_delimiters)
         effective_batchsize = batch_size
     else:
         raise Exception(f'Unrecognized ICL task type: {icl_task_type}')
@@ -1435,7 +1437,8 @@ def get_icl_task_dataloader(
         fewshot_random_seed: int = 1234,
         pass_at_k: int = 1,
         generations_per_sample: int = 1,
-        has_categories: bool = False) -> Union[DataSpec, Dict[str, DataSpec]]:
+        has_categories: bool = False,
+        extra_delimiters: Optional[dict] = None) -> Union[DataSpec, Dict[str, DataSpec]]:
     """This constructs a dataloader (or dataloaders if has_categories is True) capable of evaluating LLMs on in-context learning language modeling tasks, for example LAMBADA. An example usage is below:
 
     >>> dl = get_icl_task_dataloader(
@@ -1488,39 +1491,15 @@ def get_icl_task_dataloader(
         categories = sorted(output_files.keys())
         for category in categories:
             partition_uri = output_files[category]
-            result_dls[category] = build_icl_dataloader(
-                icl_task_type,
-                partition_uri,
-                tokenizer,
-                batch_size,
-                max_seq_len,
-                pad_tok_id,
-                num_fewshot,
-                prompt_string,
-                example_delimiter,
-                continuation_delimiter,
-                partition_uri + '_tmp',
-                question_prelimiter,
-                fewshot_random_seed,
-                pass_at_k,
-                generations_per_sample,
-            )
+            result_dls[category] = build_icl_dataloader(icl_task_type, partition_uri, tokenizer, batch_size,
+                                                        max_seq_len, pad_tok_id, num_fewshot, prompt_string,
+                                                        example_delimiter, continuation_delimiter,
+                                                        partition_uri + '_tmp', question_prelimiter,
+                                                        fewshot_random_seed, pass_at_k, generations_per_sample,
+                                                        extra_delimiters)
         return result_dls
     else:
-        return build_icl_dataloader(
-            icl_task_type,
-            dataset_uri,
-            tokenizer,
-            batch_size,
-            max_seq_len,
-            pad_tok_id,
-            num_fewshot,
-            prompt_string,
-            example_delimiter,
-            continuation_delimiter,
-            destination_path,
-            question_prelimiter,
-            fewshot_random_seed,
-            pass_at_k,
-            generations_per_sample,
-        )
+        return build_icl_dataloader(icl_task_type, dataset_uri, tokenizer, batch_size, max_seq_len, pad_tok_id,
+                                    num_fewshot, prompt_string, example_delimiter, continuation_delimiter,
+                                    destination_path, question_prelimiter, fewshot_random_seed, pass_at_k,
+                                    generations_per_sample, extra_delimiters)
diff --git a/composer/metrics/__init__.py b/composer/metrics/__init__.py
index 8c599487e7..26af16f5b5 100644
--- a/composer/metrics/__init__.py
+++ b/composer/metrics/__init__.py
@@ -5,8 +5,6 @@
 
 from composer.metrics.map import MAP
 from composer.metrics.metrics import CrossEntropy, Dice, LossMetric, MIoU
-from composer.metrics.nlp import (BinaryF1Score, HFCrossEntropy, InContextLearningCodeTracingAveragePassRate,
-                                  InContextLearningCodeTracingFullPassRate, InContextLearningLMAccuracy,)
 from composer.metrics.nlp import (BinaryF1Score, InContextLearningCodeEvalAccuracy, InContextLearningLMAccuracy,
                                   InContextLearningLMExpectedCalibrationError,
                                   InContextLearningMCExpectedCalibrationError, InContextLearningMetric,
@@ -30,8 +28,6 @@
     'InContextLearningLMExpectedCalibrationError',
     'InContextLearningMetric',
     'InContextLearningCodeEvalAccuracy',
-    'InContextLearningCodeTracingFullPassRate',
-    'InContextLearningCodeTracingAveragePassRate'
 ]
 
 METRIC_DEFAULT_CTORS = {
diff --git a/composer/metrics/nlp.py b/composer/metrics/nlp.py
index 8d9f19dbc4..bf1292f778 100644
--- a/composer/metrics/nlp.py
+++ b/composer/metrics/nlp.py
@@ -8,7 +8,7 @@
 import re
 import string
 import warnings
-from typing import Any, Dict, List, Mapping, Optional, Union
+from typing import Any, Dict, List, Mapping, Union
 
 import numpy as np
 import torch
@@ -21,10 +21,13 @@
 log = logging.getLogger(__name__)
 
 __all__ = [
-    'Perplexity', 'InContextLearningLMAccuracy', 'InContextLearningMultipleChoiceAccuracy',
-    'InContextLearningQAAccuracy', 'BinaryF1Score', 'HFCrossEntropy', 'LanguageCrossEntropy', 'MaskedAccuracy',
-    'LanguagePerplexity', 'InContextLearningLMExpectedCalibrationError', 'InContextLearningMCExpectedCalibrationError',
-    'InContextLearningCodeTracingFullPassRate', 'InContextLearningCodeTracingAveragePassRate'
+    'InContextLearningLMAccuracy',
+    'InContextLearningMultipleChoiceAccuracy',
+    'InContextLearningQAAccuracy',
+    'LanguageCrossEntropy',
+    'MaskedAccuracy',
+    'InContextLearningLMExpectedCalibrationError',
+    'InContextLearningMCExpectedCalibrationError',
     'InContextLearningLMAccuracy',
     'InContextLearningMultipleChoiceAccuracy',
     'InContextLearningQAAccuracy',
@@ -328,58 +331,6 @@ def compute(self):
         return self.correct / self.total
 
 
-class InContextLearningCodeTracingFullPassRate(InContextLearningMetric):
-
-    # Make torchmetrics call update only once
-    full_state_update = False
-
-    def __init__(self, dist_sync_on_step: bool = False):
-        # state from multiple processes
-        super().__init__(dist_sync_on_step=dist_sync_on_step)
-        self.add_state('correct', default=torch.tensor(0.), dist_reduce_fx='sum')
-        self.add_state('total', default=torch.tensor(0.), dist_reduce_fx='sum')
-
-    def update(self, batch: dict, output_logits: torch.Tensor, labels: torch.Tensor):
-        breakpoint()
-        for batch_idx, cont_idx in enumerate(batch['continuation_indices']):
-            cont_tok_pred = output_logits[batch_idx].index_select(dim=0, index=cont_idx - 1).argmax(dim=-1)
-            cont_tok_targ = labels[batch_idx].index_select(dim=0, index=cont_idx - 1)
-
-            self.correct += (cont_tok_pred == cont_tok_targ).all().int()
-            self.total += torch.tensor(1.0)
-
-    def compute(self):
-        assert isinstance(self.correct, Tensor)
-        assert isinstance(self.total, Tensor)
-        return self.correct / self.total
-
-
-class InContextLearningCodeTracingAveragePassRate(InContextLearningMetric):
-
-    # Make torchmetrics call update only once
-    full_state_update = False
-
-    def __init__(self, dist_sync_on_step: bool = False):
-        # state from multiple processes
-        super().__init__(dist_sync_on_step=dist_sync_on_step)
-        self.add_state('correct', default=torch.tensor(0.), dist_reduce_fx='sum')
-        self.add_state('total', default=torch.tensor(0.), dist_reduce_fx='sum')
-
-    def update(self, batch: dict, output_logits: torch.Tensor, labels: torch.Tensor):
-        breakpoint()
-        for batch_idx, cont_idx in enumerate(batch['continuation_indices']):
-            cont_tok_pred = output_logits[batch_idx].index_select(dim=0, index=cont_idx - 1).argmax(dim=-1)
-            cont_tok_targ = labels[batch_idx].index_select(dim=0, index=cont_idx - 1)
-
-            self.correct += (cont_tok_pred == cont_tok_targ).all().int()
-            self.total += torch.tensor(1.0)
-
-    def compute(self):
-        assert isinstance(self.correct, Tensor)
-        assert isinstance(self.total, Tensor)
-        return self.correct / self.total
-
-
 class InContextLearningMultipleChoiceAccuracy(InContextLearningMetric):
     r"""Computes accuracy for In-context learning (ICL) multiple choice (MC) tasks.
 
diff --git a/tests/datasets/local_data/human_eval.jsonl b/tests/datasets/local_data/human_eval.jsonl
deleted file mode 100644
index 2bd1fa7750..0000000000
--- a/tests/datasets/local_data/human_eval.jsonl
+++ /dev/null
@@ -1,8 +0,0 @@
-{"task_id": "HumanEval/1", "prompt": "from typing import List\n\n\ndef separate_paren_groups(paren_string: str) -> List[str]:\n    \"\"\" Input to this function is a string containing multiple groups of nested parentheses. Your goal is to\n    separate those group into separate strings and return the list of those.\n    Separate groups are balanced (each open brace is properly closed) and not nested within each other\n    Ignore any spaces in the input string.\n    >>> separate_paren_groups('( ) (( )) (( )( ))')\n    ['()', '(())', '(()())']\n    \"\"\"\n", "entry_point": "separate_paren_groups", "canonical_solution": "    result = []\n    current_string = []\n    current_depth = 0\n\n    for c in paren_string:\n        if c == '(':\n            current_depth += 1\n            current_string.append(c)\n        elif c == ')':\n            current_depth -= 1\n            current_string.append(c)\n\n            if current_depth == 0:\n                result.append(''.join(current_string))\n                current_string.clear()\n\n    return result\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate('(()()) ((())) () ((())()())') == [\n        '(()())', '((()))', '()', '((())()())'\n    ]\n    assert candidate('() (()) ((())) (((())))') == [\n        '()', '(())', '((()))', '(((())))'\n    ]\n    assert candidate('(()(())((())))') == [\n        '(()(())((())))'\n    ]\n    assert candidate('( ) (( )) (( )( ))') == ['()', '(())', '(()())']\n", "test_inputs": ["('(()()) ((())) () ((())()())',)", "('() (()) ((())) (((())))',)", "('(()(())((())))',)", "('( ) (( )) (( )( ))',)"], "test_outputs": ["['(()())', '((()))', '()', '((())()())']", "['()', '(())', '((()))', '(((())))']", "['(()(())((())))']", "['()', '(())', '(()())']"]}
-{"task_id": "HumanEval/0", "prompt": "from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    \"\"\"\n", "entry_point": "has_close_elements", "canonical_solution": "    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx != idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\n    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True\n    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False\n    assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True\n    assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True\n    assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False\n\n", "test_inputs": ["([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3)", "([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05)", "([1.0, 2.0, 5.9, 4.0, 5.0], 0.95)", "([1.0, 2.0, 5.9, 4.0, 5.0], 0.8)", "([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1)", "([1.1, 2.2, 3.1, 4.1, 5.1], 1.0)", "([1.1, 2.2, 3.1, 4.1, 5.1], 0.5)"], "test_outputs": ["True", "False", "True", "False", "True", "True", "False"]}
-{"task_id": "HumanEval/2", "prompt": "\n\ndef truncate_number(number: float) -> float:\n    \"\"\" Given a positive floating point number, it can be decomposed into\n    and integer part (largest integer smaller than given number) and decimals\n    (leftover part always smaller than 1).\n\n    Return the decimal part of the number.\n    >>> truncate_number(3.5)\n    0.5\n    \"\"\"\n", "entry_point": "truncate_number", "canonical_solution": "    return number % 1.0\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate(3.5) == 0.5\n    assert abs(candidate(1.33) - 0.33) < 1e-6\n    assert abs(candidate(123.456) - 0.456) < 1e-6\n", "test_inputs": ["(3.5,)", "(1.33,)", "(123.456,)"], "test_outputs": ["0.5", "0.33000000000000007", "0.45600000000000307"]}
-{"task_id": "HumanEval/3", "prompt": "from typing import List\n\n\ndef below_zero(operations: List[int]) -> bool:\n    \"\"\" You're given a list of deposit and withdrawal operations on a bank account that starts with\n    zero balance. Your task is to detect if at any point the balance of account fallls below zero, and\n    at that point function should return True. Otherwise it should return False.\n    >>> below_zero([1, 2, 3])\n    False\n    >>> below_zero([1, 2, -4, 5])\n    True\n    \"\"\"\n", "entry_point": "below_zero", "canonical_solution": "    balance = 0\n\n    for op in operations:\n        balance += op\n        if balance < 0:\n            return True\n\n    return False\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate([]) == False\n    assert candidate([1, 2, -3, 1, 2, -3]) == False\n    assert candidate([1, 2, -4, 5, 6]) == True\n    assert candidate([1, -1, 2, -2, 5, -5, 4, -4]) == False\n    assert candidate([1, -1, 2, -2, 5, -5, 4, -5]) == True\n    assert candidate([1, -2, 2, -2, 5, -5, 4, -4]) == True\n", "test_inputs": ["([],)", "([1, 2, -3, 1, 2, -3],)", "([1, 2, -4, 5, 6],)", "([1, -1, 2, -2, 5, -5, 4, -4],)", "([1, -1, 2, -2, 5, -5, 4, -5],)", "([1, -2, 2, -2, 5, -5, 4, -4],)"], "test_outputs": ["False", "False", "True", "False", "True", "True"]}
-{"task_id": "HumanEval/4", "prompt": "from typing import List\n\n\ndef mean_absolute_deviation(numbers: List[float]) -> float:\n    \"\"\" For a given list of input numbers, calculate Mean Absolute Deviation\n    around the mean of this dataset.\n    Mean Absolute Deviation is the average absolute difference between each\n    element and a centerpoint (mean in this case):\n    MAD = average | x - x_mean |\n    >>> mean_absolute_deviation([1.0, 2.0, 3.0, 4.0])\n    1.0\n    \"\"\"\n", "entry_point": "mean_absolute_deviation", "canonical_solution": "    mean = sum(numbers) / len(numbers)\n    return sum(abs(x - mean) for x in numbers) / len(numbers)\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert abs(candidate([1.0, 2.0, 3.0]) - 2.0/3.0) < 1e-6\n    assert abs(candidate([1.0, 2.0, 3.0, 4.0]) - 1.0) < 1e-6\n    assert abs(candidate([1.0, 2.0, 3.0, 4.0, 5.0]) - 6.0/5.0) < 1e-6\n\n", "test_inputs": ["([1.0, 2.0, 3.0],)", "([1.0, 2.0, 3.0, 4.0],)", "([1.0, 2.0, 3.0, 4.0, 5.0],)"], "test_outputs": ["0.6666666666666666", "1.0", "1.2"]}
-{"task_id": "HumanEval/5", "prompt": "from typing import List\n\n\ndef intersperse(numbers: List[int], delimeter: int) -> List[int]:\n    \"\"\" Insert a number 'delimeter' between every two consecutive elements of input list `numbers'\n    >>> intersperse([], 4)\n    []\n    >>> intersperse([1, 2, 3], 4)\n    [1, 4, 2, 4, 3]\n    \"\"\"\n", "entry_point": "intersperse", "canonical_solution": "    if not numbers:\n        return []\n\n    result = []\n\n    for n in numbers[:-1]:\n        result.append(n)\n        result.append(delimeter)\n\n    result.append(numbers[-1])\n\n    return result\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate([], 7) == []\n    assert candidate([5, 6, 3, 2], 8) == [5, 8, 6, 8, 3, 8, 2]\n    assert candidate([2, 2, 2], 2) == [2, 2, 2, 2, 2]\n", "test_inputs": ["([], 7)", "([5, 6, 3, 2], 8)", "([2, 2, 2], 2)"], "test_outputs": ["[]", "[5, 8, 6, 8, 3, 8, 2]", "[2, 2, 2, 2, 2]"]}
-{"task_id": "HumanEval/6", "prompt": "from typing import List\n\n\ndef parse_nested_parens(paren_string: str) -> List[int]:\n    \"\"\" Input to this function is a string represented multiple groups for nested parentheses separated by spaces.\n    For each of the group, output the deepest level of nesting of parentheses.\n    E.g. (()()) has maximum two levels of nesting while ((())) has three.\n\n    >>> parse_nested_parens('(()()) ((())) () ((())()())')\n    [2, 3, 1, 3]\n    \"\"\"\n", "entry_point": "parse_nested_parens", "canonical_solution": "    def parse_paren_group(s):\n        depth = 0\n        max_depth = 0\n        for c in s:\n            if c == '(':\n                depth += 1\n                max_depth = max(depth, max_depth)\n            else:\n                depth -= 1\n\n        return max_depth\n\n    return [parse_paren_group(x) for x in paren_string.split(' ') if x]\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate('(()()) ((())) () ((())()())') == [2, 3, 1, 3]\n    assert candidate('() (()) ((())) (((())))') == [1, 2, 3, 4]\n    assert candidate('(()(())((())))') == [4]\n", "test_inputs": ["('(()()) ((())) () ((())()())',)", "('() (()) ((())) (((())))',)", "('(()(())((())))',)"], "test_outputs": ["[2, 3, 1, 3]", "[1, 2, 3, 4]", "[4]"]}
-{"task_id": "HumanEval/7", "prompt": "from typing import List\n\n\ndef filter_by_substring(strings: List[str], substring: str) -> List[str]:\n    \"\"\" Filter an input list of strings only for ones that contain given substring\n    >>> filter_by_substring([], 'a')\n    []\n    >>> filter_by_substring(['abc', 'bacd', 'cde', 'array'], 'a')\n    ['abc', 'bacd', 'array']\n    \"\"\"\n", "entry_point": "filter_by_substring", "canonical_solution": "    return [x for x in strings if substring in x]\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate([], 'john') == []\n    assert candidate(['xxx', 'asd', 'xxy', 'john doe', 'xxxAAA', 'xxx'], 'xxx') == ['xxx', 'xxxAAA', 'xxx']\n    assert candidate(['xxx', 'asd', 'aaaxxy', 'john doe', 'xxxAAA', 'xxx'], 'xx') == ['xxx', 'aaaxxy', 'xxxAAA', 'xxx']\n    assert candidate(['grunt', 'trumpet', 'prune', 'gruesome'], 'run') == ['grunt', 'prune']\n", "test_inputs": ["([], 'john')", "(['xxx', 'asd', 'xxy', 'john doe', 'xxxAAA', 'xxx'], 'xxx')", "(['xxx', 'asd', 'aaaxxy', 'john doe', 'xxxAAA', 'xxx'], 'xx')", "(['grunt', 'trumpet', 'prune', 'gruesome'], 'run')"], "test_outputs": ["[]", "['xxx', 'xxxAAA', 'xxx']", "['xxx', 'aaaxxy', 'xxxAAA', 'xxx']", "['grunt', 'prune']"]}
diff --git a/tests/datasets/local_data/human_eval_small.jsonl b/tests/datasets/local_data/human_eval_small.jsonl
index 93fb289093..a2b33bebe1 100644
--- a/tests/datasets/local_data/human_eval_small.jsonl
+++ b/tests/datasets/local_data/human_eval_small.jsonl
@@ -1,5 +1,5 @@
-{"task_id": "HumanEval/1", "prompt": "from typing import List\n\n\ndef separate_paren_groups(paren_string: str) -> List[str]:\n    \"\"\" Input to this function is a string containing multiple groups of nested parentheses. Your goal is to\n    separate those group into separate strings and return the list of those.\n    Separate groups are balanced (each open brace is properly closed) and not nested within each other\n    Ignore any spaces in the input string.\n    >>> separate_paren_groups('( ) (( )) (( )( ))')\n    ['()', '(())', '(()())']\n    \"\"\"\n", "entry_point": "separate_paren_groups", "canonical_solution": "    result = []\n    current_string = []\n    current_depth = 0\n\n    for c in paren_string:\n        if c == '(':\n            current_depth += 1\n            current_string.append(c)\n        elif c == ')':\n            current_depth -= 1\n            current_string.append(c)\n\n            if current_depth == 0:\n                result.append(''.join(current_string))\n                current_string.clear()\n\n    return result\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate('(()()) ((())) () ((())()())') == [\n        '(()())', '((()))', '()', '((())()())'\n    ]\n    assert candidate('() (()) ((())) (((())))') == [\n        '()', '(())', '((()))', '(((())))'\n    ]\n    assert candidate('(()(())((())))') == [\n        '(()(())((())))'\n    ]\n    assert candidate('( ) (( )) (( )( ))') == ['()', '(())', '(()())']\n", "test_inputs": ["('(()()) ((())) () ((())()())',)", "('() (()) ((())) (((())))',)", "('(()(())((())))',)", "('( ) (( )) (( )( ))',)"], "test_outputs": ["['(()())', '((()))', '()', '((())()())']", "['()', '(())', '((()))', '(((())))']", "['(()(())((())))']", "['()', '(())', '(()())']"], "language": "python"}
 {"task_id": "HumanEval/0", "prompt": "from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    \"\"\"\n", "entry_point": "has_close_elements", "canonical_solution": "    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx != idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\n    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True\n    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False\n    assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True\n    assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True\n    assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False\n\n", "test_inputs": ["([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3)", "([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05)", "([1.0, 2.0, 5.9, 4.0, 5.0], 0.95)", "([1.0, 2.0, 5.9, 4.0, 5.0], 0.8)", "([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1)", "([1.1, 2.2, 3.1, 4.1, 5.1], 1.0)", "([1.1, 2.2, 3.1, 4.1, 5.1], 0.5)"], "test_outputs": ["True", "False", "True", "False", "True", "True", "False"], "language": "python"}
+{"task_id": "HumanEval/1", "prompt": "from typing import List\n\n\ndef separate_paren_groups(paren_string: str) -> List[str]:\n    \"\"\" Input to this function is a string containing multiple groups of nested parentheses. Your goal is to\n    separate those group into separate strings and return the list of those.\n    Separate groups are balanced (each open brace is properly closed) and not nested within each other\n    Ignore any spaces in the input string.\n    >>> separate_paren_groups('( ) (( )) (( )( ))')\n    ['()', '(())', '(()())']\n    \"\"\"\n", "entry_point": "separate_paren_groups", "canonical_solution": "    result = []\n    current_string = []\n    current_depth = 0\n\n    for c in paren_string:\n        if c == '(':\n            current_depth += 1\n            current_string.append(c)\n        elif c == ')':\n            current_depth -= 1\n            current_string.append(c)\n\n            if current_depth == 0:\n                result.append(''.join(current_string))\n                current_string.clear()\n\n    return result\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate('(()()) ((())) () ((())()())') == [\n        '(()())', '((()))', '()', '((())()())'\n    ]\n    assert candidate('() (()) ((())) (((())))') == [\n        '()', '(())', '((()))', '(((())))'\n    ]\n    assert candidate('(()(())((())))') == [\n        '(()(())((())))'\n    ]\n    assert candidate('( ) (( )) (( )( ))') == ['()', '(())', '(()())']\n", "test_inputs": ["('(()()) ((())) () ((())()())',)", "('() (()) ((())) (((())))',)", "('(()(())((())))',)", "('( ) (( )) (( )( ))',)"], "test_outputs": ["['(()())', '((()))', '()', '((())()())']", "['()', '(())', '((()))', '(((())))']", "['(()(())((())))']", "['()', '(())', '(()())']"], "language": "python"}
 {"task_id": "HumanEval/2", "prompt": "\n\ndef truncate_number(number: float) -> float:\n    \"\"\" Given a positive floating point number, it can be decomposed into\n    and integer part (largest integer smaller than given number) and decimals\n    (leftover part always smaller than 1).\n\n    Return the decimal part of the number.\n    >>> truncate_number(3.5)\n    0.5\n    \"\"\"\n", "entry_point": "truncate_number", "canonical_solution": "    return number % 1.0\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate(3.5) == 0.5\n    assert abs(candidate(1.33) - 0.33) < 1e-6\n    assert abs(candidate(123.456) - 0.456) < 1e-6\n", "test_inputs": ["(3.5,)", "(1.33,)", "(123.456,)"], "test_outputs": ["0.5", "0.33000000000000007", "0.45600000000000307"], "language": "python"}
 {"task_id": "HumanEval/3", "prompt": "from typing import List\n\n\ndef below_zero(operations: List[int]) -> bool:\n    \"\"\" You're given a list of deposit and withdrawal operations on a bank account that starts with\n    zero balance. Your task is to detect if at any point the balance of account fallls below zero, and\n    at that point function should return True. Otherwise it should return False.\n    >>> below_zero([1, 2, 3])\n    False\n    >>> below_zero([1, 2, -4, 5])\n    True\n    \"\"\"\n", "entry_point": "below_zero", "canonical_solution": "    balance = 0\n\n    for op in operations:\n        balance += op\n        if balance < 0:\n            return True\n\n    return False\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate([]) == False\n    assert candidate([1, 2, -3, 1, 2, -3]) == False\n    assert candidate([1, 2, -4, 5, 6]) == True\n    assert candidate([1, -1, 2, -2, 5, -5, 4, -4]) == False\n    assert candidate([1, -1, 2, -2, 5, -5, 4, -5]) == True\n    assert candidate([1, -2, 2, -2, 5, -5, 4, -4]) == True\n", "test_inputs": ["([],)", "([1, 2, -3, 1, 2, -3],)", "([1, 2, -4, 5, 6],)", "([1, -1, 2, -2, 5, -5, 4, -4],)", "([1, -1, 2, -2, 5, -5, 4, -5],)", "([1, -2, 2, -2, 5, -5, 4, -4],)"], "test_outputs": ["False", "False", "True", "False", "True", "True"], "language": "python"}
 {"task_id": "HumanEval/4", "prompt": "from typing import List\n\n\ndef mean_absolute_deviation(numbers: List[float]) -> float:\n    \"\"\" For a given list of input numbers, calculate Mean Absolute Deviation\n    around the mean of this dataset.\n    Mean Absolute Deviation is the average absolute difference between each\n    element and a centerpoint (mean in this case):\n    MAD = average | x - x_mean |\n    >>> mean_absolute_deviation([1.0, 2.0, 3.0, 4.0])\n    1.0\n    \"\"\"\n", "entry_point": "mean_absolute_deviation", "canonical_solution": "    mean = sum(numbers) / len(numbers)\n    return sum(abs(x - mean) for x in numbers) / len(numbers)\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert abs(candidate([1.0, 2.0, 3.0]) - 2.0/3.0) < 1e-6\n    assert abs(candidate([1.0, 2.0, 3.0, 4.0]) - 1.0) < 1e-6\n    assert abs(candidate([1.0, 2.0, 3.0, 4.0, 5.0]) - 6.0/5.0) < 1e-6\n\n", "test_inputs": ["([1.0, 2.0, 3.0],)", "([1.0, 2.0, 3.0, 4.0],)", "([1.0, 2.0, 3.0, 4.0, 5.0],)"], "test_outputs": ["0.6666666666666666", "1.0", "1.2"], "language": "python"}
diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py
index 2407271cb6..c0a31c3e19 100644
--- a/tests/datasets/test_in_context_learning_datasets.py
+++ b/tests/datasets/test_in_context_learning_datasets.py
@@ -19,8 +19,7 @@
                                                               _get_fewshot_sample_idxs, _make_padded_input,
                                                               get_icl_task_dataloader)
 from composer.loggers import InMemoryLogger
-from composer.metrics import (InContextLearningCodeTracingAveragePassRate, InContextLearningCodeTracingFullPassRate,
-                              InContextLearningCodeEvalAccuracy, InContextLearningLMAccuracy,
+from composer.metrics import (InContextLearningCodeEvalAccuracy, InContextLearningLMAccuracy,
                               InContextLearningMultipleChoiceAccuracy, InContextLearningQAAccuracy)
 from composer.models import HuggingFaceModel
 from composer.trainer import Trainer
@@ -837,6 +836,97 @@ def test_code_eval_pass_at_k_validity(dataset_uri, tmp_path):
                                 generations_per_sample=1)
 
 
+@pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
+@pytest.mark.parametrize('num_fewshot', [2])
+def test_code_execution_prediction_task_dataloader(dataset_uri, tmp_path, num_fewshot):
+    pytest.importorskip('datasets')
+
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+
+    tokenizer = AutoTokenizer.from_pretrained('mosaicml/mpt-7b')
+    dataset_uri = f'{local_data}/{dataset_uri}'
+    batch_size = 9
+    seqlen = 2048
+
+    prompt_string = """Below is a list of code snippets, followed by a python function indicated by `python_fn`, a dictionary of arguments indicated by `inputs`, and the model outputs indicated by `outputs`. Your task is to predict the outputs that would be obtained from executing the final `python_fn` on the `inputs`.\n"""
+
+    dl = get_icl_task_dataloader('code_execution_prediction',
+                                 dataset_uri,
+                                 tokenizer,
+                                 batch_size,
+                                 max_seq_len=seqlen,
+                                 pad_tok_id=tokenizer.eos_token_id,
+                                 num_fewshot=num_fewshot,
+                                 prompt_string=prompt_string,
+                                 example_delimiter='\n\n####\n\n',
+                                 destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'),
+                                 extra_delimiters={
+                                     'fn_delimiter': '\n####\n\npython_fn=',
+                                     'input_delimiter': '\ninputs=',
+                                     'output_delimiter': '\noutputs='
+                                 })
+    assert isinstance(dl, DataSpec)
+
+    assert isinstance(dl.dataloader, DataLoader)  # pyright
+    batch = next(dl.dataloader._get_iterator())
+
+    max_prompt_length = 0
+    if isinstance(dl.dataloader.dataset, InContextLearningCodeEvalDataset):
+        max_prompt_length = dl.dataloader.dataset.max_prompt_length
+    assert tuple(batch['input_ids'].shape) == (batch_size, max_prompt_length)
+    assert tuple(batch['attention_mask'].shape) == (batch_size, max_prompt_length)
+    assert batch['mode'] == 'generate'
+    # the maximum generation length from the small test data
+    assert batch['generation_length'] == seqlen - max_prompt_length
+    assert any(item[0] != tokenizer.eos_token_id for item in batch['input_ids'])  # longest should be pushed left
+
+    decoded_batch = tokenizer.batch_decode(batch['input_ids'])
+    assert all(item.count('Code start: \n') == num_fewshot + 1 for item in decoded_batch)
+
+    if len(prompt_string) > 0:
+        assert all(item.count('Please code:\n') == 1 for item in decoded_batch)
+
+    assert batch['labels'] == [
+        "    result = []\n    current_string = []\n    current_depth = 0\n\n    for c in paren_string:\n        if c == '(':\n            current_depth += 1\n            current_string.append(c)\n        elif c == ')':\n            current_depth -= 1\n            current_string.append(c)\n\n            if current_depth == 0:\n                result.append(''.join(current_string))\n                current_string.clear()\n\n    return result\n",
+        '    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx != idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n',
+        '    return number % 1.0\n',
+        '    balance = 0\n\n    for op in operations:\n        balance += op\n        if balance < 0:\n            return True\n\n    return False\n',
+        '    mean = sum(numbers) / len(numbers)\n    return sum(abs(x - mean) for x in numbers) / len(numbers)\n',
+        '    if not numbers:\n        return []\n\n    result = []\n\n    for n in numbers[:-1]:\n        result.append(n)\n        result.append(delimeter)\n\n    result.append(numbers[-1])\n\n    return result\n',
+        "    def parse_paren_group(s):\n        depth = 0\n        max_depth = 0\n        for c in s:\n            if c == '(':\n                depth += 1\n                max_depth = max(depth, max_depth)\n            else:\n                depth -= 1\n\n        return max_depth\n\n    return [parse_paren_group(x) for x in paren_string.split(' ') if x]\n",
+        '    return [x for x in strings if substring in x]\n',
+        '    sum_value = 0\n    prod_value = 1\n\n    for n in numbers:\n        sum_value += n\n        prod_value *= n\n    return sum_value, prod_value\n'
+    ]
+
+    assert decoded_batch[0].endswith(
+        "Code start: \nfrom typing import List\n\n\ndef separate_paren_groups(paren_string: str) -> List[str]:\n    \"\"\" Input to this function is a string containing multiple groups of nested parentheses. Your goal is to\n    separate those group into separate strings and return the list of those.\n    Separate groups are balanced (each open brace is properly closed) and not nested within each other\n    Ignore any spaces in the input string.\n    >>> separate_paren_groups('( ) (( )) (( )( ))')\n    ['()', '(())', '(()())']\n    \"\"\"\n"
+    )
+    assert decoded_batch[1].endswith(
+        "Code start: \nfrom typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    \"\"\"\n"
+    )
+    assert decoded_batch[2].endswith(
+        "Code start: \n\n\ndef truncate_number(number: float) -> float:\n    \"\"\" Given a positive floating point number, it can be decomposed into\n    and integer part (largest integer smaller than given number) and decimals\n    (leftover part always smaller than 1).\n\n    Return the decimal part of the number.\n    >>> truncate_number(3.5)\n    0.5\n    \"\"\"\n"
+    )
+    assert decoded_batch[3].endswith(
+        "Code start: \nfrom typing import List\n\n\ndef below_zero(operations: List[int]) -> bool:\n    \"\"\" You're given a list of deposit and withdrawal operations on a bank account that starts with\n    zero balance. Your task is to detect if at any point the balance of account fallls below zero, and\n    at that point function should return True. Otherwise it should return False.\n    >>> below_zero([1, 2, 3])\n    False\n    >>> below_zero([1, 2, -4, 5])\n    True\n    \"\"\"\n"
+    )
+    assert decoded_batch[4].endswith(
+        "Code start: \nfrom typing import List\n\n\ndef mean_absolute_deviation(numbers: List[float]) -> float:\n    \"\"\" For a given list of input numbers, calculate Mean Absolute Deviation\n    around the mean of this dataset.\n    Mean Absolute Deviation is the average absolute difference between each\n    element and a centerpoint (mean in this case):\n    MAD = average | x - x_mean |\n    >>> mean_absolute_deviation([1.0, 2.0, 3.0, 4.0])\n    1.0\n    \"\"\"\n"
+    )
+    assert decoded_batch[5].endswith(
+        "Code start: \nfrom typing import List\n\n\ndef intersperse(numbers: List[int], delimeter: int) -> List[int]:\n    \"\"\" Insert a number 'delimeter' between every two consecutive elements of input list `numbers'\n    >>> intersperse([], 4)\n    []\n    >>> intersperse([1, 2, 3], 4)\n    [1, 4, 2, 4, 3]\n    \"\"\"\n"
+    )
+    assert decoded_batch[6].endswith(
+        "Code start: \nfrom typing import List\n\n\ndef parse_nested_parens(paren_string: str) -> List[int]:\n    \"\"\" Input to this function is a string represented multiple groups for nested parentheses separated by spaces.\n    For each of the group, output the deepest level of nesting of parentheses.\n    E.g. (()()) has maximum two levels of nesting while ((())) has three.\n\n    >>> parse_nested_parens('(()()) ((())) () ((())()())')\n    [2, 3, 1, 3]\n    \"\"\"\n"
+    )
+    assert decoded_batch[7].endswith(
+        "Code start: \nfrom typing import List\n\n\ndef filter_by_substring(strings: List[str], substring: str) -> List[str]:\n    \"\"\" Filter an input list of strings only for ones that contain given substring\n    >>> filter_by_substring([], 'a')\n    []\n    >>> filter_by_substring(['abc', 'bacd', 'cde', 'array'], 'a')\n    ['abc', 'bacd', 'array']\n    \"\"\"\n"
+    )
+    assert decoded_batch[8].endswith(
+        "from typing import List, Tuple\n\n\ndef sum_product(numbers: List[int]) -> Tuple[int, int]:\n    \"\"\" For a given list of integers, return a tuple consisting of a sum and a product of all the integers in a list.\n    Empty sum should be equal to 0 and empty product should be equal to 1.\n    >>> sum_product([])\n    (0, 1)\n    >>> sum_product([1, 2, 3, 4])\n    (10, 24)\n    \"\"\"\n"
+    )
+
+
 @pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [0, 1, 2, 3])
 @pytest.mark.parametrize('prompt_string', ['Please code:\n', ''])

From 136e7defef9b7ceec18134e2ce5a834129f307d7 Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Mon, 9 Oct 2023 17:34:24 -0400
Subject: [PATCH 03/10] add execution pred

---
 .../in_context_learning_evaluation.py         | 230 +++++++++++++-----
 composer/metrics/__init__.py                  |  25 +-
 composer/metrics/nlp.py                       |  96 ++++++++
 .../test_in_context_learning_datasets.py      | 134 +++++-----
 4 files changed, 346 insertions(+), 139 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index e5e6d7dc93..cb28a5f5c5 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -26,6 +26,7 @@
     'InContextLearningMultipleChoiceTaskDataset',
     'InContextLearningCodeEvalDataset',
     'InContextLearningQATaskDataset',
+    'InContextLearningExecutionPredictionTaskDataset',
     'get_icl_task_dataloader',
 ]
 
@@ -308,19 +309,22 @@ class InContextLearningExecutionPredictionTaskDataset(Dataset):
         fewshot_random_seed (int): Random seed used to select fewshot examples
     """
 
-    def __init__(self,
-                 dataset_uri: str,
-                 tokenizer: Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast],
-                 max_seq_len: int,
-                 pad_tok_id: int,
-                 num_fewshot: int,
-                 prompt_string: str,
-                 example_delimiter: str,
-                 destination_path: str,
-                 fewshot_random_seed: int,
-                 fn_delimiter: str = '',
-                 output_delimiter: str = '',
-                 input_delimiter: str = ''):
+    def __init__(
+        self,
+        dataset_uri: str,
+        tokenizer: Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast],
+        max_seq_len: int,
+        pad_tok_id: int,
+        num_fewshot: int,
+        prompt_string: str,
+        example_delimiter: str,
+        destination_path: str,
+        fewshot_random_seed: int,
+        generations_per_sample: int,
+        pass_at_k: int = 1,
+        top_p: Optional[float] = 0.95,
+        top_k: Optional[int] = 40,
+    ):
         try:
             from datasets import load_dataset  # pyright: ignore [reportGeneralTypeIssues]
         except ImportError as e:
@@ -339,23 +343,47 @@ def __init__(self,
                     'canonical_solution': examples['canonical_solution'],
                     'test_inputs': examples['test_inputs'],
                     'test_outputs': examples['test_outputs'],
-                    'test': examples['test']
+                    'test': examples['test'],
+                    'language': examples['language'],
                 }))
+
+        if generations_per_sample < pass_at_k:
+            raise ValueError(
+                f'generations_per_sample ({generations_per_sample}) must be greater than or equal to pass_at_k ({pass_at_k}) for code evaluation.'
+            )
+
+        self.pass_at_k = pass_at_k
+        self.generations_per_sample = generations_per_sample
+
         self.tokenizer = tokenizer
         self.max_seq_len = max_seq_len
         self.pad_tok_id = pad_tok_id
+        self.padding_side = 'left'
+        self.top_p = top_p
+        self.top_k = top_k
+        self.tokenizer = tokenizer
         fewshot_rng = random.Random(fewshot_random_seed)
-        self.encoded_dataset = self.prep_examples(num_fewshot, prompt_string, example_delimiter, fn_delimiter,
-                                                  output_delimiter, input_delimiter, fewshot_rng)
+        self.max_answer_length = 0
+        self.max_prompt_length = 0
+        self.encoded_dataset = self.prep_examples(num_fewshot, prompt_string, example_delimiter, fewshot_rng)
 
     @staticmethod
     def stringify_input(input_tuple):
         tup = eval(input_tuple)
-        res = '{' + ', '.join([f'arg_{i}: {json.dumps(x)}' for i, x in enumerate(tup)]) + '}'
+        res = ', '.join([f'{json.dumps(x)}' for i, x in enumerate(tup)])
         return res
 
-    def prep_examples(self, num_fewshot: int, prompt_string: str, example_delimiter: str, fn_delimiter: str,
-                      output_delimiter: str, input_delimiter: str, fewshot_rng: random.Random):
+    @staticmethod
+    def _write_assert_statement(language, fn_name, input_val, output_val):
+        if language == 'python':
+            if output_val is not None:
+                return f'\n\nassert {fn_name}({input_val}) == {output_val}'
+            else:
+                return f'\n\nassert {fn_name}({input_val}) =='
+        else:
+            raise ValueError(f'Unsupported language: {language}')
+
+    def prep_examples(self, num_fewshot: int, prompt_string: str, example_delimiter: str, fewshot_rng: random.Random):
         """Prepares a set of language modeling tasks into tokenized format with prompt and fewshot examples.
 
         Each task consists of a context and a continuation as well as an optional prompt and optional list of
@@ -372,9 +400,11 @@ def prep_examples(self, num_fewshot: int, prompt_string: str, example_delimiter:
             dict: Contains the context, the continuation, and the preamble (prompt + fewshot examples)
         """
         examples = []
+        max_answer_length = 0  # this is used to determine the expected generation length
+        max_prompt_length = 0  # this is used to determine padding
         for sample_idx in tqdm(range(len(self.samples))):
 
-            preamble = prompt_string
+            preamble = f'"""\n{prompt_string}\n"""'
 
             if num_fewshot > 0:
                 fewshot_idxs = _get_fewshot_sample_idxs(len(self.samples), num_fewshot, sample_idx, fewshot_rng)
@@ -387,31 +417,52 @@ def prep_examples(self, num_fewshot: int, prompt_string: str, example_delimiter:
                         self.samples[fewshot_idx]['test_outputs'],
                     )
                     test_idx = random.choice(range(0, len(test_in)))
-                    example = f"""{example_delimiter}{prompt}{soln}{fn_delimiter}{entry_point}{input_delimiter}{self.stringify_input(test_in[test_idx])}{output_delimiter}{test_out[test_idx]}"""
+                    assert_stmt = self._write_assert_statement(self.samples[sample_idx]['language'], entry_point,
+                                                               self.stringify_input(test_in[test_idx]),
+                                                               test_out[test_idx])
+                    example = f"""{example_delimiter}{prompt}{soln}{assert_stmt}"""
 
                     preamble += example
-
-            prompt, soln, entry_point, test_in, test_out = (
+            prompt, soln, entry_point, test_in, test_out, language = (
                 self.samples[sample_idx]['prompt'],
                 self.samples[sample_idx]['canonical_solution'],
                 self.samples[sample_idx]['entry_point'],
                 self.samples[sample_idx]['test_inputs'],
                 self.samples[sample_idx]['test_outputs'],
+                self.samples[sample_idx]['language'],
             )
-            
             for inp, out in zip(test_in, test_out):
                 encoded_example = {}
-                context = f"""{example_delimiter}{prompt}{soln}{fn_delimiter}{entry_point}{input_delimiter}{self.stringify_input(inp)}{output_delimiter}"""
-                out = f' {out}'
-                encoded_example['preamble'] = self.tokenizer(
-                    preamble
-                )  # if the preamble is empty then these will be 0-length lists, unless the tokenizer adds special tokens to empty strings (e.g. OPT tokenizer)
-                encoded_example['context'] = self.tokenizer(context, add_special_tokens=False)
-                encoded_example['continuation'] = self.tokenizer(out, add_special_tokens=False)
-                encoded_example['task_id'] = self.samples[sample_idx]['task_id']
-
+                assert_stmt = self._write_assert_statement(
+                    language,
+                    entry_point,
+                    self.stringify_input(inp),
+                    None  # final assert statement is incomplete
+                )
+                context = f"""{example_delimiter}{prompt}{soln}{assert_stmt}"""
+
+                # If the preamble is empty then this will be a 0-length list, unless the tokenizer adds special tokens to empty strings (e.g. OPT tokenizer)
+                encoded_example['preamble'] = self.tokenizer(preamble)
+                # If there is an EOS token added, we need to remove it so it is not in the middle of the prompt
+                if self.tokenizer.eos_token_id is not None and len(
+                        encoded_example['preamble']
+                    ['input_ids']) > 1 and encoded_example['preamble']['input_ids'][-1] == self.tokenizer.eos_token_id:
+                    encoded_example['preamble']['input_ids'] = encoded_example['preamble']['input_ids'][:-1]
+
+                encoded_example['prompt'] = self.tokenizer(context, add_special_tokens=False)
+                encoded_example['prompt_text'] = self.samples[sample_idx]['prompt']
+                encoded_example['language'] = self.samples[sample_idx]['language']
+                encoded_example['expected_output'] = out
                 examples.append(encoded_example)
 
+                max_answer_length = max(max_answer_length,
+                                        len(self.tokenizer(out, add_special_tokens=False)['input_ids']))
+                max_prompt_length = max(
+                    max_prompt_length,
+                    len(encoded_example['preamble']['input_ids'] + encoded_example['prompt']['input_ids']))
+                examples.append(encoded_example)
+        self.max_answer_length = max_answer_length
+        self.max_prompt_length = max_prompt_length
         return examples
 
     def __getitem__(self, index):
@@ -421,35 +472,77 @@ def __len__(self):
         return len(self.encoded_dataset)
 
     def collate_fn(self, data):
-        inputs = []
-        continuation_indices = []
-        task_ids = []
-        for data_pair in data:
-            preamble, context, continuation = (data_pair['preamble'], data_pair['context'], data_pair['continuation'])
-            task_ids.append(data_pair['task_id'])
-            context_enc = preamble['input_ids'] + context['input_ids']
-            continuation_enc = continuation['input_ids']
-
-            inp, continuation_span = _make_padded_input(context_enc, continuation_enc, self.max_seq_len,
-                                                        self.pad_tok_id)
+        inputs, prompts, outputs, languages = [], [], [], []
+        for sample in data:
+            preamble, prompt, language = (
+                sample['preamble'],
+                sample['prompt'],
+                sample['language'],
+            )
+            context_enc = preamble['input_ids'] + prompt['input_ids']
+            inp, _ = _make_padded_input(context_enc, [],
+                                        self.max_prompt_length,
+                                        self.pad_tok_id,
+                                        padding_side=self.padding_side)
 
             inputs.append(inp)
-            continuation_indices.append(continuation_span)
+            outputs.append(sample['expected_output'])
+            prompts.append(self.tokenizer.decode(context_enc))
+            languages.append(language)
 
         batch = {
             'input_ids': torch.stack(inputs),
-            'continuation_indices': continuation_indices,
-            'mode': 'icl_task',
-            'labels': torch.stack(inputs),
-            'task_ids': task_ids
+            'mode': 'generate',
+            'prompts': prompts,  # list of prompts
+            'languages': languages,  # list of languages
+            'pass_at_k': self.pass_at_k,
+            'generation_length': self.max_answer_length,
+            'labels': outputs,
+            'generation_kwargs': {
+                'pad_token_id': self.pad_tok_id,
+                'num_beams': 1,  # single beam
+                'num_return_sequences': self.generations_per_sample,  # how many gens per prompt
+                'do_sample': True,
+                'top_p': self.top_p,
+                'top_k': self.top_k,
+                'use_cache': True,
+            }
         }
-
         batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
         return batch
 
     def get_num_samples_in_batch(self, batch) -> int:
+        # Count number of inputs in the batch
         return batch['input_ids'].shape[0]
 
+    def split_batch(self, batch: Any, microbatch_size: int):
+        # Don't split kwargs that don't change
+        # Normally split torch tensors
+        # List split lists of strings
+        no_split = ['mode', 'generation_length', 'pass_at_k', 'generation_kwargs']
+        normal_split = ['input_ids', 'attention_mask']
+        list_split = [
+            'labels', 'tests', 'canonical_solutions', 'entry_points', 'test_inputs', 'test_outputs', 'prompts',
+            'languages'
+        ]
+        chunked = {}
+        for k, v in batch.items():
+            if k in no_split:
+                # Defer broadcasting until we know num_chunks
+                pass
+            elif k in list_split:
+                chunked[k] = _split_list(v, microbatch_size)
+            elif k in normal_split:
+                chunked[k] = _default_split_batch(v, microbatch_size)
+            else:
+                raise ValueError(f'Unexpected key {k}')
+        num_chunks = len(chunked['input_ids'])
+        for k, v in batch.items():
+            if isinstance(v, (int, float, str, bool, dict)):
+                chunked[k] = [v] * num_chunks
+
+        return [{k: v[idx] for k, v in chunked.items()} for idx in range(num_chunks)]
+
 
 class InContextLearningLMTaskDataset(Dataset):
     """A dataset that construct batches for in-context learning language modeling evaluation
@@ -1273,10 +1366,8 @@ def build_icl_dataloader(
         question_prelimiter: str = '',  # e.g. 'Question: '
         fewshot_random_seed: int = 1234,
         pass_at_k: int = 1,
-        generations_per_sample: int = 1,
-        extra_delimiters: Optional[dict] = None) -> DataSpec:
-    if extra_delimiters is None:
-        extra_delimiters = {}
+        generations_per_sample: int = 1) -> DataSpec:
+
     if icl_task_type == 'multiple_choice':
         dataset = InContextLearningMultipleChoiceTaskDataset(dataset_uri,
                                                              tokenizer,
@@ -1352,7 +1443,8 @@ def build_icl_dataloader(
                                                                   example_delimiter,
                                                                   destination_path=destination_path,
                                                                   fewshot_random_seed=fewshot_random_seed,
-                                                                  **extra_delimiters)
+                                                                  pass_at_k=pass_at_k,
+                                                                  generations_per_sample=generations_per_sample)
         effective_batchsize = batch_size
     else:
         raise Exception(f'Unrecognized ICL task type: {icl_task_type}')
@@ -1437,8 +1529,7 @@ def get_icl_task_dataloader(
         fewshot_random_seed: int = 1234,
         pass_at_k: int = 1,
         generations_per_sample: int = 1,
-        has_categories: bool = False,
-        extra_delimiters: Optional[dict] = None) -> Union[DataSpec, Dict[str, DataSpec]]:
+        has_categories: bool = False) -> Union[DataSpec, Dict[str, DataSpec]]:
     """This constructs a dataloader (or dataloaders if has_categories is True) capable of evaluating LLMs on in-context learning language modeling tasks, for example LAMBADA. An example usage is below:
 
     >>> dl = get_icl_task_dataloader(
@@ -1491,15 +1582,26 @@ def get_icl_task_dataloader(
         categories = sorted(output_files.keys())
         for category in categories:
             partition_uri = output_files[category]
-            result_dls[category] = build_icl_dataloader(icl_task_type, partition_uri, tokenizer, batch_size,
-                                                        max_seq_len, pad_tok_id, num_fewshot, prompt_string,
-                                                        example_delimiter, continuation_delimiter,
-                                                        partition_uri + '_tmp', question_prelimiter,
-                                                        fewshot_random_seed, pass_at_k, generations_per_sample,
-                                                        extra_delimiters)
+            result_dls[category] = build_icl_dataloader(
+                icl_task_type,
+                partition_uri,
+                tokenizer,
+                batch_size,
+                max_seq_len,
+                pad_tok_id,
+                num_fewshot,
+                prompt_string,
+                example_delimiter,
+                continuation_delimiter,
+                partition_uri + '_tmp',
+                question_prelimiter,
+                fewshot_random_seed,
+                pass_at_k,
+                generations_per_sample,
+            )
         return result_dls
     else:
         return build_icl_dataloader(icl_task_type, dataset_uri, tokenizer, batch_size, max_seq_len, pad_tok_id,
                                     num_fewshot, prompt_string, example_delimiter, continuation_delimiter,
                                     destination_path, question_prelimiter, fewshot_random_seed, pass_at_k,
-                                    generations_per_sample, extra_delimiters)
+                                    generations_per_sample)
diff --git a/composer/metrics/__init__.py b/composer/metrics/__init__.py
index 26af16f5b5..5cc35994dc 100644
--- a/composer/metrics/__init__.py
+++ b/composer/metrics/__init__.py
@@ -5,29 +5,19 @@
 
 from composer.metrics.map import MAP
 from composer.metrics.metrics import CrossEntropy, Dice, LossMetric, MIoU
-from composer.metrics.nlp import (BinaryF1Score, InContextLearningCodeEvalAccuracy, InContextLearningLMAccuracy,
+from composer.metrics.nlp import (BinaryF1Score, InContextLearningCodeEvalAccuracy,
+                                  InContextLearningCodeExecutionPredictionAccuracy, InContextLearningLMAccuracy,
                                   InContextLearningLMExpectedCalibrationError,
                                   InContextLearningMCExpectedCalibrationError, InContextLearningMetric,
                                   InContextLearningMultipleChoiceAccuracy, InContextLearningQAAccuracy,
                                   LanguageCrossEntropy, LanguagePerplexity, MaskedAccuracy)
 
 __all__ = [
-    'MAP',
-    'MIoU',
-    'Dice',
-    'CrossEntropy',
-    'LossMetric',
-    'BinaryF1Score',
-    'LanguageCrossEntropy',
-    'MaskedAccuracy',
-    'LanguagePerplexity',
-    'InContextLearningLMAccuracy',
-    'InContextLearningMultipleChoiceAccuracy',
-    'InContextLearningQAAccuracy',
-    'InContextLearningMCExpectedCalibrationError',
-    'InContextLearningLMExpectedCalibrationError',
-    'InContextLearningMetric',
-    'InContextLearningCodeEvalAccuracy',
+    'MAP', 'MIoU', 'Dice', 'CrossEntropy', 'LossMetric', 'BinaryF1Score', 'LanguageCrossEntropy', 'MaskedAccuracy',
+    'LanguagePerplexity', 'InContextLearningLMAccuracy', 'InContextLearningMultipleChoiceAccuracy',
+    'InContextLearningQAAccuracy', 'InContextLearningMCExpectedCalibrationError',
+    'InContextLearningLMExpectedCalibrationError', 'InContextLearningMetric', 'InContextLearningCodeEvalAccuracy',
+    'InContextLearningCodeExecutionPredictionAccuracy'
 ]
 
 METRIC_DEFAULT_CTORS = {
@@ -35,4 +25,5 @@
     'InContextLearningMultipleChoiceAccuracy': InContextLearningMultipleChoiceAccuracy,
     'InContextLearningQAAccuracy': InContextLearningQAAccuracy,
     'InContextLearningCodeEvalAccuracy': InContextLearningCodeEvalAccuracy,
+    'InContextLearningCodeExecutionPredictionAccuracy': InContextLearningCodeExecutionPredictionAccuracy,
 }
diff --git a/composer/metrics/nlp.py b/composer/metrics/nlp.py
index bf1292f778..c22960c2d7 100644
--- a/composer/metrics/nlp.py
+++ b/composer/metrics/nlp.py
@@ -4,6 +4,7 @@
 """A collection of common torchmetrics for NLP tasks."""
 
 import logging
+import multiprocessing
 import os
 import re
 import string
@@ -32,6 +33,7 @@
     'InContextLearningMultipleChoiceAccuracy',
     'InContextLearningQAAccuracy',
     'InContextLearningCodeEvalAccuracy',
+    'InContextLearningCodeExecutionPredictionAccuracy',
     'BinaryF1Score',
     'LanguageCrossEntropy',
     'MaskedAccuracy',
@@ -642,3 +644,97 @@ def compute(self):
         assert isinstance(self.correct, Tensor)
         assert isinstance(self.total, Tensor)
         return self.correct / self.total
+
+
+class InContextLearningCodeExecutionPredictionAccuracy(InContextLearningCodeEvalAccuracy):
+    r"""Computes accuracy for In-context learning (ICL) code evaluation tasks.
+
+    ICL code eval tasks consist of some number of example code eval tasks (referred to as the 'context'), followed by a test task where the model must
+    complete the code, where we term the code completion a 'continuation'.
+
+    In each case, the model constructs a given number of continuations (termed pass@K for K continuations), and each continuation is run against a set of test cases. The model is considered
+    correct if at least one of the proposed continuations passes all the test cases.
+
+    Runs on AWS Lambdas by default.
+
+    Adds metric state variables:
+        correct (float): The number of instances where the predictions passed all the test cases.
+        total (float): The number of total instances that were predicted.
+
+    Args:
+        dist_sync_on_step (bool, optional): Synchronize metric state across processes at
+            each forward() before returning the value at the step. Default: ``False``.
+    """
+
+    # Make torchmetrics call update only once
+    full_state_update = False
+
+    def __init__(self, dist_sync_on_step: bool = False):
+        # state from multiple processes
+        super().__init__(dist_sync_on_step=dist_sync_on_step)
+        # this is used to make ExecutionPrediction compatible with regular HumanEval
+        self.dummy_entrypoint = """def dummy_entrypoint(dummy_inpt):\n    return\n""" ''
+
+    def update(self, batch: Dict[str, Any], outputs: List[str], labels: List[str]):
+        """Updates the pass@k accuracy of code generation.
+
+        Given a batch of prompts, test cases, and code generations, evaluates the code generations
+        against the test cases and augments the pass@k accuracy of the batch to the values so far.
+
+        Args:
+            batch (Dict[str, Any]): A batch of data produced by the InContextLearningCodeEvalDataset, with
+            the prompt, test cases, and entry points. This will be a dictionary that must have the following
+            arguments:
+            {
+                'prompts': List[str],
+                'languages': List[str],
+                'generation_kwargs': Dict[str, Any]
+            }
+            outputs (List[str]): A list of code generations in the format of HF generate with beam search,
+            which is the a list of strings in groups of beam_size e.g. for beam size 2 and batch size 2, the list
+            will be of the format [prompt 1 gen 1, prompt 1 gen 2, prompt 2 gen 1, prompt 2 gen 2]
+            labels (List[str]): A list of the correct code generations, for compatibility with existing HF generate
+            functionalities. This is not used.
+        """
+        del labels  # never used
+        client = self.get_client()
+
+        pass_at_k = batch['pass_at_k']
+        num_generations = batch['generation_kwargs']['num_return_sequences']
+        processed_outputs = [
+            outputs[i * num_generations:(i + 1) * num_generations] for i in range(len(batch['prompts']))
+        ]
+        payloads = []
+        for sample_outputs, sample_prompt, language in zip(processed_outputs, batch['prompts'], batch['languages']):
+            self.total += torch.tensor(1.0)
+            prompt_payload = []
+            for code_gen in sample_outputs:
+                code_gen = re.split(r'\n[A-Za-z0-9#`]', code_gen)[0]  # remove everything after function ends
+                final_code = self.dummy_entrypoint + sample_prompt + code_gen  # combine prompt with the code generation
+                generation_payload = []
+
+                payload = {
+                    'code': final_code,
+                    'input': 'None,',
+                    'output': 'None',
+                    'entry_point': 'dummy_entrypoint',
+                    'language': language,
+                }
+                generation_payload.append(payload)
+
+                prompt_payload.append(generation_payload)
+            payloads.append(prompt_payload)
+
+        results = client.invoke(payloads)
+
+        for prompt in results:
+            num_correct = 0
+            for generation in prompt:
+                correct = all(generation)
+                if correct:
+                    num_correct += 1
+
+            pass_at_k_rate = self.estimator(num_generations, num_correct, pass_at_k)
+            self.correct += torch.tensor(pass_at_k_rate)
+
+        client.close()  # pyright: ignore [reportOptionalMemberAccess]
diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py
index c0a31c3e19..bd935c945b 100644
--- a/tests/datasets/test_in_context_learning_datasets.py
+++ b/tests/datasets/test_in_context_learning_datasets.py
@@ -16,11 +16,13 @@
 from composer import Evaluator
 from composer.core import DataSpec
 from composer.datasets.in_context_learning_evaluation import (InContextLearningCodeEvalDataset,
+                                                              InContextLearningExecutionPredictionTaskDataset,
                                                               _get_fewshot_sample_idxs, _make_padded_input,
                                                               get_icl_task_dataloader)
 from composer.loggers import InMemoryLogger
-from composer.metrics import (InContextLearningCodeEvalAccuracy, InContextLearningLMAccuracy,
-                              InContextLearningMultipleChoiceAccuracy, InContextLearningQAAccuracy)
+from composer.metrics import (InContextLearningCodeEvalAccuracy, InContextLearningCodeExecutionPredictionAccuracy,
+                              InContextLearningLMAccuracy, InContextLearningMultipleChoiceAccuracy,
+                              InContextLearningQAAccuracy)
 from composer.models import HuggingFaceModel
 from composer.trainer import Trainer
 from composer.utils import dist, reproducibility
@@ -848,82 +850,47 @@ def test_code_execution_prediction_task_dataloader(dataset_uri, tmp_path, num_fe
     batch_size = 9
     seqlen = 2048
 
-    prompt_string = """Below is a list of code snippets, followed by a python function indicated by `python_fn`, a dictionary of arguments indicated by `inputs`, and the model outputs indicated by `outputs`. Your task is to predict the outputs that would be obtained from executing the final `python_fn` on the `inputs`.\n"""
+    prompt_string = """Below is a list of python functions each followed by a correct assert statement testing its behavior. The final assert statement is incomplete; your task is to complete the final assert statement so that it passes."""
 
-    dl = get_icl_task_dataloader('code_execution_prediction',
-                                 dataset_uri,
-                                 tokenizer,
-                                 batch_size,
-                                 max_seq_len=seqlen,
-                                 pad_tok_id=tokenizer.eos_token_id,
-                                 num_fewshot=num_fewshot,
-                                 prompt_string=prompt_string,
-                                 example_delimiter='\n\n####\n\n',
-                                 destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'),
-                                 extra_delimiters={
-                                     'fn_delimiter': '\n####\n\npython_fn=',
-                                     'input_delimiter': '\ninputs=',
-                                     'output_delimiter': '\noutputs='
-                                 })
+    dl = get_icl_task_dataloader(
+        'code_execution_prediction',
+        dataset_uri,
+        tokenizer,
+        batch_size,
+        max_seq_len=seqlen,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=num_fewshot,
+        prompt_string=prompt_string,
+        example_delimiter='\n\n####\n\n',
+        destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'),
+    )
     assert isinstance(dl, DataSpec)
 
     assert isinstance(dl.dataloader, DataLoader)  # pyright
     batch = next(dl.dataloader._get_iterator())
-
     max_prompt_length = 0
-    if isinstance(dl.dataloader.dataset, InContextLearningCodeEvalDataset):
+    if isinstance(dl.dataloader.dataset, InContextLearningExecutionPredictionTaskDataset):
         max_prompt_length = dl.dataloader.dataset.max_prompt_length
+        max_answer_length = dl.dataloader.dataset.max_answer_length
     assert tuple(batch['input_ids'].shape) == (batch_size, max_prompt_length)
     assert tuple(batch['attention_mask'].shape) == (batch_size, max_prompt_length)
     assert batch['mode'] == 'generate'
     # the maximum generation length from the small test data
-    assert batch['generation_length'] == seqlen - max_prompt_length
-    assert any(item[0] != tokenizer.eos_token_id for item in batch['input_ids'])  # longest should be pushed left
+    assert batch['generation_length'] == max_answer_length
 
     decoded_batch = tokenizer.batch_decode(batch['input_ids'])
-    assert all(item.count('Code start: \n') == num_fewshot + 1 for item in decoded_batch)
+    assert all((item.count('assert') - prompt_string.count('assert')) == num_fewshot + 1 for item in decoded_batch)
 
-    if len(prompt_string) > 0:
-        assert all(item.count('Please code:\n') == 1 for item in decoded_batch)
-
-    assert batch['labels'] == [
-        "    result = []\n    current_string = []\n    current_depth = 0\n\n    for c in paren_string:\n        if c == '(':\n            current_depth += 1\n            current_string.append(c)\n        elif c == ')':\n            current_depth -= 1\n            current_string.append(c)\n\n            if current_depth == 0:\n                result.append(''.join(current_string))\n                current_string.clear()\n\n    return result\n",
-        '    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx != idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n',
-        '    return number % 1.0\n',
-        '    balance = 0\n\n    for op in operations:\n        balance += op\n        if balance < 0:\n            return True\n\n    return False\n',
-        '    mean = sum(numbers) / len(numbers)\n    return sum(abs(x - mean) for x in numbers) / len(numbers)\n',
-        '    if not numbers:\n        return []\n\n    result = []\n\n    for n in numbers[:-1]:\n        result.append(n)\n        result.append(delimeter)\n\n    result.append(numbers[-1])\n\n    return result\n',
-        "    def parse_paren_group(s):\n        depth = 0\n        max_depth = 0\n        for c in s:\n            if c == '(':\n                depth += 1\n                max_depth = max(depth, max_depth)\n            else:\n                depth -= 1\n\n        return max_depth\n\n    return [parse_paren_group(x) for x in paren_string.split(' ') if x]\n",
-        '    return [x for x in strings if substring in x]\n',
-        '    sum_value = 0\n    prod_value = 1\n\n    for n in numbers:\n        sum_value += n\n        prod_value *= n\n    return sum_value, prod_value\n'
-    ]
+    assert batch['labels'][:3] == ['True', 'True', 'False']
 
     assert decoded_batch[0].endswith(
-        "Code start: \nfrom typing import List\n\n\ndef separate_paren_groups(paren_string: str) -> List[str]:\n    \"\"\" Input to this function is a string containing multiple groups of nested parentheses. Your goal is to\n    separate those group into separate strings and return the list of those.\n    Separate groups are balanced (each open brace is properly closed) and not nested within each other\n    Ignore any spaces in the input string.\n    >>> separate_paren_groups('( ) (( )) (( )( ))')\n    ['()', '(())', '(()())']\n    \"\"\"\n"
+        '"""\nBelow is a list of python functions each followed by a correct assert statement testing its behavior. The final assert statement is incomplete; your task is to complete the final assert statement so that it passes.\n"""\n\n####\n\nfrom typing import List\n\n\ndef separate_paren_groups(paren_string: str) -> List[str]:\n    """ Input to this function is a string containing multiple groups of nested parentheses. Your goal is to\n    separate those group into separate strings and return the list of those.\n    Separate groups are balanced (each open brace is properly closed) and not nested within each other\n    Ignore any spaces in the input string.\n    >>> separate_paren_groups(\'( ) (( )) (( )( ))\')\n    [\'()\', \'(())\', \'(()())\']\n    """\n    result = []\n    current_string = []\n    current_depth = 0\n\n    for c in paren_string:\n        if c == \'(\':\n            current_depth += 1\n            current_string.append(c)\n        elif c == \')\':\n            current_depth -= 1\n            current_string.append(c)\n\n            if current_depth == 0:\n                result.append(\'\'.join(current_string))\n                current_string.clear()\n\n    return result\n\n\nassert separate_paren_groups("( ) (( )) (( )( ))") == [\'()\', \'(())\', \'(()())\']\n\n####\n\nfrom typing import List\n\n\ndef filter_by_substring(strings: List[str], substring: str) -> List[str]:\n    """ Filter an input list of strings only for ones that contain given substring\n    >>> filter_by_substring([], \'a\')\n    []\n    >>> filter_by_substring([\'abc\', \'bacd\', \'cde\', \'array\'], \'a\')\n    [\'abc\', \'bacd\', \'array\']\n    """\n    return [x for x in strings if substring in x]\n\n\nassert filter_by_substring(["grunt", "trumpet", "prune", "gruesome"], "run") == [\'grunt\', \'prune\']\n\n####\n\nfrom typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    """ Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    """\n    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx!= idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n\n\nassert has_close_elements([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) =='
     )
     assert decoded_batch[1].endswith(
-        "Code start: \nfrom typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    \"\"\"\n"
+        '"""\nBelow is a list of python functions each followed by a correct assert statement testing its behavior. The final assert statement is incomplete; your task is to complete the final assert statement so that it passes.\n"""\n\n####\n\nfrom typing import List\n\n\ndef separate_paren_groups(paren_string: str) -> List[str]:\n    """ Input to this function is a string containing multiple groups of nested parentheses. Your goal is to\n    separate those group into separate strings and return the list of those.\n    Separate groups are balanced (each open brace is properly closed) and not nested within each other\n    Ignore any spaces in the input string.\n    >>> separate_paren_groups(\'( ) (( )) (( )( ))\')\n    [\'()\', \'(())\', \'(()())\']\n    """\n    result = []\n    current_string = []\n    current_depth = 0\n\n    for c in paren_string:\n        if c == \'(\':\n            current_depth += 1\n            current_string.append(c)\n        elif c == \')\':\n            current_depth -= 1\n            current_string.append(c)\n\n            if current_depth == 0:\n                result.append(\'\'.join(current_string))\n                current_string.clear()\n\n    return result\n\n\nassert separate_paren_groups("( ) (( )) (( )( ))") == [\'()\', \'(())\', \'(()())\']\n\n####\n\nfrom typing import List\n\n\ndef filter_by_substring(strings: List[str], substring: str) -> List[str]:\n    """ Filter an input list of strings only for ones that contain given substring\n    >>> filter_by_substring([], \'a\')\n    []\n    >>> filter_by_substring([\'abc\', \'bacd\', \'cde\', \'array\'], \'a\')\n    [\'abc\', \'bacd\', \'array\']\n    """\n    return [x for x in strings if substring in x]\n\n\nassert filter_by_substring(["grunt", "trumpet", "prune", "gruesome"], "run") == [\'grunt\', \'prune\']\n\n####\n\nfrom typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    """ Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    """\n    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx!= idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n\n\nassert has_close_elements([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) =='
     )
     assert decoded_batch[2].endswith(
-        "Code start: \n\n\ndef truncate_number(number: float) -> float:\n    \"\"\" Given a positive floating point number, it can be decomposed into\n    and integer part (largest integer smaller than given number) and decimals\n    (leftover part always smaller than 1).\n\n    Return the decimal part of the number.\n    >>> truncate_number(3.5)\n    0.5\n    \"\"\"\n"
-    )
-    assert decoded_batch[3].endswith(
-        "Code start: \nfrom typing import List\n\n\ndef below_zero(operations: List[int]) -> bool:\n    \"\"\" You're given a list of deposit and withdrawal operations on a bank account that starts with\n    zero balance. Your task is to detect if at any point the balance of account fallls below zero, and\n    at that point function should return True. Otherwise it should return False.\n    >>> below_zero([1, 2, 3])\n    False\n    >>> below_zero([1, 2, -4, 5])\n    True\n    \"\"\"\n"
-    )
-    assert decoded_batch[4].endswith(
-        "Code start: \nfrom typing import List\n\n\ndef mean_absolute_deviation(numbers: List[float]) -> float:\n    \"\"\" For a given list of input numbers, calculate Mean Absolute Deviation\n    around the mean of this dataset.\n    Mean Absolute Deviation is the average absolute difference between each\n    element and a centerpoint (mean in this case):\n    MAD = average | x - x_mean |\n    >>> mean_absolute_deviation([1.0, 2.0, 3.0, 4.0])\n    1.0\n    \"\"\"\n"
-    )
-    assert decoded_batch[5].endswith(
-        "Code start: \nfrom typing import List\n\n\ndef intersperse(numbers: List[int], delimeter: int) -> List[int]:\n    \"\"\" Insert a number 'delimeter' between every two consecutive elements of input list `numbers'\n    >>> intersperse([], 4)\n    []\n    >>> intersperse([1, 2, 3], 4)\n    [1, 4, 2, 4, 3]\n    \"\"\"\n"
-    )
-    assert decoded_batch[6].endswith(
-        "Code start: \nfrom typing import List\n\n\ndef parse_nested_parens(paren_string: str) -> List[int]:\n    \"\"\" Input to this function is a string represented multiple groups for nested parentheses separated by spaces.\n    For each of the group, output the deepest level of nesting of parentheses.\n    E.g. (()()) has maximum two levels of nesting while ((())) has three.\n\n    >>> parse_nested_parens('(()()) ((())) () ((())()())')\n    [2, 3, 1, 3]\n    \"\"\"\n"
-    )
-    assert decoded_batch[7].endswith(
-        "Code start: \nfrom typing import List\n\n\ndef filter_by_substring(strings: List[str], substring: str) -> List[str]:\n    \"\"\" Filter an input list of strings only for ones that contain given substring\n    >>> filter_by_substring([], 'a')\n    []\n    >>> filter_by_substring(['abc', 'bacd', 'cde', 'array'], 'a')\n    ['abc', 'bacd', 'array']\n    \"\"\"\n"
-    )
-    assert decoded_batch[8].endswith(
-        "from typing import List, Tuple\n\n\ndef sum_product(numbers: List[int]) -> Tuple[int, int]:\n    \"\"\" For a given list of integers, return a tuple consisting of a sum and a product of all the integers in a list.\n    Empty sum should be equal to 0 and empty product should be equal to 1.\n    >>> sum_product([])\n    (0, 1)\n    >>> sum_product([1, 2, 3, 4])\n    (10, 24)\n    \"\"\"\n"
+        '"""\nBelow is a list of python functions each followed by a correct assert statement testing its behavior. The final assert statement is incomplete; your task is to complete the final assert statement so that it passes.\n"""\n\n####\n\nfrom typing import List\n\n\ndef separate_paren_groups(paren_string: str) -> List[str]:\n    """ Input to this function is a string containing multiple groups of nested parentheses. Your goal is to\n    separate those group into separate strings and return the list of those.\n    Separate groups are balanced (each open brace is properly closed) and not nested within each other\n    Ignore any spaces in the input string.\n    >>> separate_paren_groups(\'( ) (( )) (( )( ))\')\n    [\'()\', \'(())\', \'(()())\']\n    """\n    result = []\n    current_string = []\n    current_depth = 0\n\n    for c in paren_string:\n        if c == \'(\':\n            current_depth += 1\n            current_string.append(c)\n        elif c == \')\':\n            current_depth -= 1\n            current_string.append(c)\n\n            if current_depth == 0:\n                result.append(\'\'.join(current_string))\n                current_string.clear()\n\n    return result\n\n\nassert separate_paren_groups("( ) (( )) (( )( ))") == [\'()\', \'(())\', \'(()())\']\n\n####\n\nfrom typing import List\n\n\ndef filter_by_substring(strings: List[str], substring: str) -> List[str]:\n    """ Filter an input list of strings only for ones that contain given substring\n    >>> filter_by_substring([], \'a\')\n    []\n    >>> filter_by_substring([\'abc\', \'bacd\', \'cde\', \'array\'], \'a\')\n    [\'abc\', \'bacd\', \'array\']\n    """\n    return [x for x in strings if substring in x]\n\n\nassert filter_by_substring(["grunt", "trumpet", "prune", "gruesome"], "run") == [\'grunt\', \'prune\']\n\n####\n\nfrom typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    """ Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    """\n    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx!= idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n\n\nassert has_close_elements([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) =='
     )
 
 
@@ -1428,6 +1395,57 @@ def test_code_eval_task_evaluation(monkeypatch, device, world_size, num_fewshot,
     assert in_memory_logger.data['metrics/humaneval/InContextLearningCodeEvalAccuracy'][0][1].item() == 0
 
 
+@pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
+@device('gpu')
+@world_size(1, 2)
+@pytest.mark.parametrize('num_fewshot', [0, 2])
+@pytest.mark.parametrize('generations_per_sample', [1])
+@pytest.mark.filterwarnings(r'ignore: Input length of input_ids is')
+def test_code_execution_prediction_task_evaluation(monkeypatch, device, world_size, num_fewshot, dataset_uri,
+                                                   tiny_gpt2_tokenizer, tiny_gpt2_model, tmp_path,
+                                                   generations_per_sample):
+    pytest.importorskip('datasets')
+    torch.cuda.empty_cache()
+    monkeypatch.setenv('CODE_EVAL_DEVICE', 'LOCAL')
+    in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    dataset_uri = f'{local_data}/{dataset_uri}'
+    tokenizer = tiny_gpt2_tokenizer
+    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
+    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
+    dl = get_icl_task_dataloader(
+        'code_execution_prediction',
+        dataset_uri,
+        tokenizer,
+        2,
+        max_seq_len=150 if num_fewshot == 0 else 450,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=num_fewshot,
+        prompt_string='',
+        example_delimiter='\n',
+        continuation_delimiter=': ',
+        destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
+        generations_per_sample=generations_per_sample,
+    )
+
+    evaluator = Evaluator(label='humaneval',
+                          dataloader=dl,
+                          metric_names=['InContextLearningCodeExecutionPredictionAccuracy'])
+    model = HuggingFaceModel(
+        model=tiny_gpt2_model,
+        tokenizer=tiny_gpt2_tokenizer,
+        eval_metrics=[InContextLearningCodeExecutionPredictionAccuracy()],
+        use_logits=True,
+    )
+
+    trainer = Trainer(model=model, max_duration='1ba', loggers=in_memory_logger)
+    torch.use_deterministic_algorithms(False)
+    trainer.eval(eval_dataloader=evaluator, subset_num_batches=2)
+    torch.use_deterministic_algorithms(True)
+    assert 'metrics/humaneval/InContextLearningCodeExecutionPredictionAccuracy' in in_memory_logger.data.keys()
+    assert in_memory_logger.data['metrics/humaneval/InContextLearningCodeExecutionPredictionAccuracy'][0][1].item() == 0
+
+
 @pytest.mark.parametrize('dataset_uri', ['lambada_small.jsonl'])
 def test_lm_spacing_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
     pytest.importorskip('datasets')

From 591a617ad6ab124177ad3f82318a630d101d13f4 Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Tue, 10 Oct 2023 12:37:24 -0400
Subject: [PATCH 04/10] pre commit

---
 composer/datasets/in_context_learning_evaluation.py | 2 +-
 composer/metrics/nlp.py                             | 1 -
 tests/datasets/test_in_context_learning_datasets.py | 1 +
 3 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index cb28a5f5c5..1f46dfd273 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -370,7 +370,7 @@ def __init__(
     @staticmethod
     def stringify_input(input_tuple):
         tup = eval(input_tuple)
-        res = ', '.join([f'{json.dumps(x)}' for i, x in enumerate(tup)])
+        res = ', '.join([f'{json.dumps(x)}' for x in tup])
         return res
 
     @staticmethod
diff --git a/composer/metrics/nlp.py b/composer/metrics/nlp.py
index c22960c2d7..ce1b8f01d8 100644
--- a/composer/metrics/nlp.py
+++ b/composer/metrics/nlp.py
@@ -4,7 +4,6 @@
 """A collection of common torchmetrics for NLP tasks."""
 
 import logging
-import multiprocessing
 import os
 import re
 import string
diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py
index bd935c945b..5ac3f0b56b 100644
--- a/tests/datasets/test_in_context_learning_datasets.py
+++ b/tests/datasets/test_in_context_learning_datasets.py
@@ -869,6 +869,7 @@ def test_code_execution_prediction_task_dataloader(dataset_uri, tmp_path, num_fe
     assert isinstance(dl.dataloader, DataLoader)  # pyright
     batch = next(dl.dataloader._get_iterator())
     max_prompt_length = 0
+    max_answer_length = 0
     if isinstance(dl.dataloader.dataset, InContextLearningExecutionPredictionTaskDataset):
         max_prompt_length = dl.dataloader.dataset.max_prompt_length
         max_answer_length = dl.dataloader.dataset.max_answer_length

From 1b6c45eb9c753375af1330aece58daf2e10dbf28 Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Mon, 16 Oct 2023 13:53:10 -0400
Subject: [PATCH 05/10] fix merge

---
 composer/datasets/in_context_learning_evaluation.py | 4 ++--
 composer/metrics/nlp.py                             | 9 ++++-----
 tests/datasets/test_in_context_learning_datasets.py | 9 ++++-----
 3 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 12f05f2a22..035ddae177 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -428,9 +428,9 @@ def stringify_input(input_tuple):
     def _write_assert_statement(language, fn_name, input_val, output_val):
         if language == 'python':
             if output_val is not None:
-                return f'\n\nassert {fn_name}({input_val}) == {output_val}'
+                return f'\n\ndef test():\n\tassert {fn_name}({input_val}) == {output_val}'
             else:
-                return f'\n\nassert {fn_name}({input_val}) =='
+                return f'\n\ndef test():\n\tassert {fn_name}({input_val}) =='
         else:
             raise ValueError(f'Unsupported language: {language}')
 
diff --git a/composer/metrics/nlp.py b/composer/metrics/nlp.py
index 7c8866cf3d..0b4d79b591 100644
--- a/composer/metrics/nlp.py
+++ b/composer/metrics/nlp.py
@@ -679,8 +679,6 @@ class InContextLearningCodeExecutionPredictionAccuracy(InContextLearningCodeEval
     def __init__(self, dist_sync_on_step: bool = False):
         # state from multiple processes
         super().__init__(dist_sync_on_step=dist_sync_on_step)
-        # this is used to make ExecutionPrediction compatible with regular HumanEval
-        self.dummy_entrypoint = """def dummy_entrypoint(dummy_inpt):\n    return\n""" ''
 
     def update(self, batch: Dict[str, Any], outputs: List[str], labels: List[str]):
         """Updates the pass@k accuracy of code generation.
@@ -717,16 +715,17 @@ def update(self, batch: Dict[str, Any], outputs: List[str], labels: List[str]):
             prompt_payload = []
             for code_gen in sample_outputs:
                 code_gen = re.split(r'\n[A-Za-z0-9#`]', code_gen)[0]  # remove everything after function ends
-                final_code = self.dummy_entrypoint + sample_prompt + code_gen  # combine prompt with the code generation
+                final_code = sample_prompt + code_gen  # combine prompt with the code generation
                 generation_payload = []
 
                 payload = {
                     'code': final_code,
-                    'input': 'None,',
+                    'input': '""',
                     'output': 'None',
-                    'entry_point': 'dummy_entrypoint',
+                    'entry_point': 'test',
                     'language': language,
                 }
+
                 generation_payload.append(payload)
 
                 prompt_payload.append(generation_payload)
diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py
index 6b32a6f78b..2bc981b6a6 100644
--- a/tests/datasets/test_in_context_learning_datasets.py
+++ b/tests/datasets/test_in_context_learning_datasets.py
@@ -1548,14 +1548,13 @@ def test_code_eval_task_evaluation(monkeypatch, device, world_size, num_fewshot,
 
 
 @pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
-@device('gpu')
-@world_size(1, 2)
+@device('cpu')
+# @world_size(1, 2)
 @pytest.mark.parametrize('num_fewshot', [0, 2])
 @pytest.mark.parametrize('generations_per_sample', [1])
 @pytest.mark.filterwarnings(r'ignore: Input length of input_ids is')
-def test_code_execution_prediction_task_evaluation(monkeypatch, device, world_size, num_fewshot, dataset_uri,
-                                                   tiny_gpt2_tokenizer, tiny_gpt2_model, tmp_path,
-                                                   generations_per_sample):
+def test_code_execution_prediction_task_evaluation(monkeypatch, device, num_fewshot, dataset_uri, tiny_gpt2_tokenizer,
+                                                   tiny_gpt2_model, tmp_path, generations_per_sample):
     pytest.importorskip('datasets')
     torch.cuda.empty_cache()
     monkeypatch.setenv('CODE_EVAL_DEVICE', 'LOCAL')

From e4b6fdae73ec14c93ef2bd3c22c7f7b8dbfd432a Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Tue, 31 Oct 2023 14:55:21 -0400
Subject: [PATCH 06/10] restore data

---
 tests/datasets/local_data/human_eval_small.jsonl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/datasets/local_data/human_eval_small.jsonl b/tests/datasets/local_data/human_eval_small.jsonl
index a2b33bebe1..93fb289093 100644
--- a/tests/datasets/local_data/human_eval_small.jsonl
+++ b/tests/datasets/local_data/human_eval_small.jsonl
@@ -1,5 +1,5 @@
-{"task_id": "HumanEval/0", "prompt": "from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    \"\"\"\n", "entry_point": "has_close_elements", "canonical_solution": "    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx != idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\n    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True\n    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False\n    assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True\n    assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True\n    assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False\n\n", "test_inputs": ["([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3)", "([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05)", "([1.0, 2.0, 5.9, 4.0, 5.0], 0.95)", "([1.0, 2.0, 5.9, 4.0, 5.0], 0.8)", "([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1)", "([1.1, 2.2, 3.1, 4.1, 5.1], 1.0)", "([1.1, 2.2, 3.1, 4.1, 5.1], 0.5)"], "test_outputs": ["True", "False", "True", "False", "True", "True", "False"], "language": "python"}
 {"task_id": "HumanEval/1", "prompt": "from typing import List\n\n\ndef separate_paren_groups(paren_string: str) -> List[str]:\n    \"\"\" Input to this function is a string containing multiple groups of nested parentheses. Your goal is to\n    separate those group into separate strings and return the list of those.\n    Separate groups are balanced (each open brace is properly closed) and not nested within each other\n    Ignore any spaces in the input string.\n    >>> separate_paren_groups('( ) (( )) (( )( ))')\n    ['()', '(())', '(()())']\n    \"\"\"\n", "entry_point": "separate_paren_groups", "canonical_solution": "    result = []\n    current_string = []\n    current_depth = 0\n\n    for c in paren_string:\n        if c == '(':\n            current_depth += 1\n            current_string.append(c)\n        elif c == ')':\n            current_depth -= 1\n            current_string.append(c)\n\n            if current_depth == 0:\n                result.append(''.join(current_string))\n                current_string.clear()\n\n    return result\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate('(()()) ((())) () ((())()())') == [\n        '(()())', '((()))', '()', '((())()())'\n    ]\n    assert candidate('() (()) ((())) (((())))') == [\n        '()', '(())', '((()))', '(((())))'\n    ]\n    assert candidate('(()(())((())))') == [\n        '(()(())((())))'\n    ]\n    assert candidate('( ) (( )) (( )( ))') == ['()', '(())', '(()())']\n", "test_inputs": ["('(()()) ((())) () ((())()())',)", "('() (()) ((())) (((())))',)", "('(()(())((())))',)", "('( ) (( )) (( )( ))',)"], "test_outputs": ["['(()())', '((()))', '()', '((())()())']", "['()', '(())', '((()))', '(((())))']", "['(()(())((())))']", "['()', '(())', '(()())']"], "language": "python"}
+{"task_id": "HumanEval/0", "prompt": "from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    \"\"\"\n", "entry_point": "has_close_elements", "canonical_solution": "    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx != idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\n    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True\n    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False\n    assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True\n    assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True\n    assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False\n\n", "test_inputs": ["([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3)", "([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05)", "([1.0, 2.0, 5.9, 4.0, 5.0], 0.95)", "([1.0, 2.0, 5.9, 4.0, 5.0], 0.8)", "([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1)", "([1.1, 2.2, 3.1, 4.1, 5.1], 1.0)", "([1.1, 2.2, 3.1, 4.1, 5.1], 0.5)"], "test_outputs": ["True", "False", "True", "False", "True", "True", "False"], "language": "python"}
 {"task_id": "HumanEval/2", "prompt": "\n\ndef truncate_number(number: float) -> float:\n    \"\"\" Given a positive floating point number, it can be decomposed into\n    and integer part (largest integer smaller than given number) and decimals\n    (leftover part always smaller than 1).\n\n    Return the decimal part of the number.\n    >>> truncate_number(3.5)\n    0.5\n    \"\"\"\n", "entry_point": "truncate_number", "canonical_solution": "    return number % 1.0\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate(3.5) == 0.5\n    assert abs(candidate(1.33) - 0.33) < 1e-6\n    assert abs(candidate(123.456) - 0.456) < 1e-6\n", "test_inputs": ["(3.5,)", "(1.33,)", "(123.456,)"], "test_outputs": ["0.5", "0.33000000000000007", "0.45600000000000307"], "language": "python"}
 {"task_id": "HumanEval/3", "prompt": "from typing import List\n\n\ndef below_zero(operations: List[int]) -> bool:\n    \"\"\" You're given a list of deposit and withdrawal operations on a bank account that starts with\n    zero balance. Your task is to detect if at any point the balance of account fallls below zero, and\n    at that point function should return True. Otherwise it should return False.\n    >>> below_zero([1, 2, 3])\n    False\n    >>> below_zero([1, 2, -4, 5])\n    True\n    \"\"\"\n", "entry_point": "below_zero", "canonical_solution": "    balance = 0\n\n    for op in operations:\n        balance += op\n        if balance < 0:\n            return True\n\n    return False\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate([]) == False\n    assert candidate([1, 2, -3, 1, 2, -3]) == False\n    assert candidate([1, 2, -4, 5, 6]) == True\n    assert candidate([1, -1, 2, -2, 5, -5, 4, -4]) == False\n    assert candidate([1, -1, 2, -2, 5, -5, 4, -5]) == True\n    assert candidate([1, -2, 2, -2, 5, -5, 4, -4]) == True\n", "test_inputs": ["([],)", "([1, 2, -3, 1, 2, -3],)", "([1, 2, -4, 5, 6],)", "([1, -1, 2, -2, 5, -5, 4, -4],)", "([1, -1, 2, -2, 5, -5, 4, -5],)", "([1, -2, 2, -2, 5, -5, 4, -4],)"], "test_outputs": ["False", "False", "True", "False", "True", "True"], "language": "python"}
 {"task_id": "HumanEval/4", "prompt": "from typing import List\n\n\ndef mean_absolute_deviation(numbers: List[float]) -> float:\n    \"\"\" For a given list of input numbers, calculate Mean Absolute Deviation\n    around the mean of this dataset.\n    Mean Absolute Deviation is the average absolute difference between each\n    element and a centerpoint (mean in this case):\n    MAD = average | x - x_mean |\n    >>> mean_absolute_deviation([1.0, 2.0, 3.0, 4.0])\n    1.0\n    \"\"\"\n", "entry_point": "mean_absolute_deviation", "canonical_solution": "    mean = sum(numbers) / len(numbers)\n    return sum(abs(x - mean) for x in numbers) / len(numbers)\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert abs(candidate([1.0, 2.0, 3.0]) - 2.0/3.0) < 1e-6\n    assert abs(candidate([1.0, 2.0, 3.0, 4.0]) - 1.0) < 1e-6\n    assert abs(candidate([1.0, 2.0, 3.0, 4.0, 5.0]) - 6.0/5.0) < 1e-6\n\n", "test_inputs": ["([1.0, 2.0, 3.0],)", "([1.0, 2.0, 3.0, 4.0],)", "([1.0, 2.0, 3.0, 4.0, 5.0],)"], "test_outputs": ["0.6666666666666666", "1.0", "1.2"], "language": "python"}

From 0188d1c1d28206a10905369d9e46a7e54ebd6d88 Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Tue, 31 Oct 2023 15:09:20 -0400
Subject: [PATCH 07/10] fix bug

---
 .../datasets/in_context_learning_evaluation.py    |  3 ++-
 .../datasets/test_in_context_learning_datasets.py | 15 +++++++--------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 035ddae177..e0b30d8f37 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -474,6 +474,7 @@ def prep_examples(self, num_fewshot: int, prompt_string: str, example_delimiter:
                     example = f"""{example_delimiter}{prompt}{soln}{assert_stmt}"""
 
                     preamble += example
+
             prompt, soln, entry_point, test_in, test_out, language = (
                 self.samples[sample_idx]['prompt'],
                 self.samples[sample_idx]['canonical_solution'],
@@ -511,7 +512,7 @@ def prep_examples(self, num_fewshot: int, prompt_string: str, example_delimiter:
                 max_prompt_length = max(
                     max_prompt_length,
                     len(encoded_example['preamble']['input_ids'] + encoded_example['prompt']['input_ids']))
-                examples.append(encoded_example)
+
         self.max_answer_length = max_answer_length
         self.max_prompt_length = max_prompt_length
         return examples
diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py
index 2bc981b6a6..4c4f69929d 100644
--- a/tests/datasets/test_in_context_learning_datasets.py
+++ b/tests/datasets/test_in_context_learning_datasets.py
@@ -911,7 +911,7 @@ def test_code_execution_prediction_task_dataloader(dataset_uri, tmp_path, num_fe
 
     tokenizer = AutoTokenizer.from_pretrained('mosaicml/mpt-7b')
     dataset_uri = f'{local_data}/{dataset_uri}'
-    batch_size = 9
+    batch_size = 8
     seqlen = 2048
 
     prompt_string = """Below is a list of python functions each followed by a correct assert statement testing its behavior. The final assert statement is incomplete; your task is to complete the final assert statement so that it passes."""
@@ -945,17 +945,16 @@ def test_code_execution_prediction_task_dataloader(dataset_uri, tmp_path, num_fe
 
     decoded_batch = tokenizer.batch_decode(batch['input_ids'])
     assert all((item.count('assert') - prompt_string.count('assert')) == num_fewshot + 1 for item in decoded_batch)
-
-    assert batch['labels'][:3] == ['True', 'True', 'False']
+    assert batch['labels'] == [
+        "['(()())', '((()))', '()', '((())()())']", "['()', '(())', '((()))', '(((())))']", "['(()(())((())))']",
+        "['()', '(())', '(()())']", 'True', 'False', 'True', 'False'
+    ]
 
     assert decoded_batch[0].endswith(
-        '"""\nBelow is a list of python functions each followed by a correct assert statement testing its behavior. The final assert statement is incomplete; your task is to complete the final assert statement so that it passes.\n"""\n\n####\n\nfrom typing import List\n\n\ndef separate_paren_groups(paren_string: str) -> List[str]:\n    """ Input to this function is a string containing multiple groups of nested parentheses. Your goal is to\n    separate those group into separate strings and return the list of those.\n    Separate groups are balanced (each open brace is properly closed) and not nested within each other\n    Ignore any spaces in the input string.\n    >>> separate_paren_groups(\'( ) (( )) (( )( ))\')\n    [\'()\', \'(())\', \'(()())\']\n    """\n    result = []\n    current_string = []\n    current_depth = 0\n\n    for c in paren_string:\n        if c == \'(\':\n            current_depth += 1\n            current_string.append(c)\n        elif c == \')\':\n            current_depth -= 1\n            current_string.append(c)\n\n            if current_depth == 0:\n                result.append(\'\'.join(current_string))\n                current_string.clear()\n\n    return result\n\n\nassert separate_paren_groups("( ) (( )) (( )( ))") == [\'()\', \'(())\', \'(()())\']\n\n####\n\nfrom typing import List\n\n\ndef filter_by_substring(strings: List[str], substring: str) -> List[str]:\n    """ Filter an input list of strings only for ones that contain given substring\n    >>> filter_by_substring([], \'a\')\n    []\n    >>> filter_by_substring([\'abc\', \'bacd\', \'cde\', \'array\'], \'a\')\n    [\'abc\', \'bacd\', \'array\']\n    """\n    return [x for x in strings if substring in x]\n\n\nassert filter_by_substring(["grunt", "trumpet", "prune", "gruesome"], "run") == [\'grunt\', \'prune\']\n\n####\n\nfrom typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    """ Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    """\n    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx!= idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n\n\nassert has_close_elements([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) =='
+        '"""\nBelow is a list of python functions each followed by a correct assert statement testing its behavior. The final assert statement is incomplete; your task is to complete the final assert statement so that it passes.\n"""\n\n####\n\nfrom typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    """ Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    """\n    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx!= idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n\n\ndef test():\n\tassert has_close_elements([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False\n\n####\n\nfrom typing import List\n\n\ndef filter_by_substring(strings: List[str], substring: str) -> List[str]:\n    """ Filter an input list of strings only for ones that contain given substring\n    >>> filter_by_substring([], \'a\')\n    []\n    >>> filter_by_substring([\'abc\', \'bacd\', \'cde\', \'array\'], \'a\')\n    [\'abc\', \'bacd\', \'array\']\n    """\n    return [x for x in strings if substring in x]\n\n\ndef test():\n\tassert filter_by_substring(["grunt", "trumpet", "prune", "gruesome"], "run") == [\'grunt\', \'prune\']\n\n####\n\nfrom typing import List\n\n\ndef separate_paren_groups(paren_string: str) -> List[str]:\n    """ Input to this function is a string containing multiple groups of nested parentheses. Your goal is to\n    separate those group into separate strings and return the list of those.\n    Separate groups are balanced (each open brace is properly closed) and not nested within each other\n    Ignore any spaces in the input string.\n    >>> separate_paren_groups(\'( ) (( )) (( )( ))\')\n    [\'()\', \'(())\', \'(()())\']\n    """\n    result = []\n    current_string = []\n    current_depth = 0\n\n    for c in paren_string:\n        if c == \'(\':\n            current_depth += 1\n            current_string.append(c)\n        elif c == \')\':\n            current_depth -= 1\n            current_string.append(c)\n\n            if current_depth == 0:\n                result.append(\'\'.join(current_string))\n                current_string.clear()\n\n    return result\n\n\ndef test():\n\tassert separate_paren_groups("(()()) ((())) () ((())()())") =='
     )
     assert decoded_batch[1].endswith(
-        '"""\nBelow is a list of python functions each followed by a correct assert statement testing its behavior. The final assert statement is incomplete; your task is to complete the final assert statement so that it passes.\n"""\n\n####\n\nfrom typing import List\n\n\ndef separate_paren_groups(paren_string: str) -> List[str]:\n    """ Input to this function is a string containing multiple groups of nested parentheses. Your goal is to\n    separate those group into separate strings and return the list of those.\n    Separate groups are balanced (each open brace is properly closed) and not nested within each other\n    Ignore any spaces in the input string.\n    >>> separate_paren_groups(\'( ) (( )) (( )( ))\')\n    [\'()\', \'(())\', \'(()())\']\n    """\n    result = []\n    current_string = []\n    current_depth = 0\n\n    for c in paren_string:\n        if c == \'(\':\n            current_depth += 1\n            current_string.append(c)\n        elif c == \')\':\n            current_depth -= 1\n            current_string.append(c)\n\n            if current_depth == 0:\n                result.append(\'\'.join(current_string))\n                current_string.clear()\n\n    return result\n\n\nassert separate_paren_groups("( ) (( )) (( )( ))") == [\'()\', \'(())\', \'(()())\']\n\n####\n\nfrom typing import List\n\n\ndef filter_by_substring(strings: List[str], substring: str) -> List[str]:\n    """ Filter an input list of strings only for ones that contain given substring\n    >>> filter_by_substring([], \'a\')\n    []\n    >>> filter_by_substring([\'abc\', \'bacd\', \'cde\', \'array\'], \'a\')\n    [\'abc\', \'bacd\', \'array\']\n    """\n    return [x for x in strings if substring in x]\n\n\nassert filter_by_substring(["grunt", "trumpet", "prune", "gruesome"], "run") == [\'grunt\', \'prune\']\n\n####\n\nfrom typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    """ Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    """\n    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx!= idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n\n\nassert has_close_elements([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) =='
-    )
-    assert decoded_batch[2].endswith(
-        '"""\nBelow is a list of python functions each followed by a correct assert statement testing its behavior. The final assert statement is incomplete; your task is to complete the final assert statement so that it passes.\n"""\n\n####\n\nfrom typing import List\n\n\ndef separate_paren_groups(paren_string: str) -> List[str]:\n    """ Input to this function is a string containing multiple groups of nested parentheses. Your goal is to\n    separate those group into separate strings and return the list of those.\n    Separate groups are balanced (each open brace is properly closed) and not nested within each other\n    Ignore any spaces in the input string.\n    >>> separate_paren_groups(\'( ) (( )) (( )( ))\')\n    [\'()\', \'(())\', \'(()())\']\n    """\n    result = []\n    current_string = []\n    current_depth = 0\n\n    for c in paren_string:\n        if c == \'(\':\n            current_depth += 1\n            current_string.append(c)\n        elif c == \')\':\n            current_depth -= 1\n            current_string.append(c)\n\n            if current_depth == 0:\n                result.append(\'\'.join(current_string))\n                current_string.clear()\n\n    return result\n\n\nassert separate_paren_groups("( ) (( )) (( )( ))") == [\'()\', \'(())\', \'(()())\']\n\n####\n\nfrom typing import List\n\n\ndef filter_by_substring(strings: List[str], substring: str) -> List[str]:\n    """ Filter an input list of strings only for ones that contain given substring\n    >>> filter_by_substring([], \'a\')\n    []\n    >>> filter_by_substring([\'abc\', \'bacd\', \'cde\', \'array\'], \'a\')\n    [\'abc\', \'bacd\', \'array\']\n    """\n    return [x for x in strings if substring in x]\n\n\nassert filter_by_substring(["grunt", "trumpet", "prune", "gruesome"], "run") == [\'grunt\', \'prune\']\n\n####\n\nfrom typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    """ Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    """\n    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx!= idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n\n\nassert has_close_elements([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) =='
+        '"""\nBelow is a list of python functions each followed by a correct assert statement testing its behavior. The final assert statement is incomplete; your task is to complete the final assert statement so that it passes.\n"""\n\n####\n\nfrom typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    """ Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    """\n    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx!= idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n\n\ndef test():\n\tassert has_close_elements([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False\n\n####\n\nfrom typing import List\n\n\ndef filter_by_substring(strings: List[str], substring: str) -> List[str]:\n    """ Filter an input list of strings only for ones that contain given substring\n    >>> filter_by_substring([], \'a\')\n    []\n    >>> filter_by_substring([\'abc\', \'bacd\', \'cde\', \'array\'], \'a\')\n    [\'abc\', \'bacd\', \'array\']\n    """\n    return [x for x in strings if substring in x]\n\n\ndef test():\n\tassert filter_by_substring(["grunt", "trumpet", "prune", "gruesome"], "run") == [\'grunt\', \'prune\']\n\n####\n\nfrom typing import List\n\n\ndef separate_paren_groups(paren_string: str) -> List[str]:\n    """ Input to this function is a string containing multiple groups of nested parentheses. Your goal is to\n    separate those group into separate strings and return the list of those.\n    Separate groups are balanced (each open brace is properly closed) and not nested within each other\n    Ignore any spaces in the input string.\n    >>> separate_paren_groups(\'( ) (( )) (( )( ))\')\n    [\'()\', \'(())\', \'(()())\']\n    """\n    result = []\n    current_string = []\n    current_depth = 0\n\n    for c in paren_string:\n        if c == \'(\':\n            current_depth += 1\n            current_string.append(c)\n        elif c == \')\':\n            current_depth -= 1\n            current_string.append(c)\n\n            if current_depth == 0:\n                result.append(\'\'.join(current_string))\n                current_string.clear()\n\n    return result\n\n\ndef test():\n\tassert separate_paren_groups("() (()) ((())) (((())))") =='
     )
 
 

From e215f20588d82832eae45b550209541958a63106 Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Fri, 3 Nov 2023 15:23:56 -0700
Subject: [PATCH 08/10] fix indexing

---
 composer/datasets/in_context_learning_evaluation.py | 10 +++++-----
 tests/datasets/test_in_context_learning_datasets.py |  6 +++---
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index e0b30d8f37..1bcb573b4b 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -425,12 +425,12 @@ def stringify_input(input_tuple):
         return res
 
     @staticmethod
-    def _write_assert_statement(language, fn_name, input_val, output_val):
+    def _write_assert_statement(language, fn_name, input_val, output_val, fewshot_idx = ""):
         if language == 'python':
             if output_val is not None:
-                return f'\n\ndef test():\n\tassert {fn_name}({input_val}) == {output_val}'
+                return f'\n\ndef test{fewshot_idx}():\n\tassert {fn_name}({input_val}) == {output_val}'
             else:
-                return f'\n\ndef test():\n\tassert {fn_name}({input_val}) =='
+                return f'\n\ndef test{fewshot_idx}():\n\tassert {fn_name}({input_val}) =='
         else:
             raise ValueError(f'Unsupported language: {language}')
 
@@ -459,7 +459,7 @@ def prep_examples(self, num_fewshot: int, prompt_string: str, example_delimiter:
 
             if num_fewshot > 0:
                 fewshot_idxs = _get_fewshot_sample_idxs(len(self.samples), num_fewshot, sample_idx, fewshot_rng)
-                for fewshot_idx in fewshot_idxs:
+                for idx, fewshot_idx in enumerate(fewshot_idxs):
                     prompt, soln, entry_point, test_in, test_out = (
                         self.samples[fewshot_idx]['prompt'],
                         self.samples[fewshot_idx]['canonical_solution'],
@@ -470,7 +470,7 @@ def prep_examples(self, num_fewshot: int, prompt_string: str, example_delimiter:
                     test_idx = random.choice(range(0, len(test_in)))
                     assert_stmt = self._write_assert_statement(self.samples[sample_idx]['language'], entry_point,
                                                                self.stringify_input(test_in[test_idx]),
-                                                               test_out[test_idx])
+                                                               test_out[test_idx], idx)
                     example = f"""{example_delimiter}{prompt}{soln}{assert_stmt}"""
 
                     preamble += example
diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py
index d72d0b2479..d1aadfc396 100644
--- a/tests/datasets/test_in_context_learning_datasets.py
+++ b/tests/datasets/test_in_context_learning_datasets.py
@@ -1559,12 +1559,12 @@ def test_code_eval_task_evaluation(monkeypatch, device, world_size, num_fewshot,
 
 
 @pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
-@device('cpu')
-# @world_size(1, 2)
+@device('gpu')
+@world_size(1, 2)
 @pytest.mark.parametrize('num_fewshot', [0, 2])
 @pytest.mark.parametrize('generations_per_sample', [1])
 @pytest.mark.filterwarnings(r'ignore: Input length of input_ids is')
-def test_code_execution_prediction_task_evaluation(monkeypatch, device, num_fewshot, dataset_uri, tiny_gpt2_tokenizer,
+def test_code_execution_prediction_task_evaluation(monkeypatch, device, world_size, num_fewshot, dataset_uri, tiny_gpt2_tokenizer,
                                                    tiny_gpt2_model, tmp_path, generations_per_sample):
     pytest.importorskip('datasets')
     torch.cuda.empty_cache()

From 03793665f047e127ff506bc065f1d8aaa8020b68 Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Mon, 4 Dec 2023 14:55:35 -0500
Subject: [PATCH 09/10] fix rng

---
 composer/datasets/in_context_learning_evaluation.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 1bcb573b4b..561eb8f6b1 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -467,7 +467,8 @@ def prep_examples(self, num_fewshot: int, prompt_string: str, example_delimiter:
                         self.samples[fewshot_idx]['test_inputs'],
                         self.samples[fewshot_idx]['test_outputs'],
                     )
-                    test_idx = random.choice(range(0, len(test_in)))
+                    
+                    test_idx = fewshot_rng.choice(range(0, len(test_in)))
                     assert_stmt = self._write_assert_statement(self.samples[sample_idx]['language'], entry_point,
                                                                self.stringify_input(test_in[test_idx]),
                                                                test_out[test_idx], idx)

From 0ec201e21013222e57db8ea206b9a8762d60591e Mon Sep 17 00:00:00 2001
From: Jeremy Dohmann <jeremy@mosaicml.com>
Date: Tue, 5 Dec 2023 14:49:21 -0500
Subject: [PATCH 10/10] finish

---
 composer/datasets/in_context_learning_evaluation.py | 6 +++---
 tests/datasets/test_in_context_learning_datasets.py | 9 +++++----
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 561eb8f6b1..2b43a18455 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -425,7 +425,7 @@ def stringify_input(input_tuple):
         return res
 
     @staticmethod
-    def _write_assert_statement(language, fn_name, input_val, output_val, fewshot_idx = ""):
+    def _write_assert_statement(language, fn_name, input_val, output_val, fewshot_idx=''):
         if language == 'python':
             if output_val is not None:
                 return f'\n\ndef test{fewshot_idx}():\n\tassert {fn_name}({input_val}) == {output_val}'
@@ -467,11 +467,11 @@ def prep_examples(self, num_fewshot: int, prompt_string: str, example_delimiter:
                         self.samples[fewshot_idx]['test_inputs'],
                         self.samples[fewshot_idx]['test_outputs'],
                     )
-                    
+
                     test_idx = fewshot_rng.choice(range(0, len(test_in)))
                     assert_stmt = self._write_assert_statement(self.samples[sample_idx]['language'], entry_point,
                                                                self.stringify_input(test_in[test_idx]),
-                                                               test_out[test_idx], idx)
+                                                               test_out[test_idx], str(idx))
                     example = f"""{example_delimiter}{prompt}{soln}{assert_stmt}"""
 
                     preamble += example
diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py
index d1aadfc396..4d86e0a575 100644
--- a/tests/datasets/test_in_context_learning_datasets.py
+++ b/tests/datasets/test_in_context_learning_datasets.py
@@ -951,10 +951,10 @@ def test_code_execution_prediction_task_dataloader(dataset_uri, tmp_path, num_fe
     ]
 
     assert decoded_batch[0].endswith(
-        '"""\nBelow is a list of python functions each followed by a correct assert statement testing its behavior. The final assert statement is incomplete; your task is to complete the final assert statement so that it passes.\n"""\n\n####\n\nfrom typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    """ Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    """\n    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx!= idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n\n\ndef test():\n\tassert has_close_elements([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False\n\n####\n\nfrom typing import List\n\n\ndef filter_by_substring(strings: List[str], substring: str) -> List[str]:\n    """ Filter an input list of strings only for ones that contain given substring\n    >>> filter_by_substring([], \'a\')\n    []\n    >>> filter_by_substring([\'abc\', \'bacd\', \'cde\', \'array\'], \'a\')\n    [\'abc\', \'bacd\', \'array\']\n    """\n    return [x for x in strings if substring in x]\n\n\ndef test():\n\tassert filter_by_substring(["grunt", "trumpet", "prune", "gruesome"], "run") == [\'grunt\', \'prune\']\n\n####\n\nfrom typing import List\n\n\ndef separate_paren_groups(paren_string: str) -> List[str]:\n    """ Input to this function is a string containing multiple groups of nested parentheses. Your goal is to\n    separate those group into separate strings and return the list of those.\n    Separate groups are balanced (each open brace is properly closed) and not nested within each other\n    Ignore any spaces in the input string.\n    >>> separate_paren_groups(\'( ) (( )) (( )( ))\')\n    [\'()\', \'(())\', \'(()())\']\n    """\n    result = []\n    current_string = []\n    current_depth = 0\n\n    for c in paren_string:\n        if c == \'(\':\n            current_depth += 1\n            current_string.append(c)\n        elif c == \')\':\n            current_depth -= 1\n            current_string.append(c)\n\n            if current_depth == 0:\n                result.append(\'\'.join(current_string))\n                current_string.clear()\n\n    return result\n\n\ndef test():\n\tassert separate_paren_groups("(()()) ((())) () ((())()())") =='
+        '"""\nBelow is a list of python functions each followed by a correct assert statement testing its behavior. The final assert statement is incomplete; your task is to complete the final assert statement so that it passes.\n"""\n\n####\n\nfrom typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    """ Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    """\n    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx!= idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n\n\ndef test0():\n\tassert has_close_elements([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n\n####\n\nfrom typing import List\n\n\ndef filter_by_substring(strings: List[str], substring: str) -> List[str]:\n    """ Filter an input list of strings only for ones that contain given substring\n    >>> filter_by_substring([], \'a\')\n    []\n    >>> filter_by_substring([\'abc\', \'bacd\', \'cde\', \'array\'], \'a\')\n    [\'abc\', \'bacd\', \'array\']\n    """\n    return [x for x in strings if substring in x]\n\n\ndef test1():\n\tassert filter_by_substring([], "john") == []\n\n####\n\nfrom typing import List\n\n\ndef separate_paren_groups(paren_string: str) -> List[str]:\n    """ Input to this function is a string containing multiple groups of nested parentheses. Your goal is to\n    separate those group into separate strings and return the list of those.\n    Separate groups are balanced (each open brace is properly closed) and not nested within each other\n    Ignore any spaces in the input string.\n    >>> separate_paren_groups(\'( ) (( )) (( )( ))\')\n    [\'()\', \'(())\', \'(()())\']\n    """\n    result = []\n    current_string = []\n    current_depth = 0\n\n    for c in paren_string:\n        if c == \'(\':\n            current_depth += 1\n            current_string.append(c)\n        elif c == \')\':\n            current_depth -= 1\n            current_string.append(c)\n\n            if current_depth == 0:\n                result.append(\'\'.join(current_string))\n                current_string.clear()\n\n    return result\n\n\ndef test():\n\tassert separate_paren_groups("(()()) ((())) () ((())()())") =='
     )
     assert decoded_batch[1].endswith(
-        '"""\nBelow is a list of python functions each followed by a correct assert statement testing its behavior. The final assert statement is incomplete; your task is to complete the final assert statement so that it passes.\n"""\n\n####\n\nfrom typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    """ Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    """\n    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx!= idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n\n\ndef test():\n\tassert has_close_elements([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False\n\n####\n\nfrom typing import List\n\n\ndef filter_by_substring(strings: List[str], substring: str) -> List[str]:\n    """ Filter an input list of strings only for ones that contain given substring\n    >>> filter_by_substring([], \'a\')\n    []\n    >>> filter_by_substring([\'abc\', \'bacd\', \'cde\', \'array\'], \'a\')\n    [\'abc\', \'bacd\', \'array\']\n    """\n    return [x for x in strings if substring in x]\n\n\ndef test():\n\tassert filter_by_substring(["grunt", "trumpet", "prune", "gruesome"], "run") == [\'grunt\', \'prune\']\n\n####\n\nfrom typing import List\n\n\ndef separate_paren_groups(paren_string: str) -> List[str]:\n    """ Input to this function is a string containing multiple groups of nested parentheses. Your goal is to\n    separate those group into separate strings and return the list of those.\n    Separate groups are balanced (each open brace is properly closed) and not nested within each other\n    Ignore any spaces in the input string.\n    >>> separate_paren_groups(\'( ) (( )) (( )( ))\')\n    [\'()\', \'(())\', \'(()())\']\n    """\n    result = []\n    current_string = []\n    current_depth = 0\n\n    for c in paren_string:\n        if c == \'(\':\n            current_depth += 1\n            current_string.append(c)\n        elif c == \')\':\n            current_depth -= 1\n            current_string.append(c)\n\n            if current_depth == 0:\n                result.append(\'\'.join(current_string))\n                current_string.clear()\n\n    return result\n\n\ndef test():\n\tassert separate_paren_groups("() (()) ((())) (((())))") =='
+        '"""\nBelow is a list of python functions each followed by a correct assert statement testing its behavior. The final assert statement is incomplete; your task is to complete the final assert statement so that it passes.\n"""\n\n####\n\nfrom typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    """ Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    """\n    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx!= idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n\n\ndef test0():\n\tassert has_close_elements([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n\n####\n\nfrom typing import List\n\n\ndef filter_by_substring(strings: List[str], substring: str) -> List[str]:\n    """ Filter an input list of strings only for ones that contain given substring\n    >>> filter_by_substring([], \'a\')\n    []\n    >>> filter_by_substring([\'abc\', \'bacd\', \'cde\', \'array\'], \'a\')\n    [\'abc\', \'bacd\', \'array\']\n    """\n    return [x for x in strings if substring in x]\n\n\ndef test1():\n\tassert filter_by_substring([], "john") == []\n\n####\n\nfrom typing import List\n\n\ndef separate_paren_groups(paren_string: str) -> List[str]:\n    """ Input to this function is a string containing multiple groups of nested parentheses. Your goal is to\n    separate those group into separate strings and return the list of those.\n    Separate groups are balanced (each open brace is properly closed) and not nested within each other\n    Ignore any spaces in the input string.\n    >>> separate_paren_groups(\'( ) (( )) (( )( ))\')\n    [\'()\', \'(())\', \'(()())\']\n    """\n    result = []\n    current_string = []\n    current_depth = 0\n\n    for c in paren_string:\n        if c == \'(\':\n            current_depth += 1\n            current_string.append(c)\n        elif c == \')\':\n            current_depth -= 1\n            current_string.append(c)\n\n            if current_depth == 0:\n                result.append(\'\'.join(current_string))\n                current_string.clear()\n\n    return result\n\n\ndef test():\n\tassert separate_paren_groups("() (()) ((())) (((())))") =='
     )
 
 
@@ -1564,8 +1564,9 @@ def test_code_eval_task_evaluation(monkeypatch, device, world_size, num_fewshot,
 @pytest.mark.parametrize('num_fewshot', [0, 2])
 @pytest.mark.parametrize('generations_per_sample', [1])
 @pytest.mark.filterwarnings(r'ignore: Input length of input_ids is')
-def test_code_execution_prediction_task_evaluation(monkeypatch, device, world_size, num_fewshot, dataset_uri, tiny_gpt2_tokenizer,
-                                                   tiny_gpt2_model, tmp_path, generations_per_sample):
+def test_code_execution_prediction_task_evaluation(monkeypatch, device, world_size, num_fewshot, dataset_uri,
+                                                   tiny_gpt2_tokenizer, tiny_gpt2_model, tmp_path,
+                                                   generations_per_sample):
     pytest.importorskip('datasets')
     torch.cuda.empty_cache()
     monkeypatch.setenv('CODE_EVAL_DEVICE', 'LOCAL')