From a49db7df6cff50f01bf970231376637d0145b0be Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Fri, 27 Oct 2023 23:34:31 +0000
Subject: [PATCH 001/116] extremely wip commit w/ ICLdataset class

---
 .../in_context_learning_evaluation.py         | 325 ++++++++++++++----
 1 file changed, 249 insertions(+), 76 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index a7f87d95d1..1e913f18e8 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -102,8 +102,226 @@ def _get_fewshot_sample_idxs(dataset_size: int, num_fewshot: int, sample_idx: in
         fewshot_idxs.add(replacement_sample)
     return fewshot_idxs
 
+class InContextLearningDataset(Dataset):
+    def __init__(
+        self,
+        dataset_uri: str,
+        tokenizer: Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast],
+        max_seq_len: int,
+        pad_tok_id: int,
+        num_fewshot: int,
+        prompt_string: str,
+        example_delimiter: str,
+        continuation_delimiter: str,
+        destination_path: str,
+        question_prelimiter: str,
+        fewshot_random_seed: int,
+        cot_delimiter: str = '',
+    ):
+        self.tokenizer = tokenizer
+        self.max_seq_len = max_seq_len
+        self.pad_tok_id = pad_tok_id
+        self.padding_side = 'left'
+        self.max_answer_length = 0
+        fewshot_rng = random.Random(fewshot_random_seed)
+
+        self.samples = self._read_dataset(dataset_uri, destination_path)
+        self.samples = strip_data(self.samples)
+        self.encoded_dataset = self._prep_examples(num_fewshot, prompt_string, example_delimiter,
+                                                   continuation_delimiter, question_prelimiter, fewshot_rng,
+                                                   cot_delimiter)
+
+    def __getitem__(self, index):
+        return self.encoded_dataset[index]
+
+    def __len__(self):
+        return len(self.encoded_dataset)
+
+    def get_num_samples_in_batch(self, batch) -> int:
+        return batch['input_ids'].shape[0]
+
+    def _read_dataset(self, dataset_uri, destination_path, **kwargs):
+        try:
+            from datasets import load_dataset  # pyright: ignore [reportGeneralTypeIssues]
+        except ImportError as e:
+            raise MissingConditionalImportError(extra_deps_group='nlp',
+                                                conda_package='datasets',
+                                                conda_channel='conda-forge') from e
+        if "hf://" in dataset_uri:
+            dataset_uri = dataset_uri.replace("hf://", "")
+            # TODO: I'm sure this is not correct
+            dataset = load_dataset(dataset_uri, split="train", **kwargs)
+        else:
+            with dist.local_rank_zero_download_and_wait(destination_path):
+                if dist.get_local_rank() == 0:
+                    get_file(dataset_uri, destination_path, overwrite=True)
+            dataset = load_dataset('json', data_files=destination_path, split='train', streaming=False)
+        dataset = self._parse_dataset(dataset)
+        return dataset
+
+    # abstractmethod?
+    def _parse_dataset(self, dataset: Dataset) -> List[Dict[str, str]]:
+        result = []
+        for example in dataset:
+            result.append({
+                'context': example['context'],
+                'answer': example['answer'],
+                'aliases': set([example['answer']] + example.get('aliases', [])),
+                'chain_of_thought': example.get('chain_of_thought', '')
+            })
+
+        # self.samples = list(
+        #     dataset.map(lambda examples: {
+        #         'continuation': examples['continuation'],
+        #         'context': examples['context'],
+        #     }))
+        return result
+
+    def generate_few_shot_text(self, num_fewshot: int, prompt_string: str, example_delimiter: str,
+                                   continuation_delimiter: str, question_prelimiter: str, cot_delimiter: str,
+                                   fewshot_rng: random.Random, sample_idx: int) -> str:
+        """Formats the prompt fewshot examples for test sample `sample_idx`.
+
+        Randomly select `num_fewshot` samples from the dataset (not including the sample at `sample_idx`) and format
+        them each as follows `{example_delimiter}{question_prelimiter}{context}{continuation_delimiter}{chain_of_thought}{cot_delimiter}{answer}`.
+
+        `chain_of_thought` will default to empty if not present in the dataset but `context` and `answer` must be present.
+
+        Returns the formatted prompt_string + concatenated list of formatted few shot examples.
+        """
+        prompt_and_fewshot = prompt_string
+
+        if num_fewshot > 0:
+            fewshot_idxs = _get_fewshot_sample_idxs(len(self.samples), num_fewshot, sample_idx, fewshot_rng)
+            for fewshot_idx in fewshot_idxs:
+                context = self.samples[fewshot_idx]['context']
+                chain_of_thought = self.samples[fewshot_idx].get('chain_of_thought', '')
+                answer = self.samples[fewshot_idx]['answer']
+
+                if len(chain_of_thought) == 0:
+                    cot_delimiter = ''
+                # TODO: question_perlimiter needs a question - is it already in context?
+                context = f'{question_prelimiter}{context}'
+                # add the example delimiter between few_shot examples
+                if len(prompt_and_fewshot) > 0:
+                    context = f'{example_delimiter}{context}'
+                prompt_and_fewshot += f'{context}{continuation_delimiter}{chain_of_thought}{cot_delimiter}{answer}'
+
+        return prompt_and_fewshot
+
+    def _prep_examples(self,
+                       num_fewshot: int,
+                       prompt_string: str,
+                       example_delimiter: str,
+                       continuation_delimiter: str,
+                       question_prelimiter: str,
+                       fewshot_rng: random.Random,
+                       cot_delimiter: str = '') -> List[Dict[str, Any]]:
+        """Prepares a set of language modeling tasks into tokenized format with prompt and fewshot examples.
+
+        Each task consists of a context and a continuation as well as an optional prompt and optional list of
+        example context/continuation pairs which precede the test context/continuation pair.
+
+        Args:
+            num_fewshot (int): Number of examples context/continuation pairs to prepend to the test pair
+            prompt_string (str): The prompt to prepend to all inputs
+            example_delimiter (str): The delimiter used to separate each individual context/continuation pair
+            continuation_delimiter (str): The delimiter used to separate each context from its continuation
+            question_prelimiter (str): The text to prepend to each question
+            fewshot_rng (random.Random): Random number generator to use for fewshot sampling
+            cot_delimiter (str): The delimiter used to separate the chain-of-thought (if present) from the final model response.
+
+
+        Returns:
+            dict: Contains the context, the continuation, and the preamble (prompt + fewshot examples)
+        """
+        examples = []
+        for sample_idx in tqdm(range(len(self.samples))):
 
-class InContextLearningQATaskDataset(Dataset):
+            # TODO: This will make the few_shot examples different for some examples
+            prompt_and_fewshot = self.add_few_shot(num_fewshot, prompt_string, example_delimiter,
+                                                                 continuation_delimiter, question_prelimiter,
+                                                                 cot_delimiter, fewshot_rng, sample_idx)
+
+            ctxt = self.samples[sample_idx]['context']
+            ctxt = f'{question_prelimiter}{ctxt}'
+            if len(prompt_and_fewshot) > 0:
+                ctxt = f'{example_delimiter}{ctxt}'
+
+            # rstrip the continuation delimiter, because the prompt ending in a space results in degenerate output
+            continuation_delimiter_stripped = continuation_delimiter.rstrip()
+            ctxt = f'{ctxt}{continuation_delimiter_stripped}'
+
+            encoded_example = {}
+            # If the preamble is empty then this will be a 0-length list, unless the tokenizer adds special tokens to empty strings (e.g. OPT tokenizer)
+            encoded_example['preamble'] = self.tokenizer(prompt_and_fewshot)
+            # If there is an EOS token added, we need to remove it so it is not in the middle of the prompt
+            if (self.tokenizer.eos_token_id is not None and 
+                len(encoded_example['preamble']['input_ids']) > 1 and 
+                encoded_example['preamble']['input_ids'][-1] == self.tokenizer.eos_token_id):
+                encoded_example['preamble']['input_ids'] = encoded_example['preamble']['input_ids'][:-1]
+
+            encoded_example['context'] = self.tokenizer(ctxt, add_special_tokens=False)
+            encoded_example['aliases'] = list(self.samples[sample_idx]['aliases'])
+            encoded_example['cot_delimiter'] = cot_delimiter
+            examples.append(encoded_example)
+
+        max_answer_length = self.get_max_answer_length(cot_delimiter)
+        # TODO: this is only a QA task thing
+        has_cot = self.check_for_cot()
+
+        self.max_answer_length = max_answer_length + (_MAX_ANSWER_BUFFER_LENGTH if has_cot else 0)
+        return examples
+    
+    def check_for_cot(self):
+        for sample in self.samples:
+            cot = sample.get('chain_of_thought', '')
+            if len(cot) > 0:
+                return True
+        return False
+
+    def get_max_answer_length(self, cot_delimiter):
+        max_answer_length = 0
+        for sample in self.samples:
+            for answer in sample['aliases']:
+                response = f"{sample['chain_of_thought']}{cot_delimiter}{answer}"
+                max_answer_length = max(max_answer_length, len(self.tokenizer(response)['input_ids']))
+        return max_answer_length
+
+    # TODO: implement abc
+    # @abstractmethod
+    def collate_fn(self, data):
+        pass
+
+    def split_batch(self, batch: Any, microbatch_size: int):
+        # Don't split kwargs that don't change
+        # Normally split torch tensors
+        # List split lists of strings
+        # no_split = ['mode', 'generation_length', 'pass_at_k', 'generation_kwargs']
+        # normal_split = ['input_ids', 'attention_mask']
+        # list_split = [
+        #     'labels', 'tests', 'canonical_solutions', 'entry_points', 'test_inputs', 'test_outputs', 'prompts',
+        #     'languages'
+        # ]
+        chunked = {}
+        for k, v in batch.items():
+            if k in self.dont_split_keys:
+                # Defer broadcasting until we know num_chunks
+                pass
+            elif k in self.list_split_keys:
+                chunked[k] = _split_list(v, microbatch_size)
+            elif k in self.normal_split_keys:
+                chunked[k] = _default_split_batch(v, microbatch_size)
+            else:
+                raise ValueError(f'Unexpected key {k}')
+        num_chunks = len(chunked['input_ids'])
+        for k, v in batch.items():
+            if isinstance(v, (int, float, str, bool, dict)):
+                chunked[k] = [v] * num_chunks
+
+        return [{k: v[idx] for k, v in chunked.items()} for idx in range(num_chunks)]
+
+class InContextLearningQATaskDataset(InContextLearningDataset):
     """A dataset that construct batches for in-context learning question answering evaluation
 
     The input format is expected to be a jsonl file with the following fields:
@@ -128,7 +346,7 @@ class InContextLearningQATaskDataset(Dataset):
         fewshot_random_seed (int): Random seed to use for fewshot sampling
     """
 
-    def _read_dataset(self, dataset: Dataset) -> List[Dict[str, str]]:
+    def _parse_dataset(self, dataset: Dataset) -> List[Dict[str, str]]:
         result = []
         for example in dataset:
             result.append({
@@ -154,17 +372,7 @@ def __init__(
         fewshot_random_seed: int,
         cot_delimiter: str = '',
     ):
-        try:
-            from datasets import load_dataset  # pyright: ignore [reportGeneralTypeIssues]
-        except ImportError as e:
-            raise MissingConditionalImportError(extra_deps_group='nlp',
-                                                conda_package='datasets',
-                                                conda_channel='conda-forge') from e
-        with dist.local_rank_zero_download_and_wait(destination_path):
-            if dist.get_local_rank() == 0:
-                get_file(dataset_uri, destination_path, overwrite=True)
-        dataset = load_dataset('json', data_files=destination_path, split='train', streaming=False)
-        self.samples = self._read_dataset(dataset)
+        self.samples = self._read_dataset(dataset_uri, destination_path)
         self.samples = strip_data(self.samples)
         self.tokenizer = tokenizer
         self.max_seq_len = max_seq_len
@@ -273,12 +481,6 @@ def _prep_examples(self,
         self.max_answer_length = max_answer_length + (_MAX_ANSWER_BUFFER_LENGTH if has_cot else 0)
         return examples
 
-    def __getitem__(self, index):
-        return self.encoded_dataset[index]
-
-    def __len__(self):
-        return len(self.encoded_dataset)
-
     def collate_fn(self, data):
         inputs, answers = [], []
         cot_delimiter = ''
@@ -313,9 +515,6 @@ def collate_fn(self, data):
         batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
         return batch
 
-    def get_num_samples_in_batch(self, batch) -> int:
-        return batch['input_ids'].shape[0]
-
     def split_batch(self, batch: Any, microbatch_size: int):
         # Don't split kwargs that don't change
         # Normally split torch tensors
@@ -341,7 +540,7 @@ def split_batch(self, batch: Any, microbatch_size: int):
         return [{k: v[idx] for k, v in chunked.items()} for idx in range(num_chunks)]
 
 
-class InContextLearningLMTaskDataset(Dataset):
+class InContextLearningLMTaskDataset(InContextLearningDataset):
     """A dataset that construct batches for in-context learning language modeling evaluation
 
     Args:
@@ -373,21 +572,12 @@ def __init__(
         destination_path: str,
         fewshot_random_seed: int,
     ):
-        try:
-            from datasets import load_dataset  # pyright: ignore [reportGeneralTypeIssues]
-        except ImportError as e:
-            raise MissingConditionalImportError(extra_deps_group='nlp',
-                                                conda_package='datasets',
-                                                conda_channel='conda-forge') from e
-        with dist.local_rank_zero_download_and_wait(destination_path):
-            if dist.get_local_rank() == 0:
-                get_file(dataset_uri, destination_path, overwrite=True)
-        dataset = load_dataset('json', data_files=destination_path, split='train', streaming=False)
-        self.samples = list(
-            dataset.map(lambda examples: {
-                'continuation': examples['continuation'],
-                'context': examples['context'],
-            }))
+        self.samples = self._read_dataset(dataset_uri, destination_path)
+        # self.samples = list(
+        #     dataset.map(lambda examples: {
+        #         'continuation': examples['continuation'],
+        #         'context': examples['context'],
+        #     }))
         self.samples = strip_data(self.samples)
 
         self.tokenizer = tokenizer
@@ -397,10 +587,10 @@ def __init__(
 
         self.prefix_space = _tokenizer_needs_prefix_space(self.tokenizer)
 
-        self.encoded_dataset = self.prep_examples(num_fewshot, prompt_string, example_delimiter, continuation_delimiter,
+        self.encoded_dataset = self._prep_examples(num_fewshot, prompt_string, example_delimiter, continuation_delimiter,
                                                   fewshot_rng)
 
-    def prep_examples(self, num_fewshot: int, prompt_string: str, example_delimiter: str, continuation_delimiter: str,
+    def _prep_examples(self, num_fewshot: int, prompt_string: str, example_delimiter: str, continuation_delimiter: str,
                       fewshot_rng: random.Random):
         """Prepares a set of language modeling tasks into tokenized format with prompt and fewshot examples.
 
@@ -421,8 +611,17 @@ def prep_examples(self, num_fewshot: int, prompt_string: str, example_delimiter:
         for sample_idx in tqdm(range(len(self.samples))):
             encoded_example = {}
 
-            preamble = prompt_string
-
+            few_shot_text = self.generate_few_shot_text(
+                num_fewshot, 
+                prompt_string, 
+                example_delimiter, 
+                continuation_delimiter, 
+                question_prelimiter, 
+                cot_delimiter, 
+                fewshot_rng, 
+                sample_idx
+                )
+            preamble = prompt_string + few_shot_text
             if num_fewshot > 0:
                 fewshot_idxs = _get_fewshot_sample_idxs(len(self.samples), num_fewshot, sample_idx, fewshot_rng)
                 for fewshot_idx in fewshot_idxs:
@@ -441,7 +640,6 @@ def prep_examples(self, num_fewshot: int, prompt_string: str, example_delimiter:
             if self.prefix_space and not cont.startswith(' '):
                 cont = f' {cont}'
             ctxt += continuation_delimiter_stripped
-
             encoded_example['preamble'] = self.tokenizer(
                 preamble
             )  # if the preamble is empty then these will be 0-length lists, unless the tokenizer adds special tokens to empty strings (e.g. OPT tokenizer)
@@ -457,12 +655,6 @@ def prep_examples(self, num_fewshot: int, prompt_string: str, example_delimiter:
 
         return examples
 
-    def __getitem__(self, index):
-        return self.encoded_dataset[index]
-
-    def __len__(self):
-        return len(self.encoded_dataset)
-
     def collate_fn(self, data):
         inputs = []
         continuation_indices = []
@@ -488,11 +680,8 @@ def collate_fn(self, data):
         batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
         return batch
 
-    def get_num_samples_in_batch(self, batch) -> int:
-        return batch['input_ids'].shape[0]
 
-
-class InContextLearningMultipleChoiceTaskDataset(Dataset):
+class InContextLearningMultipleChoiceTaskDataset(InContextLearningDataset):
     """A dataset that construct batches for in-context learning multiple choice evaluation
 
     If each question has N answer choices, we construct N distinct inputs per question. In order to ensure
@@ -564,10 +753,10 @@ def __init__(
 
         self.prefix_space = _tokenizer_needs_prefix_space(self.tokenizer)
 
-        self.encoded_dataset = self.prep_examples(num_fewshot, prompt_string, example_delimiter, continuation_delimiter,
+        self.encoded_dataset = self._prep_examples(num_fewshot, prompt_string, example_delimiter, continuation_delimiter,
                                                   fewshot_rng)
 
-    def prep_examples(self, num_fewshot: int, prompt_string: str, example_delimiter: str, continuation_delimiter: str,
+    def _prep_examples(self, num_fewshot: int, prompt_string: str, example_delimiter: str, continuation_delimiter: str,
                       fewshot_rng: random.Random):
         """Prepares a set of multiple choice questions into tokenized format with prompt and few shot examples.
 
@@ -632,12 +821,6 @@ def prep_examples(self, num_fewshot: int, prompt_string: str, example_delimiter:
 
         return examples
 
-    def __getitem__(self, index):
-        return self.encoded_dataset[index]
-
-    def __len__(self):
-        return len(self.encoded_dataset)
-
     def collate_fn(self, data):
         inputs = []
         continuation_indices = []
@@ -793,10 +976,10 @@ def __init__(
 
         self.prefix_space = _tokenizer_needs_prefix_space(self.tokenizer)
 
-        self.encoded_dataset = self.prep_examples(num_fewshot, prompt_string, example_delimiter, continuation_delimiter,
+        self.encoded_dataset = self._prep_examples(num_fewshot, prompt_string, example_delimiter, continuation_delimiter,
                                                   fewshot_rng)
 
-    def prep_examples(self, num_fewshot: int, prompt_string: str, example_delimiter: str, continuation_delimiter: str,
+    def _prep_examples(self, num_fewshot: int, prompt_string: str, example_delimiter: str, continuation_delimiter: str,
                       fewshot_rng: random.Random):
         """Prepares a set of schema questions into tokenized format with prompt and few shot examples.
         Each question consists of a set of possible contexts followed by a continuation, only one of the contexts would logically permit the continuation.
@@ -900,7 +1083,7 @@ def collate_fn(self, data):
         return batch
 
 
-class InContextLearningCodeEvalDataset(Dataset):
+class InContextLearningCodeEvalDataset(InContextLearningDataset):
     """ A dataset that constructs batches for in-context learning code evaluation
 
     The input format is expected to be a jsonl file with the following fields:
@@ -987,10 +1170,10 @@ def __init__(
         self.top_p = top_p
         self.top_k = top_k
         fewshot_rng = random.Random(fewshot_random_seed)
-        self.encoded_dataset = self.prep_examples(num_fewshot, prompt_string, example_delimiter, code_prelimiter,
+        self.encoded_dataset = self._prep_examples(num_fewshot, prompt_string, example_delimiter, code_prelimiter,
                                                   fewshot_rng)
 
-    def prep_examples(self, num_fewshot: int, prompt_string: str, example_delimiter: str, code_prelimiter: str,
+    def _prep_examples(self, num_fewshot: int, prompt_string: str, example_delimiter: str, code_prelimiter: str,
                       fewshot_rng: random.Random):
         """Prepares a set of code evaluation tasks into tokenized format with prompt and fewshot examples.
 
@@ -1054,12 +1237,6 @@ def prep_examples(self, num_fewshot: int, prompt_string: str, example_delimiter:
         self.max_prompt_length = max_prompt_length
         return examples
 
-    def __getitem__(self, index):
-        return self.encoded_dataset[index]
-
-    def __len__(self):
-        return len(self.encoded_dataset)
-
     def collate_fn(self, data):
         inputs, prompts, tests, canonical_solutions, entry_points, test_inputs, test_outputs, languages = [], [], [], [], [], [], [], []
         for sample in data:
@@ -1115,10 +1292,6 @@ def collate_fn(self, data):
         batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
         return batch
 
-    def get_num_samples_in_batch(self, batch) -> int:
-        # Count number of inputs in the batch
-        return batch['input_ids'].shape[0]
-
     def split_batch(self, batch: Any, microbatch_size: int):
         # Don't split kwargs that don't change
         # Normally split torch tensors

From 914f2071cd720a5f27e091b2839e0330f2a85e2f Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Mon, 30 Oct 2023 23:19:23 +0000
Subject: [PATCH 002/116] more extremely broken wip

---
 .../in_context_learning_evaluation.py         | 145 ++++++++----------
 1 file changed, 68 insertions(+), 77 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 1e913f18e8..105b7bc9b5 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -116,7 +116,6 @@ def __init__(
         destination_path: str,
         question_prelimiter: str,
         fewshot_random_seed: int,
-        cot_delimiter: str = '',
     ):
         self.tokenizer = tokenizer
         self.max_seq_len = max_seq_len
@@ -124,12 +123,14 @@ def __init__(
         self.padding_side = 'left'
         self.max_answer_length = 0
         fewshot_rng = random.Random(fewshot_random_seed)
+        self.example_delimiter = example_delimiter
+        self.continuation_delimiter = continuation_delimiter 
+        self.question_prelimiter = question_prelimiter 
 
         self.samples = self._read_dataset(dataset_uri, destination_path)
         self.samples = strip_data(self.samples)
-        self.encoded_dataset = self._prep_examples(num_fewshot, prompt_string, example_delimiter,
-                                                   continuation_delimiter, question_prelimiter, fewshot_rng,
-                                                   cot_delimiter)
+        self.encoded_dataset = self._prep_examples(num_fewshot, prompt_string, fewshot_rng)
+        # TODO: define split keys
 
     def __getitem__(self, index):
         return self.encoded_dataset[index]
@@ -177,9 +178,10 @@ def _parse_dataset(self, dataset: Dataset) -> List[Dict[str, str]]:
         #     }))
         return result
 
-    def generate_few_shot_text(self, num_fewshot: int, prompt_string: str, example_delimiter: str,
-                                   continuation_delimiter: str, question_prelimiter: str, cot_delimiter: str,
-                                   fewshot_rng: random.Random, sample_idx: int) -> str:
+    def generate_few_shot_text(self, 
+                                num_fewshot: int, 
+                                sample_idx: int, 
+                                question_prelimiter: str = "") -> str:
         """Formats the prompt fewshot examples for test sample `sample_idx`.
 
         Randomly select `num_fewshot` samples from the dataset (not including the sample at `sample_idx`) and format
@@ -189,34 +191,45 @@ def generate_few_shot_text(self, num_fewshot: int, prompt_string: str, example_d
 
         Returns the formatted prompt_string + concatenated list of formatted few shot examples.
         """
-        prompt_and_fewshot = prompt_string
+        few_shot_text = ''
 
         if num_fewshot > 0:
             fewshot_idxs = _get_fewshot_sample_idxs(len(self.samples), num_fewshot, sample_idx, fewshot_rng)
             for fewshot_idx in fewshot_idxs:
-                context = self.samples[fewshot_idx]['context']
-                chain_of_thought = self.samples[fewshot_idx].get('chain_of_thought', '')
+                ctxt = self.samples[fewshot_idx]['context']
                 answer = self.samples[fewshot_idx]['answer']
-
-                if len(chain_of_thought) == 0:
-                    cot_delimiter = ''
-                # TODO: question_perlimiter needs a question - is it already in context?
-                context = f'{question_prelimiter}{context}'
-                # add the example delimiter between few_shot examples
-                if len(prompt_and_fewshot) > 0:
-                    context = f'{example_delimiter}{context}'
-                prompt_and_fewshot += f'{context}{continuation_delimiter}{chain_of_thought}{cot_delimiter}{answer}'
-
-        return prompt_and_fewshot
+                ctxt = self.get_context(ctxt, few_shot_text)
+                few_shot_text += f'{ctxt}{self.continuation_delimiter}{answer}'
+
+        return few_shot_text
+
+    def get_context(self, ctxt, prompt_and_fewshot: str = ""):
+        ctxt = f'{self.question_prelimiter}{ctxt}'
+        if len(prompt_and_fewshot) > 0:
+            ctxt = f'{self.example_delimiter}{ctxt}'
+        return ctxt
+
+    def fix_eos_on_preamble(self, preamble):
+        # If the preamble is empty then this will be a 0-length list, unless the tokenizer adds special tokens to empty strings (e.g. OPT tokenizer)
+        # If there is an EOS token added, we need to remove it so it is not in the middle of the prompt
+        if (self.tokenizer.eos_token_id is not None and 
+            len(preamble['input_ids']) > 1 and 
+            preamble['input_ids'][-1] == self.tokenizer.eos_token_id):
+            preamble['input_ids'] = preamble['input_ids'][:-1]
+        return preamble 
+
+    def tokenize_example(self, prompt_and_fewshot, ctxt):
+        tokenized_example = {}
+        preamble = self.tokenizer(prompt_and_fewshot)
+        preamble = self.fix_eos_on_preamble(preamble)
+        tokenized_example['preamble']  = preamble
+        tokenized_example['context'] = self.tokenizer(ctxt, add_special_tokens=False)
+        return tokenized_example
 
     def _prep_examples(self,
                        num_fewshot: int,
                        prompt_string: str,
-                       example_delimiter: str,
-                       continuation_delimiter: str,
-                       question_prelimiter: str,
-                       fewshot_rng: random.Random,
-                       cot_delimiter: str = '') -> List[Dict[str, Any]]:
+                       fewshot_rng: random.Random) -> List[Dict[str, Any]]:
         """Prepares a set of language modeling tasks into tokenized format with prompt and fewshot examples.
 
         Each task consists of a context and a continuation as well as an optional prompt and optional list of
@@ -237,57 +250,18 @@ def _prep_examples(self,
         """
         examples = []
         for sample_idx in tqdm(range(len(self.samples))):
+            few_shot_text = self.generate_few_shot_text(num_fewshot, prompt_string, fewshot_rng, sample_idx)
+            prompt_and_fewshot = prompt_string + few_shot_text
 
-            # TODO: This will make the few_shot examples different for some examples
-            prompt_and_fewshot = self.add_few_shot(num_fewshot, prompt_string, example_delimiter,
-                                                                 continuation_delimiter, question_prelimiter,
-                                                                 cot_delimiter, fewshot_rng, sample_idx)
-
-            ctxt = self.samples[sample_idx]['context']
-            ctxt = f'{question_prelimiter}{ctxt}'
-            if len(prompt_and_fewshot) > 0:
-                ctxt = f'{example_delimiter}{ctxt}'
-
+            ctxt = self.get_context(self.samples[sample_idx]['context'], prompt_and_fewshot)
+            # TODO: put this in get_context? 
             # rstrip the continuation delimiter, because the prompt ending in a space results in degenerate output
-            continuation_delimiter_stripped = continuation_delimiter.rstrip()
+            continuation_delimiter_stripped = self.continuation_delimiter.rstrip()
             ctxt = f'{ctxt}{continuation_delimiter_stripped}'
-
-            encoded_example = {}
-            # If the preamble is empty then this will be a 0-length list, unless the tokenizer adds special tokens to empty strings (e.g. OPT tokenizer)
-            encoded_example['preamble'] = self.tokenizer(prompt_and_fewshot)
-            # If there is an EOS token added, we need to remove it so it is not in the middle of the prompt
-            if (self.tokenizer.eos_token_id is not None and 
-                len(encoded_example['preamble']['input_ids']) > 1 and 
-                encoded_example['preamble']['input_ids'][-1] == self.tokenizer.eos_token_id):
-                encoded_example['preamble']['input_ids'] = encoded_example['preamble']['input_ids'][:-1]
-
-            encoded_example['context'] = self.tokenizer(ctxt, add_special_tokens=False)
-            encoded_example['aliases'] = list(self.samples[sample_idx]['aliases'])
-            encoded_example['cot_delimiter'] = cot_delimiter
-            examples.append(encoded_example)
-
-        max_answer_length = self.get_max_answer_length(cot_delimiter)
-        # TODO: this is only a QA task thing
-        has_cot = self.check_for_cot()
-
-        self.max_answer_length = max_answer_length + (_MAX_ANSWER_BUFFER_LENGTH if has_cot else 0)
+            tokenized_example = self.tokenize_example(prompt_and_fewshot, ctxt)
+            examples.append(tokenized_example)
         return examples
     
-    def check_for_cot(self):
-        for sample in self.samples:
-            cot = sample.get('chain_of_thought', '')
-            if len(cot) > 0:
-                return True
-        return False
-
-    def get_max_answer_length(self, cot_delimiter):
-        max_answer_length = 0
-        for sample in self.samples:
-            for answer in sample['aliases']:
-                response = f"{sample['chain_of_thought']}{cot_delimiter}{answer}"
-                max_answer_length = max(max_answer_length, len(self.tokenizer(response)['input_ids']))
-        return max_answer_length
-
     # TODO: implement abc
     # @abstractmethod
     def collate_fn(self, data):
@@ -481,6 +455,28 @@ def _prep_examples(self,
         self.max_answer_length = max_answer_length + (_MAX_ANSWER_BUFFER_LENGTH if has_cot else 0)
         return examples
 
+    #     max_answer_length = self.get_max_answer_length(cot_delimiter)
+    #     # TODO: this is only a QA task thing
+    #     # has_cot = self.check_for_cot()
+
+    #     self.max_answer_length = max_answer_length + (_MAX_ANSWER_BUFFER_LENGTH if has_cot else 0)
+    
+    # def check_for_cot(self):
+    #     for sample in self.samples:
+    #         cot = sample.get('chain_of_thought', '')
+    #         if len(cot) > 0:
+    #             return True
+    #     return False
+
+    # def get_max_answer_length(self, cot_delimiter):
+    #     max_answer_length = 0
+    #     for sample in self.samples:
+    #         for answer in sample['aliases']:
+    #             response = f"{sample['chain_of_thought']}{cot_delimiter}{answer}"
+    #             max_answer_length = max(max_answer_length, len(self.tokenizer(response)['input_ids']))
+    #     return max_answer_length
+
+
     def collate_fn(self, data):
         inputs, answers = [], []
         cot_delimiter = ''
@@ -579,7 +575,6 @@ def __init__(
         #         'context': examples['context'],
         #     }))
         self.samples = strip_data(self.samples)
-
         self.tokenizer = tokenizer
         self.max_seq_len = max_seq_len
         self.pad_tok_id = pad_tok_id
@@ -616,8 +611,6 @@ def _prep_examples(self, num_fewshot: int, prompt_string: str, example_delimiter
                 prompt_string, 
                 example_delimiter, 
                 continuation_delimiter, 
-                question_prelimiter, 
-                cot_delimiter, 
                 fewshot_rng, 
                 sample_idx
                 )
@@ -650,7 +643,6 @@ def _prep_examples(self, num_fewshot: int, prompt_string: str, example_delimiter
 
             encoded_example['context'] = self.tokenizer(ctxt, add_special_tokens=False)
             encoded_example['continuation'] = self.tokenizer(cont, add_special_tokens=False)
-
             examples.append(encoded_example)
 
         return examples
@@ -1000,7 +992,6 @@ def _prep_examples(self, num_fewshot: int, prompt_string: str, example_delimiter
 
         examples = []
         for sample_idx in tqdm(range(len(self.samples))):
-
             preamble = prompt_string
             if num_fewshot > 0:
                 fewshot_idxs = _get_fewshot_sample_idxs(len(self.samples), num_fewshot, sample_idx, fewshot_rng)

From 330a97ed635d29e76dbe05f32954d45f751fbe71 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Fri, 3 Nov 2023 23:48:49 +0000
Subject: [PATCH 003/116]  add split keys

---
 composer/datasets/in_context_learning_evaluation.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 105b7bc9b5..33ea8df0e1 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -116,6 +116,9 @@ def __init__(
         destination_path: str,
         question_prelimiter: str,
         fewshot_random_seed: int,
+        dont_split_keys: List[str],
+        normal_split_keys: List[str],
+        list_split_keys: List[str],
     ):
         self.tokenizer = tokenizer
         self.max_seq_len = max_seq_len
@@ -130,7 +133,10 @@ def __init__(
         self.samples = self._read_dataset(dataset_uri, destination_path)
         self.samples = strip_data(self.samples)
         self.encoded_dataset = self._prep_examples(num_fewshot, prompt_string, fewshot_rng)
-        # TODO: define split keys
+
+        self.dont_split_keys = dont_split_keys
+        self.normal_split_keys = normal_split_keys
+        self.list_split_keys = list_split_keys
 
     def __getitem__(self, index):
         return self.encoded_dataset[index]

From 453df28169b26deeaee0d48db66f2403be5f59d0 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Mon, 6 Nov 2023 22:10:24 +0000
Subject: [PATCH 004/116] first pass at moving QA to new format

---
 .../in_context_learning_evaluation.py         | 215 +++++-------------
 1 file changed, 61 insertions(+), 154 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 33ea8df0e1..459ea0c8d1 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -114,11 +114,10 @@ def __init__(
         example_delimiter: str,
         continuation_delimiter: str,
         destination_path: str,
-        question_prelimiter: str,
         fewshot_random_seed: int,
-        dont_split_keys: List[str],
-        normal_split_keys: List[str],
-        list_split_keys: List[str],
+        dont_split_keys: List[str] = None,
+        normal_split_keys: List[str] = None,
+        list_split_keys: List[str] = None,
     ):
         self.tokenizer = tokenizer
         self.max_seq_len = max_seq_len
@@ -128,7 +127,6 @@ def __init__(
         fewshot_rng = random.Random(fewshot_random_seed)
         self.example_delimiter = example_delimiter
         self.continuation_delimiter = continuation_delimiter 
-        self.question_prelimiter = question_prelimiter 
 
         self.samples = self._read_dataset(dataset_uri, destination_path)
         self.samples = strip_data(self.samples)
@@ -186,8 +184,8 @@ def _parse_dataset(self, dataset: Dataset) -> List[Dict[str, str]]:
 
     def generate_few_shot_text(self, 
                                 num_fewshot: int, 
-                                sample_idx: int, 
-                                question_prelimiter: str = "") -> str:
+                                sample_idx: int,
+                                fewshot_rng: random.Random) -> str:
         """Formats the prompt fewshot examples for test sample `sample_idx`.
 
         Randomly select `num_fewshot` samples from the dataset (not including the sample at `sample_idx`) and format
@@ -210,9 +208,12 @@ def generate_few_shot_text(self,
         return few_shot_text
 
     def get_context(self, ctxt, prompt_and_fewshot: str = ""):
-        ctxt = f'{self.question_prelimiter}{ctxt}'
+        # ctxt = f'{self.question_prelimiter}{ctxt}'
         if len(prompt_and_fewshot) > 0:
             ctxt = f'{self.example_delimiter}{ctxt}'
+        # rstrip the continuation delimiter, because the prompt ending in a space results in degenerate output
+        continuation_delimiter_stripped = self.continuation_delimiter.rstrip()
+        ctxt = f'{ctxt}{continuation_delimiter_stripped}'
         return ctxt
 
     def fix_eos_on_preamble(self, preamble):
@@ -246,7 +247,6 @@ def _prep_examples(self,
             prompt_string (str): The prompt to prepend to all inputs
             example_delimiter (str): The delimiter used to separate each individual context/continuation pair
             continuation_delimiter (str): The delimiter used to separate each context from its continuation
-            question_prelimiter (str): The text to prepend to each question
             fewshot_rng (random.Random): Random number generator to use for fewshot sampling
             cot_delimiter (str): The delimiter used to separate the chain-of-thought (if present) from the final model response.
 
@@ -256,17 +256,17 @@ def _prep_examples(self,
         """
         examples = []
         for sample_idx in tqdm(range(len(self.samples))):
-            few_shot_text = self.generate_few_shot_text(num_fewshot, prompt_string, fewshot_rng, sample_idx)
+            few_shot_text = self.generate_few_shot_text(num_fewshot, fewshot_rng, sample_idx)
             prompt_and_fewshot = prompt_string + few_shot_text
 
             ctxt = self.get_context(self.samples[sample_idx]['context'], prompt_and_fewshot)
-            # TODO: put this in get_context? 
-            # rstrip the continuation delimiter, because the prompt ending in a space results in degenerate output
-            continuation_delimiter_stripped = self.continuation_delimiter.rstrip()
-            ctxt = f'{ctxt}{continuation_delimiter_stripped}'
             tokenized_example = self.tokenize_example(prompt_and_fewshot, ctxt)
+            tokenized_example = self.additional_processing_for_example(tokenized_example, sample_idx)
             examples.append(tokenized_example)
         return examples
+
+    def additional_processing_for_example(self, tokenized_example, sample_idx):
+        return tokenized_example
     
     # TODO: implement abc
     # @abstractmethod
@@ -339,34 +339,30 @@ def _parse_dataset(self, dataset: Dataset) -> List[Dict[str, str]]:
 
     def __init__(
         self,
-        dataset_uri: str,
-        tokenizer: Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast],
-        max_seq_len: int,
-        pad_tok_id: int,
-        num_fewshot: int,
-        prompt_string: str,
-        example_delimiter: str,
-        continuation_delimiter: str,
-        destination_path: str,
         question_prelimiter: str,
-        fewshot_random_seed: int,
         cot_delimiter: str = '',
+        *args, 
+        **kwargs
     ):
-        self.samples = self._read_dataset(dataset_uri, destination_path)
-        self.samples = strip_data(self.samples)
-        self.tokenizer = tokenizer
-        self.max_seq_len = max_seq_len
-        self.pad_tok_id = pad_tok_id
-        self.padding_side = 'left'
-        self.max_answer_length = 0
-        fewshot_rng = random.Random(fewshot_random_seed)
-        self.encoded_dataset = self._prep_examples(num_fewshot, prompt_string, example_delimiter,
-                                                   continuation_delimiter, question_prelimiter, fewshot_rng,
-                                                   cot_delimiter)
+        super().__init__(*args, **kwargs)
+        self.question_prelimiter = question_prelimiter
+        self.cot_delimiter = cot_delimiter
+        self.max_answer_length = self.get_max_answer_length()
+        self.set_split_keys()
+        
+    def set_split_keys(self):
+        if not self.dont_split_keys:
+            self.dont_split_keys = ['mode', 'generation_length', 'generation_kwargs', 'cot_delimiter']
+        if not self.normal_split_keys:
+            self.normal_split_keys = ['input_ids', 'attention_mask']
+        if not self.list_split_keys:
+            self.list_split_keys = ['labels']
 
-    def _format_prompt_and_fewshot(self, num_fewshot: int, prompt_string: str, example_delimiter: str,
-                                   continuation_delimiter: str, question_prelimiter: str, cot_delimiter: str,
-                                   fewshot_rng: random.Random, sample_idx: int) -> str:
+
+    def generate_few_shot_text(self, 
+                               num_fewshot: int, 
+                               fewshot_rng: random.Random, 
+                               sample_idx: int) -> str:
         """Formats the prompt fewshot examples for test sample `sample_idx`.
 
         Randomly select `num_fewshot` samples from the dataset (not including the sample at `sample_idx`) and format
@@ -376,112 +372,47 @@ def _format_prompt_and_fewshot(self, num_fewshot: int, prompt_string: str, examp
 
         Returns the formatted prompt_string + concatenated list of formatted few shot examples.
         """
-        prompt_and_fewshot = prompt_string
+        prompt_and_fewshot = ''
 
         if num_fewshot > 0:
             fewshot_idxs = _get_fewshot_sample_idxs(len(self.samples), num_fewshot, sample_idx, fewshot_rng)
             for fewshot_idx in fewshot_idxs:
-                context = self.samples[fewshot_idx]['context']
                 chain_of_thought = self.samples[fewshot_idx].get('chain_of_thought', '')
                 answer = self.samples[fewshot_idx]['answer']
-
                 if len(chain_of_thought) == 0:
                     cot_delimiter = ''
-                context = f'{question_prelimiter}{context}'
-                if len(prompt_and_fewshot) > 0:
-                    context = f'{example_delimiter}{context}'
-                prompt_and_fewshot += f'{context}{continuation_delimiter}{chain_of_thought}{cot_delimiter}{answer}'
+                else:
+                    cot_delimiter = self.cot_delimiter
+                
+                context = self.samples[fewshot_idx]['context']
+                # TODO: might _not_ want to rstrip cont_delim in get_context here
+                context = self.get_context(context, prompt_and_fewshot=prompt_and_fewshot)
+                prompt_and_fewshot += f'{context}{self.continuation_delimiter}{chain_of_thought}{cot_delimiter}{answer}'
 
         return prompt_and_fewshot
 
-    def _prep_examples(self,
-                       num_fewshot: int,
-                       prompt_string: str,
-                       example_delimiter: str,
-                       continuation_delimiter: str,
-                       question_prelimiter: str,
-                       fewshot_rng: random.Random,
-                       cot_delimiter: str = '') -> List[Dict[str, Any]]:
-        """Prepares a set of language modeling tasks into tokenized format with prompt and fewshot examples.
-
-        Each task consists of a context and a continuation as well as an optional prompt and optional list of
-        example context/continuation pairs which precede the test context/continuation pair.
-
-        Args:
-            num_fewshot (int): Number of examples context/continuation pairs to prepend to the test pair
-            prompt_string (str): The prompt to prepend to all inputs
-            example_delimiter (str): The delimiter used to separate each individual context/continuation pair
-            continuation_delimiter (str): The delimiter used to separate each context from its continuation
-            question_prelimiter (str): The text to prepend to each question
-            fewshot_rng (random.Random): Random number generator to use for fewshot sampling
-            cot_delimiter (str): The delimiter used to separate the chain-of-thought (if present) from the final model response.
-
+    def get_context(self, ctxt, prompt_and_fewshot: str = ""):
+        ctxt = f'{self.question_prelimiter}{ctxt}'
+        if len(prompt_and_fewshot) > 0:
+            ctxt = f'{self.example_delimiter}{ctxt}'
+        # rstrip the continuation delimiter, because the prompt ending in a space results in degenerate output
+        continuation_delimiter_stripped = self.continuation_delimiter.rstrip()
+        ctxt = f'{ctxt}{continuation_delimiter_stripped}'
+        return ctxt
+    
+    def addiontional_processing_for_example(self, tokenized_example, sample_idx):
+        tokenized_example['aliases'] = list(self.samples[sample_idx]['aliases'])
+        tokenized_example['cot_delimiter'] = self.cot_delimiter
+        return tokenized_example 
 
-        Returns:
-            dict: Contains the context, the continuation, and the preamble (prompt + fewshot examples)
-        """
+    def get_max_answer_length(self):
         max_answer_length = 0
-        has_cot = False
-        examples = []
-        for sample_idx in tqdm(range(len(self.samples))):
-            encoded_example = {}
-
-            prompt_and_fewshot = self._format_prompt_and_fewshot(num_fewshot, prompt_string, example_delimiter,
-                                                                 continuation_delimiter, question_prelimiter,
-                                                                 cot_delimiter, fewshot_rng, sample_idx)
-
-            ctxt = self.samples[sample_idx]['context']
-            ctxt = f'{question_prelimiter}{ctxt}'
-            if len(prompt_and_fewshot) > 0:
-                ctxt = f'{example_delimiter}{ctxt}'
-
-            # rstrip the continuation delimiter, because the prompt ending in a space results in degenerate output
-            continuation_delimiter_stripped = continuation_delimiter.rstrip()
-            ctxt = f'{ctxt}{continuation_delimiter_stripped}'
-
-            # If the preamble is empty then this will be a 0-length list, unless the tokenizer adds special tokens to empty strings (e.g. OPT tokenizer)
-            encoded_example['preamble'] = self.tokenizer(prompt_and_fewshot)
-            # If there is an EOS token added, we need to remove it so it is not in the middle of the prompt
-            if self.tokenizer.eos_token_id is not None and len(
-                    encoded_example['preamble']
-                ['input_ids']) > 1 and encoded_example['preamble']['input_ids'][-1] == self.tokenizer.eos_token_id:
-                encoded_example['preamble']['input_ids'] = encoded_example['preamble']['input_ids'][:-1]
-
-            encoded_example['context'] = self.tokenizer(ctxt, add_special_tokens=False)
-            encoded_example['aliases'] = list(self.samples[sample_idx]['aliases'])
-            encoded_example['cot_delimiter'] = cot_delimiter
-            examples.append(encoded_example)
-            for answer in self.samples[sample_idx]['aliases']:
-                response = f"{self.samples[sample_idx]['chain_of_thought']}{cot_delimiter}{answer}"
+        for sample in self.samples:
+            for answer in sample['aliases']:
+                response = f"{sample['chain_of_thought']}{self.cot_delimiter}{answer}"
                 max_answer_length = max(max_answer_length, len(self.tokenizer(response)['input_ids']))
-
-            if len(self.samples[sample_idx]['chain_of_thought']) > 0:
-                has_cot = True
-
-        self.max_answer_length = max_answer_length + (_MAX_ANSWER_BUFFER_LENGTH if has_cot else 0)
-        return examples
-
-    #     max_answer_length = self.get_max_answer_length(cot_delimiter)
-    #     # TODO: this is only a QA task thing
-    #     # has_cot = self.check_for_cot()
-
-    #     self.max_answer_length = max_answer_length + (_MAX_ANSWER_BUFFER_LENGTH if has_cot else 0)
-    
-    # def check_for_cot(self):
-    #     for sample in self.samples:
-    #         cot = sample.get('chain_of_thought', '')
-    #         if len(cot) > 0:
-    #             return True
-    #     return False
-
-    # def get_max_answer_length(self, cot_delimiter):
-    #     max_answer_length = 0
-    #     for sample in self.samples:
-    #         for answer in sample['aliases']:
-    #             response = f"{sample['chain_of_thought']}{cot_delimiter}{answer}"
-    #             max_answer_length = max(max_answer_length, len(self.tokenizer(response)['input_ids']))
-    #     return max_answer_length
-
+        max_answer_length = max_answer_length + (_MAX_ANSWER_BUFFER_LENGTH if len(self.cot_delimiter) > 0 else 0)
+        return max_answer_length
 
     def collate_fn(self, data):
         inputs, answers = [], []
@@ -517,30 +448,6 @@ def collate_fn(self, data):
         batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
         return batch
 
-    def split_batch(self, batch: Any, microbatch_size: int):
-        # Don't split kwargs that don't change
-        # Normally split torch tensors
-        # List split lists of strings
-        no_split = ['mode', 'generation_length', 'generation_kwargs', 'cot_delimiter']
-        normal_split = ['input_ids', 'attention_mask']
-        list_split = ['labels']
-        chunked = {}
-        for k, v in batch.items():
-            if k in no_split:
-                # Defer broadcasting until we know num_chunks
-                pass
-            elif k in list_split:
-                chunked[k] = _split_list(v, microbatch_size)
-            elif k in normal_split:
-                chunked[k] = _default_split_batch(v, microbatch_size)
-            else:
-                raise ValueError(f'Unexpected key {k}')
-        num_chunks = len(chunked['input_ids'])
-        for k, v in batch.items():
-            if isinstance(v, (int, float, str, bool, dict)):
-                chunked[k] = [v] * num_chunks
-        return [{k: v[idx] for k, v in chunked.items()} for idx in range(num_chunks)]
-
 
 class InContextLearningLMTaskDataset(InContextLearningDataset):
     """A dataset that construct batches for in-context learning language modeling evaluation

From 89cf3f4e5b247c12ba80ecf6b1dbd73310f2fbd0 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Wed, 8 Nov 2023 02:50:17 +0000
Subject: [PATCH 005/116] linting

---
 .../in_context_learning_evaluation.py         | 801 ++++++------------
 1 file changed, 245 insertions(+), 556 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 459ea0c8d1..a1f620c255 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -102,7 +102,9 @@ def _get_fewshot_sample_idxs(dataset_size: int, num_fewshot: int, sample_idx: in
         fewshot_idxs.add(replacement_sample)
     return fewshot_idxs
 
+
 class InContextLearningDataset(Dataset):
+
     def __init__(
         self,
         dataset_uri: str,
@@ -115,128 +117,137 @@ def __init__(
         continuation_delimiter: str,
         destination_path: str,
         fewshot_random_seed: int,
-        dont_split_keys: List[str] = None,
-        normal_split_keys: List[str] = None,
-        list_split_keys: List[str] = None,
+        icl_hf_loading_vars: dict = {},
+        icl_hf_parsing_vars: dict = {},
+        context_key: str = 'context',
+        answer_key: str = 'answer',
+        prelimiter: str = '',
+        dont_split_keys: List[str] = [],
+        normal_split_keys: List[str] = [],
+        list_split_keys: List[str] = [],
     ):
         self.tokenizer = tokenizer
+        self.prefix_space = _tokenizer_needs_prefix_space(self.tokenizer)
+
         self.max_seq_len = max_seq_len
         self.pad_tok_id = pad_tok_id
         self.padding_side = 'left'
-        self.max_answer_length = 0
-        fewshot_rng = random.Random(fewshot_random_seed)
+
+        self.prelimiter = prelimiter
         self.example_delimiter = example_delimiter
-        self.continuation_delimiter = continuation_delimiter 
+        self.continuation_delimiter = continuation_delimiter
+        self.context_key = context_key
+        self.answer_key = answer_key
 
-        self.samples = self._read_dataset(dataset_uri, destination_path)
+        self.samples = self._read_dataset(dataset_uri, destination_path, icl_hf_loading_vars, icl_hf_parsing_vars)
         self.samples = strip_data(self.samples)
+
+        fewshot_rng = random.Random(fewshot_random_seed)
         self.encoded_dataset = self._prep_examples(num_fewshot, prompt_string, fewshot_rng)
 
         self.dont_split_keys = dont_split_keys
         self.normal_split_keys = normal_split_keys
         self.list_split_keys = list_split_keys
 
-    def __getitem__(self, index):
+    def __getitem__(self, index: int):
         return self.encoded_dataset[index]
 
     def __len__(self):
         return len(self.encoded_dataset)
 
-    def get_num_samples_in_batch(self, batch) -> int:
+    def get_num_samples_in_batch(self, batch: dict) -> int:
         return batch['input_ids'].shape[0]
 
-    def _read_dataset(self, dataset_uri, destination_path, **kwargs):
+    def _read_dataset(self,
+                      dataset_uri: str,
+                      destination_path: str,
+                      icl_hf_loading_vars: dict = None,
+                      icl_hf_parsing_vars: dict = None):
         try:
             from datasets import load_dataset  # pyright: ignore [reportGeneralTypeIssues]
         except ImportError as e:
             raise MissingConditionalImportError(extra_deps_group='nlp',
                                                 conda_package='datasets',
                                                 conda_channel='conda-forge') from e
-        if "hf://" in dataset_uri:
-            dataset_uri = dataset_uri.replace("hf://", "")
-            # TODO: I'm sure this is not correct
-            dataset = load_dataset(dataset_uri, split="train", **kwargs)
+        if 'hf://' in dataset_uri:
+            dataset_uri = dataset_uri.replace('hf://', '')
+            dataset = load_dataset(dataset_uri, split='train', **icl_hf_loading_vars)
         else:
             with dist.local_rank_zero_download_and_wait(destination_path):
                 if dist.get_local_rank() == 0:
                     get_file(dataset_uri, destination_path, overwrite=True)
             dataset = load_dataset('json', data_files=destination_path, split='train', streaming=False)
-        dataset = self._parse_dataset(dataset)
+        if icl_hf_parsing_vars:
+            dataset = self._parse_hf_dataset(dataset, icl_hf_parsing_vars)
+        else:
+            dataset = self._parse_dataset(dataset)
+        return dataset
+
+    def _parse_hf_dataset(self, dataset, icl_hf_parsing_vars):
+        dataset = dataset.map(
+            lambda example: {k: ''.join([str(example[col]) for col in v]) for k, v in icl_hf_parsing_vars})
         return dataset
 
-    # abstractmethod?
     def _parse_dataset(self, dataset: Dataset) -> List[Dict[str, str]]:
-        result = []
-        for example in dataset:
-            result.append({
-                'context': example['context'],
-                'answer': example['answer'],
-                'aliases': set([example['answer']] + example.get('aliases', [])),
-                'chain_of_thought': example.get('chain_of_thought', '')
-            })
-
-        # self.samples = list(
-        #     dataset.map(lambda examples: {
-        #         'continuation': examples['continuation'],
-        #         'context': examples['context'],
-        #     }))
-        return result
-
-    def generate_few_shot_text(self, 
-                                num_fewshot: int, 
-                                sample_idx: int,
-                                fewshot_rng: random.Random) -> str:
+        return list(
+            dataset.map(lambda examples: {
+                self.context_key: examples['context'],
+                self.answer_key: examples['answer'],
+            }))
+
+    def generate_few_shot_text(self, num_fewshot: int, sample_idx: int, preamble: str,
+                               fewshot_rng: random.Random) -> str:
         """Formats the prompt fewshot examples for test sample `sample_idx`.
 
         Randomly select `num_fewshot` samples from the dataset (not including the sample at `sample_idx`) and format
-        them each as follows `{example_delimiter}{question_prelimiter}{context}{continuation_delimiter}{chain_of_thought}{cot_delimiter}{answer}`.
+        them each as follows `{example_delimiter}{prelimiter}{context}{continuation_delimiter}{chain_of_thought}{cot_delimiter}{answer}`.
 
         `chain_of_thought` will default to empty if not present in the dataset but `context` and `answer` must be present.
 
         Returns the formatted prompt_string + concatenated list of formatted few shot examples.
         """
-        few_shot_text = ''
+        few_shot_text = preamble
 
         if num_fewshot > 0:
             fewshot_idxs = _get_fewshot_sample_idxs(len(self.samples), num_fewshot, sample_idx, fewshot_rng)
             for fewshot_idx in fewshot_idxs:
-                ctxt = self.samples[fewshot_idx]['context']
-                answer = self.samples[fewshot_idx]['answer']
-                ctxt = self.get_context(ctxt, few_shot_text)
-                few_shot_text += f'{ctxt}{self.continuation_delimiter}{answer}'
+                ctxt = self.construct_context(self.samples[fewshot_idx], few_shot_text, add_answer=True)
+                few_shot_text += ctxt
 
         return few_shot_text
 
-    def get_context(self, ctxt, prompt_and_fewshot: str = ""):
-        # ctxt = f'{self.question_prelimiter}{ctxt}'
-        if len(prompt_and_fewshot) > 0:
-            ctxt = f'{self.example_delimiter}{ctxt}'
-        # rstrip the continuation delimiter, because the prompt ending in a space results in degenerate output
-        continuation_delimiter_stripped = self.continuation_delimiter.rstrip()
-        ctxt = f'{ctxt}{continuation_delimiter_stripped}'
+    def construct_context(self, sample: dict, preceding_text: str = '', add_answer: bool = False):
+        ctxt = sample[self.context_key]
+        ctxt = f'{self.prelimiter}{ctxt}'
+        if len(preceding_text) > 0:
+            ctxt = f'{preceding_text}{self.example_delimiter}{ctxt}'
+        ctxt = f'{ctxt}{self.continuation_delimiter}'
+        if add_answer:
+            ctxt = f'{ctxt}{self.get_answer_from_sample(sample)}'
         return ctxt
 
-    def fix_eos_on_preamble(self, preamble):
+    def get_answer_from_sample(self, sample: dict):
+        return sample[self.answer_key]
+
+    def fix_eos_on_preamble(self, preamble: dict):
         # If the preamble is empty then this will be a 0-length list, unless the tokenizer adds special tokens to empty strings (e.g. OPT tokenizer)
         # If there is an EOS token added, we need to remove it so it is not in the middle of the prompt
-        if (self.tokenizer.eos_token_id is not None and 
-            len(preamble['input_ids']) > 1 and 
-            preamble['input_ids'][-1] == self.tokenizer.eos_token_id):
+        if (self.tokenizer.eos_token_id is not None and len(preamble['input_ids']) > 1 and
+                preamble['input_ids'][-1] == self.tokenizer.eos_token_id):
             preamble['input_ids'] = preamble['input_ids'][:-1]
-        return preamble 
+        return preamble
 
-    def tokenize_example(self, prompt_and_fewshot, ctxt):
+    def tokenize_example(self, prompt_and_fewshot: str, ctxt: str):
         tokenized_example = {}
         preamble = self.tokenizer(prompt_and_fewshot)
         preamble = self.fix_eos_on_preamble(preamble)
-        tokenized_example['preamble']  = preamble
-        tokenized_example['context'] = self.tokenizer(ctxt, add_special_tokens=False)
+        tokenized_example['preamble'] = preamble
+        # rstrip context because a prompt ending in a space results in degenerate output
+        #TODO: use diff key for this?
+        tokenized_example['context'] = self.tokenizer(ctxt.rstrip(), add_special_tokens=False)
         return tokenized_example
 
-    def _prep_examples(self,
-                       num_fewshot: int,
-                       prompt_string: str,
-                       fewshot_rng: random.Random) -> List[Dict[str, Any]]:
+    def _prep_examples(self, num_fewshot: int, prompt_string: str, fewshot_rng: random.Random) -> List[Dict[str, Any]]:
         """Prepares a set of language modeling tasks into tokenized format with prompt and fewshot examples.
 
         Each task consists of a context and a continuation as well as an optional prompt and optional list of
@@ -256,20 +267,16 @@ def _prep_examples(self,
         """
         examples = []
         for sample_idx in tqdm(range(len(self.samples))):
-            few_shot_text = self.generate_few_shot_text(num_fewshot, fewshot_rng, sample_idx)
-            prompt_and_fewshot = prompt_string + few_shot_text
-
-            ctxt = self.get_context(self.samples[sample_idx]['context'], prompt_and_fewshot)
+            prompt_and_fewshot = self.generate_few_shot_text(num_fewshot, sample_idx, prompt_string, fewshot_rng)
+            ctxt = self.construct_context(self.samples[sample_idx], prompt_and_fewshot, add_answer=False)
             tokenized_example = self.tokenize_example(prompt_and_fewshot, ctxt)
-            tokenized_example = self.additional_processing_for_example(tokenized_example, sample_idx)
+            tokenized_example = self.additional_processing_for_example(tokenized_example, self.samples[sample_idx])
             examples.append(tokenized_example)
         return examples
 
-    def additional_processing_for_example(self, tokenized_example, sample_idx):
+    def additional_processing_for_example(self, tokenized_example: dict, sample: dict):
         return tokenized_example
-    
-    # TODO: implement abc
-    # @abstractmethod
+
     def collate_fn(self, data):
         pass
 
@@ -301,6 +308,7 @@ def split_batch(self, batch: Any, microbatch_size: int):
 
         return [{k: v[idx] for k, v in chunked.items()} for idx in range(num_chunks)]
 
+
 class InContextLearningQATaskDataset(InContextLearningDataset):
     """A dataset that construct batches for in-context learning question answering evaluation
 
@@ -322,93 +330,48 @@ class InContextLearningQATaskDataset(InContextLearningDataset):
         example_delimiter (str): Separator that goes between individual (context, answer) pairs (e.g. '\n')
         continuation_delimiter: (str): Separator that goes between context and answer in each example (e.g. '\nA: ')
         destination_path (str): Temporary path to store downloaded datasets
-        question_prelimiter (str): String to put before each question (e.g. 'Q: ')
+        prelimiter (str): String to put before each question (e.g. 'Q: ')
         fewshot_random_seed (int): Random seed to use for fewshot sampling
     """
 
-    def _parse_dataset(self, dataset: Dataset) -> List[Dict[str, str]]:
-        result = []
-        for example in dataset:
-            result.append({
-                'context': example['context'],
-                'answer': example['answer'],
-                'aliases': set([example['answer']] + example.get('aliases', [])),
-                'chain_of_thought': example.get('chain_of_thought', '')
-            })
-        return result
-
-    def __init__(
-        self,
-        question_prelimiter: str,
-        cot_delimiter: str = '',
-        *args, 
-        **kwargs
-    ):
-        super().__init__(*args, **kwargs)
-        self.question_prelimiter = question_prelimiter
+    def __init__(self, cot_delimiter: str = '', *args, **kwargs):
+        super().__init__(dont_split_keys=['mode', 'generation_length', 'generation_kwargs', 'cot_delimiter'],
+                         normal_split_keys=['input_ids', 'attention_mask'],
+                         list_split_keys=['labels'],
+                         *args,
+                         **kwargs)
         self.cot_delimiter = cot_delimiter
         self.max_answer_length = self.get_max_answer_length()
-        self.set_split_keys()
-        
-    def set_split_keys(self):
-        if not self.dont_split_keys:
-            self.dont_split_keys = ['mode', 'generation_length', 'generation_kwargs', 'cot_delimiter']
-        if not self.normal_split_keys:
-            self.normal_split_keys = ['input_ids', 'attention_mask']
-        if not self.list_split_keys:
-            self.list_split_keys = ['labels']
-
-
-    def generate_few_shot_text(self, 
-                               num_fewshot: int, 
-                               fewshot_rng: random.Random, 
-                               sample_idx: int) -> str:
-        """Formats the prompt fewshot examples for test sample `sample_idx`.
-
-        Randomly select `num_fewshot` samples from the dataset (not including the sample at `sample_idx`) and format
-        them each as follows `{example_delimiter}{question_prelimiter}{context}{continuation_delimiter}{chain_of_thought}{cot_delimiter}{answer}`.
 
-        `chain_of_thought` will default to empty if not present in the dataset but `context` and `answer` must be present.
+    def _parse_dataset(self, dataset: Dataset) -> List[Dict[str, str]]:
+        return list(
+            dataset.map(
+                lambda examples: {
+                    'context': examples['context'],
+                    'answer': examples['answer'],
+                    'aliases': set([examples['answer']] + examples.get('aliases', [])),
+                    'chain_of_thought': examples.get('chain_of_thought', '')
+                }))
 
-        Returns the formatted prompt_string + concatenated list of formatted few shot examples.
-        """
-        prompt_and_fewshot = ''
+    def get_answer_from_sample(self, sample):
+        # If we add the answer, we need to also add COT
+        chain_of_thought = sample.get('chain_of_thought', '')
+        if len(chain_of_thought) == 0:
+            cot_delimiter = ''
+        else:
+            cot_delimiter = self.cot_delimiter
+        return f'{self.continuation_delimiter}{chain_of_thought}{cot_delimiter}{sample[self.answer_key]}'
 
-        if num_fewshot > 0:
-            fewshot_idxs = _get_fewshot_sample_idxs(len(self.samples), num_fewshot, sample_idx, fewshot_rng)
-            for fewshot_idx in fewshot_idxs:
-                chain_of_thought = self.samples[fewshot_idx].get('chain_of_thought', '')
-                answer = self.samples[fewshot_idx]['answer']
-                if len(chain_of_thought) == 0:
-                    cot_delimiter = ''
-                else:
-                    cot_delimiter = self.cot_delimiter
-                
-                context = self.samples[fewshot_idx]['context']
-                # TODO: might _not_ want to rstrip cont_delim in get_context here
-                context = self.get_context(context, prompt_and_fewshot=prompt_and_fewshot)
-                prompt_and_fewshot += f'{context}{self.continuation_delimiter}{chain_of_thought}{cot_delimiter}{answer}'
-
-        return prompt_and_fewshot
-
-    def get_context(self, ctxt, prompt_and_fewshot: str = ""):
-        ctxt = f'{self.question_prelimiter}{ctxt}'
-        if len(prompt_and_fewshot) > 0:
-            ctxt = f'{self.example_delimiter}{ctxt}'
-        # rstrip the continuation delimiter, because the prompt ending in a space results in degenerate output
-        continuation_delimiter_stripped = self.continuation_delimiter.rstrip()
-        ctxt = f'{ctxt}{continuation_delimiter_stripped}'
-        return ctxt
-    
-    def addiontional_processing_for_example(self, tokenized_example, sample_idx):
-        tokenized_example['aliases'] = list(self.samples[sample_idx]['aliases'])
+    def additional_processing_for_example(self, tokenized_example: dict, sample: dict):
+        tokenized_example['aliases'] = list(sample['aliases'])
         tokenized_example['cot_delimiter'] = self.cot_delimiter
-        return tokenized_example 
+        return tokenized_example
 
     def get_max_answer_length(self):
         max_answer_length = 0
         for sample in self.samples:
             for answer in sample['aliases']:
+                # DRY says I should use get_answer_from_sample somehow here
                 response = f"{sample['chain_of_thought']}{self.cot_delimiter}{answer}"
                 max_answer_length = max(max_answer_length, len(self.tokenizer(response)['input_ids']))
         max_answer_length = max_answer_length + (_MAX_ANSWER_BUFFER_LENGTH if len(self.cot_delimiter) > 0 else 0)
@@ -468,97 +431,22 @@ class InContextLearningLMTaskDataset(InContextLearningDataset):
         fewshot_random_seed (int): Random seed used to select fewshot examples
     """
 
-    def __init__(
-        self,
-        dataset_uri: str,
-        tokenizer: Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast],
-        max_seq_len: int,
-        pad_tok_id: int,
-        num_fewshot: int,
-        prompt_string: str,
-        example_delimiter: str,
-        continuation_delimiter: str,
-        destination_path: str,
-        fewshot_random_seed: int,
-    ):
-        self.samples = self._read_dataset(dataset_uri, destination_path)
-        # self.samples = list(
-        #     dataset.map(lambda examples: {
-        #         'continuation': examples['continuation'],
-        #         'context': examples['context'],
-        #     }))
-        self.samples = strip_data(self.samples)
-        self.tokenizer = tokenizer
-        self.max_seq_len = max_seq_len
-        self.pad_tok_id = pad_tok_id
-        fewshot_rng = random.Random(fewshot_random_seed)
-
-        self.prefix_space = _tokenizer_needs_prefix_space(self.tokenizer)
-
-        self.encoded_dataset = self._prep_examples(num_fewshot, prompt_string, example_delimiter, continuation_delimiter,
-                                                  fewshot_rng)
-
-    def _prep_examples(self, num_fewshot: int, prompt_string: str, example_delimiter: str, continuation_delimiter: str,
-                      fewshot_rng: random.Random):
-        """Prepares a set of language modeling tasks into tokenized format with prompt and fewshot examples.
-
-        Each task consists of a context and a continuation as well as an optional prompt and optional list of
-        example context/continuation pairs which precede the test context/continuation pair.
-
-        Args:
-            num_fewshot (int): Number of examples context/continuation pairs to prepend to the test pair
-            prompt_string (str): The prompt to prepend to all inputs
-            example_delimiter (str): The delimiter used to separate each individual context/continuation pair
-            continuation_delimiter (str): The delimiter used to separate each context from its continuation
-            fewshot_rng (random.Random): Random number generator used to select fewshot examples
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
 
-        Returns:
-            dict: Contains the context, the continuation, and the preamble (prompt + fewshot examples)
-        """
-        examples = []
-        for sample_idx in tqdm(range(len(self.samples))):
-            encoded_example = {}
-
-            few_shot_text = self.generate_few_shot_text(
-                num_fewshot, 
-                prompt_string, 
-                example_delimiter, 
-                continuation_delimiter, 
-                fewshot_rng, 
-                sample_idx
-                )
-            preamble = prompt_string + few_shot_text
-            if num_fewshot > 0:
-                fewshot_idxs = _get_fewshot_sample_idxs(len(self.samples), num_fewshot, sample_idx, fewshot_rng)
-                for fewshot_idx in fewshot_idxs:
-                    ctxt, cont = self.samples[fewshot_idx]['context'], self.samples[fewshot_idx]['continuation']
-                    if len(preamble) > 0:
-                        ctxt = f'{example_delimiter}{ctxt}'
-                    preamble += f'{ctxt}{continuation_delimiter}{cont}'
-
-            ctxt, cont = self.samples[sample_idx]['context'], self.samples[sample_idx]['continuation']
-            if len(preamble) > 0:
-                ctxt = f'{example_delimiter}{ctxt}'
-
-            # rstrip the continuation delimiter, because the prompt ending in a space results in degenerate output
-            continuation_delimiter_stripped = continuation_delimiter.rstrip()
-
-            if self.prefix_space and not cont.startswith(' '):
-                cont = f' {cont}'
-            ctxt += continuation_delimiter_stripped
-            encoded_example['preamble'] = self.tokenizer(
-                preamble
-            )  # if the preamble is empty then these will be 0-length lists, unless the tokenizer adds special tokens to empty strings (e.g. OPT tokenizer)
-            if self.tokenizer.eos_token_id is not None and len(
-                    encoded_example['preamble']
-                ['input_ids']) > 1 and encoded_example['preamble']['input_ids'][-1] == self.tokenizer.eos_token_id:
-                encoded_example['preamble']['input_ids'] = encoded_example['preamble']['input_ids'][:-1]
-
-            encoded_example['context'] = self.tokenizer(ctxt, add_special_tokens=False)
-            encoded_example['continuation'] = self.tokenizer(cont, add_special_tokens=False)
-            examples.append(encoded_example)
+    def _parse_dataset(self, dataset: Dataset) -> List[Dict[str, str]]:
+        return list(
+            dataset.map(lambda examples: {
+                'continuation': examples['continuation'],
+                'context': examples['context'],
+            }))
 
-        return examples
+    def additional_processing_for_example(self, tokenized_example: dict, sample: dict):
+        cont = sample['continuation']
+        if self.prefix_space and not cont.startswith(' '):
+            cont = f' {cont}'
+        tokenized_example['continuation'] = self.tokenizer(cont, add_special_tokens=False)
+        return tokenized_example
 
     def collate_fn(self, data):
         inputs = []
@@ -618,113 +506,38 @@ class InContextLearningMultipleChoiceTaskDataset(InContextLearningDataset):
         fewshot_random_seed (int): Random seed used to select fewshot examples
     """
 
-    def __init__(
-        self,
-        dataset_uri: str,
-        tokenizer: Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast],
-        max_seq_len: int,
-        pad_tok_id: int,
-        num_fewshot: int,
-        prompt_string: str,
-        example_delimiter: str,
-        continuation_delimiter: str,
-        destination_path: str,
-        fewshot_random_seed: int,
-    ):
-        try:
-            from datasets import load_dataset  # pyright: ignore [reportGeneralTypeIssues]
-        except ImportError as e:
-            raise MissingConditionalImportError(extra_deps_group='nlp',
-                                                conda_package='datasets',
-                                                conda_channel='conda-forge') from e
+    def __init__(self, *args, **kwargs):
+        super().__init__(dont_split_keys=['mode'],
+                         real_split_keys=['input_ids', 'labels', 'attention_mask'],
+                         normal_split_key=['gold_indices'],
+                         *args,
+                         **kwargs)
+        self.num_choices = len(self.samples[0]['choices'])
+        self.context_key = 'query'
+        # self.dont_split_keys = ['mode']
+        # self.real_split_keys = ['input_ids', 'labels', 'attention_mask']
+        # self.normal_split_keys = ['gold_indices']
 
-        with dist.local_rank_zero_download_and_wait(destination_path):
-            if dist.get_local_rank() == 0:
-                get_file(dataset_uri, destination_path, overwrite=True)
-        dataset = load_dataset('json', data_files=destination_path, split='train', streaming=False)
-        self.samples = list(
+    def _parse_dataset(self, dataset: Dataset) -> List[Dict[str, str]]:
+        return list(
             dataset.map(lambda examples: {
                 'query': examples['query'],
                 'choices': examples['choices'],
                 'gold': examples['gold']
             }))
-        self.samples = strip_data(self.samples)
-
-        self.num_choices = len(self.samples[0]['choices'])
-        self.tokenizer = tokenizer
-        self.max_seq_len = max_seq_len
-        self.pad_tok_id = pad_tok_id
-        fewshot_rng = random.Random(fewshot_random_seed)
-
-        self.prefix_space = _tokenizer_needs_prefix_space(self.tokenizer)
-
-        self.encoded_dataset = self._prep_examples(num_fewshot, prompt_string, example_delimiter, continuation_delimiter,
-                                                  fewshot_rng)
-
-    def _prep_examples(self, num_fewshot: int, prompt_string: str, example_delimiter: str, continuation_delimiter: str,
-                      fewshot_rng: random.Random):
-        """Prepares a set of multiple choice questions into tokenized format with prompt and few shot examples.
-
-        Each question consists of a query and set of answer choices, only one of which is correct. At inference time
-        we construct individual inference examples consisting of the query + a single choice, as well as an optional (prompt) and optional list
-        of example query + correct answers, which precede the test query + choice.
-
-        For multiple choice, this method provides information relaying which of the answer choices is the correct one. This
-        information is used for computing accuracy metrics.
-
-        Args:
-            num_fewshot (int): Number of examples context/continuation pairs to prepend to the test pair
-            prompt_string (str): The prompt to prepend to all inputs
-            example_delimiter (str): The delimiter used to separate each example query/answer pair
-            continuation_delimiter (str): The delimiter used to separate each query from its answer
-            fewshot_rng (random.Random): Random number generator used to select fewshot examples
-
-        Returns:
-            dict: Contains the query, the list of encoded potential answer choices, the preamble (prompt + fewshot examples), and
-                the index of the correct answer choice.
-        """
-        examples = []
-        for sample_idx in tqdm(range(len(self.samples))):
 
-            preamble = prompt_string
-            if num_fewshot > 0:
-                fewshot_idxs = _get_fewshot_sample_idxs(len(self.samples), num_fewshot, sample_idx, fewshot_rng)
-                for fewshot_idx in fewshot_idxs:
-                    query, choices, gold_idx = self.samples[fewshot_idx]['query'], self.samples[fewshot_idx][
-                        'choices'], self.samples[fewshot_idx]['gold']
-                    if len(preamble) > 0:
-                        query = f'{example_delimiter}{query}'
-                    assert isinstance(gold_idx, int)
-                    preamble += f'{query}{continuation_delimiter}{choices[gold_idx]}'
-            encoded_example = {}
-            query, choices, gold_idx = self.samples[sample_idx]['query'], self.samples[sample_idx][
-                'choices'], self.samples[sample_idx]['gold'],
-            if len(preamble) > 0:
-                query = f'{example_delimiter}{query}'
-
-            # rstrip the continuation delimiter, because the prompt ending in a space results in degenerate output
-            continuation_delimiter_stripped = continuation_delimiter.rstrip()
-
-            if self.prefix_space:
-                choices = [(f' {choice}' if not choice.startswith(' ') else choice) for choice in choices]
-            query += continuation_delimiter_stripped
-            encoded_example['preamble'] = self.tokenizer(
-                preamble
-            )  # if the preamble is empty then these will be 0-length lists, unless the tokenizer adds special tokens to empty strings (e.g. OPT tokenizer)
-
-            if self.tokenizer.eos_token_id is not None and len(
-                    encoded_example['preamble']
-                ['input_ids']) > 1 and encoded_example['preamble']['input_ids'][-1] == self.tokenizer.eos_token_id:
-                encoded_example['preamble']['input_ids'] = encoded_example['preamble']['input_ids'][:-1]
-
-            encoded_example['gold_idx'] = gold_idx
-
-            encoded_example['query'] = self.tokenizer(query, add_special_tokens=False)
-            encoded_example['choices'] = [self.tokenizer(choice, add_special_tokens=False) for choice in choices]
-
-            examples.append(encoded_example)
-
-        return examples
+    def get_answer_from_sample(self, sample: dict):
+        choices = sample['choices']
+        gold_idx = sample['gold']
+        return choices[gold_idx]
+
+    def additional_processing_for_example(self, tokenized_example: dict, sample: dict):
+        choices = sample['choices']
+        if self.prefix_space:
+            choices = [(f' {choice}' if not choice.startswith(' ') else choice) for choice in choices]
+        tokenized_example['choices'] = [self.tokenizer(choice, add_special_tokens=False) for choice in choices]
+        tokenized_example['gold_idx'] = sample['gold']
+        return tokenized_example
 
     def collate_fn(self, data):
         inputs = []
@@ -780,16 +593,10 @@ def split_batch(self, batch: Any, microbatch_size: int):
         microbatch_size are tracked in logical samples, we split logical attributes by
         microbatch_size and real attributes by microbatch_size * num_choices.
         """
-        # Don't split kwargs that don't change
-        # Normally split torch tensors
-        # List split lists of strings
-        no_split = ['mode']
-        # Real
-        real = ['input_ids', 'labels', 'attention_mask']
-        logical = ['gold_indices']
+        # There are extra split options in this func for multiple choice
         chunked = {}
         for k, v in batch.items():
-            if k in no_split:
+            if k in self.dont_split_keys:
                 # Defer broadcasting primitives until we know num_chunks
                 pass
             elif k == 'continuation_indices':
@@ -798,9 +605,9 @@ def split_batch(self, batch: Any, microbatch_size: int):
             elif k == 'choice_groupings':
                 # List of list, so we have to directly call _split_list
                 chunked[k] = _split_list(v, microbatch_size)
-            elif k in real:
+            elif k in self.real_split_keys:
                 chunked[k] = _default_split_batch(v, microbatch_size * self.num_choices)
-            elif k in logical:
+            elif k in self.normal_split_keys:
                 chunked[k] = _default_split_batch(v, microbatch_size)
             else:
                 raise ValueError(f'Unexpected key {k}')
@@ -840,52 +647,48 @@ class InContextLearningSchemaTaskDataset(InContextLearningMultipleChoiceTaskData
         fewshot_random_seed (int): Random seed used to select fewshot examples
     """
 
-    def __init__(
-        self,
-        dataset_uri: str,
-        tokenizer: Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast],
-        max_seq_len: int,
-        pad_tok_id: int,
-        num_fewshot: int,
-        prompt_string: str,
-        example_delimiter: str,
-        continuation_delimiter: str,
-        destination_path: str,
-        fewshot_random_seed: int,
-    ):
-        try:
-            from datasets import load_dataset  # pyright: ignore [reportGeneralTypeIssues]
-        except ImportError as e:
-            raise MissingConditionalImportError(extra_deps_group='nlp',
-                                                conda_package='datasets',
-                                                conda_channel='conda-forge') from e
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
 
-        with dist.local_rank_zero_download_and_wait(destination_path):
-            if dist.get_local_rank() == 0:
-                get_file(dataset_uri, destination_path, overwrite=True)
-        dataset = load_dataset('json', data_files=destination_path, split='train', streaming=False)
-        self.samples = list(
+    def _parse_dataset(self, dataset: Dataset) -> List[Dict[str, str]]:
+        return list(
             dataset.map(
                 lambda examples: {
                     'context_options': examples['context_options'],
                     'continuation': examples['continuation'],
                     'gold': examples['gold']
                 }))
-        self.samples = strip_data(self.samples)
 
-        self.num_choices = len(self.samples[0]['context_options'])
-        self.tokenizer = tokenizer
-        self.max_seq_len = max_seq_len
-        self.pad_tok_id = pad_tok_id
-        fewshot_rng = random.Random(fewshot_random_seed)
+    def construct_context(self, sample, preceding_text: str = '', add_answer: bool = False):
+        context_options = sample['context_options']
+        gold_idx = sample['gold']
+        continuation = sample['continuation']
+        assert isinstance(gold_idx, int)
+        # TODO: fix this?
+        if add_answer:
+            context = context_options[gold_idx]
+            if len(preceding_text) > 0:
+                context = f'{self.example_delimiter}{context}'
+            context = f'{context}{self.continuation_delimiter}{continuation}'
+        else:
+            context_options = sample['context_options']
+            if len(preceding_text) > 0:
+                context_options = [f'{self.example_delimiter}{c}{self.continuation_delimiter}' for c in context_options]
 
-        self.prefix_space = _tokenizer_needs_prefix_space(self.tokenizer)
+        return context
 
-        self.encoded_dataset = self._prep_examples(num_fewshot, prompt_string, example_delimiter, continuation_delimiter,
-                                                  fewshot_rng)
+    def generate_few_shot_text(self, num_fewshot: int, sample_idx: int, preamble: str,
+                               fewshot_rng: random.Random) -> str:
+        preamble = preamble
+        if num_fewshot > 0:
+            fewshot_idxs = _get_fewshot_sample_idxs(len(self.samples), num_fewshot, sample_idx, fewshot_rng)
+            for fewshot_idx in fewshot_idxs:
+                context = self.construct_context(self.samples[fewshot_idx])
+                preamble += context
+        return preamble
 
     def _prep_examples(self, num_fewshot: int, prompt_string: str, example_delimiter: str, continuation_delimiter: str,
-                      fewshot_rng: random.Random):
+                       fewshot_rng: random.Random):
         """Prepares a set of schema questions into tokenized format with prompt and few shot examples.
         Each question consists of a set of possible contexts followed by a continuation, only one of the contexts would logically permit the continuation.
         At inference time we construct individual inference examples consisting of a single context option + the continuation,
@@ -903,47 +706,26 @@ def _prep_examples(self, num_fewshot: int, prompt_string: str, example_delimiter
                 the index of the correct answer choice.
         """
 
+        # TODO: fix this
         examples = []
         for sample_idx in tqdm(range(len(self.samples))):
-            preamble = prompt_string
-            if num_fewshot > 0:
-                fewshot_idxs = _get_fewshot_sample_idxs(len(self.samples), num_fewshot, sample_idx, fewshot_rng)
-                for fewshot_idx in fewshot_idxs:
-                    context_options, continuation, gold_idx = self.samples[fewshot_idx][
-                        'context_options'], self.samples[fewshot_idx]['continuation'], self.samples[fewshot_idx]['gold']
-                    assert isinstance(gold_idx, int)
-                    context = context_options[gold_idx]
-                    if len(preamble) > 0:
-                        context = f'{example_delimiter}{context}'
-                    preamble += f'{context}{continuation_delimiter}{continuation}'
-
-            encoded_example = {}
-            context_options, continuation, gold_idx = self.samples[sample_idx]['context_options'], self.samples[
-                sample_idx]['continuation'], self.samples[sample_idx]['gold'],
-
-            # rstrip the continuation delimiter, because the prompt ending in a space results in degenerate output
-            continuation_delimiter_stripped = continuation_delimiter.rstrip()
-
-            if len(preamble) > 0:
-                context_options = [f'{example_delimiter}{c}{continuation_delimiter_stripped}' for c in context_options]
-            encoded_example['preamble'] = self.tokenizer(
-                preamble
-            )  # if the preamble is empty then these will be 0-length lists, unless the tokenizer adds special tokens to empty strings (e.g. OPT tokenizer)
-            if self.tokenizer.eos_token_id is not None and len(
-                    encoded_example['preamble']
-                ['input_ids']) > 1 and encoded_example['preamble']['input_ids'][-1] == self.tokenizer.eos_token_id:
-                encoded_example['preamble']['input_ids'] = encoded_example['preamble']['input_ids'][:-1]
-
-            encoded_example['gold_idx'] = gold_idx
-            encoded_example['context_options'] = [self.tokenizer(c, add_special_tokens=False) for c in context_options]
-
-            if self.prefix_space:
-                continuation = f' {continuation}' if not continuation.startswith(' ') else continuation
-            encoded_example['continuation'] = self.tokenizer(continuation, add_special_tokens=False)
-            examples.append(encoded_example)
+            prompt_and_fewshot = self.generate_few_shot_text(num_fewshot, sample_idx, prompt_string, fewshot_rng)
+            tokenized_example = self.tokenize_example(prompt_and_fewshot, '')
+            tokenized_example['context_options'] = [
+                self.tokenizer(c, add_special_tokens=False) for c in context_options
+            ]
+            examples.append(tokenized_example)
 
         return examples
 
+    def additional_processing_for_example(self, tokenized_example: dict, sample: dict):
+        continuation = sample['continuation']
+        if self.prefix_space:
+            continuation = f' {continuation}' if not continuation.startswith(' ') else continuation
+        tokenized_example['continuation'] = self.tokenizer(continuation, add_special_tokens=False)
+        tokenized_example['gold_idx'] = sample['gold_idx']
+        return tokenized_example
+
     def collate_fn(self, data):
         inputs = []
         continuation_indices = []
@@ -1018,128 +800,63 @@ class InContextLearningCodeEvalDataset(InContextLearningDataset):
         top_k: top_k sampling parameter for number of samples to consider
     """
 
-    def __init__(
-        self,
-        dataset_uri: str,
-        tokenizer: Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast],
-        max_seq_len: int,
-        pad_tok_id: int,
-        num_fewshot: int,
-        prompt_string: str,
-        example_delimiter: str,
-        destination_path: str,
-        code_prelimiter: str,
-        fewshot_random_seed: int,
-        generations_per_sample: int,
-        pass_at_k: int = 1,
-        top_p: Optional[float] = 0.95,
-        top_k: Optional[int] = 40,
-    ):
-        try:
-            from datasets import load_dataset  # pyright: ignore [reportGeneralTypeIssues]
-        except ImportError as e:
-            raise MissingConditionalImportError(extra_deps_group='nlp',
-                                                conda_package='datasets',
-                                                conda_channel='conda-forge') from e
-        with dist.local_rank_zero_download_and_wait(destination_path):
-            if dist.get_local_rank() == 0:
-                get_file(dataset_uri, destination_path, overwrite=True)
-        dataset = load_dataset('json', data_files=destination_path, split='train', streaming=False)
-        self.samples = list(
-            dataset.map(
-                lambda examples: {
-                    'task_id': examples['task_id'],
-                    'prompt': examples['prompt'],
-                    'canonical_solution': examples['canonical_solution'],
-                    'test': examples['test'],
-                    'entry_point': examples['entry_point'],
-                    'test_inputs': examples['test_inputs'],
-                    'test_outputs': examples['test_outputs'],
-                    'language': examples['language'],
-                }))
-
+    def __init__(self,
+                 generations_per_sample: int,
+                 pass_at_k: int = 1,
+                 top_p: Optional[float] = 0.95,
+                 top_k: Optional[int] = 40,
+                 *args,
+                 **kwargs):
         if generations_per_sample < pass_at_k:
             raise ValueError(
                 f'generations_per_sample ({generations_per_sample}) must be greater than or equal to pass_at_k ({pass_at_k}) for code evaluation.'
             )
 
+        super().__init__(context_key='prompt', answer_key='canonical_solution', *args, **kwargs)
         self.pass_at_k = pass_at_k
         self.generations_per_sample = generations_per_sample
-
-        self.tokenizer = tokenizer
-        self.max_seq_len = max_seq_len
-        self.pad_tok_id = pad_tok_id
-        self.padding_side = 'left'
-        self.max_prompt_length = 0
+        self.max_prompt_length = self.get_max_prompt_length()
         self.top_p = top_p
         self.top_k = top_k
-        fewshot_rng = random.Random(fewshot_random_seed)
-        self.encoded_dataset = self._prep_examples(num_fewshot, prompt_string, example_delimiter, code_prelimiter,
-                                                  fewshot_rng)
 
-    def _prep_examples(self, num_fewshot: int, prompt_string: str, example_delimiter: str, code_prelimiter: str,
-                      fewshot_rng: random.Random):
-        """Prepares a set of code evaluation tasks into tokenized format with prompt and fewshot examples.
-
-        Each task consists of a context as well as an optional prompt and optional list of
-        example context/continuation pairs which precede the test context/continuation pair.
+        self.dont_split_keys = ['mode', 'generation_length', 'pass_at_k', 'generation_kwargs']
+        self.normal_split_keys = ['input_ids', 'attention_mask']
+        self.list_split_keys = [
+            'labels', 'tests', 'canonical_solutions', 'entry_points', 'test_inputs', 'test_outputs', 'prompts',
+            'languages'
+        ]
 
-        Args:
-            num_fewshot (int): Number of examples context/continuation pairs to prepend to the test pair
-            prompt_string (str): The prompt to prepend to all inputs
-            example_delimiter (str): The delimiter used to separate each individual context/continuation pair
-            code_prelimiter (str): The text to prepend to each code prompt
-            fewshot_rng (random.Random): Random number generator to use for fewshot sampling
+    def _parse_dataset(self, dataset: Dataset) -> List[Dict[str, str]]:
+        return list(
+            dataset.map(
+                lambda examples: {
+                    'task_id': examples['task_id'],
+                    'prompt': examples['prompt'],
+                    'canonical_solution': examples['canonical_solution'],
+                    'test': examples['test'],
+                    'entry_point': examples['entry_point'],
+                    'test_inputs': examples['test_inputs'],
+                    'test_outputs': examples['test_outputs'],
+                    'language': examples['language'],
+                }))
 
-        Returns:
-            dict: Contains the context, the continuation, and the preamble (prompt + fewshot examples)
-        """
+    def get_max_prompt_length(self):
         max_prompt_length = 0
-        examples = []
-        for sample_idx in tqdm(range(len(self.samples))):
-            encoded_example = {}
-
-            preamble = prompt_string
-
-            if num_fewshot > 0:
-                fewshot_idxs = _get_fewshot_sample_idxs(len(self.samples), num_fewshot, sample_idx, fewshot_rng)
-                for fewshot_idx in fewshot_idxs:
-                    ctxt, cont = self.samples[fewshot_idx]['prompt'], self.samples[fewshot_idx]['canonical_solution']
-                    ctxt = f'{code_prelimiter}{ctxt}'
-                    if len(preamble) > 0:
-                        ctxt = f'{example_delimiter}{ctxt}'
-                    preamble += f'{ctxt}{cont}'
-
-            ctxt = self.samples[sample_idx]['prompt']
-            ctxt = f'{code_prelimiter}{ctxt}'
-            if len(preamble) > 0:
-                ctxt = f'{example_delimiter}{ctxt}'
-
-            # If the preamble is empty then this will be a 0-length list, unless the tokenizer adds special tokens to empty strings (e.g. OPT tokenizer)
-            encoded_example['preamble'] = self.tokenizer(preamble)
-            # If there is an EOS token added, we need to remove it so it is not in the middle of the prompt
-            if self.tokenizer.eos_token_id is not None and len(
-                    encoded_example['preamble']
-                ['input_ids']) > 1 and encoded_example['preamble']['input_ids'][-1] == self.tokenizer.eos_token_id:
-                encoded_example['preamble']['input_ids'] = encoded_example['preamble']['input_ids'][:-1]
-
-            encoded_example['prompt'] = self.tokenizer(ctxt, add_special_tokens=False)
-            encoded_example['prompt_text'] = self.samples[sample_idx]['prompt']
-            encoded_example['task_id'] = self.samples[sample_idx]['task_id']
-            encoded_example['canonical_solution'] = self.samples[sample_idx]['canonical_solution']
-            encoded_example['test'] = self.samples[sample_idx]['test']
-            encoded_example['entry_point'] = self.samples[sample_idx]['entry_point']
-            encoded_example['test_inputs'] = self.samples[sample_idx]['test_inputs']
-            encoded_example['test_outputs'] = self.samples[sample_idx]['test_outputs']
-            encoded_example['language'] = self.samples[sample_idx]['language']
-
-            examples.append(encoded_example)
-            max_prompt_length = max(
-                max_prompt_length,
-                len(encoded_example['preamble']['input_ids'] + encoded_example['prompt']['input_ids']))
-
-        self.max_prompt_length = max_prompt_length
-        return examples
+        for sample in self.samples:
+            max_prompt_length = max(max_prompt_length,
+                                    len(sample['preamble']['input_ids'] + sample['prompt']['input_ids']))
+        return max_prompt_length
+
+    def additional_processing_for_example(self, tokenized_example: dict, sample: dict):
+        tokenized_example['prompt_text'] = sample['prompt']
+        tokenized_example['task_id'] = sample['task_id']
+        tokenized_example['canonical_solution'] = sample['canonical_solution']
+        tokenized_example['test'] = sample['test']
+        tokenized_example['entry_point'] = sample['entry_point']
+        tokenized_example['test_inputs'] = sample['test_inputs']
+        tokenized_example['test_outputs'] = sample['test_outputs']
+        tokenized_example['language'] = sample['language']
+        return tokenized_example
 
     def collate_fn(self, data):
         inputs, prompts, tests, canonical_solutions, entry_points, test_inputs, test_outputs, languages = [], [], [], [], [], [], [], []
@@ -1196,34 +913,6 @@ def collate_fn(self, data):
         batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
         return batch
 
-    def split_batch(self, batch: Any, microbatch_size: int):
-        # Don't split kwargs that don't change
-        # Normally split torch tensors
-        # List split lists of strings
-        no_split = ['mode', 'generation_length', 'pass_at_k', 'generation_kwargs']
-        normal_split = ['input_ids', 'attention_mask']
-        list_split = [
-            'labels', 'tests', 'canonical_solutions', 'entry_points', 'test_inputs', 'test_outputs', 'prompts',
-            'languages'
-        ]
-        chunked = {}
-        for k, v in batch.items():
-            if k in no_split:
-                # Defer broadcasting until we know num_chunks
-                pass
-            elif k in list_split:
-                chunked[k] = _split_list(v, microbatch_size)
-            elif k in normal_split:
-                chunked[k] = _default_split_batch(v, microbatch_size)
-            else:
-                raise ValueError(f'Unexpected key {k}')
-        num_chunks = len(chunked['input_ids'])
-        for k, v in batch.items():
-            if isinstance(v, (int, float, str, bool, dict)):
-                chunked[k] = [v] * num_chunks
-
-        return [{k: v[idx] for k, v in chunked.items()} for idx in range(num_chunks)]
-
 
 def build_icl_dataloader(
     icl_task_type: str,
@@ -1237,7 +926,7 @@ def build_icl_dataloader(
     example_delimiter: str,  # e.g. '\n'
     continuation_delimiter: str,  # e.g. ''
     destination_path: str,
-    question_prelimiter: str = '',  # e.g. 'Question: '
+    prelimiter: str = '',  # e.g. 'Question: '
     cot_delimiter: str = '',
     fewshot_random_seed: int = 1234,
     pass_at_k: int = 1,
@@ -1291,7 +980,7 @@ def build_icl_dataloader(
                                                  example_delimiter,
                                                  continuation_delimiter,
                                                  destination_path=destination_path,
-                                                 question_prelimiter=question_prelimiter,
+                                                 prelimiter=prelimiter,
                                                  fewshot_random_seed=fewshot_random_seed,
                                                  cot_delimiter=cot_delimiter)
         effective_batchsize = batch_size
@@ -1304,7 +993,7 @@ def build_icl_dataloader(
                                                    prompt_string,
                                                    example_delimiter,
                                                    destination_path=destination_path,
-                                                   code_prelimiter=question_prelimiter,
+                                                   prelimiter=prelimiter,
                                                    fewshot_random_seed=fewshot_random_seed,
                                                    pass_at_k=pass_at_k,
                                                    generations_per_sample=generations_per_sample)
@@ -1388,7 +1077,7 @@ def get_icl_task_dataloader(
         example_delimiter: str,  # e.g. '\n'
         continuation_delimiter: str = '',
         destination_path: str = '',
-        question_prelimiter: str = '',  # e.g. 'Question: '
+        prelimiter: str = '',  # e.g. 'Question: '
         fewshot_random_seed: int = 1234,
         pass_at_k: int = 1,
         generations_per_sample: int = 1,
@@ -1433,7 +1122,7 @@ def get_icl_task_dataloader(
         example_delimiter (str): Separator that goes between individual examples (e.g. '\n')
         continuation_delimiter: (str): Separator that goes between context and continuation in each example (e.g. '->')
         destination_path: (str): This is the local file where remote datasets will be saved.
-        question_prelimiter: (str): For QA tasks, this will be prepended to each question.
+        prelimiter: (str): For QA tasks, this will be prepended to each question.
         has_categories: (bool): If ``True``, we will search the dataset file for a category key, and partition the dataset into a separate dataloader for each category occurring in the data.
 
     Returns:
@@ -1458,7 +1147,7 @@ def get_icl_task_dataloader(
                 example_delimiter,
                 continuation_delimiter,
                 partition_uri + '_tmp',
-                question_prelimiter,
+                prelimiter,
                 cot_delimiter,
                 fewshot_random_seed,
                 pass_at_k,
@@ -1478,7 +1167,7 @@ def get_icl_task_dataloader(
             example_delimiter,
             continuation_delimiter,
             destination_path,
-            question_prelimiter,
+            prelimiter,
             cot_delimiter,
             fewshot_random_seed,
             pass_at_k,

From e585e1f499fdc07d6e55e194651b944a7d00ce16 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Wed, 8 Nov 2023 23:11:19 +0000
Subject: [PATCH 006/116] linting

---
 .../in_context_learning_evaluation.py         | 75 ++++++++-----------
 .../test_in_context_learning_datasets.py      | 12 +--
 2 files changed, 39 insertions(+), 48 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index a1f620c255..6ad6e4153f 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -117,14 +117,12 @@ def __init__(
         continuation_delimiter: str,
         destination_path: str,
         fewshot_random_seed: int,
+        strip_data: bool = True,
         icl_hf_loading_vars: dict = {},
         icl_hf_parsing_vars: dict = {},
         context_key: str = 'context',
         answer_key: str = 'answer',
         prelimiter: str = '',
-        dont_split_keys: List[str] = [],
-        normal_split_keys: List[str] = [],
-        list_split_keys: List[str] = [],
     ):
         self.tokenizer = tokenizer
         self.prefix_space = _tokenizer_needs_prefix_space(self.tokenizer)
@@ -140,15 +138,13 @@ def __init__(
         self.answer_key = answer_key
 
         self.samples = self._read_dataset(dataset_uri, destination_path, icl_hf_loading_vars, icl_hf_parsing_vars)
-        self.samples = strip_data(self.samples)
+        if strip_data:
+            self.samples = strip_data(self.samples)
 
         fewshot_rng = random.Random(fewshot_random_seed)
+        self.num_fewshot = num_fewshot
         self.encoded_dataset = self._prep_examples(num_fewshot, prompt_string, fewshot_rng)
 
-        self.dont_split_keys = dont_split_keys
-        self.normal_split_keys = normal_split_keys
-        self.list_split_keys = list_split_keys
-
     def __getitem__(self, index: int):
         return self.encoded_dataset[index]
 
@@ -220,7 +216,7 @@ def construct_context(self, sample: dict, preceding_text: str = '', add_answer:
         ctxt = sample[self.context_key]
         ctxt = f'{self.prelimiter}{ctxt}'
         if len(preceding_text) > 0:
-            ctxt = f'{preceding_text}{self.example_delimiter}{ctxt}'
+            ctxt = f'{self.example_delimiter}{ctxt}'
         ctxt = f'{ctxt}{self.continuation_delimiter}'
         if add_answer:
             ctxt = f'{ctxt}{self.get_answer_from_sample(sample)}'
@@ -243,8 +239,7 @@ def tokenize_example(self, prompt_and_fewshot: str, ctxt: str):
         preamble = self.fix_eos_on_preamble(preamble)
         tokenized_example['preamble'] = preamble
         # rstrip context because a prompt ending in a space results in degenerate output
-        #TODO: use diff key for this?
-        tokenized_example['context'] = self.tokenizer(ctxt.rstrip(), add_special_tokens=False)
+        tokenized_example[self.context_key] = self.tokenizer(ctxt.rstrip(), add_special_tokens=False)
         return tokenized_example
 
     def _prep_examples(self, num_fewshot: int, prompt_string: str, fewshot_rng: random.Random) -> List[Dict[str, Any]]:
@@ -335,13 +330,12 @@ class InContextLearningQATaskDataset(InContextLearningDataset):
     """
 
     def __init__(self, cot_delimiter: str = '', *args, **kwargs):
-        super().__init__(dont_split_keys=['mode', 'generation_length', 'generation_kwargs', 'cot_delimiter'],
-                         normal_split_keys=['input_ids', 'attention_mask'],
-                         list_split_keys=['labels'],
-                         *args,
-                         **kwargs)
+        super().__init__(*args, **kwargs)
         self.cot_delimiter = cot_delimiter
         self.max_answer_length = self.get_max_answer_length()
+        self.dont_split_keys = ['mode', 'generation_length', 'generation_kwargs', 'cot_delimiter']
+        self.normal_split_keys = ['input_ids', 'attention_mask']
+        self.list_split_keys = ['labels']
 
     def _parse_dataset(self, dataset: Dataset) -> List[Dict[str, str]]:
         return list(
@@ -432,7 +426,7 @@ class InContextLearningLMTaskDataset(InContextLearningDataset):
     """
 
     def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
+        super().__init__(answer_key='continuation', *args, **kwargs)
 
     def _parse_dataset(self, dataset: Dataset) -> List[Dict[str, str]]:
         return list(
@@ -507,16 +501,13 @@ class InContextLearningMultipleChoiceTaskDataset(InContextLearningDataset):
     """
 
     def __init__(self, *args, **kwargs):
-        super().__init__(dont_split_keys=['mode'],
-                         real_split_keys=['input_ids', 'labels', 'attention_mask'],
-                         normal_split_key=['gold_indices'],
-                         *args,
-                         **kwargs)
+        super().__init__(context_key='query', *args, **kwargs)
         self.num_choices = len(self.samples[0]['choices'])
-        self.context_key = 'query'
-        # self.dont_split_keys = ['mode']
-        # self.real_split_keys = ['input_ids', 'labels', 'attention_mask']
-        # self.normal_split_keys = ['gold_indices']
+
+        # TODO: set all these keys like this or what?
+        self.dont_split_keys = ['mode']
+        self.real_split_keys = ['input_ids', 'labels', 'attention_mask']
+        self.normal_split_keys = ['gold_indices']
 
     def _parse_dataset(self, dataset: Dataset) -> List[Dict[str, str]]:
         return list(
@@ -664,7 +655,6 @@ def construct_context(self, sample, preceding_text: str = '', add_answer: bool =
         gold_idx = sample['gold']
         continuation = sample['continuation']
         assert isinstance(gold_idx, int)
-        # TODO: fix this?
         if add_answer:
             context = context_options[gold_idx]
             if len(preceding_text) > 0:
@@ -812,7 +802,7 @@ def __init__(self,
                 f'generations_per_sample ({generations_per_sample}) must be greater than or equal to pass_at_k ({pass_at_k}) for code evaluation.'
             )
 
-        super().__init__(context_key='prompt', answer_key='canonical_solution', *args, **kwargs)
+        super().__init__(context_key='prompt', answer_key='canonical_solution', strip_data=False, *args, **kwargs)
         self.pass_at_k = pass_at_k
         self.generations_per_sample = generations_per_sample
         self.max_prompt_length = self.get_max_prompt_length()
@@ -826,6 +816,13 @@ def __init__(self,
             'languages'
         ]
 
+    def get_max_prompt_length(self):
+        max_prompt_length = 0
+        for sample in self.encoded_dataset:
+            max_prompt_length = max(max_prompt_length,
+                                    len(sample['preamble']['input_ids'] + sample['prompt']['input_ids']))
+        return max_prompt_length
+
     def _parse_dataset(self, dataset: Dataset) -> List[Dict[str, str]]:
         return list(
             dataset.map(
@@ -840,13 +837,6 @@ def _parse_dataset(self, dataset: Dataset) -> List[Dict[str, str]]:
                     'language': examples['language'],
                 }))
 
-    def get_max_prompt_length(self):
-        max_prompt_length = 0
-        for sample in self.samples:
-            max_prompt_length = max(max_prompt_length,
-                                    len(sample['preamble']['input_ids'] + sample['prompt']['input_ids']))
-        return max_prompt_length
-
     def additional_processing_for_example(self, tokenized_example: dict, sample: dict):
         tokenized_example['prompt_text'] = sample['prompt']
         tokenized_example['task_id'] = sample['task_id']
@@ -985,13 +975,14 @@ def build_icl_dataloader(
                                                  cot_delimiter=cot_delimiter)
         effective_batchsize = batch_size
     elif icl_task_type == 'code_evaluation':
-        dataset = InContextLearningCodeEvalDataset(dataset_uri,
-                                                   tokenizer,
-                                                   max_seq_len,
-                                                   pad_tok_id,
-                                                   num_fewshot,
-                                                   prompt_string,
-                                                   example_delimiter,
+        dataset = InContextLearningCodeEvalDataset(dataset_uri=dataset_uri,
+                                                   tokenizer=tokenizer,
+                                                   max_seq_len=max_seq_len,
+                                                   pad_tok_id=pad_tok_id,
+                                                   num_fewshot=num_fewshot,
+                                                   prompt_string=prompt_string,
+                                                   example_delimiter=example_delimiter,
+                                                   continuation_delimiter=continuation_delimiter,
                                                    destination_path=destination_path,
                                                    prelimiter=prelimiter,
                                                    fewshot_random_seed=fewshot_random_seed,
diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py
index 8ebb7816af..058734f60b 100644
--- a/tests/datasets/test_in_context_learning_datasets.py
+++ b/tests/datasets/test_in_context_learning_datasets.py
@@ -533,7 +533,7 @@ def test_qa_task_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path, num_fews
                                  num_fewshot=num_fewshot,
                                  prompt_string=prompt_string,
                                  example_delimiter='\n',
-                                 question_prelimiter='Q: ',
+                                 prelimiter='Q: ',
                                  continuation_delimiter='\nA:',
                                  destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'))
     assert isinstance(dl, DataSpec)
@@ -583,7 +583,7 @@ def test_qa_task_with_cot_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path,
                                  num_fewshot=num_fewshot,
                                  prompt_string='',
                                  example_delimiter='\n',
-                                 question_prelimiter='Q: ',
+                                 prelimiter='Q: ',
                                  continuation_delimiter="\nA: Let's think step by step. ",
                                  cot_delimiter=' #### ',
                                  destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'))
@@ -760,7 +760,7 @@ def test_code_eval_sentpiece_dataloader(dataset_uri, tmp_path, num_fewshot, prom
                                  num_fewshot=num_fewshot,
                                  prompt_string=prompt_string,
                                  example_delimiter='\n',
-                                 question_prelimiter='Code start: \n',
+                                 prelimiter='Code start: \n',
                                  destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'),
                                  generations_per_sample=generations_per_sample)
     assert isinstance(dl, DataSpec)
@@ -845,7 +845,7 @@ def test_code_eval_test_cases(dataset_uri, tmp_path):
                                  num_fewshot=0,
                                  prompt_string='',
                                  example_delimiter='\n',
-                                 question_prelimiter='Code start: \n',
+                                 prelimiter='Code start: \n',
                                  destination_path=str(tmp_path / f'icl_.jsonl'),
                                  generations_per_sample=1)
     assert isinstance(dl, DataSpec)
@@ -894,7 +894,7 @@ def test_code_eval_pass_at_k_validity(dataset_uri, tmp_path):
                                 num_fewshot=0,
                                 prompt_string='',
                                 example_delimiter='\n',
-                                question_prelimiter='Code start: \n',
+                                prelimiter='Code start: \n',
                                 destination_path=str(tmp_path / f'icl_.jsonl'),
                                 pass_at_k=10,
                                 generations_per_sample=1)
@@ -923,7 +923,7 @@ def test_code_eval_task_dataloader(dataset_uri, tmp_path, num_fewshot, prompt_st
                                  num_fewshot=num_fewshot,
                                  prompt_string=prompt_string,
                                  example_delimiter='\n',
-                                 question_prelimiter='Code start: \n',
+                                 prelimiter='Code start: \n',
                                  destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'),
                                  generations_per_sample=generations_per_sample)
     assert isinstance(dl, DataSpec)

From 50783738b46d296dc9d524806b4d023f07fa8547 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Thu, 9 Nov 2023 01:35:27 +0000
Subject: [PATCH 007/116] tests pass!

---
 .../in_context_learning_evaluation.py         | 151 ++++++++++--------
 .../test_in_context_learning_datasets.py      |  14 +-
 2 files changed, 89 insertions(+), 76 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 6ad6e4153f..20403b73a0 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -117,7 +117,7 @@ def __init__(
         continuation_delimiter: str,
         destination_path: str,
         fewshot_random_seed: int,
-        strip_data: bool = True,
+        strip_dataset: bool = True,
         icl_hf_loading_vars: dict = {},
         icl_hf_parsing_vars: dict = {},
         context_key: str = 'context',
@@ -138,7 +138,8 @@ def __init__(
         self.answer_key = answer_key
 
         self.samples = self._read_dataset(dataset_uri, destination_path, icl_hf_loading_vars, icl_hf_parsing_vars)
-        if strip_data:
+        self.strip_data = strip_dataset
+        if self.strip_data:
             self.samples = strip_data(self.samples)
 
         fewshot_rng = random.Random(fewshot_random_seed)
@@ -238,8 +239,11 @@ def tokenize_example(self, prompt_and_fewshot: str, ctxt: str):
         preamble = self.tokenizer(prompt_and_fewshot)
         preamble = self.fix_eos_on_preamble(preamble)
         tokenized_example['preamble'] = preamble
-        # rstrip context because a prompt ending in a space results in degenerate output
-        tokenized_example[self.context_key] = self.tokenizer(ctxt.rstrip(), add_special_tokens=False)
+        if self.strip_data:
+            # TODO: probably shouldn't use self.strip_data for this
+            # rstrip context because a prompt ending in a space results in degenerate output
+            ctxt = ctxt.rstrip()
+        tokenized_example[self.context_key] = self.tokenizer(ctxt, add_special_tokens=False)
         return tokenized_example
 
     def _prep_examples(self, num_fewshot: int, prompt_string: str, fewshot_rng: random.Random) -> List[Dict[str, Any]]:
@@ -330,8 +334,8 @@ class InContextLearningQATaskDataset(InContextLearningDataset):
     """
 
     def __init__(self, cot_delimiter: str = '', *args, **kwargs):
-        super().__init__(*args, **kwargs)
         self.cot_delimiter = cot_delimiter
+        super().__init__(*args, **kwargs)
         self.max_answer_length = self.get_max_answer_length()
         self.dont_split_keys = ['mode', 'generation_length', 'generation_kwargs', 'cot_delimiter']
         self.normal_split_keys = ['input_ids', 'attention_mask']
@@ -354,7 +358,7 @@ def get_answer_from_sample(self, sample):
             cot_delimiter = ''
         else:
             cot_delimiter = self.cot_delimiter
-        return f'{self.continuation_delimiter}{chain_of_thought}{cot_delimiter}{sample[self.answer_key]}'
+        return f'{chain_of_thought}{cot_delimiter}{sample[self.answer_key]}'
 
     def additional_processing_for_example(self, tokenized_example: dict, sample: dict):
         tokenized_example['aliases'] = list(sample['aliases'])
@@ -365,7 +369,6 @@ def get_max_answer_length(self):
         max_answer_length = 0
         for sample in self.samples:
             for answer in sample['aliases']:
-                # DRY says I should use get_answer_from_sample somehow here
                 response = f"{sample['chain_of_thought']}{self.cot_delimiter}{answer}"
                 max_answer_length = max(max_answer_length, len(self.tokenizer(response)['input_ids']))
         max_answer_length = max_answer_length + (_MAX_ANSWER_BUFFER_LENGTH if len(self.cot_delimiter) > 0 else 0)
@@ -500,9 +503,9 @@ class InContextLearningMultipleChoiceTaskDataset(InContextLearningDataset):
         fewshot_random_seed (int): Random seed used to select fewshot examples
     """
 
-    def __init__(self, *args, **kwargs):
+    def __init__(self, choices_key: str = 'choices', *args, **kwargs):
         super().__init__(context_key='query', *args, **kwargs)
-        self.num_choices = len(self.samples[0]['choices'])
+        self.num_choices = len(self.samples[0][choices_key])
 
         # TODO: set all these keys like this or what?
         self.dont_split_keys = ['mode']
@@ -638,8 +641,9 @@ class InContextLearningSchemaTaskDataset(InContextLearningMultipleChoiceTaskData
         fewshot_random_seed (int): Random seed used to select fewshot examples
     """
 
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
+    def __init__(self, choices_key='context_options', *args, **kwargs):
+        super().__init__(choices_key=choices_key, *args, **kwargs)
+        # self.num_choices = len(self.samples[0]['context_options'])
 
     def _parse_dataset(self, dataset: Dataset) -> List[Dict[str, str]]:
         return list(
@@ -651,6 +655,7 @@ def _parse_dataset(self, dataset: Dataset) -> List[Dict[str, str]]:
                 }))
 
     def construct_context(self, sample, preceding_text: str = '', add_answer: bool = False):
+        # TODO this is a bad monkey patch
         context_options = sample['context_options']
         gold_idx = sample['gold']
         continuation = sample['continuation']
@@ -660,25 +665,14 @@ def construct_context(self, sample, preceding_text: str = '', add_answer: bool =
             if len(preceding_text) > 0:
                 context = f'{self.example_delimiter}{context}'
             context = f'{context}{self.continuation_delimiter}{continuation}'
-        else:
-            context_options = sample['context_options']
-            if len(preceding_text) > 0:
-                context_options = [f'{self.example_delimiter}{c}{self.continuation_delimiter}' for c in context_options]
+        # else:
+        #     context_options = sample['context_options']
+        #     if len(preceding_text) > 0:
+        #         context_options = [f'{self.example_delimiter}{c}{self.continuation_delimiter}' for c in context_options]
 
         return context
 
-    def generate_few_shot_text(self, num_fewshot: int, sample_idx: int, preamble: str,
-                               fewshot_rng: random.Random) -> str:
-        preamble = preamble
-        if num_fewshot > 0:
-            fewshot_idxs = _get_fewshot_sample_idxs(len(self.samples), num_fewshot, sample_idx, fewshot_rng)
-            for fewshot_idx in fewshot_idxs:
-                context = self.construct_context(self.samples[fewshot_idx])
-                preamble += context
-        return preamble
-
-    def _prep_examples(self, num_fewshot: int, prompt_string: str, example_delimiter: str, continuation_delimiter: str,
-                       fewshot_rng: random.Random):
+    def _prep_examples(self, num_fewshot: int, prompt_string: str, fewshot_rng: random.Random):
         """Prepares a set of schema questions into tokenized format with prompt and few shot examples.
         Each question consists of a set of possible contexts followed by a continuation, only one of the contexts would logically permit the continuation.
         At inference time we construct individual inference examples consisting of a single context option + the continuation,
@@ -696,24 +690,41 @@ def _prep_examples(self, num_fewshot: int, prompt_string: str, example_delimiter
                 the index of the correct answer choice.
         """
 
-        # TODO: fix this
         examples = []
         for sample_idx in tqdm(range(len(self.samples))):
             prompt_and_fewshot = self.generate_few_shot_text(num_fewshot, sample_idx, prompt_string, fewshot_rng)
-            tokenized_example = self.tokenize_example(prompt_and_fewshot, '')
-            tokenized_example['context_options'] = [
-                self.tokenizer(c, add_special_tokens=False) for c in context_options
-            ]
+            # This is different bcus the context has multiple options for scheme problems
+            ctxt_options = self.construct_context_options(self.samples[sample_idx], prompt_and_fewshot)
+            tokenized_example = self.tokenize_example(prompt_and_fewshot, ctxt_options)
+            tokenized_example = self.additional_processing_for_example(tokenized_example, self.samples[sample_idx])
             examples.append(tokenized_example)
-
         return examples
 
+    def construct_context_options(self, sample, preceding_text):
+        context_options = sample['context_options']
+        if len(preceding_text) > 0:
+            if self.strip_data:
+                cont_del = self.continuation_delimiter.rstrip()
+            else:
+                cont_del = self.continuation_delimiter
+            context_options = [f'{self.example_delimiter}{c}{cont_del}' for c in context_options]
+        return context_options
+
+    def tokenize_example(self, prompt_and_fewshot: str, context_options: List[str]):
+        tokenized_example = {}
+        preamble = self.tokenizer(prompt_and_fewshot)
+        preamble = self.fix_eos_on_preamble(preamble)
+        tokenized_example['preamble'] = preamble
+        tokenized_example['context_options'] = [self.tokenizer(c, add_special_tokens=False) for c in context_options]
+        return tokenized_example
+
     def additional_processing_for_example(self, tokenized_example: dict, sample: dict):
         continuation = sample['continuation']
         if self.prefix_space:
             continuation = f' {continuation}' if not continuation.startswith(' ') else continuation
         tokenized_example['continuation'] = self.tokenizer(continuation, add_special_tokens=False)
-        tokenized_example['gold_idx'] = sample['gold_idx']
+        # TODO: make this just "gold" not 'gold_idx'
+        tokenized_example['gold_idx'] = sample['gold']
         return tokenized_example
 
     def collate_fn(self, data):
@@ -722,7 +733,6 @@ def collate_fn(self, data):
         gold_idxs = []
         choice_groupings = []
         for data_pair in data:
-
             continuation_start_idx = len(continuation_indices)
             preamble, context_options, continuation, gold_idx = (data_pair['preamble'], data_pair['context_options'],
                                                                  data_pair['continuation'], data_pair['gold_idx'])
@@ -758,6 +768,9 @@ def collate_fn(self, data):
         batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
         return batch
 
+    # def get_num_samples_in_batch(self, batch) -> int:
+    #     return batch['input_ids'].shape[0] // self.num_choices
+
 
 class InContextLearningCodeEvalDataset(InContextLearningDataset):
     """ A dataset that constructs batches for in-context learning code evaluation
@@ -776,7 +789,7 @@ class InContextLearningCodeEvalDataset(InContextLearningDataset):
         supported by :meth:`composer.utils.maybe_create_object_store_from_uri`. Dataset must consist of rows of JSON data points with "task_id",
         "prompt", "entry_point", "canonical_solution", "test", "test_inputs", and "test_outputs". See tests/datasets/local_data/human_eval_small.jsonl.
         tokenizer (Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast]): The tokenizer used to map between strings and token ids
-        batch_size (int): Size of a batch used for eval
+        ? batch_size (int): Size of a batch used for eval
         max_seq_len (int): The maximum sequence length supported by the model
         pad_tok_id (int): The special token reserved for padding batches
         num_fewshot (int): The number of complete fewshot examples to prepend before each test example
@@ -802,7 +815,7 @@ def __init__(self,
                 f'generations_per_sample ({generations_per_sample}) must be greater than or equal to pass_at_k ({pass_at_k}) for code evaluation.'
             )
 
-        super().__init__(context_key='prompt', answer_key='canonical_solution', strip_data=False, *args, **kwargs)
+        super().__init__(context_key='prompt', answer_key='canonical_solution', strip_dataset=False, *args, **kwargs)
         self.pass_at_k = pass_at_k
         self.generations_per_sample = generations_per_sample
         self.max_prompt_length = self.get_max_prompt_length()
@@ -923,52 +936,52 @@ def build_icl_dataloader(
     generations_per_sample: int = 1,
 ) -> DataSpec:
     if icl_task_type == 'multiple_choice':
-        dataset = InContextLearningMultipleChoiceTaskDataset(dataset_uri,
-                                                             tokenizer,
-                                                             max_seq_len,
-                                                             pad_tok_id,
-                                                             num_fewshot,
-                                                             prompt_string,
-                                                             example_delimiter,
-                                                             continuation_delimiter,
+        dataset = InContextLearningMultipleChoiceTaskDataset(dataset_uri=dataset_uri,
+                                                             tokenizer=tokenizer,
+                                                             max_seq_len=max_seq_len,
+                                                             pad_tok_id=pad_tok_id,
+                                                             num_fewshot=num_fewshot,
+                                                             prompt_string=prompt_string,
+                                                             example_delimiter=example_delimiter,
+                                                             continuation_delimiter=continuation_delimiter,
                                                              destination_path=destination_path,
                                                              fewshot_random_seed=fewshot_random_seed)
         batch_size = max(dataset.num_choices, batch_size)
         effective_batchsize = batch_size // dataset.num_choices
     elif icl_task_type == 'schema':
-        dataset = InContextLearningSchemaTaskDataset(dataset_uri,
-                                                     tokenizer,
-                                                     max_seq_len,
-                                                     pad_tok_id,
-                                                     num_fewshot,
-                                                     prompt_string,
-                                                     example_delimiter,
-                                                     continuation_delimiter,
+        dataset = InContextLearningSchemaTaskDataset(dataset_uri=dataset_uri,
+                                                     tokenizer=tokenizer,
+                                                     max_seq_len=max_seq_len,
+                                                     pad_tok_id=pad_tok_id,
+                                                     num_fewshot=num_fewshot,
+                                                     prompt_string=prompt_string,
+                                                     example_delimiter=example_delimiter,
+                                                     continuation_delimiter=continuation_delimiter,
                                                      destination_path=destination_path,
                                                      fewshot_random_seed=fewshot_random_seed)
         batch_size = max(dataset.num_choices, batch_size)
         effective_batchsize = batch_size // dataset.num_choices
     elif icl_task_type == 'language_modeling':
-        dataset = InContextLearningLMTaskDataset(dataset_uri,
-                                                 tokenizer,
-                                                 max_seq_len,
-                                                 pad_tok_id,
-                                                 num_fewshot,
-                                                 prompt_string,
-                                                 example_delimiter,
-                                                 continuation_delimiter,
+        dataset = InContextLearningLMTaskDataset(dataset_uri=dataset_uri,
+                                                 tokenizer=tokenizer,
+                                                 max_seq_len=max_seq_len,
+                                                 pad_tok_id=pad_tok_id,
+                                                 num_fewshot=num_fewshot,
+                                                 prompt_string=prompt_string,
+                                                 example_delimiter=example_delimiter,
+                                                 continuation_delimiter=continuation_delimiter,
                                                  destination_path=destination_path,
                                                  fewshot_random_seed=fewshot_random_seed)
         effective_batchsize = batch_size
     elif icl_task_type == 'question_answering':
-        dataset = InContextLearningQATaskDataset(dataset_uri,
-                                                 tokenizer,
-                                                 max_seq_len,
-                                                 pad_tok_id,
-                                                 num_fewshot,
-                                                 prompt_string,
-                                                 example_delimiter,
-                                                 continuation_delimiter,
+        dataset = InContextLearningQATaskDataset(dataset_uri=dataset_uri,
+                                                 tokenizer=tokenizer,
+                                                 max_seq_len=max_seq_len,
+                                                 pad_tok_id=pad_tok_id,
+                                                 num_fewshot=num_fewshot,
+                                                 prompt_string=prompt_string,
+                                                 example_delimiter=example_delimiter,
+                                                 continuation_delimiter=continuation_delimiter,
                                                  destination_path=destination_path,
                                                  prelimiter=prelimiter,
                                                  fewshot_random_seed=fewshot_random_seed,
diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py
index 058734f60b..52a1ca6bb2 100644
--- a/tests/datasets/test_in_context_learning_datasets.py
+++ b/tests/datasets/test_in_context_learning_datasets.py
@@ -468,10 +468,10 @@ def test_qa_split_batch(dataset_uri, tmp_path):
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
     dl = get_icl_task_dataloader(
-        'question_answering',
-        dataset_uri,
-        tokenizer,
-        8,
+        icl_task_type='question_answering',
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        batch_size=8,
         max_seq_len=1024,
         pad_tok_id=tokenizer.eos_token_id,
         num_fewshot=0,
@@ -525,9 +525,9 @@ def test_qa_task_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path, num_fews
     # empirical number from the small test dataset
     maximum_answer_length = 9
     dl = get_icl_task_dataloader('question_answering',
-                                 dataset_uri,
-                                 tokenizer,
-                                 batch_size,
+                                 dataset_uri=dataset_uri,
+                                 tokenizer=tokenizer,
+                                 batch_size=batch_size,
                                  max_seq_len=seqlen,
                                  pad_tok_id=tokenizer.eos_token_id,
                                  num_fewshot=num_fewshot,

From b4e00e4a0b7de2982ac013010e9a57d4fb762801 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Thu, 9 Nov 2023 01:48:44 +0000
Subject: [PATCH 008/116] fix repeated defaults, gold_idx --> gold

---
 .../in_context_learning_evaluation.py         | 91 +++++++++----------
 1 file changed, 43 insertions(+), 48 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 20403b73a0..7825e46430 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -507,7 +507,6 @@ def __init__(self, choices_key: str = 'choices', *args, **kwargs):
         super().__init__(context_key='query', *args, **kwargs)
         self.num_choices = len(self.samples[0][choices_key])
 
-        # TODO: set all these keys like this or what?
         self.dont_split_keys = ['mode']
         self.real_split_keys = ['input_ids', 'labels', 'attention_mask']
         self.normal_split_keys = ['gold_indices']
@@ -530,7 +529,7 @@ def additional_processing_for_example(self, tokenized_example: dict, sample: dic
         if self.prefix_space:
             choices = [(f' {choice}' if not choice.startswith(' ') else choice) for choice in choices]
         tokenized_example['choices'] = [self.tokenizer(choice, add_special_tokens=False) for choice in choices]
-        tokenized_example['gold_idx'] = sample['gold']
+        tokenized_example['gold'] = sample['gold']
         return tokenized_example
 
     def collate_fn(self, data):
@@ -542,7 +541,7 @@ def collate_fn(self, data):
 
             choice_start_idx = len(continuation_indices)
             preamble, context, choices, gold_idx = (data_pair['preamble'], data_pair['query'], data_pair['choices'],
-                                                    data_pair['gold_idx'])
+                                                    data_pair['gold'])
 
             for choice in choices:
                 context_enc = preamble['input_ids'] + context['input_ids']
@@ -643,7 +642,6 @@ class InContextLearningSchemaTaskDataset(InContextLearningMultipleChoiceTaskData
 
     def __init__(self, choices_key='context_options', *args, **kwargs):
         super().__init__(choices_key=choices_key, *args, **kwargs)
-        # self.num_choices = len(self.samples[0]['context_options'])
 
     def _parse_dataset(self, dataset: Dataset) -> List[Dict[str, str]]:
         return list(
@@ -655,7 +653,7 @@ def _parse_dataset(self, dataset: Dataset) -> List[Dict[str, str]]:
                 }))
 
     def construct_context(self, sample, preceding_text: str = '', add_answer: bool = False):
-        # TODO this is a bad monkey patch
+        # TODO this is bad 
         context_options = sample['context_options']
         gold_idx = sample['gold']
         continuation = sample['continuation']
@@ -723,8 +721,7 @@ def additional_processing_for_example(self, tokenized_example: dict, sample: dic
         if self.prefix_space:
             continuation = f' {continuation}' if not continuation.startswith(' ') else continuation
         tokenized_example['continuation'] = self.tokenizer(continuation, add_special_tokens=False)
-        # TODO: make this just "gold" not 'gold_idx'
-        tokenized_example['gold_idx'] = sample['gold']
+        tokenized_example['gold'] = sample['gold']
         return tokenized_example
 
     def collate_fn(self, data):
@@ -735,7 +732,7 @@ def collate_fn(self, data):
         for data_pair in data:
             continuation_start_idx = len(continuation_indices)
             preamble, context_options, continuation, gold_idx = (data_pair['preamble'], data_pair['context_options'],
-                                                                 data_pair['continuation'], data_pair['gold_idx'])
+                                                                 data_pair['continuation'], data_pair['gold'])
 
             for ctxt in context_options:
                 context_enc = preamble['input_ids'] + ctxt['input_ids']
@@ -768,9 +765,6 @@ def collate_fn(self, data):
         batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
         return batch
 
-    # def get_num_samples_in_batch(self, batch) -> int:
-    #     return batch['input_ids'].shape[0] // self.num_choices
-
 
 class InContextLearningCodeEvalDataset(InContextLearningDataset):
     """ A dataset that constructs batches for in-context learning code evaluation
@@ -929,11 +923,11 @@ def build_icl_dataloader(
     example_delimiter: str,  # e.g. '\n'
     continuation_delimiter: str,  # e.g. ''
     destination_path: str,
-    prelimiter: str = '',  # e.g. 'Question: '
-    cot_delimiter: str = '',
-    fewshot_random_seed: int = 1234,
-    pass_at_k: int = 1,
-    generations_per_sample: int = 1,
+    prelimiter: str,  # e.g. 'Question: '
+    cot_delimiter: str,
+    fewshot_random_seed: int,
+    pass_at_k: int,
+    generations_per_sample: int,
 ) -> DataSpec:
     if icl_task_type == 'multiple_choice':
         dataset = InContextLearningMultipleChoiceTaskDataset(dataset_uri=dataset_uri,
@@ -1068,6 +1062,7 @@ def partition_dataset_by_category(dataset_uri: str, destination_path: str) -> Di
         output_files[cat] = cat_dest
     return output_files
 
+#TODO: Where do we want to set our defaults?
 
 def get_icl_task_dataloader(
         icl_task_type: str,
@@ -1140,40 +1135,40 @@ def get_icl_task_dataloader(
         for category in categories:
             partition_uri = output_files[category]
             result_dls[category] = build_icl_dataloader(
-                icl_task_type,
-                partition_uri,
-                tokenizer,
-                batch_size,
-                max_seq_len,
-                pad_tok_id,
-                num_fewshot,
-                prompt_string,
-                example_delimiter,
-                continuation_delimiter,
-                partition_uri + '_tmp',
-                prelimiter,
-                cot_delimiter,
-                fewshot_random_seed,
-                pass_at_k,
-                generations_per_sample,
+                icl_task_type=icl_task_type,
+                dataset_uri=partition_uri,
+                tokenizer=tokenizer,
+                batch_size=batch_size,
+                max_seq_len=max_seq_len,
+                pad_tok_id=pad_tok_id,
+                num_fewshot=num_fewshot,
+                prompt_string=prompt_string,
+                example_delimiter=example_delimiter,
+                continuation_delimiter=continuation_delimiter,
+                destination_path=partition_uri + '_tmp',
+                prelimiter=prelimiter,
+                cot_delimiter=cot_delimiter,
+                fewshot_random_seed=fewshot_random_seed,
+                pass_at_k=pass_at_k,
+                generations_per_sample=generations_per_sample,
             )
         return result_dls
     else:
         return build_icl_dataloader(
-            icl_task_type,
-            dataset_uri,
-            tokenizer,
-            batch_size,
-            max_seq_len,
-            pad_tok_id,
-            num_fewshot,
-            prompt_string,
-            example_delimiter,
-            continuation_delimiter,
-            destination_path,
-            prelimiter,
-            cot_delimiter,
-            fewshot_random_seed,
-            pass_at_k,
-            generations_per_sample,
+            icl_task_type=icl_task_type,
+            dataset_uri=dataset_uri,
+            tokenizer=tokenizer,
+            batch_size=batch_size,
+            max_seq_len=max_seq_len,
+            pad_tok_id=pad_tok_id,
+            num_fewshot=num_fewshot,
+            prompt_string=prompt_string,
+            example_delimiter=example_delimiter,
+            continuation_delimiter=continuation_delimiter,
+            destination_path=destination_path,
+            prelimiter=prelimiter,
+            cot_delimiter=cot_delimiter,
+            fewshot_random_seed=fewshot_random_seed,
+            pass_at_k=pass_at_k,
+            generations_per_sample=generations_per_sample,
         )

From a588767326f0e500894943c19488138de8770212 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Thu, 9 Nov 2023 19:08:27 +0000
Subject: [PATCH 009/116] basic HF parsing but test not passing

---
 .../in_context_learning_evaluation.py         | 64 +++++++++++++------
 .../test_in_context_learning_datasets.py      | 57 +++++++++++++++++
 2 files changed, 103 insertions(+), 18 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 7825e46430..c0e8b2eb11 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -88,6 +88,10 @@ def _make_padded_input(context_enc, continuation_enc, max_seq_len, pad_tok_id, p
 def _get_fewshot_sample_idxs(dataset_size: int, num_fewshot: int, sample_idx: int, rng: random.Random):
     # samples without replacement. if num_fewshot exceeds the number of unique samples,
     # then we will have fewer than num_fewshot examples in context
+    
+    # Simpler implementation (but will choose different actual ids which will break some tests) 
+    # possible_fewshot_idxs = [i for i in range(0, dataset_size) if i != sample_idx] 
+    # fewshot_idxs = set(rng.sample(possible_fewshot_idxs, num_fewshot))
     num_fewshot = min(dataset_size - 1, num_fewshot)
     fewshot_idxs = set(rng.sample(range(0, dataset_size), num_fewshot))
 
@@ -118,8 +122,8 @@ def __init__(
         destination_path: str,
         fewshot_random_seed: int,
         strip_dataset: bool = True,
-        icl_hf_loading_vars: dict = {},
-        icl_hf_parsing_vars: dict = {},
+        hf_loading_vars: dict = {},
+        hf_parsing_vars: dict = {},
         context_key: str = 'context',
         answer_key: str = 'answer',
         prelimiter: str = '',
@@ -129,6 +133,7 @@ def __init__(
 
         self.max_seq_len = max_seq_len
         self.pad_tok_id = pad_tok_id
+        self.num_fewshot = num_fewshot
         self.padding_side = 'left'
 
         self.prelimiter = prelimiter
@@ -137,13 +142,12 @@ def __init__(
         self.context_key = context_key
         self.answer_key = answer_key
 
-        self.samples = self._read_dataset(dataset_uri, destination_path, icl_hf_loading_vars, icl_hf_parsing_vars)
+        self.samples = self._read_dataset(dataset_uri, destination_path, hf_loading_vars, hf_parsing_vars)
         self.strip_data = strip_dataset
         if self.strip_data:
             self.samples = strip_data(self.samples)
 
         fewshot_rng = random.Random(fewshot_random_seed)
-        self.num_fewshot = num_fewshot
         self.encoded_dataset = self._prep_examples(num_fewshot, prompt_string, fewshot_rng)
 
     def __getitem__(self, index: int):
@@ -158,8 +162,8 @@ def get_num_samples_in_batch(self, batch: dict) -> int:
     def _read_dataset(self,
                       dataset_uri: str,
                       destination_path: str,
-                      icl_hf_loading_vars: dict = None,
-                      icl_hf_parsing_vars: dict = None):
+                      hf_loading_vars: dict = None,
+                      hf_parsing_vars: dict = None):
         try:
             from datasets import load_dataset  # pyright: ignore [reportGeneralTypeIssues]
         except ImportError as e:
@@ -168,21 +172,19 @@ def _read_dataset(self,
                                                 conda_channel='conda-forge') from e
         if 'hf://' in dataset_uri:
             dataset_uri = dataset_uri.replace('hf://', '')
-            dataset = load_dataset(dataset_uri, split='train', **icl_hf_loading_vars)
+            dataset = load_dataset(dataset_uri, **hf_loading_vars)
+            dataset = self._parse_hf_dataset(dataset, hf_parsing_vars)
         else:
             with dist.local_rank_zero_download_and_wait(destination_path):
                 if dist.get_local_rank() == 0:
                     get_file(dataset_uri, destination_path, overwrite=True)
             dataset = load_dataset('json', data_files=destination_path, split='train', streaming=False)
-        if icl_hf_parsing_vars:
-            dataset = self._parse_hf_dataset(dataset, icl_hf_parsing_vars)
-        else:
             dataset = self._parse_dataset(dataset)
         return dataset
 
-    def _parse_hf_dataset(self, dataset, icl_hf_parsing_vars):
+    def _parse_hf_dataset(self, dataset, hf_parsing_vars):
         dataset = dataset.map(
-            lambda example: {k: ''.join([str(example[col]) for col in v]) for k, v in icl_hf_parsing_vars})
+            lambda example: {k: ''.join([str(example[col]) for col in v]) for k, v in hf_parsing_vars.items()})
         return dataset
 
     def _parse_dataset(self, dataset: Dataset) -> List[Dict[str, str]]:
@@ -357,19 +359,27 @@ def get_answer_from_sample(self, sample):
         if len(chain_of_thought) == 0:
             cot_delimiter = ''
         else:
+            # TODO: cot_delimiter setting is all over the place. Need to choose a single way to do it
             cot_delimiter = self.cot_delimiter
         return f'{chain_of_thought}{cot_delimiter}{sample[self.answer_key]}'
 
     def additional_processing_for_example(self, tokenized_example: dict, sample: dict):
-        tokenized_example['aliases'] = list(sample['aliases'])
+        tokenized_example['aliases'] = list(sample.get('aliases', []))
         tokenized_example['cot_delimiter'] = self.cot_delimiter
         return tokenized_example
 
     def get_max_answer_length(self):
         max_answer_length = 0
         for sample in self.samples:
-            for answer in sample['aliases']:
-                response = f"{sample['chain_of_thought']}{self.cot_delimiter}{answer}"
+            all_answers = [sample[self.answer_key]] + list(sample.get('aliases', []))
+            for answer in all_answers:
+                chain_of_thought = sample.get('chain_of_thought', '')
+                if len(chain_of_thought) == 0:
+                    cot_delimiter = ''
+                else:
+                    # TODO: cot_delimiter setting is all over the place. Need to choose a single way to do it
+                    cot_delimiter = self.cot_delimiter
+                response = f"{chain_of_thought}{cot_delimiter}{answer}"
                 max_answer_length = max(max_answer_length, len(self.tokenizer(response)['input_ids']))
         max_answer_length = max_answer_length + (_MAX_ANSWER_BUFFER_LENGTH if len(self.cot_delimiter) > 0 else 0)
         return max_answer_length
@@ -922,6 +932,8 @@ def build_icl_dataloader(
     prompt_string: str,  # e.g. 'translate english to french:'
     example_delimiter: str,  # e.g. '\n'
     continuation_delimiter: str,  # e.g. ''
+    hf_loading_vars: dict, 
+    hf_parsing_vars: dict,
     destination_path: str,
     prelimiter: str,  # e.g. 'Question: '
     cot_delimiter: str,
@@ -939,7 +951,9 @@ def build_icl_dataloader(
                                                              example_delimiter=example_delimiter,
                                                              continuation_delimiter=continuation_delimiter,
                                                              destination_path=destination_path,
-                                                             fewshot_random_seed=fewshot_random_seed)
+                                                             fewshot_random_seed=fewshot_random_seed,
+                                                             hf_loading_vars=hf_loading_vars,
+                                                             hf_parsing_vars=hf_parsing_vars)
         batch_size = max(dataset.num_choices, batch_size)
         effective_batchsize = batch_size // dataset.num_choices
     elif icl_task_type == 'schema':
@@ -952,7 +966,9 @@ def build_icl_dataloader(
                                                      example_delimiter=example_delimiter,
                                                      continuation_delimiter=continuation_delimiter,
                                                      destination_path=destination_path,
-                                                     fewshot_random_seed=fewshot_random_seed)
+                                                     fewshot_random_seed=fewshot_random_seed,
+                                                     hf_loading_vars=hf_loading_vars,
+                                                     hf_parsing_vars=hf_parsing_vars)
         batch_size = max(dataset.num_choices, batch_size)
         effective_batchsize = batch_size // dataset.num_choices
     elif icl_task_type == 'language_modeling':
@@ -965,7 +981,9 @@ def build_icl_dataloader(
                                                  example_delimiter=example_delimiter,
                                                  continuation_delimiter=continuation_delimiter,
                                                  destination_path=destination_path,
-                                                 fewshot_random_seed=fewshot_random_seed)
+                                                 fewshot_random_seed=fewshot_random_seed,
+                                                 hf_loading_vars=hf_loading_vars,
+                                                 hf_parsing_vars=hf_parsing_vars)
         effective_batchsize = batch_size
     elif icl_task_type == 'question_answering':
         dataset = InContextLearningQATaskDataset(dataset_uri=dataset_uri,
@@ -979,6 +997,8 @@ def build_icl_dataloader(
                                                  destination_path=destination_path,
                                                  prelimiter=prelimiter,
                                                  fewshot_random_seed=fewshot_random_seed,
+                                                 hf_loading_vars=hf_loading_vars,
+                                                 hf_parsing_vars=hf_parsing_vars,
                                                  cot_delimiter=cot_delimiter)
         effective_batchsize = batch_size
     elif icl_task_type == 'code_evaluation':
@@ -993,6 +1013,8 @@ def build_icl_dataloader(
                                                    destination_path=destination_path,
                                                    prelimiter=prelimiter,
                                                    fewshot_random_seed=fewshot_random_seed,
+                                                   hf_loading_vars=hf_loading_vars,
+                                                   hf_parsing_vars=hf_parsing_vars,
                                                    pass_at_k=pass_at_k,
                                                    generations_per_sample=generations_per_sample)
         effective_batchsize = batch_size
@@ -1074,6 +1096,8 @@ def get_icl_task_dataloader(
         num_fewshot: int,
         prompt_string: str,  # e.g. 'translate english to french:'
         example_delimiter: str,  # e.g. '\n'
+        hf_loading_vars: dict = {},
+        hf_parsing_vars: dict = {},
         continuation_delimiter: str = '',
         destination_path: str = '',
         prelimiter: str = '',  # e.g. 'Question: '
@@ -1144,6 +1168,8 @@ def get_icl_task_dataloader(
                 num_fewshot=num_fewshot,
                 prompt_string=prompt_string,
                 example_delimiter=example_delimiter,
+                hf_loading_vars=hf_loading_vars, 
+                hf_parsing_vars=hf_parsing_vars,
                 continuation_delimiter=continuation_delimiter,
                 destination_path=partition_uri + '_tmp',
                 prelimiter=prelimiter,
@@ -1164,6 +1190,8 @@ def get_icl_task_dataloader(
             num_fewshot=num_fewshot,
             prompt_string=prompt_string,
             example_delimiter=example_delimiter,
+            hf_loading_vars=hf_loading_vars, 
+            hf_parsing_vars=hf_parsing_vars,
             continuation_delimiter=continuation_delimiter,
             destination_path=destination_path,
             prelimiter=prelimiter,
diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py
index 52a1ca6bb2..52ef2a8459 100644
--- a/tests/datasets/test_in_context_learning_datasets.py
+++ b/tests/datasets/test_in_context_learning_datasets.py
@@ -1525,3 +1525,60 @@ def test_lm_spacing_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
 
     assert first_batch_without_last_word.count(' UNIQUE ') == 1
     assert second_batch_without_last_word.count(' UNIQUE ') == 1
+
+@pytest.mark.parametrize('dataset_uri', ['hf://maxisawesome/long_context_eval'])
+@pytest.mark.parametrize('num_fewshot', [0, 1, 2])
+@pytest.mark.parametrize('prompt_string', ['I am a prompt', ''])
+@pytest.mark.parametrize('hf_loading_vars', [{"split":"test","name":"kv_pairs", "context_length":2048, "section":"middle"}])
+@pytest.mark.parametrize('hf_parsing_vars', [{"inputs":["context"], "outputs":["answer"]}])
+def test_hf_dataloading(dataset_uri, tiny_gpt2_tokenizer, tmp_path, num_fewshot, prompt_string, hf_loading_vars, hf_parsing_vars):
+    pytest.importorskip('datasets')
+
+    # local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+
+    tokenizer = tiny_gpt2_tokenizer
+    # dataset_uri = f'{local_data}/{dataset_uri}'
+    batch_size = 2
+    seqlen = 2048
+    # empirical number from the small test dataset
+    maximum_answer_length = 9
+    dl = get_icl_task_dataloader('question_answering',
+                                 dataset_uri=dataset_uri,
+                                 tokenizer=tokenizer,
+                                 batch_size=batch_size,
+                                 max_seq_len=seqlen,
+                                 pad_tok_id=tokenizer.eos_token_id,
+                                 num_fewshot=num_fewshot,
+                                 prompt_string=prompt_string,
+                                 example_delimiter='\n',
+                                 prelimiter='Q: ',
+                                 continuation_delimiter='\nA:',
+                                 destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'),
+                                 hf_loading_vars=hf_loading_vars,
+                                 hf_parsing_vars=hf_parsing_vars)
+    assert isinstance(dl, DataSpec)
+
+    assert isinstance(dl.dataloader, DataLoader)  # pyright
+    batch = next(dl.dataloader._get_iterator())
+    decoded_batch = tokenizer.batch_decode(batch['input_ids'])
+    import IPython; IPython.embed()
+
+    assert tuple(batch['input_ids'].shape) == (batch_size, seqlen - maximum_answer_length)
+    assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen - maximum_answer_length)
+    assert batch['mode'] == 'generate'
+    # the maximum generation length from the small test data
+    assert batch['generation_length'] == maximum_answer_length
+    assert all(item[0] == tokenizer.eos_token_id for item in batch['input_ids'])
+
+    decoded_batch = tokenizer.batch_decode(batch['input_ids'])
+    import IPython; IPython.embed()
+    assert all(item.count('Q: ') == num_fewshot + 1 for item in decoded_batch)
+    assert all(item.count('\nA:') == num_fewshot + 1 for item in decoded_batch)
+
+    if len(prompt_string) > 0:
+        assert all(item.count('I am a prompt') == 1 for item in decoded_batch)
+    assert all(
+        set(found) == set(expected)
+        for found, expected in zip(batch['labels'], [['David Seville'], ['Skorpio', 'Scorpio']]))
+    assert decoded_batch[0].endswith('Q: Who was the man behind The Chipmunks?\nA:')
+    assert decoded_batch[1].endswith('Q: What star sign is Jamie Lee Curtis?\nA:')

From cea8ff8a9b8a6fb7043a24fbaf99c54066641c00 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Fri, 10 Nov 2023 20:11:01 +0000
Subject: [PATCH 010/116] fix cot. wip

---
 .../in_context_learning_evaluation.py         | 46 +++++++++----------
 .../test_in_context_learning_datasets.py      | 13 ++++--
 2 files changed, 32 insertions(+), 27 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index c0e8b2eb11..b779753640 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -88,9 +88,9 @@ def _make_padded_input(context_enc, continuation_enc, max_seq_len, pad_tok_id, p
 def _get_fewshot_sample_idxs(dataset_size: int, num_fewshot: int, sample_idx: int, rng: random.Random):
     # samples without replacement. if num_fewshot exceeds the number of unique samples,
     # then we will have fewer than num_fewshot examples in context
-    
-    # Simpler implementation (but will choose different actual ids which will break some tests) 
-    # possible_fewshot_idxs = [i for i in range(0, dataset_size) if i != sample_idx] 
+
+    # Simpler implementation (but will choose different actual ids which will break some tests)
+    # possible_fewshot_idxs = [i for i in range(0, dataset_size) if i != sample_idx]
     # fewshot_idxs = set(rng.sample(possible_fewshot_idxs, num_fewshot))
     num_fewshot = min(dataset_size - 1, num_fewshot)
     fewshot_idxs = set(rng.sample(range(0, dataset_size), num_fewshot))
@@ -337,13 +337,17 @@ class InContextLearningQATaskDataset(InContextLearningDataset):
 
     def __init__(self, cot_delimiter: str = '', *args, **kwargs):
         self.cot_delimiter = cot_delimiter
+        self.has_cot = False
         super().__init__(*args, **kwargs)
+        
         self.max_answer_length = self.get_max_answer_length()
         self.dont_split_keys = ['mode', 'generation_length', 'generation_kwargs', 'cot_delimiter']
         self.normal_split_keys = ['input_ids', 'attention_mask']
         self.list_split_keys = ['labels']
 
     def _parse_dataset(self, dataset: Dataset) -> List[Dict[str, str]]:
+        # TODO: I hate that this is here - I really just don't want to ever have a list
+        self.has_cot = 'chain_of_thought' in dataset.features
         return list(
             dataset.map(
                 lambda examples: {
@@ -354,18 +358,14 @@ def _parse_dataset(self, dataset: Dataset) -> List[Dict[str, str]]:
                 }))
 
     def get_answer_from_sample(self, sample):
-        # If we add the answer, we need to also add COT
-        chain_of_thought = sample.get('chain_of_thought', '')
-        if len(chain_of_thought) == 0:
-            cot_delimiter = ''
+        if self.has_cot:
+            return f'{sample["chain_of_thought"]}{self.cot_delimiter}{sample[self.answer_key]}'
         else:
-            # TODO: cot_delimiter setting is all over the place. Need to choose a single way to do it
-            cot_delimiter = self.cot_delimiter
-        return f'{chain_of_thought}{cot_delimiter}{sample[self.answer_key]}'
+            return sample[self.answer_key]
 
     def additional_processing_for_example(self, tokenized_example: dict, sample: dict):
         tokenized_example['aliases'] = list(sample.get('aliases', []))
-        tokenized_example['cot_delimiter'] = self.cot_delimiter
+        # tokenized_example['cot_delimiter'] = self.cot_delimiter
         return tokenized_example
 
     def get_max_answer_length(self):
@@ -373,20 +373,16 @@ def get_max_answer_length(self):
         for sample in self.samples:
             all_answers = [sample[self.answer_key]] + list(sample.get('aliases', []))
             for answer in all_answers:
-                chain_of_thought = sample.get('chain_of_thought', '')
-                if len(chain_of_thought) == 0:
-                    cot_delimiter = ''
+                if self.has_cot:
+                    response = f'{sample["chain_of_thought"]}{self.cot_delimiter}{answer}'
                 else:
-                    # TODO: cot_delimiter setting is all over the place. Need to choose a single way to do it
-                    cot_delimiter = self.cot_delimiter
-                response = f"{chain_of_thought}{cot_delimiter}{answer}"
+                    response = answer
                 max_answer_length = max(max_answer_length, len(self.tokenizer(response)['input_ids']))
         max_answer_length = max_answer_length + (_MAX_ANSWER_BUFFER_LENGTH if len(self.cot_delimiter) > 0 else 0)
         return max_answer_length
 
     def collate_fn(self, data):
         inputs, answers = [], []
-        cot_delimiter = ''
 
         for sample in data:
             preamble, context, aliases = (sample['preamble'], sample['context'], sample['aliases'])
@@ -401,13 +397,13 @@ def collate_fn(self, data):
 
             # We will search for the answer within the portion of the model response
             # beginning with `cot_delimiter`
-            cot_delimiter = sample['cot_delimiter']
+            # cot_delimiter = sample['cot_delimiter']
 
         batch = {
             'input_ids': torch.stack(inputs),
             'mode': 'generate',
             'labels': answers,
-            'cot_delimiter': cot_delimiter,
+            'cot_delimiter': self.cot_delimiter,
             'generation_length': self.max_answer_length,
             'generation_kwargs': {
                 'pad_token_id': self.pad_tok_id,
@@ -663,7 +659,7 @@ def _parse_dataset(self, dataset: Dataset) -> List[Dict[str, str]]:
                 }))
 
     def construct_context(self, sample, preceding_text: str = '', add_answer: bool = False):
-        # TODO this is bad 
+        # TODO this is bad
         context_options = sample['context_options']
         gold_idx = sample['gold']
         continuation = sample['continuation']
@@ -932,7 +928,7 @@ def build_icl_dataloader(
     prompt_string: str,  # e.g. 'translate english to french:'
     example_delimiter: str,  # e.g. '\n'
     continuation_delimiter: str,  # e.g. ''
-    hf_loading_vars: dict, 
+    hf_loading_vars: dict,
     hf_parsing_vars: dict,
     destination_path: str,
     prelimiter: str,  # e.g. 'Question: '
@@ -1084,8 +1080,10 @@ def partition_dataset_by_category(dataset_uri: str, destination_path: str) -> Di
         output_files[cat] = cat_dest
     return output_files
 
+
 #TODO: Where do we want to set our defaults?
 
+
 def get_icl_task_dataloader(
         icl_task_type: str,
         dataset_uri: str,
@@ -1168,7 +1166,7 @@ def get_icl_task_dataloader(
                 num_fewshot=num_fewshot,
                 prompt_string=prompt_string,
                 example_delimiter=example_delimiter,
-                hf_loading_vars=hf_loading_vars, 
+                hf_loading_vars=hf_loading_vars,
                 hf_parsing_vars=hf_parsing_vars,
                 continuation_delimiter=continuation_delimiter,
                 destination_path=partition_uri + '_tmp',
@@ -1190,7 +1188,7 @@ def get_icl_task_dataloader(
             num_fewshot=num_fewshot,
             prompt_string=prompt_string,
             example_delimiter=example_delimiter,
-            hf_loading_vars=hf_loading_vars, 
+            hf_loading_vars=hf_loading_vars,
             hf_parsing_vars=hf_parsing_vars,
             continuation_delimiter=continuation_delimiter,
             destination_path=destination_path,
diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py
index 52ef2a8459..5be8cc368c 100644
--- a/tests/datasets/test_in_context_learning_datasets.py
+++ b/tests/datasets/test_in_context_learning_datasets.py
@@ -1526,12 +1526,19 @@ def test_lm_spacing_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
     assert first_batch_without_last_word.count(' UNIQUE ') == 1
     assert second_batch_without_last_word.count(' UNIQUE ') == 1
 
+
 @pytest.mark.parametrize('dataset_uri', ['hf://maxisawesome/long_context_eval'])
 @pytest.mark.parametrize('num_fewshot', [0, 1, 2])
 @pytest.mark.parametrize('prompt_string', ['I am a prompt', ''])
-@pytest.mark.parametrize('hf_loading_vars', [{"split":"test","name":"kv_pairs", "context_length":2048, "section":"middle"}])
-@pytest.mark.parametrize('hf_parsing_vars', [{"inputs":["context"], "outputs":["answer"]}])
-def test_hf_dataloading(dataset_uri, tiny_gpt2_tokenizer, tmp_path, num_fewshot, prompt_string, hf_loading_vars, hf_parsing_vars):
+@pytest.mark.parametrize('hf_loading_vars', [{
+    'split': 'test',
+    'name': 'kv_pairs',
+    'context_length': 2048,
+    'section': 'middle'
+}])
+@pytest.mark.parametrize('hf_parsing_vars', [{'inputs': ['context'], 'outputs': ['answer']}])
+def test_hf_dataloading(dataset_uri, tiny_gpt2_tokenizer, tmp_path, num_fewshot, prompt_string, hf_loading_vars,
+                        hf_parsing_vars):
     pytest.importorskip('datasets')
 
     # local_data = os.path.join(os.path.dirname(__file__), 'local_data')

From ae4c6bca257d20bf6d4a16efb3a61b084d717333 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Sat, 11 Nov 2023 01:52:49 +0000
Subject: [PATCH 011/116] del device and world_size from tests

---
 .../in_context_learning_evaluation.py         |  6 +--
 .../test_in_context_learning_datasets.py      | 40 +++++--------------
 2 files changed, 13 insertions(+), 33 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index b779753640..e8e95a6f39 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -183,9 +183,9 @@ def _read_dataset(self,
         return dataset
 
     def _parse_hf_dataset(self, dataset, hf_parsing_vars):
-        dataset = dataset.map(
-            lambda example: {k: ''.join([str(example[col]) for col in v]) for k, v in hf_parsing_vars.items()})
-        return dataset
+        return dataset.map(
+            lambda example: {k: ''.join([str(example[col]) for col in v]) for k, v in hf_parsing_vars.items()}
+            )
 
     def _parse_dataset(self, dataset: Dataset) -> List[Dict[str, str]]:
         return list(
diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py
index 5be8cc368c..5e0039ee06 100644
--- a/tests/datasets/test_in_context_learning_datasets.py
+++ b/tests/datasets/test_in_context_learning_datasets.py
@@ -990,8 +990,7 @@ def test_code_eval_task_dataloader(dataset_uri, tmp_path, num_fewshot, prompt_st
 
 @pytest.mark.parametrize('dataset_uri', ['lambada_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [0, 5])
-@device('gpu')
-def test_lm_task_evaluation(device, dataset_uri, num_fewshot, tiny_gpt2_tokenizer, tmp_path):
+def test_lm_task_evaluation(dataset_uri, num_fewshot, tiny_gpt2_tokenizer, tmp_path):
     pytest.importorskip('datasets')
     in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -1073,10 +1072,8 @@ def test_schema_task_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer, t
 
 @pytest.mark.parametrize('dataset_uri', ['mmlu_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [0, 5])
-@device('gpu')
-@world_size(1, 2)
 @pytest.mark.filterwarnings(r'ignore:Cannot split .* of length.*:UserWarning')
-def test_mc_task_evaluation_subcategories(device, world_size, dataset_uri, num_fewshot, tiny_gpt2_model,
+def test_mc_task_evaluation_subcategories(dataset_uri, num_fewshot, tiny_gpt2_model,
                                           tiny_gpt2_tokenizer, tmp_path):
     pytest.importorskip('datasets')
     in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
@@ -1126,9 +1123,8 @@ def test_mc_task_evaluation_subcategories(device, world_size, dataset_uri, num_f
 
 
 @pytest.mark.parametrize('dataset_uri', ['piqa_small.jsonl', 'hellaswag_small.jsonl'])
-@device('gpu')
 @pytest.mark.parametrize('num_fewshot', [0, 5])
-def test_mc_task_evaluation(device, num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tmp_path, tiny_gpt2_model):
+def test_mc_task_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tmp_path, tiny_gpt2_model):
     pytest.importorskip('datasets')
     in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -1172,10 +1168,8 @@ def test_mc_task_evaluation(device, num_fewshot, dataset_uri, tiny_gpt2_tokenize
 
 
 @pytest.mark.parametrize('dataset_uri', ['triviaqa_small.jsonl'])
-@device('gpu')
-@world_size(1, 2)
 @pytest.mark.parametrize('num_fewshot', [0, 5])
-def test_qa_task_evaluation_opt_tokenizer(device, world_size, num_fewshot, dataset_uri, tmp_path):
+def test_qa_task_evaluation_opt_tokenizer(num_fewshot, dataset_uri, tmp_path):
     pytest.importorskip('datasets')
     in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -1214,10 +1208,8 @@ def test_qa_task_evaluation_opt_tokenizer(device, world_size, num_fewshot, datas
 
 
 @pytest.mark.parametrize('dataset_uri', ['gsm8k_small.jsonl'])
-@device('gpu')
-@world_size(1, 2)
 @pytest.mark.parametrize('num_fewshot', [5])
-def test_qa_task_evaluation_with_cot_opt_tokenizer(device, world_size, num_fewshot, dataset_uri, tmp_path):
+def test_qa_task_evaluation_with_cot_opt_tokenizer(num_fewshot, dataset_uri, tmp_path):
     pytest.importorskip('datasets')
     in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -1257,10 +1249,8 @@ def test_qa_task_evaluation_with_cot_opt_tokenizer(device, world_size, num_fewsh
 
 
 @pytest.mark.parametrize('dataset_uri', ['triviaqa_small.jsonl'])
-@device('gpu')
-@world_size(1, 2)
 @pytest.mark.parametrize('num_fewshot', [0, 5])
-def test_qa_task_evaluation(device, world_size, num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tiny_gpt2_model,
+def test_qa_task_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tiny_gpt2_model,
                             tmp_path):
     pytest.importorskip('datasets')
     in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
@@ -1300,10 +1290,8 @@ def test_qa_task_evaluation(device, world_size, num_fewshot, dataset_uri, tiny_g
 
 
 @pytest.mark.parametrize('dataset_uri', ['gsm8k_small.jsonl'])
-@device('gpu')
-@world_size(1, 2)
 @pytest.mark.parametrize('num_fewshot', [5])
-def test_qa_task_with_cot_evaluation(device, world_size, num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tiny_gpt2_model,
+def test_qa_task_with_cot_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tiny_gpt2_model,
                                      tmp_path):
     pytest.importorskip('datasets')
     in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
@@ -1344,11 +1332,9 @@ def test_qa_task_with_cot_evaluation(device, world_size, num_fewshot, dataset_ur
 
 
 @pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
-@device('gpu')
-@world_size(1, 2)
 @pytest.mark.parametrize('num_fewshot', [0])
 @pytest.mark.parametrize('generations_per_sample', range(1, 3))
-def test_code_eval_microbatching(monkeypatch, device, world_size, num_fewshot, dataset_uri, tmp_path,
+def test_code_eval_microbatching(monkeypatch, num_fewshot, dataset_uri, tmp_path,
                                  generations_per_sample):
     pytest.importorskip('datasets')
     monkeypatch.setenv('CODE_EVAL_DEVICE', 'LOCAL')
@@ -1394,11 +1380,9 @@ def test_code_eval_microbatching(monkeypatch, device, world_size, num_fewshot, d
 
 
 @pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
-@device('gpu')
-@world_size(1, 2)
 @pytest.mark.parametrize('num_fewshot', [0])
 @pytest.mark.parametrize('generations_per_sample', range(1, 3))
-def test_code_eval_sentpiece_evaluation(monkeypatch, device, world_size, num_fewshot, dataset_uri, tiny_t5_tokenizer,
+def test_code_eval_sentpiece_evaluation(monkeypatch, num_fewshot, dataset_uri, tiny_t5_tokenizer,
                                         tiny_t5_model, tmp_path, generations_per_sample):
     pytest.importorskip('datasets')
     torch.cuda.empty_cache()
@@ -1441,12 +1425,10 @@ def test_code_eval_sentpiece_evaluation(monkeypatch, device, world_size, num_few
 
 
 @pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
-@device('gpu')
-@world_size(1, 2)
 @pytest.mark.parametrize('num_fewshot', [0, 2])
 @pytest.mark.parametrize('generations_per_sample', [1])
 @pytest.mark.filterwarnings(r'ignore: Input length of input_ids is')
-def test_code_eval_task_evaluation(monkeypatch, device, world_size, num_fewshot, dataset_uri, tiny_gpt2_tokenizer,
+def test_code_eval_task_evaluation(monkeypatch, num_fewshot, dataset_uri, tiny_gpt2_tokenizer,
                                    tiny_gpt2_model, tmp_path, generations_per_sample):
     pytest.importorskip('datasets')
     torch.cuda.empty_cache()
@@ -1567,8 +1549,6 @@ def test_hf_dataloading(dataset_uri, tiny_gpt2_tokenizer, tmp_path, num_fewshot,
 
     assert isinstance(dl.dataloader, DataLoader)  # pyright
     batch = next(dl.dataloader._get_iterator())
-    decoded_batch = tokenizer.batch_decode(batch['input_ids'])
-    import IPython; IPython.embed()
 
     assert tuple(batch['input_ids'].shape) == (batch_size, seqlen - maximum_answer_length)
     assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen - maximum_answer_length)

From 76a3f336026a6b47c0dcf8d11d04735d3e890243 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Sat, 11 Nov 2023 02:23:57 +0000
Subject: [PATCH 012/116] change to .map

---
 .../in_context_learning_evaluation.py         | 541 +++++++++++-------
 1 file changed, 332 insertions(+), 209 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index e8e95a6f39..ac20439bf3 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -33,8 +33,10 @@
 ]
 
 
-def strip_data(samples):
-    return [{k: v.strip() if isinstance(v, str) else v for k, v in entry.items()} for entry in samples]
+# def strip_data(samples):
+#     return [{k: v.strip() if isinstance(v, str) else v for k, v in entry.items()} for entry in samples]
+def strip_data(sample):
+    return {k: v.strip() if isinstance(v, str) else v for k, v in sample.items()}
 
 
 def _tokenizer_needs_prefix_space(tokenizer) -> bool:
@@ -142,13 +144,22 @@ def __init__(
         self.context_key = context_key
         self.answer_key = answer_key
 
-        self.samples = self._read_dataset(dataset_uri, destination_path, hf_loading_vars, hf_parsing_vars)
+        self.dataset = self._read_dataset(dataset_uri, destination_path, hf_loading_vars, hf_parsing_vars)
         self.strip_data = strip_dataset
         if self.strip_data:
-            self.samples = strip_data(self.samples)
+            self.dataset = self.dataset.map(strip_data)
 
         fewshot_rng = random.Random(fewshot_random_seed)
-        self.encoded_dataset = self._prep_examples(num_fewshot, prompt_string, fewshot_rng)
+        self.encoded_dataset = self.dataset.map(
+            self._prep_example,
+            with_indices=True,
+            fn_kwargs={
+                'num_fewshot': num_fewshot,
+                'prompt_string': prompt_string,
+                'fewshot_rng': fewshot_rng,
+            },
+        )
+        # self.encoded_dataset = self._prep_examples(num_fewshot, prompt_string, fewshot_rng)
 
     def __getitem__(self, index: int):
         return self.encoded_dataset[index]
@@ -159,17 +170,21 @@ def __len__(self):
     def get_num_samples_in_batch(self, batch: dict) -> int:
         return batch['input_ids'].shape[0]
 
-    def _read_dataset(self,
-                      dataset_uri: str,
-                      destination_path: str,
-                      hf_loading_vars: dict = None,
-                      hf_parsing_vars: dict = None):
+    def _read_dataset(
+        self,
+        dataset_uri: str,
+        destination_path: str,
+        hf_loading_vars: dict = None,
+        hf_parsing_vars: dict = None,
+    ):
         try:
             from datasets import load_dataset  # pyright: ignore [reportGeneralTypeIssues]
         except ImportError as e:
-            raise MissingConditionalImportError(extra_deps_group='nlp',
-                                                conda_package='datasets',
-                                                conda_channel='conda-forge') from e
+            raise MissingConditionalImportError(
+                extra_deps_group='nlp',
+                conda_package='datasets',
+                conda_channel='conda-forge',
+            ) from e
         if 'hf://' in dataset_uri:
             dataset_uri = dataset_uri.replace('hf://', '')
             dataset = load_dataset(dataset_uri, **hf_loading_vars)
@@ -184,18 +199,21 @@ def _read_dataset(self,
 
     def _parse_hf_dataset(self, dataset, hf_parsing_vars):
         return dataset.map(
-            lambda example: {k: ''.join([str(example[col]) for col in v]) for k, v in hf_parsing_vars.items()}
-            )
+            lambda example: {k: ''.join([str(example[col]) for col in v]) for k, v in hf_parsing_vars.items()})
 
     def _parse_dataset(self, dataset: Dataset) -> List[Dict[str, str]]:
-        return list(
-            dataset.map(lambda examples: {
-                self.context_key: examples['context'],
-                self.answer_key: examples['answer'],
-            }))
-
-    def generate_few_shot_text(self, num_fewshot: int, sample_idx: int, preamble: str,
-                               fewshot_rng: random.Random) -> str:
+        return dataset.map(lambda examples: {
+            self.context_key: examples['context'],
+            self.answer_key: examples['answer'],
+        })
+
+    def generate_few_shot_text(
+        self,
+        num_fewshot: int,
+        sample_idx: int,
+        preamble: str,
+        fewshot_rng: random.Random,
+    ) -> str:
         """Formats the prompt fewshot examples for test sample `sample_idx`.
 
         Randomly select `num_fewshot` samples from the dataset (not including the sample at `sample_idx`) and format
@@ -208,9 +226,9 @@ def generate_few_shot_text(self, num_fewshot: int, sample_idx: int, preamble: st
         few_shot_text = preamble
 
         if num_fewshot > 0:
-            fewshot_idxs = _get_fewshot_sample_idxs(len(self.samples), num_fewshot, sample_idx, fewshot_rng)
+            fewshot_idxs = _get_fewshot_sample_idxs(len(self.dataset), num_fewshot, sample_idx, fewshot_rng)
             for fewshot_idx in fewshot_idxs:
-                ctxt = self.construct_context(self.samples[fewshot_idx], few_shot_text, add_answer=True)
+                ctxt = self.construct_context(self.dataset[fewshot_idx], few_shot_text, add_answer=True)
                 few_shot_text += ctxt
 
         return few_shot_text
@@ -248,7 +266,14 @@ def tokenize_example(self, prompt_and_fewshot: str, ctxt: str):
         tokenized_example[self.context_key] = self.tokenizer(ctxt, add_special_tokens=False)
         return tokenized_example
 
-    def _prep_examples(self, num_fewshot: int, prompt_string: str, fewshot_rng: random.Random) -> List[Dict[str, Any]]:
+    def _prep_example(
+        self,
+        example,
+        example_idx: int,
+        num_fewshot: int,
+        prompt_string: str,
+        fewshot_rng: random.Random,
+    ) -> List[Dict[str, Any]]:
         """Prepares a set of language modeling tasks into tokenized format with prompt and fewshot examples.
 
         Each task consists of a context and a continuation as well as an optional prompt and optional list of
@@ -266,31 +291,50 @@ def _prep_examples(self, num_fewshot: int, prompt_string: str, fewshot_rng: rand
         Returns:
             dict: Contains the context, the continuation, and the preamble (prompt + fewshot examples)
         """
-        examples = []
-        for sample_idx in tqdm(range(len(self.samples))):
-            prompt_and_fewshot = self.generate_few_shot_text(num_fewshot, sample_idx, prompt_string, fewshot_rng)
-            ctxt = self.construct_context(self.samples[sample_idx], prompt_and_fewshot, add_answer=False)
-            tokenized_example = self.tokenize_example(prompt_and_fewshot, ctxt)
-            tokenized_example = self.additional_processing_for_example(tokenized_example, self.samples[sample_idx])
-            examples.append(tokenized_example)
-        return examples
+        prompt_and_fewshot = self.generate_few_shot_text(num_fewshot, example_idx, prompt_string, fewshot_rng)
+        ctxt = self.construct_context(example, prompt_and_fewshot, add_answer=False)
+        tokenized_example = self.tokenize_example(prompt_and_fewshot, ctxt)
+        tokenized_example = self.additional_processing_for_example(tokenized_example, example)
+        return tokenized_example
 
     def additional_processing_for_example(self, tokenized_example: dict, sample: dict):
         return tokenized_example
 
     def collate_fn(self, data):
-        pass
+        batch = self.default_batch
+        batch = {
+            'input_ids': [],
+            'continuation_indices': [],
+            'mode': 'icl_task',
+            'labels': [],
+        }
+        for data_pair in data:
+            preamble, context, continuation = (
+                data_pair['preamble'],
+                data_pair['context'],
+                data_pair['continuation'],
+            )
+
+            context_enc = preamble['input_ids'] + context['input_ids']
+            continuation_enc = continuation['input_ids']
+
+            inp, continuation_span = _make_padded_input(context_enc, continuation_enc, self.max_seq_len,
+                                                        self.pad_tok_id)
+
+            batch['input_ids'].append(inp)
+            batch['continuate_indicies'].append(continuation_span)
+            batch['labels'].append(inp)
+
+        for key in self.stack_keys:
+            batch[key] = torch.stack(batch[key])
+
+        batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
+        return batch
 
     def split_batch(self, batch: Any, microbatch_size: int):
         # Don't split kwargs that don't change
         # Normally split torch tensors
         # List split lists of strings
-        # no_split = ['mode', 'generation_length', 'pass_at_k', 'generation_kwargs']
-        # normal_split = ['input_ids', 'attention_mask']
-        # list_split = [
-        #     'labels', 'tests', 'canonical_solutions', 'entry_points', 'test_inputs', 'test_outputs', 'prompts',
-        #     'languages'
-        # ]
         chunked = {}
         for k, v in batch.items():
             if k in self.dont_split_keys:
@@ -339,23 +383,27 @@ def __init__(self, cot_delimiter: str = '', *args, **kwargs):
         self.cot_delimiter = cot_delimiter
         self.has_cot = False
         super().__init__(*args, **kwargs)
-        
+
         self.max_answer_length = self.get_max_answer_length()
-        self.dont_split_keys = ['mode', 'generation_length', 'generation_kwargs', 'cot_delimiter']
+        self.dont_split_keys = [
+            'mode',
+            'generation_length',
+            'generation_kwargs',
+            'cot_delimiter',
+        ]
         self.normal_split_keys = ['input_ids', 'attention_mask']
         self.list_split_keys = ['labels']
 
     def _parse_dataset(self, dataset: Dataset) -> List[Dict[str, str]]:
         # TODO: I hate that this is here - I really just don't want to ever have a list
         self.has_cot = 'chain_of_thought' in dataset.features
-        return list(
-            dataset.map(
-                lambda examples: {
-                    'context': examples['context'],
-                    'answer': examples['answer'],
-                    'aliases': set([examples['answer']] + examples.get('aliases', [])),
-                    'chain_of_thought': examples.get('chain_of_thought', '')
-                }))
+        return dataset.map(
+            lambda examples: {
+                'context': examples['context'],
+                'answer': examples['answer'],
+                'aliases': set([examples['answer']] + examples.get('aliases', [])),
+                'chain_of_thought': examples.get('chain_of_thought', ''),
+            })
 
     def get_answer_from_sample(self, sample):
         if self.has_cot:
@@ -370,11 +418,11 @@ def additional_processing_for_example(self, tokenized_example: dict, sample: dic
 
     def get_max_answer_length(self):
         max_answer_length = 0
-        for sample in self.samples:
+        for sample in self.dataset:
             all_answers = [sample[self.answer_key]] + list(sample.get('aliases', []))
             for answer in all_answers:
                 if self.has_cot:
-                    response = f'{sample["chain_of_thought"]}{self.cot_delimiter}{answer}'
+                    response = (f'{sample["chain_of_thought"]}{self.cot_delimiter}{answer}')
                 else:
                     response = answer
                 max_answer_length = max(max_answer_length, len(self.tokenizer(response)['input_ids']))
@@ -385,12 +433,19 @@ def collate_fn(self, data):
         inputs, answers = [], []
 
         for sample in data:
-            preamble, context, aliases = (sample['preamble'], sample['context'], sample['aliases'])
+            preamble, context, aliases = (
+                sample['preamble'],
+                sample['context'],
+                sample['aliases'],
+            )
             context_enc = preamble['input_ids'] + context['input_ids']
-            inp, _ = _make_padded_input(context_enc, [],
-                                        self.max_seq_len - self.max_answer_length,
-                                        self.pad_tok_id,
-                                        padding_side=self.padding_side)
+            inp, _ = _make_padded_input(
+                context_enc,
+                [],
+                self.max_seq_len - self.max_answer_length,
+                self.pad_tok_id,
+                padding_side=self.padding_side,
+            )
 
             inputs.append(inp)
             answers.append(aliases)
@@ -408,7 +463,7 @@ def collate_fn(self, data):
             'generation_kwargs': {
                 'pad_token_id': self.pad_tok_id,
                 'use_cache': True
-            }
+            },
         }
 
         batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
@@ -438,11 +493,10 @@ def __init__(self, *args, **kwargs):
         super().__init__(answer_key='continuation', *args, **kwargs)
 
     def _parse_dataset(self, dataset: Dataset) -> List[Dict[str, str]]:
-        return list(
-            dataset.map(lambda examples: {
-                'continuation': examples['continuation'],
-                'context': examples['context'],
-            }))
+        return dataset.map(lambda examples: {
+            'continuation': examples['continuation'],
+            'context': examples['context'],
+        })
 
     def additional_processing_for_example(self, tokenized_example: dict, sample: dict):
         cont = sample['continuation']
@@ -455,7 +509,11 @@ def collate_fn(self, data):
         inputs = []
         continuation_indices = []
         for data_pair in data:
-            preamble, context, continuation = (data_pair['preamble'], data_pair['context'], data_pair['continuation'])
+            preamble, context, continuation = (
+                data_pair['preamble'],
+                data_pair['context'],
+                data_pair['continuation'],
+            )
 
             context_enc = preamble['input_ids'] + context['input_ids']
             continuation_enc = continuation['input_ids']
@@ -511,19 +569,18 @@ class InContextLearningMultipleChoiceTaskDataset(InContextLearningDataset):
 
     def __init__(self, choices_key: str = 'choices', *args, **kwargs):
         super().__init__(context_key='query', *args, **kwargs)
-        self.num_choices = len(self.samples[0][choices_key])
+        self.num_choices = len(self.dataset[0][choices_key])
 
         self.dont_split_keys = ['mode']
         self.real_split_keys = ['input_ids', 'labels', 'attention_mask']
         self.normal_split_keys = ['gold_indices']
 
     def _parse_dataset(self, dataset: Dataset) -> List[Dict[str, str]]:
-        return list(
-            dataset.map(lambda examples: {
-                'query': examples['query'],
-                'choices': examples['choices'],
-                'gold': examples['gold']
-            }))
+        return dataset.map(lambda examples: {
+            'query': examples['query'],
+            'choices': examples['choices'],
+            'gold': examples['gold'],
+        })
 
     def get_answer_from_sample(self, sample: dict):
         choices = sample['choices']
@@ -544,10 +601,13 @@ def collate_fn(self, data):
         gold_idxs = []
         choice_groupings = []
         for data_pair in data:
-
             choice_start_idx = len(continuation_indices)
-            preamble, context, choices, gold_idx = (data_pair['preamble'], data_pair['query'], data_pair['choices'],
-                                                    data_pair['gold'])
+            preamble, context, choices, gold_idx = (
+                data_pair['preamble'],
+                data_pair['query'],
+                data_pair['choices'],
+                data_pair['gold'],
+            )
 
             for choice in choices:
                 context_enc = preamble['input_ids'] + context['input_ids']
@@ -575,7 +635,7 @@ def collate_fn(self, data):
             'mode': 'icl_task',
             'labels': torch.stack(inputs),
             'gold_indices': gold_idxs,
-            'choice_groupings': choice_groupings
+            'choice_groupings': choice_groupings,
         }
         batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
         return batch
@@ -650,13 +710,12 @@ def __init__(self, choices_key='context_options', *args, **kwargs):
         super().__init__(choices_key=choices_key, *args, **kwargs)
 
     def _parse_dataset(self, dataset: Dataset) -> List[Dict[str, str]]:
-        return list(
-            dataset.map(
-                lambda examples: {
-                    'context_options': examples['context_options'],
-                    'continuation': examples['continuation'],
-                    'gold': examples['gold']
-                }))
+        return dataset.map(
+            lambda examples: {
+                'context_options': examples['context_options'],
+                'continuation': examples['continuation'],
+                'gold': examples['gold'],
+            })
 
     def construct_context(self, sample, preceding_text: str = '', add_answer: bool = False):
         # TODO this is bad
@@ -725,7 +784,7 @@ def tokenize_example(self, prompt_and_fewshot: str, context_options: List[str]):
     def additional_processing_for_example(self, tokenized_example: dict, sample: dict):
         continuation = sample['continuation']
         if self.prefix_space:
-            continuation = f' {continuation}' if not continuation.startswith(' ') else continuation
+            continuation = (f' {continuation}' if not continuation.startswith(' ') else continuation)
         tokenized_example['continuation'] = self.tokenizer(continuation, add_special_tokens=False)
         tokenized_example['gold'] = sample['gold']
         return tokenized_example
@@ -737,8 +796,12 @@ def collate_fn(self, data):
         choice_groupings = []
         for data_pair in data:
             continuation_start_idx = len(continuation_indices)
-            preamble, context_options, continuation, gold_idx = (data_pair['preamble'], data_pair['context_options'],
-                                                                 data_pair['continuation'], data_pair['gold'])
+            preamble, context_options, continuation, gold_idx = (
+                data_pair['preamble'],
+                data_pair['context_options'],
+                data_pair['continuation'],
+                data_pair['gold'],
+            )
 
             for ctxt in context_options:
                 context_enc = preamble['input_ids'] + ctxt['input_ids']
@@ -766,14 +829,14 @@ def collate_fn(self, data):
             'mode': 'icl_task',
             'labels': torch.stack(inputs),
             'gold_indices': gold_idxs,
-            'choice_groupings': choice_groupings
+            'choice_groupings': choice_groupings,
         }
         batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
         return batch
 
 
 class InContextLearningCodeEvalDataset(InContextLearningDataset):
-    """ A dataset that constructs batches for in-context learning code evaluation
+    """A dataset that constructs batches for in-context learning code evaluation
 
     The input format is expected to be a jsonl file with the following fields:
     - task_id: label of given task
@@ -803,52 +866,72 @@ class InContextLearningCodeEvalDataset(InContextLearningDataset):
         top_k: top_k sampling parameter for number of samples to consider
     """
 
-    def __init__(self,
-                 generations_per_sample: int,
-                 pass_at_k: int = 1,
-                 top_p: Optional[float] = 0.95,
-                 top_k: Optional[int] = 40,
-                 *args,
-                 **kwargs):
+    def __init__(
+        self,
+        generations_per_sample: int,
+        pass_at_k: int = 1,
+        top_p: Optional[float] = 0.95,
+        top_k: Optional[int] = 40,
+        *args,
+        **kwargs,
+    ):
         if generations_per_sample < pass_at_k:
             raise ValueError(
                 f'generations_per_sample ({generations_per_sample}) must be greater than or equal to pass_at_k ({pass_at_k}) for code evaluation.'
             )
 
-        super().__init__(context_key='prompt', answer_key='canonical_solution', strip_dataset=False, *args, **kwargs)
+        super().__init__(
+            context_key='prompt',
+            answer_key='canonical_solution',
+            strip_dataset=False,
+            *args,
+            **kwargs,
+        )
         self.pass_at_k = pass_at_k
         self.generations_per_sample = generations_per_sample
         self.max_prompt_length = self.get_max_prompt_length()
         self.top_p = top_p
         self.top_k = top_k
 
-        self.dont_split_keys = ['mode', 'generation_length', 'pass_at_k', 'generation_kwargs']
+        self.dont_split_keys = [
+            'mode',
+            'generation_length',
+            'pass_at_k',
+            'generation_kwargs',
+        ]
         self.normal_split_keys = ['input_ids', 'attention_mask']
         self.list_split_keys = [
-            'labels', 'tests', 'canonical_solutions', 'entry_points', 'test_inputs', 'test_outputs', 'prompts',
-            'languages'
+            'labels',
+            'tests',
+            'canonical_solutions',
+            'entry_points',
+            'test_inputs',
+            'test_outputs',
+            'prompts',
+            'languages',
         ]
 
     def get_max_prompt_length(self):
         max_prompt_length = 0
         for sample in self.encoded_dataset:
-            max_prompt_length = max(max_prompt_length,
-                                    len(sample['preamble']['input_ids'] + sample['prompt']['input_ids']))
+            max_prompt_length = max(
+                max_prompt_length,
+                len(sample['preamble']['input_ids'] + sample['prompt']['input_ids']),
+            )
         return max_prompt_length
 
     def _parse_dataset(self, dataset: Dataset) -> List[Dict[str, str]]:
-        return list(
-            dataset.map(
-                lambda examples: {
-                    'task_id': examples['task_id'],
-                    'prompt': examples['prompt'],
-                    'canonical_solution': examples['canonical_solution'],
-                    'test': examples['test'],
-                    'entry_point': examples['entry_point'],
-                    'test_inputs': examples['test_inputs'],
-                    'test_outputs': examples['test_outputs'],
-                    'language': examples['language'],
-                }))
+        return dataset.map(
+            lambda examples: {
+                'task_id': examples['task_id'],
+                'prompt': examples['prompt'],
+                'canonical_solution': examples['canonical_solution'],
+                'test': examples['test'],
+                'entry_point': examples['entry_point'],
+                'test_inputs': examples['test_inputs'],
+                'test_outputs': examples['test_outputs'],
+                'language': examples['language'],
+            })
 
     def additional_processing_for_example(self, tokenized_example: dict, sample: dict):
         tokenized_example['prompt_text'] = sample['prompt']
@@ -862,9 +945,28 @@ def additional_processing_for_example(self, tokenized_example: dict, sample: dic
         return tokenized_example
 
     def collate_fn(self, data):
-        inputs, prompts, tests, canonical_solutions, entry_points, test_inputs, test_outputs, languages = [], [], [], [], [], [], [], []
+        (
+            inputs,
+            prompts,
+            tests,
+            canonical_solutions,
+            entry_points,
+            test_inputs,
+            test_outputs,
+            languages,
+        ) = ([], [], [], [], [], [], [], [])
         for sample in data:
-            preamble, prompt, text_prompt, canonical_solution, test, entry_point, test_input, test_output, language = (
+            (
+                preamble,
+                prompt,
+                text_prompt,
+                canonical_solution,
+                test,
+                entry_point,
+                test_input,
+                test_output,
+                language,
+            ) = (
                 sample['preamble'],
                 sample['prompt'],
                 sample['prompt_text'],
@@ -876,10 +978,13 @@ def collate_fn(self, data):
                 sample['language'],
             )
             context_enc = preamble['input_ids'] + prompt['input_ids']
-            inp, _ = _make_padded_input(context_enc, [],
-                                        self.max_prompt_length,
-                                        self.pad_tok_id,
-                                        padding_side=self.padding_side)
+            inp, _ = _make_padded_input(
+                context_enc,
+                [],
+                self.max_prompt_length,
+                self.pad_tok_id,
+                padding_side=self.padding_side,
+            )
 
             inputs.append(inp)
             tests.append(test)
@@ -911,7 +1016,7 @@ def collate_fn(self, data):
                 'top_p': self.top_p,
                 'top_k': self.top_k,
                 'use_cache': True,
-            }
+            },
         }
         batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
         return batch
@@ -938,81 +1043,91 @@ def build_icl_dataloader(
     generations_per_sample: int,
 ) -> DataSpec:
     if icl_task_type == 'multiple_choice':
-        dataset = InContextLearningMultipleChoiceTaskDataset(dataset_uri=dataset_uri,
-                                                             tokenizer=tokenizer,
-                                                             max_seq_len=max_seq_len,
-                                                             pad_tok_id=pad_tok_id,
-                                                             num_fewshot=num_fewshot,
-                                                             prompt_string=prompt_string,
-                                                             example_delimiter=example_delimiter,
-                                                             continuation_delimiter=continuation_delimiter,
-                                                             destination_path=destination_path,
-                                                             fewshot_random_seed=fewshot_random_seed,
-                                                             hf_loading_vars=hf_loading_vars,
-                                                             hf_parsing_vars=hf_parsing_vars)
+        dataset = InContextLearningMultipleChoiceTaskDataset(
+            dataset_uri=dataset_uri,
+            tokenizer=tokenizer,
+            max_seq_len=max_seq_len,
+            pad_tok_id=pad_tok_id,
+            num_fewshot=num_fewshot,
+            prompt_string=prompt_string,
+            example_delimiter=example_delimiter,
+            continuation_delimiter=continuation_delimiter,
+            destination_path=destination_path,
+            fewshot_random_seed=fewshot_random_seed,
+            hf_loading_vars=hf_loading_vars,
+            hf_parsing_vars=hf_parsing_vars,
+        )
         batch_size = max(dataset.num_choices, batch_size)
         effective_batchsize = batch_size // dataset.num_choices
     elif icl_task_type == 'schema':
-        dataset = InContextLearningSchemaTaskDataset(dataset_uri=dataset_uri,
-                                                     tokenizer=tokenizer,
-                                                     max_seq_len=max_seq_len,
-                                                     pad_tok_id=pad_tok_id,
-                                                     num_fewshot=num_fewshot,
-                                                     prompt_string=prompt_string,
-                                                     example_delimiter=example_delimiter,
-                                                     continuation_delimiter=continuation_delimiter,
-                                                     destination_path=destination_path,
-                                                     fewshot_random_seed=fewshot_random_seed,
-                                                     hf_loading_vars=hf_loading_vars,
-                                                     hf_parsing_vars=hf_parsing_vars)
+        dataset = InContextLearningSchemaTaskDataset(
+            dataset_uri=dataset_uri,
+            tokenizer=tokenizer,
+            max_seq_len=max_seq_len,
+            pad_tok_id=pad_tok_id,
+            num_fewshot=num_fewshot,
+            prompt_string=prompt_string,
+            example_delimiter=example_delimiter,
+            continuation_delimiter=continuation_delimiter,
+            destination_path=destination_path,
+            fewshot_random_seed=fewshot_random_seed,
+            hf_loading_vars=hf_loading_vars,
+            hf_parsing_vars=hf_parsing_vars,
+        )
         batch_size = max(dataset.num_choices, batch_size)
         effective_batchsize = batch_size // dataset.num_choices
     elif icl_task_type == 'language_modeling':
-        dataset = InContextLearningLMTaskDataset(dataset_uri=dataset_uri,
-                                                 tokenizer=tokenizer,
-                                                 max_seq_len=max_seq_len,
-                                                 pad_tok_id=pad_tok_id,
-                                                 num_fewshot=num_fewshot,
-                                                 prompt_string=prompt_string,
-                                                 example_delimiter=example_delimiter,
-                                                 continuation_delimiter=continuation_delimiter,
-                                                 destination_path=destination_path,
-                                                 fewshot_random_seed=fewshot_random_seed,
-                                                 hf_loading_vars=hf_loading_vars,
-                                                 hf_parsing_vars=hf_parsing_vars)
+        dataset = InContextLearningLMTaskDataset(
+            dataset_uri=dataset_uri,
+            tokenizer=tokenizer,
+            max_seq_len=max_seq_len,
+            pad_tok_id=pad_tok_id,
+            num_fewshot=num_fewshot,
+            prompt_string=prompt_string,
+            example_delimiter=example_delimiter,
+            continuation_delimiter=continuation_delimiter,
+            destination_path=destination_path,
+            fewshot_random_seed=fewshot_random_seed,
+            hf_loading_vars=hf_loading_vars,
+            hf_parsing_vars=hf_parsing_vars,
+        )
         effective_batchsize = batch_size
     elif icl_task_type == 'question_answering':
-        dataset = InContextLearningQATaskDataset(dataset_uri=dataset_uri,
-                                                 tokenizer=tokenizer,
-                                                 max_seq_len=max_seq_len,
-                                                 pad_tok_id=pad_tok_id,
-                                                 num_fewshot=num_fewshot,
-                                                 prompt_string=prompt_string,
-                                                 example_delimiter=example_delimiter,
-                                                 continuation_delimiter=continuation_delimiter,
-                                                 destination_path=destination_path,
-                                                 prelimiter=prelimiter,
-                                                 fewshot_random_seed=fewshot_random_seed,
-                                                 hf_loading_vars=hf_loading_vars,
-                                                 hf_parsing_vars=hf_parsing_vars,
-                                                 cot_delimiter=cot_delimiter)
+        dataset = InContextLearningQATaskDataset(
+            dataset_uri=dataset_uri,
+            tokenizer=tokenizer,
+            max_seq_len=max_seq_len,
+            pad_tok_id=pad_tok_id,
+            num_fewshot=num_fewshot,
+            prompt_string=prompt_string,
+            example_delimiter=example_delimiter,
+            continuation_delimiter=continuation_delimiter,
+            destination_path=destination_path,
+            prelimiter=prelimiter,
+            fewshot_random_seed=fewshot_random_seed,
+            hf_loading_vars=hf_loading_vars,
+            hf_parsing_vars=hf_parsing_vars,
+            cot_delimiter=cot_delimiter,
+        )
         effective_batchsize = batch_size
     elif icl_task_type == 'code_evaluation':
-        dataset = InContextLearningCodeEvalDataset(dataset_uri=dataset_uri,
-                                                   tokenizer=tokenizer,
-                                                   max_seq_len=max_seq_len,
-                                                   pad_tok_id=pad_tok_id,
-                                                   num_fewshot=num_fewshot,
-                                                   prompt_string=prompt_string,
-                                                   example_delimiter=example_delimiter,
-                                                   continuation_delimiter=continuation_delimiter,
-                                                   destination_path=destination_path,
-                                                   prelimiter=prelimiter,
-                                                   fewshot_random_seed=fewshot_random_seed,
-                                                   hf_loading_vars=hf_loading_vars,
-                                                   hf_parsing_vars=hf_parsing_vars,
-                                                   pass_at_k=pass_at_k,
-                                                   generations_per_sample=generations_per_sample)
+        dataset = InContextLearningCodeEvalDataset(
+            dataset_uri=dataset_uri,
+            tokenizer=tokenizer,
+            max_seq_len=max_seq_len,
+            pad_tok_id=pad_tok_id,
+            num_fewshot=num_fewshot,
+            prompt_string=prompt_string,
+            example_delimiter=example_delimiter,
+            continuation_delimiter=continuation_delimiter,
+            destination_path=destination_path,
+            prelimiter=prelimiter,
+            fewshot_random_seed=fewshot_random_seed,
+            hf_loading_vars=hf_loading_vars,
+            hf_parsing_vars=hf_parsing_vars,
+            pass_at_k=pass_at_k,
+            generations_per_sample=generations_per_sample,
+        )
         effective_batchsize = batch_size
     else:
         raise Exception(f'Unrecognized ICL task type: {icl_task_type}')
@@ -1022,7 +1137,12 @@ def build_icl_dataloader(
     split_batch = None
     if isinstance(
             dataset,
-        (InContextLearningMultipleChoiceTaskDataset, InContextLearningQATaskDataset, InContextLearningCodeEvalDataset)):
+        (
+            InContextLearningMultipleChoiceTaskDataset,
+            InContextLearningQATaskDataset,
+            InContextLearningCodeEvalDataset,
+        ),
+    ):
         split_batch = dataset.split_batch
 
     return DataSpec(
@@ -1054,9 +1174,11 @@ def partition_dataset_by_category(dataset_uri: str, destination_path: str) -> Di
     try:
         from datasets import load_dataset  # pyright: ignore [reportGeneralTypeIssues]
     except ImportError as e:
-        raise MissingConditionalImportError(extra_deps_group='nlp',
-                                            conda_package='datasets',
-                                            conda_channel='conda-forge') from e
+        raise MissingConditionalImportError(
+            extra_deps_group='nlp',
+            conda_package='datasets',
+            conda_channel='conda-forge',
+        ) from e
     with dist.local_rank_zero_download_and_wait(destination_path):
         if dist.get_local_rank() == 0:
             get_file(dataset_uri, destination_path, overwrite=True)
@@ -1081,29 +1203,30 @@ def partition_dataset_by_category(dataset_uri: str, destination_path: str) -> Di
     return output_files
 
 
-#TODO: Where do we want to set our defaults?
+# TODO: Where do we want to set our defaults?
 
 
 def get_icl_task_dataloader(
-        icl_task_type: str,
-        dataset_uri: str,
-        tokenizer: Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast],
-        batch_size: int,
-        max_seq_len: int,
-        pad_tok_id: int,
-        num_fewshot: int,
-        prompt_string: str,  # e.g. 'translate english to french:'
-        example_delimiter: str,  # e.g. '\n'
-        hf_loading_vars: dict = {},
-        hf_parsing_vars: dict = {},
-        continuation_delimiter: str = '',
-        destination_path: str = '',
-        prelimiter: str = '',  # e.g. 'Question: '
-        fewshot_random_seed: int = 1234,
-        pass_at_k: int = 1,
-        generations_per_sample: int = 1,
-        cot_delimiter: str = '',
-        has_categories: bool = False) -> Union[DataSpec, Dict[str, DataSpec]]:
+    icl_task_type: str,
+    dataset_uri: str,
+    tokenizer: Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast],
+    batch_size: int,
+    max_seq_len: int,
+    pad_tok_id: int,
+    num_fewshot: int,
+    prompt_string: str,  # e.g. 'translate english to french:'
+    example_delimiter: str,  # e.g. '\n'
+    hf_loading_vars: dict = {},
+    hf_parsing_vars: dict = {},
+    continuation_delimiter: str = '',
+    destination_path: str = '',
+    prelimiter: str = '',  # e.g. 'Question: '
+    fewshot_random_seed: int = 1234,
+    pass_at_k: int = 1,
+    generations_per_sample: int = 1,
+    cot_delimiter: str = '',
+    has_categories: bool = False,
+) -> Union[DataSpec, Dict[str, DataSpec]]:
     """This constructs a dataloader (or dataloaders if has_categories is True) capable of evaluating LLMs on in-context learning language modeling tasks, for example LAMBADA. An example usage is below:
 
     >>> dl = get_icl_task_dataloader(

From 0090b829c06f24c39c0a7859ebb9b8c3c160bda4 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Sat, 11 Nov 2023 03:25:10 +0000
Subject: [PATCH 013/116] fix schema

---
 .../in_context_learning_evaluation.py         | 59 ++++---------------
 1 file changed, 13 insertions(+), 46 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index ac20439bf3..366600a43e 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -159,7 +159,6 @@ def __init__(
                 'fewshot_rng': fewshot_rng,
             },
         )
-        # self.encoded_dataset = self._prep_examples(num_fewshot, prompt_string, fewshot_rng)
 
     def __getitem__(self, index: int):
         return self.encoded_dataset[index]
@@ -185,6 +184,7 @@ def _read_dataset(
                 conda_package='datasets',
                 conda_channel='conda-forge',
             ) from e
+        # TODO: this feels bad as well
         if 'hf://' in dataset_uri:
             dataset_uri = dataset_uri.replace('hf://', '')
             dataset = load_dataset(dataset_uri, **hf_loading_vars)
@@ -718,7 +718,6 @@ def _parse_dataset(self, dataset: Dataset) -> List[Dict[str, str]]:
             })
 
     def construct_context(self, sample, preceding_text: str = '', add_answer: bool = False):
-        # TODO this is bad
         context_options = sample['context_options']
         gold_idx = sample['gold']
         continuation = sample['continuation']
@@ -728,50 +727,18 @@ def construct_context(self, sample, preceding_text: str = '', add_answer: bool =
             if len(preceding_text) > 0:
                 context = f'{self.example_delimiter}{context}'
             context = f'{context}{self.continuation_delimiter}{continuation}'
-        # else:
-        #     context_options = sample['context_options']
-        #     if len(preceding_text) > 0:
-        #         context_options = [f'{self.example_delimiter}{c}{self.continuation_delimiter}' for c in context_options]
-
-        return context
-
-    def _prep_examples(self, num_fewshot: int, prompt_string: str, fewshot_rng: random.Random):
-        """Prepares a set of schema questions into tokenized format with prompt and few shot examples.
-        Each question consists of a set of possible contexts followed by a continuation, only one of the contexts would logically permit the continuation.
-        At inference time we construct individual inference examples consisting of a single context option + the continuation,
-        as well as an optional (prompt) and optional list of example correct context option + continuations, which precede the test context option + continuation.
-        For schema, this method provides information relaying which of the answer choices is the correct one. This
-        information is used for computing accuracy metrics.
-        Args:
-            num_fewshot (int): Number of examples context/continuation pairs to prepend to the test pair
-            prompt_string (str): The prompt to prepend to all inputs
-            example_delimiter (str): The delimiter used to separate each example query/answer pair
-            continuation_delimiter (str): The delimiter used to separate each query from its answer
-            fewshot_rng (random.Random): Random number generator used to select fewshot examples
-        Returns:
-            dict: Contains the query, the list of encoded potential answer choices, the preamble (prompt + fewshot examples), and
-                the index of the correct answer choice.
-        """
-
-        examples = []
-        for sample_idx in tqdm(range(len(self.samples))):
-            prompt_and_fewshot = self.generate_few_shot_text(num_fewshot, sample_idx, prompt_string, fewshot_rng)
-            # This is different bcus the context has multiple options for scheme problems
-            ctxt_options = self.construct_context_options(self.samples[sample_idx], prompt_and_fewshot)
-            tokenized_example = self.tokenize_example(prompt_and_fewshot, ctxt_options)
-            tokenized_example = self.additional_processing_for_example(tokenized_example, self.samples[sample_idx])
-            examples.append(tokenized_example)
-        return examples
-
-    def construct_context_options(self, sample, preceding_text):
-        context_options = sample['context_options']
-        if len(preceding_text) > 0:
-            if self.strip_data:
-                cont_del = self.continuation_delimiter.rstrip()
-            else:
-                cont_del = self.continuation_delimiter
-            context_options = [f'{self.example_delimiter}{c}{cont_del}' for c in context_options]
-        return context_options
+            return context
+        else:
+            # TODO: This is a kinda code-smelly bcus we return two different types
+            # depending on the situation (a string if we hav add_answer=True or a
+            # list of strings if add_answer=False)
+            if len(preceding_text) > 0:
+                if self.strip_data:
+                    cont_del = self.continuation_delimiter.rstrip()
+                else:
+                    cont_del = self.continuation_delimiter
+                context_options = [f'{self.example_delimiter}{c}{cont_del}' for c in context_options]
+            return context_options
 
     def tokenize_example(self, prompt_and_fewshot: str, context_options: List[str]):
         tokenized_example = {}

From ea033808fd9b13ab0c65223d7d181ac3d6119456 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Mon, 13 Nov 2023 08:06:17 +0000
Subject: [PATCH 014/116] tests passing w/ collate refactor

---
 .../in_context_learning_evaluation.py         | 383 +++++++-----------
 .../test_in_context_learning_datasets.py      | 121 +++---
 2 files changed, 199 insertions(+), 305 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 366600a43e..130097ba72 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -129,6 +129,7 @@ def __init__(
         context_key: str = 'context',
         answer_key: str = 'answer',
         prelimiter: str = '',
+        stacked_keys: List[str] = ['input_ids', 'labels']
     ):
         self.tokenizer = tokenizer
         self.prefix_space = _tokenizer_needs_prefix_space(self.tokenizer)
@@ -143,6 +144,7 @@ def __init__(
         self.continuation_delimiter = continuation_delimiter
         self.context_key = context_key
         self.answer_key = answer_key
+        self.stacked_keys = stacked_keys
 
         self.dataset = self._read_dataset(dataset_uri, destination_path, hf_loading_vars, hf_parsing_vars)
         self.strip_data = strip_dataset
@@ -194,18 +196,12 @@ def _read_dataset(
                 if dist.get_local_rank() == 0:
                     get_file(dataset_uri, destination_path, overwrite=True)
             dataset = load_dataset('json', data_files=destination_path, split='train', streaming=False)
-            dataset = self._parse_dataset(dataset)
         return dataset
 
     def _parse_hf_dataset(self, dataset, hf_parsing_vars):
         return dataset.map(
             lambda example: {k: ''.join([str(example[col]) for col in v]) for k, v in hf_parsing_vars.items()})
 
-    def _parse_dataset(self, dataset: Dataset) -> List[Dict[str, str]]:
-        return dataset.map(lambda examples: {
-            self.context_key: examples['context'],
-            self.answer_key: examples['answer'],
-        })
 
     def generate_few_shot_text(
         self,
@@ -254,7 +250,7 @@ def fix_eos_on_preamble(self, preamble: dict):
             preamble['input_ids'] = preamble['input_ids'][:-1]
         return preamble
 
-    def tokenize_example(self, prompt_and_fewshot: str, ctxt: str):
+    def tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict):
         tokenized_example = {}
         preamble = self.tokenizer(prompt_and_fewshot)
         preamble = self.fix_eos_on_preamble(preamble)
@@ -293,15 +289,11 @@ def _prep_example(
         """
         prompt_and_fewshot = self.generate_few_shot_text(num_fewshot, example_idx, prompt_string, fewshot_rng)
         ctxt = self.construct_context(example, prompt_and_fewshot, add_answer=False)
-        tokenized_example = self.tokenize_example(prompt_and_fewshot, ctxt)
-        tokenized_example = self.additional_processing_for_example(tokenized_example, example)
-        return tokenized_example
-
-    def additional_processing_for_example(self, tokenized_example: dict, sample: dict):
+        tokenized_example = self.tokenize_example(prompt_and_fewshot, ctxt, example)
         return tokenized_example
 
     def collate_fn(self, data):
-        batch = self.default_batch
+        # batch = self.default_batch
         batch = {
             'input_ids': [],
             'continuation_indices': [],
@@ -309,25 +301,20 @@ def collate_fn(self, data):
             'labels': [],
         }
         for data_pair in data:
-            preamble, context, continuation = (
-                data_pair['preamble'],
-                data_pair['context'],
-                data_pair['continuation'],
-            )
+            context_enc = data_pair['preamble']['input_ids'] + data_pair[self.context_key]['input_ids']
 
-            context_enc = preamble['input_ids'] + context['input_ids']
-            continuation_enc = continuation['input_ids']
-
-            inp, continuation_span = _make_padded_input(context_enc, continuation_enc, self.max_seq_len,
-                                                        self.pad_tok_id)
+            inp, continuation_span = _make_padded_input(
+                context_enc, 
+                data_pair['continuation']['input_ids'], 
+                self.max_seq_len,
+                self.pad_tok_id
+                )
 
             batch['input_ids'].append(inp)
             batch['continuate_indicies'].append(continuation_span)
             batch['labels'].append(inp)
 
-        for key in self.stack_keys:
-            batch[key] = torch.stack(batch[key])
-
+        batch = {k : torch.stack(v) if k in self.stacked_keys else v for k, v in batch.items()}
         batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
         return batch
 
@@ -382,7 +369,7 @@ class InContextLearningQATaskDataset(InContextLearningDataset):
     def __init__(self, cot_delimiter: str = '', *args, **kwargs):
         self.cot_delimiter = cot_delimiter
         self.has_cot = False
-        super().__init__(*args, **kwargs)
+        super().__init__(stacked_keys=['input_ids'], *args, **kwargs)
 
         self.max_answer_length = self.get_max_answer_length()
         self.dont_split_keys = [
@@ -394,8 +381,8 @@ def __init__(self, cot_delimiter: str = '', *args, **kwargs):
         self.normal_split_keys = ['input_ids', 'attention_mask']
         self.list_split_keys = ['labels']
 
-    def _parse_dataset(self, dataset: Dataset) -> List[Dict[str, str]]:
-        # TODO: I hate that this is here - I really just don't want to ever have a list
+    def _read_dataset(self, dataset_uri: str, destination_path: str, hf_loading_vars: dict = None, hf_parsing_vars: dict = None):
+        dataset = super()._read_dataset(dataset_uri, destination_path, hf_loading_vars, hf_parsing_vars)   
         self.has_cot = 'chain_of_thought' in dataset.features
         return dataset.map(
             lambda examples: {
@@ -411,9 +398,9 @@ def get_answer_from_sample(self, sample):
         else:
             return sample[self.answer_key]
 
-    def additional_processing_for_example(self, tokenized_example: dict, sample: dict):
-        tokenized_example['aliases'] = list(sample.get('aliases', []))
-        # tokenized_example['cot_delimiter'] = self.cot_delimiter
+    def tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict):
+        tokenized_example = super().tokenize_example(prompt_and_fewshot, ctxt, example)
+        tokenized_example['aliases'] = list(example.get('aliases', []))
         return tokenized_example
 
     def get_max_answer_length(self):
@@ -430,34 +417,10 @@ def get_max_answer_length(self):
         return max_answer_length
 
     def collate_fn(self, data):
-        inputs, answers = [], []
-
-        for sample in data:
-            preamble, context, aliases = (
-                sample['preamble'],
-                sample['context'],
-                sample['aliases'],
-            )
-            context_enc = preamble['input_ids'] + context['input_ids']
-            inp, _ = _make_padded_input(
-                context_enc,
-                [],
-                self.max_seq_len - self.max_answer_length,
-                self.pad_tok_id,
-                padding_side=self.padding_side,
-            )
-
-            inputs.append(inp)
-            answers.append(aliases)
-
-            # We will search for the answer within the portion of the model response
-            # beginning with `cot_delimiter`
-            # cot_delimiter = sample['cot_delimiter']
-
         batch = {
-            'input_ids': torch.stack(inputs),
+            'input_ids': [],
             'mode': 'generate',
-            'labels': answers,
+            'labels': [],
             'cot_delimiter': self.cot_delimiter,
             'generation_length': self.max_answer_length,
             'generation_kwargs': {
@@ -465,7 +428,21 @@ def collate_fn(self, data):
                 'use_cache': True
             },
         }
+        for sample in data:
+            aliases = sample['aliases']
+            context_enc = sample['preamble']['input_ids'] + sample[self.context_key]['input_ids']
+            inp, _ = _make_padded_input(
+                context_enc,
+                [],
+                self.max_seq_len - self.max_answer_length,
+                self.pad_tok_id,
+                padding_side=self.padding_side,
+            )
 
+            batch['input_ids'].append(inp)
+            batch['labels'].append(aliases)
+
+        batch = {k : torch.stack(v) if k in self.stacked_keys else v for k, v in batch.items()}
         batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
         return batch
 
@@ -492,45 +469,33 @@ class InContextLearningLMTaskDataset(InContextLearningDataset):
     def __init__(self, *args, **kwargs):
         super().__init__(answer_key='continuation', *args, **kwargs)
 
-    def _parse_dataset(self, dataset: Dataset) -> List[Dict[str, str]]:
-        return dataset.map(lambda examples: {
-            'continuation': examples['continuation'],
-            'context': examples['context'],
-        })
-
-    def additional_processing_for_example(self, tokenized_example: dict, sample: dict):
-        cont = sample['continuation']
+    def tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict):
+        tokenized_example = super().tokenize_example(prompt_and_fewshot, ctxt, example)
+        cont = example['continuation']
         if self.prefix_space and not cont.startswith(' '):
             cont = f' {cont}'
         tokenized_example['continuation'] = self.tokenizer(cont, add_special_tokens=False)
         return tokenized_example
 
     def collate_fn(self, data):
-        inputs = []
-        continuation_indices = []
+        batch = {
+            'input_ids': [],
+            'continuation_indices': [],
+            'mode': 'icl_task',
+            'labels': []
+        }
         for data_pair in data:
-            preamble, context, continuation = (
-                data_pair['preamble'],
-                data_pair['context'],
-                data_pair['continuation'],
-            )
-
-            context_enc = preamble['input_ids'] + context['input_ids']
-            continuation_enc = continuation['input_ids']
+            context_enc = data_pair['preamble']['input_ids'] + data_pair['context']['input_ids']
+            continuation_enc = data_pair['continuation']['input_ids']
 
             inp, continuation_span = _make_padded_input(context_enc, continuation_enc, self.max_seq_len,
                                                         self.pad_tok_id)
+            batch['input_ids'].append(inp)
+            batch['continuation_indices'].append(continuation_span)
+            batch['labels'].append(inp)
 
-            inputs.append(inp)
-            continuation_indices.append(continuation_span)
-
-        batch = {
-            'input_ids': torch.stack(inputs),
-            'continuation_indices': continuation_indices,
-            'mode': 'icl_task',
-            'labels': torch.stack(inputs),
-        }
 
+        batch = {k : torch.stack(v) if k in self.stacked_keys else v for k, v in batch.items()}
         batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
         return batch
 
@@ -575,52 +540,46 @@ def __init__(self, choices_key: str = 'choices', *args, **kwargs):
         self.real_split_keys = ['input_ids', 'labels', 'attention_mask']
         self.normal_split_keys = ['gold_indices']
 
-    def _parse_dataset(self, dataset: Dataset) -> List[Dict[str, str]]:
-        return dataset.map(lambda examples: {
-            'query': examples['query'],
-            'choices': examples['choices'],
-            'gold': examples['gold'],
-        })
-
     def get_answer_from_sample(self, sample: dict):
         choices = sample['choices']
         gold_idx = sample['gold']
         return choices[gold_idx]
 
-    def additional_processing_for_example(self, tokenized_example: dict, sample: dict):
-        choices = sample['choices']
+    def tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict):
+        tokenized_example = super().tokenize_example(prompt_and_fewshot, ctxt, example)
+        choices = example['choices']
         if self.prefix_space:
             choices = [(f' {choice}' if not choice.startswith(' ') else choice) for choice in choices]
         tokenized_example['choices'] = [self.tokenizer(choice, add_special_tokens=False) for choice in choices]
-        tokenized_example['gold'] = sample['gold']
+        tokenized_example['gold'] = example['gold']
         return tokenized_example
 
     def collate_fn(self, data):
-        inputs = []
-        continuation_indices = []
-        gold_idxs = []
-        choice_groupings = []
+        batch = {
+            'input_ids': [],
+            'continuation_indices': [],
+            'mode': 'icl_task',
+            'labels': [],
+            'gold_indices': [],
+            'choice_groupings': [],
+        }
         for data_pair in data:
-            choice_start_idx = len(continuation_indices)
-            preamble, context, choices, gold_idx = (
-                data_pair['preamble'],
-                data_pair['query'],
-                data_pair['choices'],
-                data_pair['gold'],
-            )
+            # TODO: this line is sus idgi
+            choice_start_idx = len(batch['continuation_indices'])
 
-            for choice in choices:
-                context_enc = preamble['input_ids'] + context['input_ids']
+            for choice in data_pair['choices']:
+                context_enc = data_pair['preamble']['input_ids'] + data_pair[self.context_key]['input_ids']
                 continuation_enc = choice['input_ids']
                 inp, continuation_span = _make_padded_input(context_enc, continuation_enc, self.max_seq_len,
                                                             self.pad_tok_id)
 
-                inputs.append(inp)
-                continuation_indices.append(continuation_span)
+                batch['input_ids'].append(inp)
+                batch['continuation_indices'].append(continuation_span)
+                batch['labels'].append(inp)
 
-            gold_idxs.append(gold_idx)
-            choice_end_idx = len(continuation_indices)
-            choice_groupings.append((choice_start_idx, choice_end_idx))
+            batch['gold_indices'].append(data_pair['gold'])
+            choice_end_idx = len(batch['continuation_indices'])
+            batch['choice_groupings'].append((choice_start_idx, choice_end_idx))
 
         # We run each distinct query + answer choice through the model separately and determine which
         # answer has the lowest per-token-perplexity.
@@ -629,14 +588,7 @@ def collate_fn(self, data):
         # since the batch may consist of multiple questions, the choice_groupings indicates
         # which contiguous sequences of elements in the batch correspond to which question
         # gold_indices indicates which of the [0, N-1] choices is the correct one for each question.
-        batch = {
-            'input_ids': torch.stack(inputs),
-            'continuation_indices': continuation_indices,
-            'mode': 'icl_task',
-            'labels': torch.stack(inputs),
-            'gold_indices': gold_idxs,
-            'choice_groupings': choice_groupings,
-        }
+        batch = {k : torch.stack(v) if k in self.stacked_keys else v for k, v in batch.items()}
         batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
         return batch
 
@@ -709,14 +661,6 @@ class InContextLearningSchemaTaskDataset(InContextLearningMultipleChoiceTaskData
     def __init__(self, choices_key='context_options', *args, **kwargs):
         super().__init__(choices_key=choices_key, *args, **kwargs)
 
-    def _parse_dataset(self, dataset: Dataset) -> List[Dict[str, str]]:
-        return dataset.map(
-            lambda examples: {
-                'context_options': examples['context_options'],
-                'continuation': examples['continuation'],
-                'gold': examples['gold'],
-            })
-
     def construct_context(self, sample, preceding_text: str = '', add_answer: bool = False):
         context_options = sample['context_options']
         gold_idx = sample['gold']
@@ -740,48 +684,46 @@ def construct_context(self, sample, preceding_text: str = '', add_answer: bool =
                 context_options = [f'{self.example_delimiter}{c}{cont_del}' for c in context_options]
             return context_options
 
-    def tokenize_example(self, prompt_and_fewshot: str, context_options: List[str]):
+    def tokenize_example(self, prompt_and_fewshot: str, context_options: List[str], example:dict):
         tokenized_example = {}
         preamble = self.tokenizer(prompt_and_fewshot)
         preamble = self.fix_eos_on_preamble(preamble)
         tokenized_example['preamble'] = preamble
         tokenized_example['context_options'] = [self.tokenizer(c, add_special_tokens=False) for c in context_options]
-        return tokenized_example
-
-    def additional_processing_for_example(self, tokenized_example: dict, sample: dict):
-        continuation = sample['continuation']
+        continuation = example['continuation']
         if self.prefix_space:
             continuation = (f' {continuation}' if not continuation.startswith(' ') else continuation)
         tokenized_example['continuation'] = self.tokenizer(continuation, add_special_tokens=False)
-        tokenized_example['gold'] = sample['gold']
+        tokenized_example['gold'] = example['gold']
         return tokenized_example
 
+
     def collate_fn(self, data):
-        inputs = []
-        continuation_indices = []
-        gold_idxs = []
-        choice_groupings = []
+        batch = {
+            'input_ids': [], 
+            'continuation_indices': [],
+            'mode': 'icl_task',
+            'labels': [], 
+            'gold_indices': [],
+            'choice_groupings': [],
+        }
         for data_pair in data:
-            continuation_start_idx = len(continuation_indices)
-            preamble, context_options, continuation, gold_idx = (
-                data_pair['preamble'],
-                data_pair['context_options'],
-                data_pair['continuation'],
-                data_pair['gold'],
-            )
+            continuation_start_idx = len(batch['continuation_indices'])
+            context_options = data_pair['context_options']
 
-            for ctxt in context_options:
-                context_enc = preamble['input_ids'] + ctxt['input_ids']
-                continuation_enc = continuation['input_ids']
+            for context in context_options:
+                context_enc = data_pair['preamble']['input_ids'] + context['input_ids']
+                continuation_enc = data_pair['continuation']['input_ids']
                 inp, continuation_span = _make_padded_input(context_enc, continuation_enc, self.max_seq_len,
                                                             self.pad_tok_id)
 
-                inputs.append(inp)
-                continuation_indices.append(continuation_span)
+                batch['input_ids'].append(inp)
+                batch['labels'].append(inp)
+                batch['continuation_indices'].append(continuation_span)
 
-            gold_idxs.append(gold_idx)
-            continuation_end_idx = len(continuation_indices)
-            choice_groupings.append((continuation_start_idx, continuation_end_idx))
+            batch['gold_indices'].append(data_pair['gold'])
+            continuation_end_idx = len(batch['continuation_indices'])
+            batch['choice_groupings'].append((continuation_start_idx, continuation_end_idx))
 
         # We run each distinct query + answer choice through the model separately and determine which
         # answer has the lowest per-token-perplexity.
@@ -790,14 +732,7 @@ def collate_fn(self, data):
         # since the batch may consist of multiple questions, the choice_groupings indicates
         # which contiguous sequences of elements in the batch correspond to which question
         # gold_indices indicates which of the [0, N-1] choices is the correct one for each question.
-        batch = {
-            'input_ids': torch.stack(inputs),
-            'continuation_indices': continuation_indices,
-            'mode': 'icl_task',
-            'labels': torch.stack(inputs),
-            'gold_indices': gold_idxs,
-            'choice_groupings': choice_groupings,
-        }
+        batch = {k : torch.stack(v) if k in self.stacked_keys else v for k, v in batch.items()}
         batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
         return batch
 
@@ -851,6 +786,7 @@ def __init__(
             context_key='prompt',
             answer_key='canonical_solution',
             strip_dataset=False,
+            stacked_keys=['input_ids'],
             *args,
             **kwargs,
         )
@@ -887,92 +823,30 @@ def get_max_prompt_length(self):
             )
         return max_prompt_length
 
-    def _parse_dataset(self, dataset: Dataset) -> List[Dict[str, str]]:
-        return dataset.map(
-            lambda examples: {
-                'task_id': examples['task_id'],
-                'prompt': examples['prompt'],
-                'canonical_solution': examples['canonical_solution'],
-                'test': examples['test'],
-                'entry_point': examples['entry_point'],
-                'test_inputs': examples['test_inputs'],
-                'test_outputs': examples['test_outputs'],
-                'language': examples['language'],
-            })
-
-    def additional_processing_for_example(self, tokenized_example: dict, sample: dict):
-        tokenized_example['prompt_text'] = sample['prompt']
-        tokenized_example['task_id'] = sample['task_id']
-        tokenized_example['canonical_solution'] = sample['canonical_solution']
-        tokenized_example['test'] = sample['test']
-        tokenized_example['entry_point'] = sample['entry_point']
-        tokenized_example['test_inputs'] = sample['test_inputs']
-        tokenized_example['test_outputs'] = sample['test_outputs']
-        tokenized_example['language'] = sample['language']
+    def tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict):
+        tokenized_example = super().tokenize_example(prompt_and_fewshot, ctxt, example)
+        tokenized_example['prompt_text'] = example['prompt']
+        tokenized_example['task_id'] = example['task_id']
+        tokenized_example['canonical_solution'] = example['canonical_solution']
+        tokenized_example['test'] = example['test']
+        tokenized_example['entry_point'] = example['entry_point']
+        tokenized_example['test_inputs'] = example['test_inputs']
+        tokenized_example['test_outputs'] = example['test_outputs']
+        tokenized_example['language'] = example['language']
         return tokenized_example
 
     def collate_fn(self, data):
-        (
-            inputs,
-            prompts,
-            tests,
-            canonical_solutions,
-            entry_points,
-            test_inputs,
-            test_outputs,
-            languages,
-        ) = ([], [], [], [], [], [], [], [])
-        for sample in data:
-            (
-                preamble,
-                prompt,
-                text_prompt,
-                canonical_solution,
-                test,
-                entry_point,
-                test_input,
-                test_output,
-                language,
-            ) = (
-                sample['preamble'],
-                sample['prompt'],
-                sample['prompt_text'],
-                sample['canonical_solution'],
-                sample['test'],
-                sample['entry_point'],
-                sample['test_inputs'],
-                sample['test_outputs'],
-                sample['language'],
-            )
-            context_enc = preamble['input_ids'] + prompt['input_ids']
-            inp, _ = _make_padded_input(
-                context_enc,
-                [],
-                self.max_prompt_length,
-                self.pad_tok_id,
-                padding_side=self.padding_side,
-            )
-
-            inputs.append(inp)
-            tests.append(test)
-            prompts.append(text_prompt)
-            canonical_solutions.append(canonical_solution)
-            entry_points.append(entry_point)
-            test_inputs.append(test_input)
-            test_outputs.append(test_output)
-            languages.append(language)
-
         batch = {
-            'input_ids': torch.stack(inputs),
+            'input_ids': [],
             'mode': 'generate',
-            'labels': canonical_solutions,
-            'prompts': prompts,  # list of prompts
-            'tests': tests,  # list of tests
-            'canonical_solutions': canonical_solutions,  # list of solutions
-            'entry_points': entry_points,  # list of entry points
-            'test_inputs': test_inputs,  # list of test inputs
-            'test_outputs': test_outputs,  # list of test outputs
-            'languages': languages,  # list of languages
+            'labels': [],
+            'prompts': [],  # list of prompts
+            'tests': [],  # list of tests
+            'canonical_solutions': [],  # list of solutions
+            'entry_points': [],  # list of entry points
+            'test_inputs': [],  # list of test inputs
+            'test_outputs': [],  # list of test outputs
+            'languages': [],  # list of languages
             'pass_at_k': self.pass_at_k,
             'generation_length': self.max_seq_len - self.max_prompt_length,
             'generation_kwargs': {
@@ -985,6 +859,27 @@ def collate_fn(self, data):
                 'use_cache': True,
             },
         }
+        for sample in data:
+            context_enc = sample['preamble']['input_ids'] + sample['prompt']['input_ids']
+            inp, _ = _make_padded_input(
+                context_enc,
+                [],
+                self.max_prompt_length,
+                self.pad_tok_id,
+                padding_side=self.padding_side,
+            )
+
+            batch['input_ids'].append(inp)
+            batch['canonical_solutions'].append(sample['canonical_solution'])
+            batch['prompts'].append(sample['prompt_text'])
+            batch['tests'].append(sample['test'])
+            batch['labels'].append(sample['canonical_solution'])
+            batch['entry_points'].append(sample['entry_point'])
+            batch['test_inputs'].append(sample['test_inputs'])
+            batch['test_outputs'].append(sample['test_outputs'])
+            batch['languages'].append(sample['language'])
+
+        batch = {k : torch.stack(v) if k in self.stacked_keys else v for k, v in batch.items()}
         batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
         return batch
 
@@ -1171,8 +1066,6 @@ def partition_dataset_by_category(dataset_uri: str, destination_path: str) -> Di
 
 
 # TODO: Where do we want to set our defaults?
-
-
 def get_icl_task_dataloader(
     icl_task_type: str,
     dataset_uri: str,
diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py
index 5e0039ee06..996741bce3 100644
--- a/tests/datasets/test_in_context_learning_datasets.py
+++ b/tests/datasets/test_in_context_learning_datasets.py
@@ -1509,63 +1509,64 @@ def test_lm_spacing_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
     assert second_batch_without_last_word.count(' UNIQUE ') == 1
 
 
-@pytest.mark.parametrize('dataset_uri', ['hf://maxisawesome/long_context_eval'])
-@pytest.mark.parametrize('num_fewshot', [0, 1, 2])
-@pytest.mark.parametrize('prompt_string', ['I am a prompt', ''])
-@pytest.mark.parametrize('hf_loading_vars', [{
-    'split': 'test',
-    'name': 'kv_pairs',
-    'context_length': 2048,
-    'section': 'middle'
-}])
-@pytest.mark.parametrize('hf_parsing_vars', [{'inputs': ['context'], 'outputs': ['answer']}])
-def test_hf_dataloading(dataset_uri, tiny_gpt2_tokenizer, tmp_path, num_fewshot, prompt_string, hf_loading_vars,
-                        hf_parsing_vars):
-    pytest.importorskip('datasets')
-
-    # local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-
-    tokenizer = tiny_gpt2_tokenizer
-    # dataset_uri = f'{local_data}/{dataset_uri}'
-    batch_size = 2
-    seqlen = 2048
-    # empirical number from the small test dataset
-    maximum_answer_length = 9
-    dl = get_icl_task_dataloader('question_answering',
-                                 dataset_uri=dataset_uri,
-                                 tokenizer=tokenizer,
-                                 batch_size=batch_size,
-                                 max_seq_len=seqlen,
-                                 pad_tok_id=tokenizer.eos_token_id,
-                                 num_fewshot=num_fewshot,
-                                 prompt_string=prompt_string,
-                                 example_delimiter='\n',
-                                 prelimiter='Q: ',
-                                 continuation_delimiter='\nA:',
-                                 destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'),
-                                 hf_loading_vars=hf_loading_vars,
-                                 hf_parsing_vars=hf_parsing_vars)
-    assert isinstance(dl, DataSpec)
-
-    assert isinstance(dl.dataloader, DataLoader)  # pyright
-    batch = next(dl.dataloader._get_iterator())
-
-    assert tuple(batch['input_ids'].shape) == (batch_size, seqlen - maximum_answer_length)
-    assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen - maximum_answer_length)
-    assert batch['mode'] == 'generate'
-    # the maximum generation length from the small test data
-    assert batch['generation_length'] == maximum_answer_length
-    assert all(item[0] == tokenizer.eos_token_id for item in batch['input_ids'])
-
-    decoded_batch = tokenizer.batch_decode(batch['input_ids'])
-    import IPython; IPython.embed()
-    assert all(item.count('Q: ') == num_fewshot + 1 for item in decoded_batch)
-    assert all(item.count('\nA:') == num_fewshot + 1 for item in decoded_batch)
-
-    if len(prompt_string) > 0:
-        assert all(item.count('I am a prompt') == 1 for item in decoded_batch)
-    assert all(
-        set(found) == set(expected)
-        for found, expected in zip(batch['labels'], [['David Seville'], ['Skorpio', 'Scorpio']]))
-    assert decoded_batch[0].endswith('Q: Who was the man behind The Chipmunks?\nA:')
-    assert decoded_batch[1].endswith('Q: What star sign is Jamie Lee Curtis?\nA:')
+# @pytest.mark.parametrize('dataset_uri', ['hf://maxisawesome/long_context_eval'])
+# @pytest.mark.parametrize('num_fewshot', [0, 1, 2])
+# @pytest.mark.parametrize('prompt_string', ['I am a prompt', ''])
+# @pytest.mark.parametrize('hf_loading_vars', [{
+#     'split': 'test',
+#     'name': 'kv_pairs',
+#     'context_length': 2048,
+#     'section': 'middle'
+# }])
+# @pytest.mark.parametrize('hf_parsing_vars', [{'inputs': ['context'], 'outputs': ['answer']}])
+# def test_hf_dataloading(dataset_uri, tiny_gpt2_tokenizer, tmp_path, num_fewshot, prompt_string, hf_loading_vars,
+#                         hf_parsing_vars):
+#     pytest.importorskip('datasets')
+
+#     # local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+
+#     tokenizer = tiny_gpt2_tokenizer
+#     # dataset_uri = f'{local_data}/{dataset_uri}'
+#     batch_size = 2
+#     seqlen = 2048
+#     # empirical number from the small test dataset
+#     maximum_answer_length = 9
+#     dl = get_icl_task_dataloader('question_answering',
+#                                  dataset_uri=dataset_uri,
+#                                  tokenizer=tokenizer,
+#                                  batch_size=batch_size,
+#                                  max_seq_len=seqlen,
+#                                  pad_tok_id=tokenizer.eos_token_id,
+#                                  num_fewshot=num_fewshot,
+#                                  prompt_string=prompt_string,
+#                                  example_delimiter='\n',
+#                                  prelimiter='Q: ',
+#                                  continuation_delimiter='\nA:',
+#                                  destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'),
+#                                  hf_loading_vars=hf_loading_vars,
+#                                  hf_parsing_vars=hf_parsing_vars)
+#     assert isinstance(dl, DataSpec)
+
+#     assert isinstance(dl.dataloader, DataLoader)  # pyright
+#     batch = next(dl.dataloader._get_iterator())
+
+#     assert tuple(batch['input_ids'].shape) == (batch_size, seqlen - maximum_answer_length)
+#     assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen - maximum_answer_length)
+#     assert batch['mode'] == 'generate'
+#     # the maximum generation length from the small test data
+#     assert batch['generation_length'] == maximum_answer_length
+#     assert all(item[0] == tokenizer.eos_token_id for item in batch['input_ids'])
+
+#     decoded_batch = tokenizer.batch_decode(batch['input_ids'])
+#     import IPython
+#     IPython.embed()
+#     assert all(item.count('Q: ') == num_fewshot + 1 for item in decoded_batch)
+#     assert all(item.count('\nA:') == num_fewshot + 1 for item in decoded_batch)
+
+#     if len(prompt_string) > 0:
+#         assert all(item.count('I am a prompt') == 1 for item in decoded_batch)
+#     assert all(
+#         set(found) == set(expected)
+#         for found, expected in zip(batch['labels'], [['David Seville'], ['Skorpio', 'Scorpio']]))
+#     assert decoded_batch[0].endswith('Q: Who was the man behind The Chipmunks?\nA:')
+#     assert decoded_batch[1].endswith('Q: What star sign is Jamie Lee Curtis?\nA:')

From bebbbda5dd7076f2afd0714b93641df5b67d4709 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Mon, 13 Nov 2023 23:56:10 +0000
Subject: [PATCH 015/116] finish HF tests

---
 .../in_context_learning_evaluation.py         | 246 +++++++++---------
 .../test_in_context_learning_datasets.py      | 192 ++++++++------
 2 files changed, 235 insertions(+), 203 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 130097ba72..007bfaa289 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -7,7 +7,7 @@
 import json
 import os
 import random
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
 
 import torch
 import transformers
@@ -111,26 +111,25 @@ def _get_fewshot_sample_idxs(dataset_size: int, num_fewshot: int, sample_idx: in
 
 class InContextLearningDataset(Dataset):
 
-    def __init__(
-        self,
-        dataset_uri: str,
-        tokenizer: Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast],
-        max_seq_len: int,
-        pad_tok_id: int,
-        num_fewshot: int,
-        prompt_string: str,
-        example_delimiter: str,
-        continuation_delimiter: str,
-        destination_path: str,
-        fewshot_random_seed: int,
-        strip_dataset: bool = True,
-        hf_loading_vars: dict = {},
-        hf_parsing_vars: dict = {},
-        context_key: str = 'context',
-        answer_key: str = 'answer',
-        prelimiter: str = '',
-        stacked_keys: List[str] = ['input_ids', 'labels']
-    ):
+    def __init__(self,
+                 dataset_uri: str,
+                 tokenizer: Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast],
+                 max_seq_len: int,
+                 pad_tok_id: int,
+                 num_fewshot: int,
+                 prompt_string: str,
+                 example_delimiter: str,
+                 continuation_delimiter: str,
+                 destination_path: str,
+                 fewshot_random_seed: int,
+                 strip_dataset: bool = True,
+                 hf_loading_vars: dict = {},
+                 hf_parsing_vars: dict = {},
+                 hf_parsing_func: Callable = None,
+                 context_key: str = 'context',
+                 answer_key: str = 'answer',
+                 prelimiter: str = '',
+                 stacked_keys: List[str] = ['input_ids', 'labels']):
         self.tokenizer = tokenizer
         self.prefix_space = _tokenizer_needs_prefix_space(self.tokenizer)
 
@@ -146,7 +145,14 @@ def __init__(
         self.answer_key = answer_key
         self.stacked_keys = stacked_keys
 
-        self.dataset = self._read_dataset(dataset_uri, destination_path, hf_loading_vars, hf_parsing_vars)
+        if hf_parsing_func is not None:
+            self._parse_hf_dataset = hf_parsing_func
+        else:
+            self._parse_hf_dataset = lambda example: {
+                k: ' '.join([str(example[col]) for col in v]) for k, v in hf_parsing_vars.items()
+            }
+
+        self.dataset = self._read_dataset(dataset_uri, destination_path, hf_loading_vars)
         self.strip_data = strip_dataset
         if self.strip_data:
             self.dataset = self.dataset.map(strip_data)
@@ -176,7 +182,6 @@ def _read_dataset(
         dataset_uri: str,
         destination_path: str,
         hf_loading_vars: dict = None,
-        hf_parsing_vars: dict = None,
     ):
         try:
             from datasets import load_dataset  # pyright: ignore [reportGeneralTypeIssues]
@@ -190,7 +195,7 @@ def _read_dataset(
         if 'hf://' in dataset_uri:
             dataset_uri = dataset_uri.replace('hf://', '')
             dataset = load_dataset(dataset_uri, **hf_loading_vars)
-            dataset = self._parse_hf_dataset(dataset, hf_parsing_vars)
+            dataset = dataset.map(self._parse_hf_dataset)
         else:
             with dist.local_rank_zero_download_and_wait(destination_path):
                 if dist.get_local_rank() == 0:
@@ -198,11 +203,6 @@ def _read_dataset(
             dataset = load_dataset('json', data_files=destination_path, split='train', streaming=False)
         return dataset
 
-    def _parse_hf_dataset(self, dataset, hf_parsing_vars):
-        return dataset.map(
-            lambda example: {k: ''.join([str(example[col]) for col in v]) for k, v in hf_parsing_vars.items()})
-
-
     def generate_few_shot_text(
         self,
         num_fewshot: int,
@@ -303,18 +303,14 @@ def collate_fn(self, data):
         for data_pair in data:
             context_enc = data_pair['preamble']['input_ids'] + data_pair[self.context_key]['input_ids']
 
-            inp, continuation_span = _make_padded_input(
-                context_enc, 
-                data_pair['continuation']['input_ids'], 
-                self.max_seq_len,
-                self.pad_tok_id
-                )
+            inp, continuation_span = _make_padded_input(context_enc, data_pair['continuation']['input_ids'],
+                                                        self.max_seq_len, self.pad_tok_id)
 
             batch['input_ids'].append(inp)
             batch['continuate_indicies'].append(continuation_span)
             batch['labels'].append(inp)
 
-        batch = {k : torch.stack(v) if k in self.stacked_keys else v for k, v in batch.items()}
+        batch = {k: torch.stack(v) if k in self.stacked_keys else v for k, v in batch.items()}
         batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
         return batch
 
@@ -381,8 +377,8 @@ def __init__(self, cot_delimiter: str = '', *args, **kwargs):
         self.normal_split_keys = ['input_ids', 'attention_mask']
         self.list_split_keys = ['labels']
 
-    def _read_dataset(self, dataset_uri: str, destination_path: str, hf_loading_vars: dict = None, hf_parsing_vars: dict = None):
-        dataset = super()._read_dataset(dataset_uri, destination_path, hf_loading_vars, hf_parsing_vars)   
+    def _read_dataset(self, dataset_uri: str, destination_path: str, hf_loading_vars: dict = None):
+        dataset = super()._read_dataset(dataset_uri, destination_path, hf_loading_vars)
         self.has_cot = 'chain_of_thought' in dataset.features
         return dataset.map(
             lambda examples: {
@@ -442,7 +438,7 @@ def collate_fn(self, data):
             batch['input_ids'].append(inp)
             batch['labels'].append(aliases)
 
-        batch = {k : torch.stack(v) if k in self.stacked_keys else v for k, v in batch.items()}
+        batch = {k: torch.stack(v) if k in self.stacked_keys else v for k, v in batch.items()}
         batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
         return batch
 
@@ -478,12 +474,7 @@ def tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict):
         return tokenized_example
 
     def collate_fn(self, data):
-        batch = {
-            'input_ids': [],
-            'continuation_indices': [],
-            'mode': 'icl_task',
-            'labels': []
-        }
+        batch = {'input_ids': [], 'continuation_indices': [], 'mode': 'icl_task', 'labels': []}
         for data_pair in data:
             context_enc = data_pair['preamble']['input_ids'] + data_pair['context']['input_ids']
             continuation_enc = data_pair['continuation']['input_ids']
@@ -494,8 +485,7 @@ def collate_fn(self, data):
             batch['continuation_indices'].append(continuation_span)
             batch['labels'].append(inp)
 
-
-        batch = {k : torch.stack(v) if k in self.stacked_keys else v for k, v in batch.items()}
+        batch = {k: torch.stack(v) if k in self.stacked_keys else v for k, v in batch.items()}
         batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
         return batch
 
@@ -588,7 +578,7 @@ def collate_fn(self, data):
         # since the batch may consist of multiple questions, the choice_groupings indicates
         # which contiguous sequences of elements in the batch correspond to which question
         # gold_indices indicates which of the [0, N-1] choices is the correct one for each question.
-        batch = {k : torch.stack(v) if k in self.stacked_keys else v for k, v in batch.items()}
+        batch = {k: torch.stack(v) if k in self.stacked_keys else v for k, v in batch.items()}
         batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
         return batch
 
@@ -684,7 +674,7 @@ def construct_context(self, sample, preceding_text: str = '', add_answer: bool =
                 context_options = [f'{self.example_delimiter}{c}{cont_del}' for c in context_options]
             return context_options
 
-    def tokenize_example(self, prompt_and_fewshot: str, context_options: List[str], example:dict):
+    def tokenize_example(self, prompt_and_fewshot: str, context_options: List[str], example: dict):
         tokenized_example = {}
         preamble = self.tokenizer(prompt_and_fewshot)
         preamble = self.fix_eos_on_preamble(preamble)
@@ -697,13 +687,12 @@ def tokenize_example(self, prompt_and_fewshot: str, context_options: List[str],
         tokenized_example['gold'] = example['gold']
         return tokenized_example
 
-
     def collate_fn(self, data):
         batch = {
-            'input_ids': [], 
+            'input_ids': [],
             'continuation_indices': [],
             'mode': 'icl_task',
-            'labels': [], 
+            'labels': [],
             'gold_indices': [],
             'choice_groupings': [],
         }
@@ -732,7 +721,7 @@ def collate_fn(self, data):
         # since the batch may consist of multiple questions, the choice_groupings indicates
         # which contiguous sequences of elements in the batch correspond to which question
         # gold_indices indicates which of the [0, N-1] choices is the correct one for each question.
-        batch = {k : torch.stack(v) if k in self.stacked_keys else v for k, v in batch.items()}
+        batch = {k: torch.stack(v) if k in self.stacked_keys else v for k, v in batch.items()}
         batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
         return batch
 
@@ -879,7 +868,7 @@ def collate_fn(self, data):
             batch['test_outputs'].append(sample['test_outputs'])
             batch['languages'].append(sample['language'])
 
-        batch = {k : torch.stack(v) if k in self.stacked_keys else v for k, v in batch.items()}
+        batch = {k: torch.stack(v) if k in self.stacked_keys else v for k, v in batch.items()}
         batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
         return batch
 
@@ -903,93 +892,89 @@ def build_icl_dataloader(
     fewshot_random_seed: int,
     pass_at_k: int,
     generations_per_sample: int,
+    hf_parsing_func: Callable = None,
 ) -> DataSpec:
     if icl_task_type == 'multiple_choice':
-        dataset = InContextLearningMultipleChoiceTaskDataset(
-            dataset_uri=dataset_uri,
-            tokenizer=tokenizer,
-            max_seq_len=max_seq_len,
-            pad_tok_id=pad_tok_id,
-            num_fewshot=num_fewshot,
-            prompt_string=prompt_string,
-            example_delimiter=example_delimiter,
-            continuation_delimiter=continuation_delimiter,
-            destination_path=destination_path,
-            fewshot_random_seed=fewshot_random_seed,
-            hf_loading_vars=hf_loading_vars,
-            hf_parsing_vars=hf_parsing_vars,
-        )
+        dataset = InContextLearningMultipleChoiceTaskDataset(dataset_uri=dataset_uri,
+                                                             tokenizer=tokenizer,
+                                                             max_seq_len=max_seq_len,
+                                                             pad_tok_id=pad_tok_id,
+                                                             num_fewshot=num_fewshot,
+                                                             prompt_string=prompt_string,
+                                                             example_delimiter=example_delimiter,
+                                                             continuation_delimiter=continuation_delimiter,
+                                                             destination_path=destination_path,
+                                                             fewshot_random_seed=fewshot_random_seed,
+                                                             hf_loading_vars=hf_loading_vars,
+                                                             hf_parsing_vars=hf_parsing_vars,
+                                                             hf_parsing_func=hf_parsing_func)
         batch_size = max(dataset.num_choices, batch_size)
         effective_batchsize = batch_size // dataset.num_choices
     elif icl_task_type == 'schema':
-        dataset = InContextLearningSchemaTaskDataset(
-            dataset_uri=dataset_uri,
-            tokenizer=tokenizer,
-            max_seq_len=max_seq_len,
-            pad_tok_id=pad_tok_id,
-            num_fewshot=num_fewshot,
-            prompt_string=prompt_string,
-            example_delimiter=example_delimiter,
-            continuation_delimiter=continuation_delimiter,
-            destination_path=destination_path,
-            fewshot_random_seed=fewshot_random_seed,
-            hf_loading_vars=hf_loading_vars,
-            hf_parsing_vars=hf_parsing_vars,
-        )
+        dataset = InContextLearningSchemaTaskDataset(dataset_uri=dataset_uri,
+                                                     tokenizer=tokenizer,
+                                                     max_seq_len=max_seq_len,
+                                                     pad_tok_id=pad_tok_id,
+                                                     num_fewshot=num_fewshot,
+                                                     prompt_string=prompt_string,
+                                                     example_delimiter=example_delimiter,
+                                                     continuation_delimiter=continuation_delimiter,
+                                                     destination_path=destination_path,
+                                                     fewshot_random_seed=fewshot_random_seed,
+                                                     hf_loading_vars=hf_loading_vars,
+                                                     hf_parsing_vars=hf_parsing_vars,
+                                                     hf_parsing_func=hf_parsing_func)
         batch_size = max(dataset.num_choices, batch_size)
         effective_batchsize = batch_size // dataset.num_choices
     elif icl_task_type == 'language_modeling':
-        dataset = InContextLearningLMTaskDataset(
-            dataset_uri=dataset_uri,
-            tokenizer=tokenizer,
-            max_seq_len=max_seq_len,
-            pad_tok_id=pad_tok_id,
-            num_fewshot=num_fewshot,
-            prompt_string=prompt_string,
-            example_delimiter=example_delimiter,
-            continuation_delimiter=continuation_delimiter,
-            destination_path=destination_path,
-            fewshot_random_seed=fewshot_random_seed,
-            hf_loading_vars=hf_loading_vars,
-            hf_parsing_vars=hf_parsing_vars,
-        )
+        dataset = InContextLearningLMTaskDataset(dataset_uri=dataset_uri,
+                                                 tokenizer=tokenizer,
+                                                 max_seq_len=max_seq_len,
+                                                 pad_tok_id=pad_tok_id,
+                                                 num_fewshot=num_fewshot,
+                                                 prompt_string=prompt_string,
+                                                 example_delimiter=example_delimiter,
+                                                 continuation_delimiter=continuation_delimiter,
+                                                 destination_path=destination_path,
+                                                 fewshot_random_seed=fewshot_random_seed,
+                                                 hf_loading_vars=hf_loading_vars,
+                                                 hf_parsing_vars=hf_parsing_vars,
+                                                 hf_parsing_func=hf_parsing_func)
         effective_batchsize = batch_size
     elif icl_task_type == 'question_answering':
-        dataset = InContextLearningQATaskDataset(
-            dataset_uri=dataset_uri,
-            tokenizer=tokenizer,
-            max_seq_len=max_seq_len,
-            pad_tok_id=pad_tok_id,
-            num_fewshot=num_fewshot,
-            prompt_string=prompt_string,
-            example_delimiter=example_delimiter,
-            continuation_delimiter=continuation_delimiter,
-            destination_path=destination_path,
-            prelimiter=prelimiter,
-            fewshot_random_seed=fewshot_random_seed,
-            hf_loading_vars=hf_loading_vars,
-            hf_parsing_vars=hf_parsing_vars,
-            cot_delimiter=cot_delimiter,
-        )
+        dataset = InContextLearningQATaskDataset(dataset_uri=dataset_uri,
+                                                 tokenizer=tokenizer,
+                                                 max_seq_len=max_seq_len,
+                                                 pad_tok_id=pad_tok_id,
+                                                 num_fewshot=num_fewshot,
+                                                 prompt_string=prompt_string,
+                                                 example_delimiter=example_delimiter,
+                                                 continuation_delimiter=continuation_delimiter,
+                                                 destination_path=destination_path,
+                                                 prelimiter=prelimiter,
+                                                 fewshot_random_seed=fewshot_random_seed,
+                                                 hf_loading_vars=hf_loading_vars,
+                                                 hf_parsing_vars=hf_parsing_vars,
+                                                 cot_delimiter=cot_delimiter,
+                                                 hf_parsing_func=hf_parsing_func)
         effective_batchsize = batch_size
     elif icl_task_type == 'code_evaluation':
-        dataset = InContextLearningCodeEvalDataset(
-            dataset_uri=dataset_uri,
-            tokenizer=tokenizer,
-            max_seq_len=max_seq_len,
-            pad_tok_id=pad_tok_id,
-            num_fewshot=num_fewshot,
-            prompt_string=prompt_string,
-            example_delimiter=example_delimiter,
-            continuation_delimiter=continuation_delimiter,
-            destination_path=destination_path,
-            prelimiter=prelimiter,
-            fewshot_random_seed=fewshot_random_seed,
-            hf_loading_vars=hf_loading_vars,
-            hf_parsing_vars=hf_parsing_vars,
-            pass_at_k=pass_at_k,
-            generations_per_sample=generations_per_sample,
-        )
+        dataset = InContextLearningCodeEvalDataset(dataset_uri=dataset_uri,
+                                                   tokenizer=tokenizer,
+                                                   max_seq_len=max_seq_len,
+                                                   pad_tok_id=pad_tok_id,
+                                                   num_fewshot=num_fewshot,
+                                                   prompt_string=prompt_string,
+                                                   example_delimiter=example_delimiter,
+                                                   continuation_delimiter=continuation_delimiter,
+                                                   destination_path=destination_path,
+                                                   prelimiter=prelimiter,
+                                                   fewshot_random_seed=fewshot_random_seed,
+                                                   hf_loading_vars=hf_loading_vars,
+                                                   hf_parsing_vars=hf_parsing_vars,
+                                                   pass_at_k=pass_at_k,
+                                                   generations_per_sample=generations_per_sample,
+                                                   hf_parsing_func=hf_parsing_func)
         effective_batchsize = batch_size
     else:
         raise Exception(f'Unrecognized ICL task type: {icl_task_type}')
@@ -1078,6 +1063,7 @@ def get_icl_task_dataloader(
     example_delimiter: str,  # e.g. '\n'
     hf_loading_vars: dict = {},
     hf_parsing_vars: dict = {},
+    hf_parsing_func: Callable = None,
     continuation_delimiter: str = '',
     destination_path: str = '',
     prelimiter: str = '',  # e.g. 'Question: '
@@ -1151,6 +1137,7 @@ def get_icl_task_dataloader(
                 example_delimiter=example_delimiter,
                 hf_loading_vars=hf_loading_vars,
                 hf_parsing_vars=hf_parsing_vars,
+                hf_parsing_func=hf_parsing_func,
                 continuation_delimiter=continuation_delimiter,
                 destination_path=partition_uri + '_tmp',
                 prelimiter=prelimiter,
@@ -1173,6 +1160,7 @@ def get_icl_task_dataloader(
             example_delimiter=example_delimiter,
             hf_loading_vars=hf_loading_vars,
             hf_parsing_vars=hf_parsing_vars,
+            hf_parsing_func=hf_parsing_func,
             continuation_delimiter=continuation_delimiter,
             destination_path=destination_path,
             prelimiter=prelimiter,
diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py
index 996741bce3..e123e55441 100644
--- a/tests/datasets/test_in_context_learning_datasets.py
+++ b/tests/datasets/test_in_context_learning_datasets.py
@@ -24,7 +24,8 @@
 from composer.models import HuggingFaceModel
 from composer.trainer import Trainer
 from composer.utils import dist, reproducibility
-from tests.common import device, world_size
+
+# from tests.common import device, world_size
 
 
 def test_fewshot_sample_idxs():
@@ -1073,8 +1074,7 @@ def test_schema_task_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer, t
 @pytest.mark.parametrize('dataset_uri', ['mmlu_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [0, 5])
 @pytest.mark.filterwarnings(r'ignore:Cannot split .* of length.*:UserWarning')
-def test_mc_task_evaluation_subcategories(dataset_uri, num_fewshot, tiny_gpt2_model,
-                                          tiny_gpt2_tokenizer, tmp_path):
+def test_mc_task_evaluation_subcategories(dataset_uri, num_fewshot, tiny_gpt2_model, tiny_gpt2_tokenizer, tmp_path):
     pytest.importorskip('datasets')
     in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -1250,8 +1250,7 @@ def test_qa_task_evaluation_with_cot_opt_tokenizer(num_fewshot, dataset_uri, tmp
 
 @pytest.mark.parametrize('dataset_uri', ['triviaqa_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [0, 5])
-def test_qa_task_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tiny_gpt2_model,
-                            tmp_path):
+def test_qa_task_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tiny_gpt2_model, tmp_path):
     pytest.importorskip('datasets')
     in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -1291,8 +1290,7 @@ def test_qa_task_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tiny_
 
 @pytest.mark.parametrize('dataset_uri', ['gsm8k_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [5])
-def test_qa_task_with_cot_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tiny_gpt2_model,
-                                     tmp_path):
+def test_qa_task_with_cot_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tiny_gpt2_model, tmp_path):
     pytest.importorskip('datasets')
     in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -1334,8 +1332,7 @@ def test_qa_task_with_cot_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokeniz
 @pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [0])
 @pytest.mark.parametrize('generations_per_sample', range(1, 3))
-def test_code_eval_microbatching(monkeypatch, num_fewshot, dataset_uri, tmp_path,
-                                 generations_per_sample):
+def test_code_eval_microbatching(monkeypatch, num_fewshot, dataset_uri, tmp_path, generations_per_sample):
     pytest.importorskip('datasets')
     monkeypatch.setenv('CODE_EVAL_DEVICE', 'LOCAL')
     in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
@@ -1382,8 +1379,8 @@ def test_code_eval_microbatching(monkeypatch, num_fewshot, dataset_uri, tmp_path
 @pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [0])
 @pytest.mark.parametrize('generations_per_sample', range(1, 3))
-def test_code_eval_sentpiece_evaluation(monkeypatch, num_fewshot, dataset_uri, tiny_t5_tokenizer,
-                                        tiny_t5_model, tmp_path, generations_per_sample):
+def test_code_eval_sentpiece_evaluation(monkeypatch, num_fewshot, dataset_uri, tiny_t5_tokenizer, tiny_t5_model,
+                                        tmp_path, generations_per_sample):
     pytest.importorskip('datasets')
     torch.cuda.empty_cache()
     monkeypatch.setenv('CODE_EVAL_DEVICE', 'LOCAL')
@@ -1428,8 +1425,8 @@ def test_code_eval_sentpiece_evaluation(monkeypatch, num_fewshot, dataset_uri, t
 @pytest.mark.parametrize('num_fewshot', [0, 2])
 @pytest.mark.parametrize('generations_per_sample', [1])
 @pytest.mark.filterwarnings(r'ignore: Input length of input_ids is')
-def test_code_eval_task_evaluation(monkeypatch, num_fewshot, dataset_uri, tiny_gpt2_tokenizer,
-                                   tiny_gpt2_model, tmp_path, generations_per_sample):
+def test_code_eval_task_evaluation(monkeypatch, num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tiny_gpt2_model,
+                                   tmp_path, generations_per_sample):
     pytest.importorskip('datasets')
     torch.cuda.empty_cache()
     monkeypatch.setenv('CODE_EVAL_DEVICE', 'LOCAL')
@@ -1509,64 +1506,111 @@ def test_lm_spacing_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
     assert second_batch_without_last_word.count(' UNIQUE ') == 1
 
 
-# @pytest.mark.parametrize('dataset_uri', ['hf://maxisawesome/long_context_eval'])
-# @pytest.mark.parametrize('num_fewshot', [0, 1, 2])
-# @pytest.mark.parametrize('prompt_string', ['I am a prompt', ''])
-# @pytest.mark.parametrize('hf_loading_vars', [{
-#     'split': 'test',
-#     'name': 'kv_pairs',
-#     'context_length': 2048,
-#     'section': 'middle'
-# }])
-# @pytest.mark.parametrize('hf_parsing_vars', [{'inputs': ['context'], 'outputs': ['answer']}])
-# def test_hf_dataloading(dataset_uri, tiny_gpt2_tokenizer, tmp_path, num_fewshot, prompt_string, hf_loading_vars,
-#                         hf_parsing_vars):
-#     pytest.importorskip('datasets')
-
-#     # local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-
-#     tokenizer = tiny_gpt2_tokenizer
-#     # dataset_uri = f'{local_data}/{dataset_uri}'
-#     batch_size = 2
-#     seqlen = 2048
-#     # empirical number from the small test dataset
-#     maximum_answer_length = 9
-#     dl = get_icl_task_dataloader('question_answering',
-#                                  dataset_uri=dataset_uri,
-#                                  tokenizer=tokenizer,
-#                                  batch_size=batch_size,
-#                                  max_seq_len=seqlen,
-#                                  pad_tok_id=tokenizer.eos_token_id,
-#                                  num_fewshot=num_fewshot,
-#                                  prompt_string=prompt_string,
-#                                  example_delimiter='\n',
-#                                  prelimiter='Q: ',
-#                                  continuation_delimiter='\nA:',
-#                                  destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'),
-#                                  hf_loading_vars=hf_loading_vars,
-#                                  hf_parsing_vars=hf_parsing_vars)
-#     assert isinstance(dl, DataSpec)
-
-#     assert isinstance(dl.dataloader, DataLoader)  # pyright
-#     batch = next(dl.dataloader._get_iterator())
-
-#     assert tuple(batch['input_ids'].shape) == (batch_size, seqlen - maximum_answer_length)
-#     assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen - maximum_answer_length)
-#     assert batch['mode'] == 'generate'
-#     # the maximum generation length from the small test data
-#     assert batch['generation_length'] == maximum_answer_length
-#     assert all(item[0] == tokenizer.eos_token_id for item in batch['input_ids'])
-
-#     decoded_batch = tokenizer.batch_decode(batch['input_ids'])
-#     import IPython
-#     IPython.embed()
-#     assert all(item.count('Q: ') == num_fewshot + 1 for item in decoded_batch)
-#     assert all(item.count('\nA:') == num_fewshot + 1 for item in decoded_batch)
-
-#     if len(prompt_string) > 0:
-#         assert all(item.count('I am a prompt') == 1 for item in decoded_batch)
-#     assert all(
-#         set(found) == set(expected)
-#         for found, expected in zip(batch['labels'], [['David Seville'], ['Skorpio', 'Scorpio']]))
-#     assert decoded_batch[0].endswith('Q: Who was the man behind The Chipmunks?\nA:')
-#     assert decoded_batch[1].endswith('Q: What star sign is Jamie Lee Curtis?\nA:')
+@pytest.mark.parametrize('dataset_uri', ['hf://maxisawesome/test_dataset'])
+@pytest.mark.parametrize('num_fewshot', [0, 1])
+@pytest.mark.parametrize('prompt_string', ['Complete the voiceline: ', ''])
+@pytest.mark.parametrize('hf_loading_vars', [{
+    'split': 'test',
+    'name': 'juggernaut',
+}])
+@pytest.mark.parametrize('hf_parsing_vars', [{'inputs': ['context'], 'outputs': ['continuation']}])
+def test_hf_dataloading_lm_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path, num_fewshot, prompt_string,
+                                      hf_loading_vars, hf_parsing_vars):
+    pytest.importorskip('datasets')
+
+    tokenizer = tiny_gpt2_tokenizer
+    batch_size = 2
+    seqlen = 2048
+    dl = get_icl_task_dataloader('language_modeling',
+                                 dataset_uri,
+                                 tokenizer,
+                                 batch_size,
+                                 max_seq_len=seqlen,
+                                 pad_tok_id=tokenizer.eos_token_id,
+                                 num_fewshot=0,
+                                 prompt_string='',
+                                 example_delimiter='\n',
+                                 continuation_delimiter=' ',
+                                 destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
+                                 hf_loading_vars=hf_loading_vars,
+                                 hf_parsing_vars=hf_parsing_vars)
+    assert isinstance(dl, DataSpec)
+    assert isinstance(dl.dataloader, DataLoader)  # pyright
+    batch = next(dl.dataloader._get_iterator())
+
+    assert 'input_ids' in batch
+    assert tuple(batch['input_ids'].shape) == (batch_size, seqlen)
+    assert 'attention_mask' in batch
+    assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen)
+    assert 'continuation_indices' in batch
+    assert isinstance(batch['continuation_indices'], list) and len(batch['continuation_indices']) == batch_size
+    assert 'mode' in batch
+    assert batch['mode'] == 'icl_task'
+    min_idx = min(batch['continuation_indices'][0]).item()
+    max_idx = max(batch['continuation_indices'][0]).item()
+    assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx + 1]) == ' and me.'
+
+    decoded_batch = tokenizer.decode(batch['input_ids'][batch['input_ids'] != tokenizer.eos_token_id])
+    # Pytorch kills our dim_size = 2 here and concatenates the two strings.
+    assert decoded_batch == "Looks like it's just you and me.There's a fine line between bravery and stupidity."
+
+
+@pytest.mark.parametrize('dataset_uri', ['hf://maxisawesome/test_dataset'])
+@pytest.mark.parametrize('num_fewshot', [0, 1])
+@pytest.mark.parametrize('prompt_string', ['What spell does this invoke? ', ''])
+@pytest.mark.parametrize('hf_loading_vars', [{
+    'split': 'test',
+    'name': 'invoker',
+}])
+def test_hf_dataloading_custom_parsing(dataset_uri, tiny_gpt2_tokenizer, tmp_path, num_fewshot, prompt_string,
+                                       hf_loading_vars):
+    pytest.importorskip('datasets')
+
+    tokenizer = tiny_gpt2_tokenizer
+    batch_size = 2
+    seqlen = 2048
+
+    def parse_invoker(example):
+        context = ' '.join([example['quas'], example['wex'], example['exort']])
+        label = example['spell']
+        return {'context': context, 'answer': label}
+
+    # empirical number from the small test dataset
+    maximum_answer_length = 4
+
+    dl = get_icl_task_dataloader('question_answering',
+                                 dataset_uri,
+                                 tokenizer,
+                                 batch_size,
+                                 max_seq_len=seqlen,
+                                 pad_tok_id=tokenizer.eos_token_id,
+                                 num_fewshot=num_fewshot,
+                                 prompt_string=prompt_string,
+                                 example_delimiter='\n',
+                                 hf_parsing_func=parse_invoker,
+                                 prelimiter='Orbs: ',
+                                 continuation_delimiter='\nSpell:',
+                                 destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
+                                 hf_loading_vars=hf_loading_vars)
+    assert isinstance(dl, DataSpec)
+    assert isinstance(dl.dataloader, DataLoader)  # pyright
+    batch = next(dl.dataloader._get_iterator())
+
+    assert tuple(batch['input_ids'].shape) == (batch_size, seqlen - maximum_answer_length)
+    assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen - maximum_answer_length)
+    assert batch['mode'] == 'generate'
+    # the maximum generation length from the small test data
+    assert batch['generation_length'] == maximum_answer_length
+    assert all(item[0] == tokenizer.eos_token_id for item in batch['input_ids'])
+
+    decoded_batch = tokenizer.batch_decode(batch['input_ids'])
+    assert all(item.count('Orbs: ') == num_fewshot + 1 for item in decoded_batch)
+    # import IPython; IPython.embed()
+    assert all(item.count('\nSpell:') == num_fewshot + 1 for item in decoded_batch)
+
+    if len(prompt_string) > 0:
+        assert all(item.count('What spell does this invoke? ') == 1 for item in decoded_batch)
+    assert all(
+        set(found) == set(expected) for found, expected in zip(batch['labels'], [['defeaning blast'], ['cold snap']]))
+    assert decoded_batch[0].endswith('Orbs: quas wex exort\nSpell:')
+    assert decoded_batch[1].endswith('Orbs: quas quas quas\nSpell:')

From 1907cee87e1bf6c6af4695e37122d1b8e66c81ab Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Tue, 14 Nov 2023 18:12:58 +0000
Subject: [PATCH 016/116] add hf batch parsing

---
 .../in_context_learning_evaluation.py         | 24 ++++---
 .../test_in_context_learning_datasets.py      | 69 ++++++++++++++++++-
 2 files changed, 83 insertions(+), 10 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 007bfaa289..f033382578 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -12,7 +12,6 @@
 import torch
 import transformers
 from torch.utils.data import DataLoader, Dataset
-from tqdm import tqdm
 
 from composer.core import DataSpec
 from composer.core.data_spec import _default_split_batch, _split_list
@@ -148,11 +147,10 @@ def __init__(self,
         if hf_parsing_func is not None:
             self._parse_hf_dataset = hf_parsing_func
         else:
-            self._parse_hf_dataset = lambda example: {
+            self._parse_hf_dataset = lambda example, **kwargs: {
                 k: ' '.join([str(example[col]) for col in v]) for k, v in hf_parsing_vars.items()
             }
-
-        self.dataset = self._read_dataset(dataset_uri, destination_path, hf_loading_vars)
+        self.dataset = self._read_dataset(dataset_uri, destination_path, hf_loading_vars, hf_parsing_vars)
         self.strip_data = strip_dataset
         if self.strip_data:
             self.dataset = self.dataset.map(strip_data)
@@ -182,6 +180,7 @@ def _read_dataset(
         dataset_uri: str,
         destination_path: str,
         hf_loading_vars: dict = None,
+        hf_parsing_vars: dict = None,
     ):
         try:
             from datasets import load_dataset  # pyright: ignore [reportGeneralTypeIssues]
@@ -191,11 +190,15 @@ def _read_dataset(
                 conda_package='datasets',
                 conda_channel='conda-forge',
             ) from e
-        # TODO: this feels bad as well
         if 'hf://' in dataset_uri:
             dataset_uri = dataset_uri.replace('hf://', '')
             dataset = load_dataset(dataset_uri, **hf_loading_vars)
-            dataset = dataset.map(self._parse_hf_dataset)
+            batched = hf_parsing_vars.pop('batched', False)
+            dataset = dataset.map(self._parse_hf_dataset,
+                                  remove_columns=dataset.column_names,
+                                  batched=batched,
+                                  fn_kwargs=hf_parsing_vars)
+
         else:
             with dist.local_rank_zero_download_and_wait(destination_path):
                 if dist.get_local_rank() == 0:
@@ -293,7 +296,6 @@ def _prep_example(
         return tokenized_example
 
     def collate_fn(self, data):
-        # batch = self.default_batch
         batch = {
             'input_ids': [],
             'continuation_indices': [],
@@ -377,8 +379,12 @@ def __init__(self, cot_delimiter: str = '', *args, **kwargs):
         self.normal_split_keys = ['input_ids', 'attention_mask']
         self.list_split_keys = ['labels']
 
-    def _read_dataset(self, dataset_uri: str, destination_path: str, hf_loading_vars: dict = None):
-        dataset = super()._read_dataset(dataset_uri, destination_path, hf_loading_vars)
+    def _read_dataset(self,
+                      dataset_uri: str,
+                      destination_path: str,
+                      hf_loading_vars: dict = None,
+                      hf_parsing_vars: dict = None):
+        dataset = super()._read_dataset(dataset_uri, destination_path, hf_loading_vars, hf_parsing_vars)
         self.has_cot = 'chain_of_thought' in dataset.features
         return dataset.map(
             lambda examples: {
diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py
index 6e50d25c52..8f5d7001ae 100644
--- a/tests/datasets/test_in_context_learning_datasets.py
+++ b/tests/datasets/test_in_context_learning_datasets.py
@@ -1525,7 +1525,7 @@ def test_lm_spacing_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
     'split': 'test',
     'name': 'juggernaut',
 }])
-@pytest.mark.parametrize('hf_parsing_vars', [{'inputs': ['context'], 'outputs': ['continuation']}])
+@pytest.mark.parametrize('hf_parsing_vars', [{'context': ['context'], 'continuation': ['continuation']}])
 def test_hf_dataloading_lm_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path, num_fewshot, prompt_string,
                                       hf_loading_vars, hf_parsing_vars):
     pytest.importorskip('datasets')
@@ -1626,3 +1626,70 @@ def parse_invoker(example):
         set(found) == set(expected) for found, expected in zip(batch['labels'], [['defeaning blast'], ['cold snap']]))
     assert decoded_batch[0].endswith('Orbs: quas wex exort\nSpell:')
     assert decoded_batch[1].endswith('Orbs: quas quas quas\nSpell:')
+
+
+@pytest.mark.parametrize('dataset_uri', ['hf://maxisawesome/test_dataset'])
+@pytest.mark.parametrize('num_fewshot', [0, 1])
+@pytest.mark.parametrize('prompt_string', ['What spell does this invoke? ', ''])
+@pytest.mark.parametrize('hf_loading_vars', [{
+    'split': 'test',
+    'name': 'invoker',
+}])
+def test_hf_dataloading_custom_parsing_batched(dataset_uri, tiny_gpt2_tokenizer, tmp_path, num_fewshot, prompt_string,
+                                               hf_loading_vars):
+    pytest.importorskip('datasets')
+
+    tokenizer = tiny_gpt2_tokenizer
+    batch_size = 2
+    seqlen = 2048
+
+    def parse_invoker_batched(examples):
+        batch = {'context': [], 'answer': []}
+        for i, quas_text in enumerate(examples['quas']):
+            # import IPython; IPython.embed()
+            wex_text = examples['wex'][i]
+            exort_text = examples['exort'][i]
+            batch['context'].append(' '.join([quas_text, wex_text, exort_text]))
+            batch['answer'].append(examples['spell'][i])
+        return batch
+
+    # empirical number from the small test dataset
+    maximum_answer_length = 4
+
+    dl = get_icl_task_dataloader('question_answering',
+                                 dataset_uri,
+                                 tokenizer,
+                                 batch_size,
+                                 max_seq_len=seqlen,
+                                 pad_tok_id=tokenizer.eos_token_id,
+                                 num_fewshot=num_fewshot,
+                                 prompt_string=prompt_string,
+                                 example_delimiter='\n',
+                                 hf_parsing_func=parse_invoker_batched,
+                                 prelimiter='Orbs: ',
+                                 continuation_delimiter='\nSpell:',
+                                 destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
+                                 hf_loading_vars=hf_loading_vars,
+                                 hf_parsing_vars={'batched': True})
+    assert isinstance(dl, DataSpec)
+    assert isinstance(dl.dataloader, DataLoader)  # pyright
+    batch = next(dl.dataloader._get_iterator())
+
+    assert tuple(batch['input_ids'].shape) == (batch_size, seqlen - maximum_answer_length)
+    assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen - maximum_answer_length)
+    assert batch['mode'] == 'generate'
+    # the maximum generation length from the small test data
+    assert batch['generation_length'] == maximum_answer_length
+    assert all(item[0] == tokenizer.eos_token_id for item in batch['input_ids'])
+
+    decoded_batch = tokenizer.batch_decode(batch['input_ids'])
+    assert all(item.count('Orbs: ') == num_fewshot + 1 for item in decoded_batch)
+    # import IPython; IPython.embed()
+    assert all(item.count('\nSpell:') == num_fewshot + 1 for item in decoded_batch)
+
+    if len(prompt_string) > 0:
+        assert all(item.count('What spell does this invoke? ') == 1 for item in decoded_batch)
+    assert all(
+        set(found) == set(expected) for found, expected in zip(batch['labels'], [['defeaning blast'], ['cold snap']]))
+    assert decoded_batch[0].endswith('Orbs: quas wex exort\nSpell:')
+    assert decoded_batch[1].endswith('Orbs: quas quas quas\nSpell:')

From 503914a1509213bfbae928052c467f49373be44e Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Wed, 15 Nov 2023 23:19:28 +0000
Subject: [PATCH 017/116] linting

---
 .../in_context_learning_evaluation.py         | 196 +++++++++++-------
 1 file changed, 119 insertions(+), 77 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index f033382578..a08606b10d 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -32,8 +32,6 @@
 ]
 
 
-# def strip_data(samples):
-#     return [{k: v.strip() if isinstance(v, str) else v for k, v in entry.items()} for entry in samples]
 def strip_data(sample):
     return {k: v.strip() if isinstance(v, str) else v for k, v in sample.items()}
 
@@ -109,26 +107,60 @@ def _get_fewshot_sample_idxs(dataset_size: int, num_fewshot: int, sample_idx: in
 
 
 class InContextLearningDataset(Dataset):
+    """A base dataset that construct batches for in-context learning task evaluations
 
-    def __init__(self,
-                 dataset_uri: str,
-                 tokenizer: Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast],
-                 max_seq_len: int,
-                 pad_tok_id: int,
-                 num_fewshot: int,
-                 prompt_string: str,
-                 example_delimiter: str,
-                 continuation_delimiter: str,
-                 destination_path: str,
-                 fewshot_random_seed: int,
-                 strip_dataset: bool = True,
-                 hf_loading_vars: dict = {},
-                 hf_parsing_vars: dict = {},
-                 hf_parsing_func: Callable = None,
-                 context_key: str = 'context',
-                 answer_key: str = 'answer',
-                 prelimiter: str = '',
-                 stacked_keys: List[str] = ['input_ids', 'labels']):
+    The input format is expected to be a jsonl file with different fields based on the task or a link to a huggingface dataset.
+
+    Args:
+        dataset_uri (str): A local path, a remote path beginning with ``s3://`` or another backend, or a HuggingFace dataset link with ``hf://`` prepended to it.
+            Alternate backends must be supported by :meth:`composer.utils.maybe_create_object_store_from_uri`.
+            A local dataset must consist of rows of JSON data points with different fields based on the task.
+            The default is "context" and "answer".
+        tokenizer (Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast]): The tokenizer used to map between strings and token ids
+        max_seq_len (int): The maximum sequence length supported by the model
+        pad_tok_id (int): The special token reserved for padding batches
+        num_fewshot (int): The number of complete fewshot examples to prepend before each test example
+        prompt_string (str): Prompt string to put once before all fewshot examples/test examples (e.g. 'translate english to french')
+        example_delimiter (str): Separator that goes between individual (context, answer) pairs (e.g. '\n')
+        continuation_delimiter: (str): Separator that goes between context and answer in each example (e.g. '\nA: ')
+        destination_path (str): Temporary path to store downloaded datasets
+        prelimiter (str): String to put before each question (e.g. 'Q: ')
+        fewshot_random_seed (int): Random seed to use for fewshot sampling
+        strip_dataset (bool): Boolean for whether to strip whitespace from data. Trailing whitespace can cause degenerative outputs,
+            so unless otherwise required (for example in code), this should be set to True.
+        hf_loading_vars (dict): A dictionary containing keyword arguments to be passed into `load_dataset` if dataset is being pulled from HF.
+        hf_parsing_vars (dict): A dictionary containing keyword arguments to be passed into `_parse_hf_dataset` if the dataset is being pulled from HF,
+            as well as a boolean for whether or not to process the dataset in batches.
+        context_key (str): The key from the parsed dataset that the class will use as the "context" (i.e. the main content to be included in the prompt)
+        answer_key (str): The key from the parsed dataset that the class will use as the "answer" (i.e. the main content to be predicted by the model)
+        prelimiter (str): Text to be prepended before each example, including few shot examples #TODO: confirm this
+        stacked_keys (list(str)): keys in the output batch that must be converted to tensors with torch.stack()
+    """
+
+    def __init__(
+            self,
+            dataset_uri: str,
+            tokenizer: transformers.PreTrainedTokenizerBase,
+            max_seq_len: int,
+            pad_tok_id: int,
+            num_fewshot: int,
+            prompt_string: str,
+            example_delimiter: str,
+            continuation_delimiter: str,
+            destination_path: str,
+            fewshot_random_seed: int,
+            strip_dataset: bool = True,
+            hf_loading_vars: dict = None,
+            hf_parsing_vars: dict = None,
+            hf_parsing_func: Callable = None,
+            # TODO: should this be used to both set and access the data / tokenized examples?
+            context_key: str = 'context',
+            answer_key: str = 'answer',
+            prelimiter: str = '',
+            stacked_keys: List[str] = None,
+            dont_split_keys: List[str] = None,
+            list_split_keys: List[str] = None,
+            normal_split_keys: List[str] = None):
         self.tokenizer = tokenizer
         self.prefix_space = _tokenizer_needs_prefix_space(self.tokenizer)
 
@@ -143,13 +175,18 @@ def __init__(self,
         self.context_key = context_key
         self.answer_key = answer_key
         self.stacked_keys = stacked_keys
+        hf_loading_vars = hf_loading_vars or {}
+        hf_parsing_vars = hf_parsing_vars or {}
+        self.stacked_keys = stacked_keys or ['input_ids', 'labels']
+        self.dont_split_keys = dont_split_keys or []
+        self.list_split_keys = list_split_keys or []
+        self.normal_split_keys = normal_split_keys or []
+
+        default_hf_parser = lambda example, **kwargs: {
+            k: ' '.join([str(example[col]) for col in v]) for k, v in kwargs.items()
+        }
+        self._parse_hf_dataset = hf_parsing_func or default_hf_parser
 
-        if hf_parsing_func is not None:
-            self._parse_hf_dataset = hf_parsing_func
-        else:
-            self._parse_hf_dataset = lambda example, **kwargs: {
-                k: ' '.join([str(example[col]) for col in v]) for k, v in hf_parsing_vars.items()
-            }
         self.dataset = self._read_dataset(dataset_uri, destination_path, hf_loading_vars, hf_parsing_vars)
         self.strip_data = strip_dataset
         if self.strip_data:
@@ -190,7 +227,7 @@ def _read_dataset(
                 conda_package='datasets',
                 conda_channel='conda-forge',
             ) from e
-        if 'hf://' in dataset_uri:
+        if dataset_uri.startswith('hf://'):
             dataset_uri = dataset_uri.replace('hf://', '')
             dataset = load_dataset(dataset_uri, **hf_loading_vars)
             batched = hf_parsing_vars.pop('batched', False)
@@ -215,10 +252,8 @@ def generate_few_shot_text(
     ) -> str:
         """Formats the prompt fewshot examples for test sample `sample_idx`.
 
-        Randomly select `num_fewshot` samples from the dataset (not including the sample at `sample_idx`) and format
-        them each as follows `{example_delimiter}{prelimiter}{context}{continuation_delimiter}{chain_of_thought}{cot_delimiter}{answer}`.
-
-        `chain_of_thought` will default to empty if not present in the dataset but `context` and `answer` must be present.
+        Randomly select `num_fewshot` samples from the dataset (not including the sample at `sample_idx`) and constructs
+        a context with its answer appended.
 
         Returns the formatted prompt_string + concatenated list of formatted few shot examples.
         """
@@ -233,6 +268,13 @@ def generate_few_shot_text(
         return few_shot_text
 
     def construct_context(self, sample: dict, preceding_text: str = '', add_answer: bool = False):
+        """
+        Takes a sample and  constructs a context. Optionally, appends this to preceeding text (such as a
+        prompt or fewshot examples), as well as optionally adds the correct answer (for fewshot examples)
+
+        The default context is formatted as follows: f'{self.prelimiter}{sample[self.context_key]}{self.continuation_delimiter}'
+
+        """
         ctxt = sample[self.context_key]
         ctxt = f'{self.prelimiter}{ctxt}'
         if len(preceding_text) > 0:
@@ -243,23 +285,33 @@ def construct_context(self, sample: dict, preceding_text: str = '', add_answer:
         return ctxt
 
     def get_answer_from_sample(self, sample: dict):
+        """
+        Returns the answer from the sample
+        """
         return sample[self.answer_key]
 
     def fix_eos_on_preamble(self, preamble: dict):
-        # If the preamble is empty then this will be a 0-length list, unless the tokenizer adds special tokens to empty strings (e.g. OPT tokenizer)
-        # If there is an EOS token added, we need to remove it so it is not in the middle of the prompt
+        """
+        If the preamble is empty then preamble['input_ids'] will be a 0-length list,
+        unless the tokenizer adds special tokens to empty strings (e.g. OPT tokenizer)
+        If there is an EOS token added, we need to remove it so it is not in the middle of the prompt,
+        as the specific eval question's prompt will follow the preamble
+        """
         if (self.tokenizer.eos_token_id is not None and len(preamble['input_ids']) > 1 and
                 preamble['input_ids'][-1] == self.tokenizer.eos_token_id):
             preamble['input_ids'] = preamble['input_ids'][:-1]
         return preamble
 
+    # TODO: do we need example here?
     def tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict):
+        """
+        Runs text throught he tokenizer and handles special cases.
+        """
         tokenized_example = {}
         preamble = self.tokenizer(prompt_and_fewshot)
         preamble = self.fix_eos_on_preamble(preamble)
         tokenized_example['preamble'] = preamble
         if self.strip_data:
-            # TODO: probably shouldn't use self.strip_data for this
             # rstrip context because a prompt ending in a space results in degenerate output
             ctxt = ctxt.rstrip()
         tokenized_example[self.context_key] = self.tokenizer(ctxt, add_special_tokens=False)
@@ -267,28 +319,26 @@ def tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict):
 
     def _prep_example(
         self,
-        example,
+        example: dict,
         example_idx: int,
         num_fewshot: int,
         prompt_string: str,
         fewshot_rng: random.Random,
     ) -> List[Dict[str, Any]]:
-        """Prepares a set of language modeling tasks into tokenized format with prompt and fewshot examples.
+        """Prepares a single example from a HF Dataset into tokenized format with prompt and fewshot examples.
 
         Each task consists of a context and a continuation as well as an optional prompt and optional list of
         example context/continuation pairs which precede the test context/continuation pair.
 
         Args:
+            example (dict): A dictionary from the hf dataset
+            example_idx (int): the index of example
             num_fewshot (int): Number of examples context/continuation pairs to prepend to the test pair
             prompt_string (str): The prompt to prepend to all inputs
-            example_delimiter (str): The delimiter used to separate each individual context/continuation pair
-            continuation_delimiter (str): The delimiter used to separate each context from its continuation
             fewshot_rng (random.Random): Random number generator to use for fewshot sampling
-            cot_delimiter (str): The delimiter used to separate the chain-of-thought (if present) from the final model response.
-
 
         Returns:
-            dict: Contains the context, the continuation, and the preamble (prompt + fewshot examples)
+            dict: contains a dictionary with the tokenized data
         """
         prompt_and_fewshot = self.generate_few_shot_text(num_fewshot, example_idx, prompt_string, fewshot_rng)
         ctxt = self.construct_context(example, prompt_and_fewshot, add_answer=False)
@@ -296,6 +346,10 @@ def _prep_example(
         return tokenized_example
 
     def collate_fn(self, data):
+        """
+        The function that the dataloader uses to accumulate data into batches
+        """
+
         batch = {
             'input_ids': [],
             'continuation_indices': [],
@@ -309,7 +363,7 @@ def collate_fn(self, data):
                                                         self.max_seq_len, self.pad_tok_id)
 
             batch['input_ids'].append(inp)
-            batch['continuate_indicies'].append(continuation_span)
+            batch['continuation_indicies'].append(continuation_span)
             batch['labels'].append(inp)
 
         batch = {k: torch.stack(v) if k in self.stacked_keys else v for k, v in batch.items()}
@@ -317,6 +371,9 @@ def collate_fn(self, data):
         return batch
 
     def split_batch(self, batch: Any, microbatch_size: int):
+        """
+        Handling for certain specialty columns that must be split into batches in different formats
+        """
         # Don't split kwargs that don't change
         # Normally split torch tensors
         # List split lists of strings
@@ -367,17 +424,14 @@ class InContextLearningQATaskDataset(InContextLearningDataset):
     def __init__(self, cot_delimiter: str = '', *args, **kwargs):
         self.cot_delimiter = cot_delimiter
         self.has_cot = False
-        super().__init__(stacked_keys=['input_ids'], *args, **kwargs)
+        super().__init__(stacked_keys=['input_ids'],
+                         dont_split_keys=['mode', 'generation_length', 'generation_kwargs', 'cot_delimiter'],
+                         normal_split_keys=['input_ids', 'attention_mask'],
+                         list_split_keys=['labels'],
+                         *args,
+                         **kwargs)
 
         self.max_answer_length = self.get_max_answer_length()
-        self.dont_split_keys = [
-            'mode',
-            'generation_length',
-            'generation_kwargs',
-            'cot_delimiter',
-        ]
-        self.normal_split_keys = ['input_ids', 'attention_mask']
-        self.list_split_keys = ['labels']
 
     def _read_dataset(self,
                       dataset_uri: str,
@@ -529,12 +583,13 @@ class InContextLearningMultipleChoiceTaskDataset(InContextLearningDataset):
     """
 
     def __init__(self, choices_key: str = 'choices', *args, **kwargs):
-        super().__init__(context_key='query', *args, **kwargs)
+        super().__init__(context_key='query',
+                         dont_split_keys=['mode'],
+                         normal_split_keys=['gold_indices'],
+                         *args,
+                         **kwargs)
         self.num_choices = len(self.dataset[0][choices_key])
-
-        self.dont_split_keys = ['mode']
         self.real_split_keys = ['input_ids', 'labels', 'attention_mask']
-        self.normal_split_keys = ['gold_indices']
 
     def get_answer_from_sample(self, sample: dict):
         choices = sample['choices']
@@ -749,7 +804,6 @@ class InContextLearningCodeEvalDataset(InContextLearningDataset):
         supported by :meth:`composer.utils.maybe_create_object_store_from_uri`. Dataset must consist of rows of JSON data points with "task_id",
         "prompt", "entry_point", "canonical_solution", "test", "test_inputs", and "test_outputs". See tests/datasets/local_data/human_eval_small.jsonl.
         tokenizer (Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast]): The tokenizer used to map between strings and token ids
-        ? batch_size (int): Size of a batch used for eval
         max_seq_len (int): The maximum sequence length supported by the model
         pad_tok_id (int): The special token reserved for padding batches
         num_fewshot (int): The number of complete fewshot examples to prepend before each test example
@@ -782,6 +836,12 @@ def __init__(
             answer_key='canonical_solution',
             strip_dataset=False,
             stacked_keys=['input_ids'],
+            dont_split_keys=['mode', 'generation_length', 'pass_at_k', 'generation_kwargs'],
+            normal_split_keys=['input_ids', 'attention_mask'],
+            list_split_keys=[
+                'labels', 'tests', 'canonical_solutions', 'entry_points', 'test_inputs', 'test_outputs', 'prompts',
+                'languages'
+            ],
             *args,
             **kwargs,
         )
@@ -791,24 +851,6 @@ def __init__(
         self.top_p = top_p
         self.top_k = top_k
 
-        self.dont_split_keys = [
-            'mode',
-            'generation_length',
-            'pass_at_k',
-            'generation_kwargs',
-        ]
-        self.normal_split_keys = ['input_ids', 'attention_mask']
-        self.list_split_keys = [
-            'labels',
-            'tests',
-            'canonical_solutions',
-            'entry_points',
-            'test_inputs',
-            'test_outputs',
-            'prompts',
-            'languages',
-        ]
-
     def get_max_prompt_length(self):
         max_prompt_length = 0
         for sample in self.encoded_dataset:
@@ -1067,8 +1109,8 @@ def get_icl_task_dataloader(
     num_fewshot: int,
     prompt_string: str,  # e.g. 'translate english to french:'
     example_delimiter: str,  # e.g. '\n'
-    hf_loading_vars: dict = {},
-    hf_parsing_vars: dict = {},
+    hf_loading_vars: dict = None,
+    hf_parsing_vars: dict = None,
     hf_parsing_func: Callable = None,
     continuation_delimiter: str = '',
     destination_path: str = '',

From db0bcb84b44907a87772b28a587bd84034d61530 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Thu, 16 Nov 2023 06:37:56 +0000
Subject: [PATCH 018/116] add doc strings, rm hf_parsing_vars

---
 .../in_context_learning_evaluation.py         | 188 +++++++++++++-----
 1 file changed, 134 insertions(+), 54 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index a08606b10d..d9ea17f498 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -7,7 +7,7 @@
 import json
 import os
 import random
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
 
 import torch
 import transformers
@@ -116,7 +116,7 @@ class InContextLearningDataset(Dataset):
             Alternate backends must be supported by :meth:`composer.utils.maybe_create_object_store_from_uri`.
             A local dataset must consist of rows of JSON data points with different fields based on the task.
             The default is "context" and "answer".
-        tokenizer (Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast]): The tokenizer used to map between strings and token ids
+        tokenizer (transformers.PreTrainedTokenizerBase): The tokenizer used to map between strings and token ids
         max_seq_len (int): The maximum sequence length supported by the model
         pad_tok_id (int): The special token reserved for padding batches
         num_fewshot (int): The number of complete fewshot examples to prepend before each test example
@@ -129,11 +129,11 @@ class InContextLearningDataset(Dataset):
         strip_dataset (bool): Boolean for whether to strip whitespace from data. Trailing whitespace can cause degenerative outputs,
             so unless otherwise required (for example in code), this should be set to True.
         hf_loading_vars (dict): A dictionary containing keyword arguments to be passed into `load_dataset` if dataset is being pulled from HF.
-        hf_parsing_vars (dict): A dictionary containing keyword arguments to be passed into `_parse_hf_dataset` if the dataset is being pulled from HF,
+        hf_parsing_map (dict): A dictionary containing keyword arguments to be passed into `_parse_hf_dataset` if the dataset is being pulled from HF,
             as well as a boolean for whether or not to process the dataset in batches.
         context_key (str): The key from the parsed dataset that the class will use as the "context" (i.e. the main content to be included in the prompt)
         answer_key (str): The key from the parsed dataset that the class will use as the "answer" (i.e. the main content to be predicted by the model)
-        prelimiter (str): Text to be prepended before each example, including few shot examples #TODO: confirm this
+        prelimiter (str): Text to be prepended before each example, including few shot examples 
         stacked_keys (list(str)): keys in the output batch that must be converted to tensors with torch.stack()
     """
 
@@ -151,8 +151,7 @@ def __init__(
             fewshot_random_seed: int,
             strip_dataset: bool = True,
             hf_loading_vars: dict = None,
-            hf_parsing_vars: dict = None,
-            hf_parsing_func: Callable = None,
+            hf_parsing_map: dict = None,
             # TODO: should this be used to both set and access the data / tokenized examples?
             context_key: str = 'context',
             answer_key: str = 'answer',
@@ -174,20 +173,13 @@ def __init__(
         self.continuation_delimiter = continuation_delimiter
         self.context_key = context_key
         self.answer_key = answer_key
-        self.stacked_keys = stacked_keys
-        hf_loading_vars = hf_loading_vars or {}
-        hf_parsing_vars = hf_parsing_vars or {}
         self.stacked_keys = stacked_keys or ['input_ids', 'labels']
         self.dont_split_keys = dont_split_keys or []
         self.list_split_keys = list_split_keys or []
         self.normal_split_keys = normal_split_keys or []
 
-        default_hf_parser = lambda example, **kwargs: {
-            k: ' '.join([str(example[col]) for col in v]) for k, v in kwargs.items()
-        }
-        self._parse_hf_dataset = hf_parsing_func or default_hf_parser
-
-        self.dataset = self._read_dataset(dataset_uri, destination_path, hf_loading_vars, hf_parsing_vars)
+        hf_loading_vars = hf_loading_vars or {}
+        self.dataset = self._read_dataset(dataset_uri, destination_path, hf_loading_vars, hf_parsing_map)
         self.strip_data = strip_dataset
         if self.strip_data:
             self.dataset = self.dataset.map(strip_data)
@@ -217,7 +209,7 @@ def _read_dataset(
         dataset_uri: str,
         destination_path: str,
         hf_loading_vars: dict = None,
-        hf_parsing_vars: dict = None,
+        hf_parsing_map: dict = None
     ):
         try:
             from datasets import load_dataset  # pyright: ignore [reportGeneralTypeIssues]
@@ -230,11 +222,11 @@ def _read_dataset(
         if dataset_uri.startswith('hf://'):
             dataset_uri = dataset_uri.replace('hf://', '')
             dataset = load_dataset(dataset_uri, **hf_loading_vars)
-            batched = hf_parsing_vars.pop('batched', False)
-            dataset = dataset.map(self._parse_hf_dataset,
-                                  remove_columns=dataset.column_names,
-                                  batched=batched,
-                                  fn_kwargs=hf_parsing_vars)
+            if hf_parsing_map:
+                dataset_parsing_func = lambda example: {
+                    k: ' '.join([str(example[col]) for col in v]) for k, v in hf_parsing_map.items()
+                }
+                dataset = dataset.map(dataset_parsing_func, remove_columns=dataset.column_names)
 
         else:
             with dist.local_rank_zero_download_and_wait(destination_path):
@@ -256,6 +248,11 @@ def generate_few_shot_text(
         a context with its answer appended.
 
         Returns the formatted prompt_string + concatenated list of formatted few shot examples.
+        Args:
+            num_fewshot (int): number of examples to prepend
+            sample_idx (int): current sample idx
+            preamble (str): text to occur at the beginning of the task. Generally instructions or a prompt.
+            fewshot_rng (random.Random): seeded sampler to chose samples with
         """
         few_shot_text = preamble
 
@@ -272,7 +269,12 @@ def construct_context(self, sample: dict, preceding_text: str = '', add_answer:
         Takes a sample and  constructs a context. Optionally, appends this to preceeding text (such as a
         prompt or fewshot examples), as well as optionally adds the correct answer (for fewshot examples)
 
-        The default context is formatted as follows: f'{self.prelimiter}{sample[self.context_key]}{self.continuation_delimiter}'
+        The default output context is formatted as follows: f'{self.prelimiter}{sample[self.context_key]}{self.continuation_delimiter}'
+
+        Args:
+            sample (dict): the sample from which to construct the context
+            preceding_text (str): any preceding text, needed to if self.example_delimiter is needed at the beginning
+            add_answer (bool): bool for whether or not to add the answer on the end of the context (needed for fewshot examples)
 
         """
         ctxt = sample[self.context_key]
@@ -287,6 +289,8 @@ def construct_context(self, sample: dict, preceding_text: str = '', add_answer:
     def get_answer_from_sample(self, sample: dict):
         """
         Returns the answer from the sample
+        Args:
+            sample (dict): the sample from which to retrieve the answer
         """
         return sample[self.answer_key]
 
@@ -302,10 +306,13 @@ def fix_eos_on_preamble(self, preamble: dict):
             preamble['input_ids'] = preamble['input_ids'][:-1]
         return preamble
 
-    # TODO: do we need example here?
     def tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict):
         """
-        Runs text throught he tokenizer and handles special cases.
+        Runs text through the tokenizer and handles special cases.
+        Args:
+            prompt_and_fewshot (str): the collection of the prompt and fewshot examples that belongs before the example's context
+            ctx (str): the specific example's derrived context 
+            example (dict): the example as a dictionary. Used for additional processing in inherited classes.
         """
         tokenized_example = {}
         preamble = self.tokenizer(prompt_and_fewshot)
@@ -349,7 +356,6 @@ def collate_fn(self, data):
         """
         The function that the dataloader uses to accumulate data into batches
         """
-
         batch = {
             'input_ids': [],
             'continuation_indices': [],
@@ -398,6 +404,7 @@ def split_batch(self, batch: Any, microbatch_size: int):
 
 class InContextLearningQATaskDataset(InContextLearningDataset):
     """A dataset that construct batches for in-context learning question answering evaluation
+    QA tasks evaluate a model's ability to answer questions using a consistent format. 
 
     The input format is expected to be a jsonl file with the following fields:
     - context: the question
@@ -405,11 +412,10 @@ class InContextLearningQATaskDataset(InContextLearningDataset):
     - aliases: a list of aliases for the answer
 
     Args:
-        dataset_uri (str): Either a local path, or a remote path beginning with ``s3://``, or another backend
-            supported by :meth:`composer.utils.maybe_create_object_store_from_uri`. Dataset must consist of rows of JSON data points with "context",
-            "answer", and "aliases". See tests/datasets/local_data/triviaqa_small.jsonl.
-        tokenizer (Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast]): The tokenizer used to map between strings and token ids
-        batch_size (int): Size of a batch used for eval
+        dataset_uri (str): A local path, a remote path beginning with ``s3://`` or another backend, or a HuggingFace dataset link with ``hf://`` prepended to it.
+            Alternate backends must be supported by :meth:`composer.utils.maybe_create_object_store_from_uri`.
+            Dataset must consist of rows of JSON data points with "context", "answer", and "aliases". See tests/datasets/local_data/triviaqa_small.jsonl.
+        tokenizer (transformers.PreTrainedTokenizerBase): The tokenizer used to map between strings and token ids
         max_seq_len (int): The maximum sequence length supported by the model
         pad_tok_id (int): The special token reserved for padding batches
         num_fewshot (int): The number of complete fewshot examples to prepend before each test example
@@ -419,6 +425,7 @@ class InContextLearningQATaskDataset(InContextLearningDataset):
         destination_path (str): Temporary path to store downloaded datasets
         prelimiter (str): String to put before each question (e.g. 'Q: ')
         fewshot_random_seed (int): Random seed to use for fewshot sampling
+        cot_delimiter (str): Delimiter to place between the chain of thought and continuations.
     """
 
     def __init__(self, cot_delimiter: str = '', *args, **kwargs):
@@ -437,8 +444,8 @@ def _read_dataset(self,
                       dataset_uri: str,
                       destination_path: str,
                       hf_loading_vars: dict = None,
-                      hf_parsing_vars: dict = None):
-        dataset = super()._read_dataset(dataset_uri, destination_path, hf_loading_vars, hf_parsing_vars)
+                      ):
+        dataset = super()._read_dataset(dataset_uri, destination_path, hf_loading_vars)
         self.has_cot = 'chain_of_thought' in dataset.features
         return dataset.map(
             lambda examples: {
@@ -448,18 +455,33 @@ def _read_dataset(self,
                 'chain_of_thought': examples.get('chain_of_thought', ''),
             })
 
-    def get_answer_from_sample(self, sample):
+    def get_answer_from_sample(self, sample : dict):
+        """
+        Returns the answer from the sample. Applies chain of thought if self.has_cot is marked as true
+        Args:
+            sample (dict): the sample from which to retrieve the answer
+        """
         if self.has_cot:
             return f'{sample["chain_of_thought"]}{self.cot_delimiter}{sample[self.answer_key]}'
         else:
             return sample[self.answer_key]
 
     def tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict):
+        """
+        Runs text through the tokenizer and handles special cases.
+        Args:
+            prompt_and_fewshot (str): the collection of the prompt and fewshot examples that belongs before the example's context
+            ctx (str): the specific example's derrived context 
+            example (dict): the example as a dictionary. 
+        """
         tokenized_example = super().tokenize_example(prompt_and_fewshot, ctxt, example)
         tokenized_example['aliases'] = list(example.get('aliases', []))
         return tokenized_example
 
     def get_max_answer_length(self):
+        """
+        Loops over the dataset and finds the longes answer length
+        """
         max_answer_length = 0
         for sample in self.dataset:
             all_answers = [sample[self.answer_key]] + list(sample.get('aliases', []))
@@ -473,6 +495,9 @@ def get_max_answer_length(self):
         return max_answer_length
 
     def collate_fn(self, data):
+        """
+        The function that the dataloader uses to accumulate data into batches
+        """
         batch = {
             'input_ids': [],
             'mode': 'generate',
@@ -504,14 +529,15 @@ def collate_fn(self, data):
 
 
 class InContextLearningLMTaskDataset(InContextLearningDataset):
-    """A dataset that construct batches for in-context learning language modeling evaluation
+    """A dataset that construct batches for in-context learning language modeling evaluation.
+    Language modeling tasks test a model's ability to properly predict tokens based on preceding tokens. 
+
 
     Args:
         dataset_uri (str): Either a local path, or a remote path beginning with ``s3://``, or another backend
             supported by :meth:`composer.utils.maybe_create_object_store_from_uri`. Dataset must consist of rows of JSON data points with "context",
             and "continuation". See tests/datasets/local_data/lambada_small.jsonl.
         tokenizer (Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast]): The tokenizer used to transform data into batches
-        batch_size (int): Size of a batch used for eval
         max_seq_len (int): The sequence length expected by the model
         pad_tok_id (int): The special token reserved for padding the ends of batches
         num_fewshot (int): The number of complete fewshot examples to prepend before each test example
@@ -526,6 +552,13 @@ def __init__(self, *args, **kwargs):
         super().__init__(answer_key='continuation', *args, **kwargs)
 
     def tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict):
+        """
+        Runs text through the tokenizer and handles special cases.
+        Args:
+            prompt_and_fewshot (str): the collection of the prompt and fewshot examples that belongs before the example's context
+            ctx (str): the specific example's derrived context 
+            example (dict): the example as a dictionary. 
+        """
         tokenized_example = super().tokenize_example(prompt_and_fewshot, ctxt, example)
         cont = example['continuation']
         if self.prefix_space and not cont.startswith(' '):
@@ -534,6 +567,9 @@ def tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict):
         return tokenized_example
 
     def collate_fn(self, data):
+        """
+        The function that the dataloader uses to accumulate data into batches
+        """
         batch = {'input_ids': [], 'continuation_indices': [], 'mode': 'icl_task', 'labels': []}
         for data_pair in data:
             context_enc = data_pair['preamble']['input_ids'] + data_pair['context']['input_ids']
@@ -571,7 +607,6 @@ class InContextLearningMultipleChoiceTaskDataset(InContextLearningDataset):
             supported by :meth:`composer.utils.maybe_create_object_store_from_uri`. Dataset must consist of rows of JSON data points with "query",
             "choices", and "gold" index. See tests/datasets/local_data/piqa_small.jsonl.
         tokenizer (Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast]): The tokenizer used to transform data into batches
-        batch_size (int): Size of a batch used for eval
         max_seq_len (int): The sequence length expected by the model
         pad_tok_id (int): The special token reserved for padding the ends of batches
         num_fewshot (int): The number of complete fewshot examples to prepend before each test example
@@ -592,11 +627,23 @@ def __init__(self, choices_key: str = 'choices', *args, **kwargs):
         self.real_split_keys = ['input_ids', 'labels', 'attention_mask']
 
     def get_answer_from_sample(self, sample: dict):
+        """
+        Returns the correct answer from the sample's choices
+        Args:
+            sample (dict): the sample from which to retrieve the answer
+        """
         choices = sample['choices']
         gold_idx = sample['gold']
         return choices[gold_idx]
 
     def tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict):
+        """
+        Runs text through the tokenizer and handles special cases.
+        Args:
+            prompt_and_fewshot (str): the collection of the prompt and fewshot examples that belongs before the example's context
+            ctx (str): the specific example's derrived context 
+            example (dict): the example as a dictionary. 
+        """
         tokenized_example = super().tokenize_example(prompt_and_fewshot, ctxt, example)
         choices = example['choices']
         if self.prefix_space:
@@ -606,6 +653,9 @@ def tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict):
         return tokenized_example
 
     def collate_fn(self, data):
+        """
+        The function that the dataloader uses to accumulate data into batches
+        """
         batch = {
             'input_ids': [],
             'continuation_indices': [],
@@ -713,6 +763,17 @@ def __init__(self, choices_key='context_options', *args, **kwargs):
         super().__init__(choices_key=choices_key, *args, **kwargs)
 
     def construct_context(self, sample, preceding_text: str = '', add_answer: bool = False):
+        """
+        Takes a sample and  constructs a context. Optionally, appends this to preceeding text (such as a
+        prompt or fewshot examples), as well as optionally adds the correct answer (for fewshot examples)
+
+        Args:
+            sample (dict): the sample from which to construct the context
+            preceding_text (str): any preceding text, needed to if self.example_delimiter is needed at the beginning
+            add_answer (bool): bool for whether or not to add the answer on the end of the context (needed for fewshot examples)
+
+        TODO: finish documentation after discussions
+        """
         context_options = sample['context_options']
         gold_idx = sample['gold']
         continuation = sample['continuation']
@@ -736,6 +797,13 @@ def construct_context(self, sample, preceding_text: str = '', add_answer: bool =
             return context_options
 
     def tokenize_example(self, prompt_and_fewshot: str, context_options: List[str], example: dict):
+        """
+        Runs text through the tokenizer and handles special cases.
+        Args:
+            prompt_and_fewshot (str): the collection of the prompt and fewshot examples that belongs before the example's context
+            ctx (str): the specific example's derrived context 
+            example (dict): the example as a dictionary. 
+        """
         tokenized_example = {}
         preamble = self.tokenizer(prompt_and_fewshot)
         preamble = self.fix_eos_on_preamble(preamble)
@@ -749,6 +817,9 @@ def tokenize_example(self, prompt_and_fewshot: str, context_options: List[str],
         return tokenized_example
 
     def collate_fn(self, data):
+        """
+        The function that the dataloader uses to accumulate data into batches
+        """
         batch = {
             'input_ids': [],
             'continuation_indices': [],
@@ -852,6 +923,9 @@ def __init__(
         self.top_k = top_k
 
     def get_max_prompt_length(self):
+        """
+        Iterates through the dataset and finds the length of the longest prompt
+        """
         max_prompt_length = 0
         for sample in self.encoded_dataset:
             max_prompt_length = max(
@@ -861,6 +935,13 @@ def get_max_prompt_length(self):
         return max_prompt_length
 
     def tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict):
+        """
+        Runs text through the tokenizer and handles special cases.
+        Args:
+            prompt_and_fewshot (str): the collection of the prompt and fewshot examples that belongs before the example's context
+            ctx (str): the specific example's derrived context 
+            example (dict): the example as a dictionary. 
+        """
         tokenized_example = super().tokenize_example(prompt_and_fewshot, ctxt, example)
         tokenized_example['prompt_text'] = example['prompt']
         tokenized_example['task_id'] = example['task_id']
@@ -873,6 +954,9 @@ def tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict):
         return tokenized_example
 
     def collate_fn(self, data):
+        """
+        The function that the dataloader uses to accumulate data into batches
+        """
         batch = {
             'input_ids': [],
             'mode': 'generate',
@@ -933,14 +1017,13 @@ def build_icl_dataloader(
     example_delimiter: str,  # e.g. '\n'
     continuation_delimiter: str,  # e.g. ''
     hf_loading_vars: dict,
-    hf_parsing_vars: dict,
+    hf_parsing_map: dict,
     destination_path: str,
     prelimiter: str,  # e.g. 'Question: '
     cot_delimiter: str,
     fewshot_random_seed: int,
     pass_at_k: int,
     generations_per_sample: int,
-    hf_parsing_func: Callable = None,
 ) -> DataSpec:
     if icl_task_type == 'multiple_choice':
         dataset = InContextLearningMultipleChoiceTaskDataset(dataset_uri=dataset_uri,
@@ -954,8 +1037,8 @@ def build_icl_dataloader(
                                                              destination_path=destination_path,
                                                              fewshot_random_seed=fewshot_random_seed,
                                                              hf_loading_vars=hf_loading_vars,
-                                                             hf_parsing_vars=hf_parsing_vars,
-                                                             hf_parsing_func=hf_parsing_func)
+                                                             hf_parsing_map=hf_parsing_map,
+                                                             )
         batch_size = max(dataset.num_choices, batch_size)
         effective_batchsize = batch_size // dataset.num_choices
     elif icl_task_type == 'schema':
@@ -970,8 +1053,8 @@ def build_icl_dataloader(
                                                      destination_path=destination_path,
                                                      fewshot_random_seed=fewshot_random_seed,
                                                      hf_loading_vars=hf_loading_vars,
-                                                     hf_parsing_vars=hf_parsing_vars,
-                                                     hf_parsing_func=hf_parsing_func)
+                                                     hf_parsing_map=hf_parsing_map,
+                                                     )
         batch_size = max(dataset.num_choices, batch_size)
         effective_batchsize = batch_size // dataset.num_choices
     elif icl_task_type == 'language_modeling':
@@ -986,8 +1069,8 @@ def build_icl_dataloader(
                                                  destination_path=destination_path,
                                                  fewshot_random_seed=fewshot_random_seed,
                                                  hf_loading_vars=hf_loading_vars,
-                                                 hf_parsing_vars=hf_parsing_vars,
-                                                 hf_parsing_func=hf_parsing_func)
+                                                 hf_parsing_map=hf_parsing_map,
+                                                 )
         effective_batchsize = batch_size
     elif icl_task_type == 'question_answering':
         dataset = InContextLearningQATaskDataset(dataset_uri=dataset_uri,
@@ -1002,9 +1085,9 @@ def build_icl_dataloader(
                                                  prelimiter=prelimiter,
                                                  fewshot_random_seed=fewshot_random_seed,
                                                  hf_loading_vars=hf_loading_vars,
-                                                 hf_parsing_vars=hf_parsing_vars,
+                                                 hf_parsing_map=hf_parsing_map,
                                                  cot_delimiter=cot_delimiter,
-                                                 hf_parsing_func=hf_parsing_func)
+                                                 )
         effective_batchsize = batch_size
     elif icl_task_type == 'code_evaluation':
         dataset = InContextLearningCodeEvalDataset(dataset_uri=dataset_uri,
@@ -1019,10 +1102,10 @@ def build_icl_dataloader(
                                                    prelimiter=prelimiter,
                                                    fewshot_random_seed=fewshot_random_seed,
                                                    hf_loading_vars=hf_loading_vars,
-                                                   hf_parsing_vars=hf_parsing_vars,
+                                                   hf_parsing_map=hf_parsing_map,
                                                    pass_at_k=pass_at_k,
                                                    generations_per_sample=generations_per_sample,
-                                                   hf_parsing_func=hf_parsing_func)
+                                                   )
         effective_batchsize = batch_size
     else:
         raise Exception(f'Unrecognized ICL task type: {icl_task_type}')
@@ -1110,8 +1193,7 @@ def get_icl_task_dataloader(
     prompt_string: str,  # e.g. 'translate english to french:'
     example_delimiter: str,  # e.g. '\n'
     hf_loading_vars: dict = None,
-    hf_parsing_vars: dict = None,
-    hf_parsing_func: Callable = None,
+    hf_parsing_map: dict = None,
     continuation_delimiter: str = '',
     destination_path: str = '',
     prelimiter: str = '',  # e.g. 'Question: '
@@ -1184,8 +1266,7 @@ def get_icl_task_dataloader(
                 prompt_string=prompt_string,
                 example_delimiter=example_delimiter,
                 hf_loading_vars=hf_loading_vars,
-                hf_parsing_vars=hf_parsing_vars,
-                hf_parsing_func=hf_parsing_func,
+                hf_parsing_map=hf_parsing_map,
                 continuation_delimiter=continuation_delimiter,
                 destination_path=partition_uri + '_tmp',
                 prelimiter=prelimiter,
@@ -1207,8 +1288,7 @@ def get_icl_task_dataloader(
             prompt_string=prompt_string,
             example_delimiter=example_delimiter,
             hf_loading_vars=hf_loading_vars,
-            hf_parsing_vars=hf_parsing_vars,
-            hf_parsing_func=hf_parsing_func,
+            hf_parsing_map=hf_parsing_map,
             continuation_delimiter=continuation_delimiter,
             destination_path=destination_path,
             prelimiter=prelimiter,

From c2dd31cfc8ef73422c176e716d3b241bd0d4475e Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Thu, 16 Nov 2023 19:35:03 +0000
Subject: [PATCH 019/116] revert question_prelimiter back to prelimiter

---
 composer/datasets/in_context_learning_evaluation.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index d9ea17f498..ec60680338 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -1196,7 +1196,7 @@ def get_icl_task_dataloader(
     hf_parsing_map: dict = None,
     continuation_delimiter: str = '',
     destination_path: str = '',
-    prelimiter: str = '',  # e.g. 'Question: '
+    question_prelimiter: str = '',  # e.g. 'Question: '
     fewshot_random_seed: int = 1234,
     pass_at_k: int = 1,
     generations_per_sample: int = 1,
@@ -1269,7 +1269,7 @@ def get_icl_task_dataloader(
                 hf_parsing_map=hf_parsing_map,
                 continuation_delimiter=continuation_delimiter,
                 destination_path=partition_uri + '_tmp',
-                prelimiter=prelimiter,
+                prelimiter=question_prelimiter,
                 cot_delimiter=cot_delimiter,
                 fewshot_random_seed=fewshot_random_seed,
                 pass_at_k=pass_at_k,
@@ -1291,7 +1291,7 @@ def get_icl_task_dataloader(
             hf_parsing_map=hf_parsing_map,
             continuation_delimiter=continuation_delimiter,
             destination_path=destination_path,
-            prelimiter=prelimiter,
+            prelimiter=question_prelimiter,
             cot_delimiter=cot_delimiter,
             fewshot_random_seed=fewshot_random_seed,
             pass_at_k=pass_at_k,

From 3ad77e6eafeff5d91553f5baea79ef08e7d5baf8 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Thu, 16 Nov 2023 20:12:36 +0000
Subject: [PATCH 020/116] fix tests

---
 .../in_context_learning_evaluation.py         |  3 +-
 .../test_in_context_learning_datasets.py      | 99 +++----------------
 2 files changed, 16 insertions(+), 86 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index ec60680338..7db4a613f0 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -444,8 +444,9 @@ def _read_dataset(self,
                       dataset_uri: str,
                       destination_path: str,
                       hf_loading_vars: dict = None,
+                      hf_parsing_map: dict = None,
                       ):
-        dataset = super()._read_dataset(dataset_uri, destination_path, hf_loading_vars)
+        dataset = super()._read_dataset(dataset_uri, destination_path, hf_loading_vars, hf_parsing_map)
         self.has_cot = 'chain_of_thought' in dataset.features
         return dataset.map(
             lambda examples: {
diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py
index 8f5d7001ae..e86771e1a9 100644
--- a/tests/datasets/test_in_context_learning_datasets.py
+++ b/tests/datasets/test_in_context_learning_datasets.py
@@ -534,7 +534,7 @@ def test_qa_task_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path, num_fews
                                  num_fewshot=num_fewshot,
                                  prompt_string=prompt_string,
                                  example_delimiter='\n',
-                                 prelimiter='Q: ',
+                                 question_prelimiter='Q: ',
                                  continuation_delimiter='\nA:',
                                  destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'))
     assert isinstance(dl, DataSpec)
@@ -584,7 +584,7 @@ def test_qa_task_with_cot_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path,
                                  num_fewshot=num_fewshot,
                                  prompt_string='',
                                  example_delimiter='\n',
-                                 prelimiter='Q: ',
+                                 question_prelimiter='Q: ',
                                  continuation_delimiter="\nA: Let's think step by step. ",
                                  cot_delimiter=' #### ',
                                  destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'))
@@ -761,7 +761,7 @@ def test_code_eval_sentpiece_dataloader(dataset_uri, tmp_path, num_fewshot, prom
                                  num_fewshot=num_fewshot,
                                  prompt_string=prompt_string,
                                  example_delimiter='\n',
-                                 prelimiter='Code start: \n',
+                                 question_prelimiter='Code start: \n',
                                  destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'),
                                  generations_per_sample=generations_per_sample)
     assert isinstance(dl, DataSpec)
@@ -846,7 +846,7 @@ def test_code_eval_test_cases(dataset_uri, tmp_path):
                                  num_fewshot=0,
                                  prompt_string='',
                                  example_delimiter='\n',
-                                 prelimiter='Code start: \n',
+                                 question_prelimiter='Code start: \n',
                                  destination_path=str(tmp_path / f'icl_.jsonl'),
                                  generations_per_sample=1)
     assert isinstance(dl, DataSpec)
@@ -895,7 +895,7 @@ def test_code_eval_pass_at_k_validity(dataset_uri, tmp_path):
                                 num_fewshot=0,
                                 prompt_string='',
                                 example_delimiter='\n',
-                                prelimiter='Code start: \n',
+                                question_prelimiter='Code start: \n',
                                 destination_path=str(tmp_path / f'icl_.jsonl'),
                                 pass_at_k=10,
                                 generations_per_sample=1)
@@ -924,7 +924,7 @@ def test_code_eval_task_dataloader(dataset_uri, tmp_path, num_fewshot, prompt_st
                                  num_fewshot=num_fewshot,
                                  prompt_string=prompt_string,
                                  example_delimiter='\n',
-                                 prelimiter='Code start: \n',
+                                 question_prelimiter='Code start: \n',
                                  destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'),
                                  generations_per_sample=generations_per_sample)
     assert isinstance(dl, DataSpec)
@@ -1525,9 +1525,9 @@ def test_lm_spacing_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
     'split': 'test',
     'name': 'juggernaut',
 }])
-@pytest.mark.parametrize('hf_parsing_vars', [{'context': ['context'], 'continuation': ['continuation']}])
+@pytest.mark.parametrize('hf_parsing_map', [None, {'context': ['context'], 'continuation': ['continuation']}])
 def test_hf_dataloading_lm_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path, num_fewshot, prompt_string,
-                                      hf_loading_vars, hf_parsing_vars):
+                                      hf_loading_vars, hf_parsing_map):
     pytest.importorskip('datasets')
 
     tokenizer = tiny_gpt2_tokenizer
@@ -1545,7 +1545,7 @@ def test_hf_dataloading_lm_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path
                                  continuation_delimiter=' ',
                                  destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
                                  hf_loading_vars=hf_loading_vars,
-                                 hf_parsing_vars=hf_parsing_vars)
+                                 hf_parsing_map=hf_parsing_map)
     assert isinstance(dl, DataSpec)
     assert isinstance(dl.dataloader, DataLoader)  # pyright
     batch = next(dl.dataloader._get_iterator())
@@ -1574,85 +1574,15 @@ def test_hf_dataloading_lm_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path
     'split': 'test',
     'name': 'invoker',
 }])
+@pytest.mark.parametrize('hf_parsing_map', [{"context":['quas','wex','exort'],"answer":['spell']}])
 def test_hf_dataloading_custom_parsing(dataset_uri, tiny_gpt2_tokenizer, tmp_path, num_fewshot, prompt_string,
-                                       hf_loading_vars):
+                                       hf_loading_vars, hf_parsing_map):
     pytest.importorskip('datasets')
 
     tokenizer = tiny_gpt2_tokenizer
     batch_size = 2
     seqlen = 2048
 
-    def parse_invoker(example):
-        context = ' '.join([example['quas'], example['wex'], example['exort']])
-        label = example['spell']
-        return {'context': context, 'answer': label}
-
-    # empirical number from the small test dataset
-    maximum_answer_length = 4
-
-    dl = get_icl_task_dataloader('question_answering',
-                                 dataset_uri,
-                                 tokenizer,
-                                 batch_size,
-                                 max_seq_len=seqlen,
-                                 pad_tok_id=tokenizer.eos_token_id,
-                                 num_fewshot=num_fewshot,
-                                 prompt_string=prompt_string,
-                                 example_delimiter='\n',
-                                 hf_parsing_func=parse_invoker,
-                                 prelimiter='Orbs: ',
-                                 continuation_delimiter='\nSpell:',
-                                 destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
-                                 hf_loading_vars=hf_loading_vars)
-    assert isinstance(dl, DataSpec)
-    assert isinstance(dl.dataloader, DataLoader)  # pyright
-    batch = next(dl.dataloader._get_iterator())
-
-    assert tuple(batch['input_ids'].shape) == (batch_size, seqlen - maximum_answer_length)
-    assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen - maximum_answer_length)
-    assert batch['mode'] == 'generate'
-    # the maximum generation length from the small test data
-    assert batch['generation_length'] == maximum_answer_length
-    assert all(item[0] == tokenizer.eos_token_id for item in batch['input_ids'])
-
-    decoded_batch = tokenizer.batch_decode(batch['input_ids'])
-    assert all(item.count('Orbs: ') == num_fewshot + 1 for item in decoded_batch)
-    # import IPython; IPython.embed()
-    assert all(item.count('\nSpell:') == num_fewshot + 1 for item in decoded_batch)
-
-    if len(prompt_string) > 0:
-        assert all(item.count('What spell does this invoke? ') == 1 for item in decoded_batch)
-    assert all(
-        set(found) == set(expected) for found, expected in zip(batch['labels'], [['defeaning blast'], ['cold snap']]))
-    assert decoded_batch[0].endswith('Orbs: quas wex exort\nSpell:')
-    assert decoded_batch[1].endswith('Orbs: quas quas quas\nSpell:')
-
-
-@pytest.mark.parametrize('dataset_uri', ['hf://maxisawesome/test_dataset'])
-@pytest.mark.parametrize('num_fewshot', [0, 1])
-@pytest.mark.parametrize('prompt_string', ['What spell does this invoke? ', ''])
-@pytest.mark.parametrize('hf_loading_vars', [{
-    'split': 'test',
-    'name': 'invoker',
-}])
-def test_hf_dataloading_custom_parsing_batched(dataset_uri, tiny_gpt2_tokenizer, tmp_path, num_fewshot, prompt_string,
-                                               hf_loading_vars):
-    pytest.importorskip('datasets')
-
-    tokenizer = tiny_gpt2_tokenizer
-    batch_size = 2
-    seqlen = 2048
-
-    def parse_invoker_batched(examples):
-        batch = {'context': [], 'answer': []}
-        for i, quas_text in enumerate(examples['quas']):
-            # import IPython; IPython.embed()
-            wex_text = examples['wex'][i]
-            exort_text = examples['exort'][i]
-            batch['context'].append(' '.join([quas_text, wex_text, exort_text]))
-            batch['answer'].append(examples['spell'][i])
-        return batch
-
     # empirical number from the small test dataset
     maximum_answer_length = 4
 
@@ -1665,12 +1595,11 @@ def parse_invoker_batched(examples):
                                  num_fewshot=num_fewshot,
                                  prompt_string=prompt_string,
                                  example_delimiter='\n',
-                                 hf_parsing_func=parse_invoker_batched,
-                                 prelimiter='Orbs: ',
+                                 question_prelimiter='Orbs: ',
                                  continuation_delimiter='\nSpell:',
                                  destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
                                  hf_loading_vars=hf_loading_vars,
-                                 hf_parsing_vars={'batched': True})
+                                 hf_parsing_map=hf_parsing_map)
     assert isinstance(dl, DataSpec)
     assert isinstance(dl.dataloader, DataLoader)  # pyright
     batch = next(dl.dataloader._get_iterator())
@@ -1692,4 +1621,4 @@ def parse_invoker_batched(examples):
     assert all(
         set(found) == set(expected) for found, expected in zip(batch['labels'], [['defeaning blast'], ['cold snap']]))
     assert decoded_batch[0].endswith('Orbs: quas wex exort\nSpell:')
-    assert decoded_batch[1].endswith('Orbs: quas quas quas\nSpell:')
+    assert decoded_batch[1].endswith('Orbs: quas quas quas\nSpell:')
\ No newline at end of file

From 14cf5e7c2bdce71860967e4702a41d7e5b97b4b2 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Thu, 16 Nov 2023 22:57:38 +0000
Subject: [PATCH 021/116] add more docstrings

---
 .../in_context_learning_evaluation.py         | 34 ++++++++++++++-----
 1 file changed, 25 insertions(+), 9 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 7db4a613f0..0c6e4e388a 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -1137,7 +1137,7 @@ def build_icl_dataloader(
     )
 
 
-def partition_dataset_by_category(dataset_uri: str, destination_path: str) -> Dict[str, str]:
+def partition_dataset_by_category(dataset_uri: str, destination_path: str, hf_loading_vars: dict, hf_parsing_map: dict) -> Dict[str, str]:
     """If has_categories is enabled, we partition the dataset into a separate dataset for each category value in the data and write each partition to a local file.
 
     Args:
@@ -1158,6 +1158,15 @@ def partition_dataset_by_category(dataset_uri: str, destination_path: str) -> Di
             conda_package='datasets',
             conda_channel='conda-forge',
         ) from e
+    if dataset_uri.startswith("hf://"):
+        # TODO: this will also execute in the dataset class, so ensure that the same hf_parsing_map and loading_vars can be used both times
+        cur_dataset_uri = dataset_uri.replace('hf://', '')
+        dataset = load_dataset(cur_dataset_uri, **hf_loading_vars)
+        if hf_parsing_map:
+            dataset_parsing_func = lambda example: {
+                k: ' '.join([str(example[col]) for col in v]) for k, v in hf_parsing_map.items()
+            }
+            dataset = dataset.map(dataset_parsing_func, remove_columns=dataset.column_names)
     with dist.local_rank_zero_download_and_wait(destination_path):
         if dist.get_local_rank() == 0:
             get_file(dataset_uri, destination_path, overwrite=True)
@@ -1186,18 +1195,18 @@ def partition_dataset_by_category(dataset_uri: str, destination_path: str) -> Di
 def get_icl_task_dataloader(
     icl_task_type: str,
     dataset_uri: str,
-    tokenizer: Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast],
+    tokenizer: transformers.PreTrainedTokenizerBase,
     batch_size: int,
     max_seq_len: int,
     pad_tok_id: int,
     num_fewshot: int,
     prompt_string: str,  # e.g. 'translate english to french:'
     example_delimiter: str,  # e.g. '\n'
+    continuation_delimiter: str = '',
+    question_prelimiter: str = '',  # e.g. 'Question: '
     hf_loading_vars: dict = None,
     hf_parsing_map: dict = None,
-    continuation_delimiter: str = '',
     destination_path: str = '',
-    question_prelimiter: str = '',  # e.g. 'Question: '
     fewshot_random_seed: int = 1234,
     pass_at_k: int = 1,
     generations_per_sample: int = 1,
@@ -1232,9 +1241,10 @@ def get_icl_task_dataloader(
        ... )
 
     Args:
-        dataset_uri (str): Either a local path, or a remote path beginning with ``s3://``, or another backend
-            supported by :meth:`composer.utils.maybe_create_object_store_from_uri`.
-        tokenizer (Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast]): The tokenizer used to transform data into batches
+        icl_task_type (str): Name of icl_task type. One of ['multiple_choice', 'schema', 'language_modeling', 'question_answering', 'code_evaluation']
+        dataset_uri (str): Either a local path, a remote path beginning with ``s3://``, or another backend
+            supported by :meth:`composer.utils.maybe_create_object_store_from_uri`, a link to a HuggingFace Dataset
+        tokenizer (transformers.PreTrainedTokenizerBase): The tokenizer used to transform data into batches
         batch_size (int): Size of a batch used for eval
         max_seq_len (int): The sequence length expected by the model
         pad_tok_id (int): The special token reserved for padding the ends of batches
@@ -1242,8 +1252,14 @@ def get_icl_task_dataloader(
         prompt_string (str): Prompt string to put once before all fewshot examples/test examples (e.g. 'translate english to french')
         example_delimiter (str): Separator that goes between individual examples (e.g. '\n')
         continuation_delimiter: (str): Separator that goes between context and continuation in each example (e.g. '->')
+        question_prelimiter: (str): Text to be prepended before each context segement in each eval example. (e.g. 'Q:', 'The following is a paragraph containing...')
+        hf_loading_vars (dict):
+        hf_parsing_map (dict):
         destination_path: (str): This is the local file where remote datasets will be saved.
-        prelimiter: (str): For QA tasks, this will be prepended to each question.
+        fewshot_random_seed (int):
+        pass_at_k (int):
+        generations_per_sample (int):
+        cot_delimiter (str):
         has_categories: (bool): If ``True``, we will search the dataset file for a category key, and partition the dataset into a separate dataloader for each category occurring in the data.
 
     Returns:
@@ -1252,7 +1268,7 @@ def get_icl_task_dataloader(
 
     if has_categories:
         result_dls = {}
-        output_files = partition_dataset_by_category(dataset_uri, destination_path)
+        output_files = partition_dataset_by_category(dataset_uri, destination_path, hf_loading_vars, hf_parsing_map)
         categories = sorted(output_files.keys())
         for category in categories:
             partition_uri = output_files[category]

From f76856364290a300404cba28622a9f1865e3fb81 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Fri, 17 Nov 2023 01:06:08 +0000
Subject: [PATCH 022/116] add doc strings, fix hf w/ categories

---
 .../in_context_learning_evaluation.py         | 188 +++++++++++++-----
 1 file changed, 138 insertions(+), 50 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 0c6e4e388a..1befcd54df 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -115,26 +115,28 @@ class InContextLearningDataset(Dataset):
         dataset_uri (str): A local path, a remote path beginning with ``s3://`` or another backend, or a HuggingFace dataset link with ``hf://`` prepended to it.
             Alternate backends must be supported by :meth:`composer.utils.maybe_create_object_store_from_uri`.
             A local dataset must consist of rows of JSON data points with different fields based on the task.
-            The default is "context" and "answer".
+            The default keys expected are "context" and "answer".
         tokenizer (transformers.PreTrainedTokenizerBase): The tokenizer used to map between strings and token ids
         max_seq_len (int): The maximum sequence length supported by the model
         pad_tok_id (int): The special token reserved for padding batches
         num_fewshot (int): The number of complete fewshot examples to prepend before each test example
+        fewshot_random_seed (int): Random seed to use for fewshot sampling
         prompt_string (str): Prompt string to put once before all fewshot examples/test examples (e.g. 'translate english to french')
         example_delimiter (str): Separator that goes between individual (context, answer) pairs (e.g. '\n')
         continuation_delimiter: (str): Separator that goes between context and answer in each example (e.g. '\nA: ')
+        prelimiter (str): Text to be prepended before each example, including few shot examples 
+        context_key (str): The key from the parsed dataset that the class will use as the "context" (i.e. the main content to be included in the prompt)
+        answer_key (str): The key from the parsed dataset that the class will use as the "answer" (i.e. the main content to be predicted by the model)
         destination_path (str): Temporary path to store downloaded datasets
-        prelimiter (str): String to put before each question (e.g. 'Q: ')
-        fewshot_random_seed (int): Random seed to use for fewshot sampling
         strip_dataset (bool): Boolean for whether to strip whitespace from data. Trailing whitespace can cause degenerative outputs,
-            so unless otherwise required (for example in code), this should be set to True.
+            so unless whitespace should be preserved (for example in code), this should be set to True.
         hf_loading_vars (dict): A dictionary containing keyword arguments to be passed into `load_dataset` if dataset is being pulled from HF.
-        hf_parsing_map (dict): A dictionary containing keyword arguments to be passed into `_parse_hf_dataset` if the dataset is being pulled from HF,
-            as well as a boolean for whether or not to process the dataset in batches.
-        context_key (str): The key from the parsed dataset that the class will use as the "context" (i.e. the main content to be included in the prompt)
-        answer_key (str): The key from the parsed dataset that the class will use as the "answer" (i.e. the main content to be predicted by the model)
-        prelimiter (str): Text to be prepended before each example, including few shot examples 
+        hf_parsing_map (Dict[str:List[str]]): A dictionary containing a from HF columns to ICL dataset keys. The dictionary should be formatted {icl_key:[hf_key1, hf_key1]}.
+            Values in the dict will be concatenated with ' ' seperating them. If not included, will use the columns already present in the HF dataset.
         stacked_keys (list(str)): keys in the output batch that must be converted to tensors with torch.stack()
+        dont_split_keys (list(str)): keys in the ICL dictionary that should not be split among batches.
+        list_split_keys (list(str)): keys in the ICL dictionary that will be split as lists, resulting in microbatch_size sections of the list being inserted in every batch 
+        normal_split_keys (list(str)): keys in the ICL dictionary that will be split into chunks regularly
     """
 
     def __init__(
@@ -144,18 +146,18 @@ def __init__(
             max_seq_len: int,
             pad_tok_id: int,
             num_fewshot: int,
+            fewshot_random_seed: int,
             prompt_string: str,
             example_delimiter: str,
             continuation_delimiter: str,
             destination_path: str,
-            fewshot_random_seed: int,
-            strip_dataset: bool = True,
-            hf_loading_vars: dict = None,
-            hf_parsing_map: dict = None,
+            prelimiter: str = '',
             # TODO: should this be used to both set and access the data / tokenized examples?
             context_key: str = 'context',
             answer_key: str = 'answer',
-            prelimiter: str = '',
+            strip_dataset: bool = True,
+            hf_loading_vars: dict = None,
+            hf_parsing_map: dict = None,
             stacked_keys: List[str] = None,
             dont_split_keys: List[str] = None,
             list_split_keys: List[str] = None,
@@ -227,7 +229,6 @@ def _read_dataset(
                     k: ' '.join([str(example[col]) for col in v]) for k, v in hf_parsing_map.items()
                 }
                 dataset = dataset.map(dataset_parsing_func, remove_columns=dataset.column_names)
-
         else:
             with dist.local_rank_zero_download_and_wait(destination_path):
                 if dist.get_local_rank() == 0:
@@ -235,7 +236,7 @@ def _read_dataset(
             dataset = load_dataset('json', data_files=destination_path, split='train', streaming=False)
         return dataset
 
-    def generate_few_shot_text(
+    def _generate_few_shot_text(
         self,
         num_fewshot: int,
         sample_idx: int,
@@ -253,29 +254,34 @@ def generate_few_shot_text(
             sample_idx (int): current sample idx
             preamble (str): text to occur at the beginning of the task. Generally instructions or a prompt.
             fewshot_rng (random.Random): seeded sampler to chose samples with
+        
+        Returns:
+            str: the original preamble with num_fewshot examples appended
         """
         few_shot_text = preamble
 
         if num_fewshot > 0:
             fewshot_idxs = _get_fewshot_sample_idxs(len(self.dataset), num_fewshot, sample_idx, fewshot_rng)
             for fewshot_idx in fewshot_idxs:
-                ctxt = self.construct_context(self.dataset[fewshot_idx], few_shot_text, add_answer=True)
+                ctxt = self._construct_context(self.dataset[fewshot_idx], few_shot_text, add_answer=True)
                 few_shot_text += ctxt
 
         return few_shot_text
 
-    def construct_context(self, sample: dict, preceding_text: str = '', add_answer: bool = False):
+    def _construct_context(self, sample: dict, preceding_text: str = '', add_answer: bool = False):
         """
         Takes a sample and  constructs a context. Optionally, appends this to preceeding text (such as a
         prompt or fewshot examples), as well as optionally adds the correct answer (for fewshot examples)
 
-        The default output context is formatted as follows: f'{self.prelimiter}{sample[self.context_key]}{self.continuation_delimiter}'
-
         Args:
             sample (dict): the sample from which to construct the context
             preceding_text (str): any preceding text, needed to if self.example_delimiter is needed at the beginning
             add_answer (bool): bool for whether or not to add the answer on the end of the context (needed for fewshot examples)
 
+        Returns:
+
+            str: The constructed context. The default output context is 
+                 formatted as follows: f'{self.prelimiter}{sample[self.context_key]}{self.continuation_delimiter}'
         """
         ctxt = sample[self.context_key]
         ctxt = f'{self.prelimiter}{ctxt}'
@@ -283,40 +289,51 @@ def construct_context(self, sample: dict, preceding_text: str = '', add_answer:
             ctxt = f'{self.example_delimiter}{ctxt}'
         ctxt = f'{ctxt}{self.continuation_delimiter}'
         if add_answer:
-            ctxt = f'{ctxt}{self.get_answer_from_sample(sample)}'
+            ctxt = f'{ctxt}{self._get_answer_from_sample(sample)}'
         return ctxt
 
-    def get_answer_from_sample(self, sample: dict):
+    def _get_answer_from_sample(self, sample: dict):
         """
         Returns the answer from the sample
         Args:
             sample (dict): the sample from which to retrieve the answer
+
+        Returns:
+            str: the answer in from the sample
         """
         return sample[self.answer_key]
 
-    def fix_eos_on_preamble(self, preamble: dict):
+    def _fix_eos_on_preamble(self, preamble: dict):
         """
         If the preamble is empty then preamble['input_ids'] will be a 0-length list,
         unless the tokenizer adds special tokens to empty strings (e.g. OPT tokenizer)
         If there is an EOS token added, we need to remove it so it is not in the middle of the prompt,
         as the specific eval question's prompt will follow the preamble
+        Args:
+            preamble (dict): a dictionary containing a the tokenized input
+
+        Returns:
+            dict: the same dictionary with the final token conditionally removed
         """
         if (self.tokenizer.eos_token_id is not None and len(preamble['input_ids']) > 1 and
                 preamble['input_ids'][-1] == self.tokenizer.eos_token_id):
             preamble['input_ids'] = preamble['input_ids'][:-1]
         return preamble
 
-    def tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict):
+    def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict):
         """
         Runs text through the tokenizer and handles special cases.
         Args:
             prompt_and_fewshot (str): the collection of the prompt and fewshot examples that belongs before the example's context
             ctx (str): the specific example's derrived context 
             example (dict): the example as a dictionary. Used for additional processing in inherited classes.
+
+        Returns:
+            dict: dictionary with the tokenized data
         """
         tokenized_example = {}
         preamble = self.tokenizer(prompt_and_fewshot)
-        preamble = self.fix_eos_on_preamble(preamble)
+        preamble = self._fix_eos_on_preamble(preamble)
         tokenized_example['preamble'] = preamble
         if self.strip_data:
             # rstrip context because a prompt ending in a space results in degenerate output
@@ -347,14 +364,19 @@ def _prep_example(
         Returns:
             dict: contains a dictionary with the tokenized data
         """
-        prompt_and_fewshot = self.generate_few_shot_text(num_fewshot, example_idx, prompt_string, fewshot_rng)
-        ctxt = self.construct_context(example, prompt_and_fewshot, add_answer=False)
-        tokenized_example = self.tokenize_example(prompt_and_fewshot, ctxt, example)
+        prompt_and_fewshot = self._generate_few_shot_text(num_fewshot, example_idx, prompt_string, fewshot_rng)
+        ctxt = self._construct_context(example, prompt_and_fewshot, add_answer=False)
+        tokenized_example = self._tokenize_example(prompt_and_fewshot, ctxt, example)
         return tokenized_example
 
     def collate_fn(self, data):
         """
         The function that the dataloader uses to accumulate data into batches
+        Args:
+            data (list): list of tokenized datapoints (dicts returned by self._tokenize_example)
+        
+        Returns:
+            dict: dictionary for a single batch 
         """
         batch = {
             'input_ids': [],
@@ -379,6 +401,13 @@ def collate_fn(self, data):
     def split_batch(self, batch: Any, microbatch_size: int):
         """
         Handling for certain specialty columns that must be split into batches in different formats
+
+        Args:
+            batch (dict): batch of data
+            microbatch_size (int): size of microbatches
+
+        Returns:
+            list: list of chunked batches
         """
         # Don't split kwargs that don't change
         # Normally split torch tensors
@@ -411,6 +440,7 @@ class InContextLearningQATaskDataset(InContextLearningDataset):
     - answer: the preferred answer to the question
     - aliases: a list of aliases for the answer
 
+    #TODO: Should I only list variables here that are different than InContextLearningDataset?
     Args:
         dataset_uri (str): A local path, a remote path beginning with ``s3://`` or another backend, or a HuggingFace dataset link with ``hf://`` prepended to it.
             Alternate backends must be supported by :meth:`composer.utils.maybe_create_object_store_from_uri`.
@@ -456,32 +486,41 @@ def _read_dataset(self,
                 'chain_of_thought': examples.get('chain_of_thought', ''),
             })
 
-    def get_answer_from_sample(self, sample : dict):
+    def _get_answer_from_sample(self, sample : dict):
         """
         Returns the answer from the sample. Applies chain of thought if self.has_cot is marked as true
         Args:
             sample (dict): the sample from which to retrieve the answer
+
+        Returns:
+            str: the answer in from the sample with chain of thought and delimiter if needed
         """
         if self.has_cot:
             return f'{sample["chain_of_thought"]}{self.cot_delimiter}{sample[self.answer_key]}'
         else:
             return sample[self.answer_key]
 
-    def tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict):
+    def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict):
         """
         Runs text through the tokenizer and handles special cases.
         Args:
             prompt_and_fewshot (str): the collection of the prompt and fewshot examples that belongs before the example's context
             ctx (str): the specific example's derrived context 
             example (dict): the example as a dictionary. 
+
+        Returns:
+            dict: dictionary with the tokenized data
         """
-        tokenized_example = super().tokenize_example(prompt_and_fewshot, ctxt, example)
+        tokenized_example = super()._tokenize_example(prompt_and_fewshot, ctxt, example)
         tokenized_example['aliases'] = list(example.get('aliases', []))
         return tokenized_example
 
     def get_max_answer_length(self):
-        """
+        f"""
         Loops over the dataset and finds the longes answer length
+
+        Returns:
+            int: the maximum answer length with an additional buffer of {_MAX_ANSWER_BUFFER_LENGTH} if chain of thought is present
         """
         max_answer_length = 0
         for sample in self.dataset:
@@ -498,6 +537,11 @@ def get_max_answer_length(self):
     def collate_fn(self, data):
         """
         The function that the dataloader uses to accumulate data into batches
+        Args:
+            data (list): list of tokenized datapoints (dicts returned by self._tokenize_example)
+        
+        Returns:
+            dict: dictionary for a single batch 
         """
         batch = {
             'input_ids': [],
@@ -552,15 +596,18 @@ class InContextLearningLMTaskDataset(InContextLearningDataset):
     def __init__(self, *args, **kwargs):
         super().__init__(answer_key='continuation', *args, **kwargs)
 
-    def tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict):
+    def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict):
         """
         Runs text through the tokenizer and handles special cases.
         Args:
             prompt_and_fewshot (str): the collection of the prompt and fewshot examples that belongs before the example's context
             ctx (str): the specific example's derrived context 
             example (dict): the example as a dictionary. 
+
+        Returns:
+            dict: dictionary with the tokenized data
         """
-        tokenized_example = super().tokenize_example(prompt_and_fewshot, ctxt, example)
+        tokenized_example = super()._tokenize_example(prompt_and_fewshot, ctxt, example)
         cont = example['continuation']
         if self.prefix_space and not cont.startswith(' '):
             cont = f' {cont}'
@@ -570,6 +617,11 @@ def tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict):
     def collate_fn(self, data):
         """
         The function that the dataloader uses to accumulate data into batches
+        Args:
+            data (list): list of tokenized datapoints (dicts returned by self._tokenize_example)
+        
+        Returns:
+            dict: dictionary for a single batch 
         """
         batch = {'input_ids': [], 'continuation_indices': [], 'mode': 'icl_task', 'labels': []}
         for data_pair in data:
@@ -624,28 +676,35 @@ def __init__(self, choices_key: str = 'choices', *args, **kwargs):
                          normal_split_keys=['gold_indices'],
                          *args,
                          **kwargs)
+        # self.check_defaults_are_set({'num_choices': self.num_choices, 'generations_per_sample':self.generations_per_sample, "top_p": self.top_p,"top_k":self.top_k})
         self.num_choices = len(self.dataset[0][choices_key])
         self.real_split_keys = ['input_ids', 'labels', 'attention_mask']
 
-    def get_answer_from_sample(self, sample: dict):
+    def _get_answer_from_sample(self, sample: dict):
         """
         Returns the correct answer from the sample's choices
         Args:
             sample (dict): the sample from which to retrieve the answer
+
+        Returns:
+            str: the full string of the correct answer based on the 'gold' key 
         """
         choices = sample['choices']
         gold_idx = sample['gold']
         return choices[gold_idx]
 
-    def tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict):
+    def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict):
         """
         Runs text through the tokenizer and handles special cases.
         Args:
             prompt_and_fewshot (str): the collection of the prompt and fewshot examples that belongs before the example's context
             ctx (str): the specific example's derrived context 
             example (dict): the example as a dictionary. 
+
+        Returns:
+            dict: dictionary with the tokenized data
         """
-        tokenized_example = super().tokenize_example(prompt_and_fewshot, ctxt, example)
+        tokenized_example = super()._tokenize_example(prompt_and_fewshot, ctxt, example)
         choices = example['choices']
         if self.prefix_space:
             choices = [(f' {choice}' if not choice.startswith(' ') else choice) for choice in choices]
@@ -656,6 +715,11 @@ def tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict):
     def collate_fn(self, data):
         """
         The function that the dataloader uses to accumulate data into batches
+        Args:
+            data (list): list of tokenized datapoints (dicts returned by self._tokenize_example)
+        
+        Returns:
+            dict: dictionary for a single batch 
         """
         batch = {
             'input_ids': [],
@@ -666,7 +730,6 @@ def collate_fn(self, data):
             'choice_groupings': [],
         }
         for data_pair in data:
-            # TODO: this line is sus idgi
             choice_start_idx = len(batch['continuation_indices'])
 
             for choice in data_pair['choices']:
@@ -694,6 +757,7 @@ def collate_fn(self, data):
         batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
         return batch
 
+    # TODO: should I type all the return values like this did?
     def get_num_samples_in_batch(self, batch) -> int:
         return batch['input_ids'].shape[0] // self.num_choices
 
@@ -705,6 +769,12 @@ def split_batch(self, batch: Any, microbatch_size: int):
         and real samples, which refers to one possible continuation. As sample count and
         microbatch_size are tracked in logical samples, we split logical attributes by
         microbatch_size and real attributes by microbatch_size * num_choices.
+        Args:
+            batch (dict): batch of data
+            microbatch_size (int): size of microbatches
+
+        Returns:
+            list: list of chunked batches
         """
         # There are extra split options in this func for multiple choice
         chunked = {}
@@ -758,12 +828,13 @@ class InContextLearningSchemaTaskDataset(InContextLearningMultipleChoiceTaskData
         continuation_delimiter: (str): Separator that goes between context and continuation in each example (e.g. '->')
         destination_path (str): Temporary path to store downloaded datasets
         fewshot_random_seed (int): Random seed used to select fewshot examples
+        choices_key (str)
     """
 
     def __init__(self, choices_key='context_options', *args, **kwargs):
         super().__init__(choices_key=choices_key, *args, **kwargs)
 
-    def construct_context(self, sample, preceding_text: str = '', add_answer: bool = False):
+    def _construct_context(self, sample, preceding_text: str = '', add_answer: bool = False):
         """
         Takes a sample and  constructs a context. Optionally, appends this to preceeding text (such as a
         prompt or fewshot examples), as well as optionally adds the correct answer (for fewshot examples)
@@ -797,17 +868,20 @@ def construct_context(self, sample, preceding_text: str = '', add_answer: bool =
                 context_options = [f'{self.example_delimiter}{c}{cont_del}' for c in context_options]
             return context_options
 
-    def tokenize_example(self, prompt_and_fewshot: str, context_options: List[str], example: dict):
+    def _tokenize_example(self, prompt_and_fewshot: str, context_options: List[str], example: dict):
         """
         Runs text through the tokenizer and handles special cases.
         Args:
             prompt_and_fewshot (str): the collection of the prompt and fewshot examples that belongs before the example's context
             ctx (str): the specific example's derrived context 
             example (dict): the example as a dictionary. 
+        
+        Returns:
+            dict: dictionary with the tokenized data
         """
         tokenized_example = {}
         preamble = self.tokenizer(prompt_and_fewshot)
-        preamble = self.fix_eos_on_preamble(preamble)
+        preamble = self._fix_eos_on_preamble(preamble)
         tokenized_example['preamble'] = preamble
         tokenized_example['context_options'] = [self.tokenizer(c, add_special_tokens=False) for c in context_options]
         continuation = example['continuation']
@@ -820,6 +894,11 @@ def tokenize_example(self, prompt_and_fewshot: str, context_options: List[str],
     def collate_fn(self, data):
         """
         The function that the dataloader uses to accumulate data into batches
+        Args:
+            data (list): list of tokenized datapoints (dicts returned by self._tokenize_example)
+        
+        Returns:
+            dict: dictionary for a single batch 
         """
         batch = {
             'input_ids': [],
@@ -926,6 +1005,8 @@ def __init__(
     def get_max_prompt_length(self):
         """
         Iterates through the dataset and finds the length of the longest prompt
+        Returns:
+            int: maximum prompt length
         """
         max_prompt_length = 0
         for sample in self.encoded_dataset:
@@ -935,15 +1016,18 @@ def get_max_prompt_length(self):
             )
         return max_prompt_length
 
-    def tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict):
+    def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict):
         """
         Runs text through the tokenizer and handles special cases.
         Args:
             prompt_and_fewshot (str): the collection of the prompt and fewshot examples that belongs before the example's context
             ctx (str): the specific example's derrived context 
             example (dict): the example as a dictionary. 
+        
+        Returns:
+            dict: dictionary with the tokenized data
         """
-        tokenized_example = super().tokenize_example(prompt_and_fewshot, ctxt, example)
+        tokenized_example = super()._tokenize_example(prompt_and_fewshot, ctxt, example)
         tokenized_example['prompt_text'] = example['prompt']
         tokenized_example['task_id'] = example['task_id']
         tokenized_example['canonical_solution'] = example['canonical_solution']
@@ -957,6 +1041,11 @@ def tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict):
     def collate_fn(self, data):
         """
         The function that the dataloader uses to accumulate data into batches
+        Args:
+            data (list): list of tokenized datapoints (dicts returned by self._tokenize_example)
+        
+        Returns:
+            dict: dictionary for a single batch 
         """
         batch = {
             'input_ids': [],
@@ -1026,6 +1115,7 @@ def build_icl_dataloader(
     pass_at_k: int,
     generations_per_sample: int,
 ) -> DataSpec:
+    # TODO: Should this be some form of registry?
     if icl_task_type == 'multiple_choice':
         dataset = InContextLearningMultipleChoiceTaskDataset(dataset_uri=dataset_uri,
                                                              tokenizer=tokenizer,
@@ -1159,7 +1249,6 @@ def partition_dataset_by_category(dataset_uri: str, destination_path: str, hf_lo
             conda_channel='conda-forge',
         ) from e
     if dataset_uri.startswith("hf://"):
-        # TODO: this will also execute in the dataset class, so ensure that the same hf_parsing_map and loading_vars can be used both times
         cur_dataset_uri = dataset_uri.replace('hf://', '')
         dataset = load_dataset(cur_dataset_uri, **hf_loading_vars)
         if hf_parsing_map:
@@ -1167,10 +1256,11 @@ def partition_dataset_by_category(dataset_uri: str, destination_path: str, hf_lo
                 k: ' '.join([str(example[col]) for col in v]) for k, v in hf_parsing_map.items()
             }
             dataset = dataset.map(dataset_parsing_func, remove_columns=dataset.column_names)
-    with dist.local_rank_zero_download_and_wait(destination_path):
-        if dist.get_local_rank() == 0:
-            get_file(dataset_uri, destination_path, overwrite=True)
-    dataset = load_dataset('json', data_files=destination_path, split='train', streaming=False)
+    else:
+        with dist.local_rank_zero_download_and_wait(destination_path):
+            if dist.get_local_rank() == 0:
+                get_file(dataset_uri, destination_path, overwrite=True)
+        dataset = load_dataset('json', data_files=destination_path, split='train', streaming=False)
     if 'category' not in dataset.features.keys():
         raise Exception(
             f"Attempted to partition dataset by `category` but it doesn't have a `category` key. Got keys: {str(list(dataset.features.keys()))}"
@@ -1282,8 +1372,6 @@ def get_icl_task_dataloader(
                 num_fewshot=num_fewshot,
                 prompt_string=prompt_string,
                 example_delimiter=example_delimiter,
-                hf_loading_vars=hf_loading_vars,
-                hf_parsing_map=hf_parsing_map,
                 continuation_delimiter=continuation_delimiter,
                 destination_path=partition_uri + '_tmp',
                 prelimiter=question_prelimiter,

From b354d43e36b72230a3b3452cfa3b5f09d26d2b5a Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Fri, 17 Nov 2023 02:46:18 +0000
Subject: [PATCH 023/116] add doc strings and default check

---
 .../in_context_learning_evaluation.py         | 52 +++++++++++++++----
 1 file changed, 41 insertions(+), 11 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 1befcd54df..d5cc1c607c 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -206,6 +206,11 @@ def __len__(self):
     def get_num_samples_in_batch(self, batch: dict) -> int:
         return batch['input_ids'].shape[0]
 
+    def check_defaults_are_set(self, dict_of_defaults:dict):
+        if all([v for v in dict_of_defaults.values()]):
+            return
+        raise ValueError(f"{type(self).__name__} missing required variable(s): {''.join([k for k, v in dict_of_defaults.items() if not v])}") 
+
     def _read_dataset(
         self,
         dataset_uri: str,
@@ -213,6 +218,18 @@ def _read_dataset(
         hf_loading_vars: dict = None,
         hf_parsing_map: dict = None
     ):
+        """
+        Reads a dataset and handles parsing it from HuggingFace.
+        Args:
+            dataset_uri (str): A local path, a remote path beginning with ``s3://`` or another backend, or a HuggingFace dataset link with ``hf://`` prepended to it.
+                Alternate backends must be supported by :meth:`composer.utils.maybe_create_object_store_from_uri`.
+            destination_path (str): A local path where the data will be stored
+            hf_loading_vars (dict): If parsing from HuggingFace, keyword args that will be passed into load_dataset
+            hf_parsing_map (dict): Dictionary in the form of {icl_key: [hf_col1, hf_col2]} that will map one or more hf columns, in order, to ICL dataset cols
+
+        Returns:
+            dataset: a loaded HF dataset 
+        """
         try:
             from datasets import load_dataset  # pyright: ignore [reportGeneralTypeIssues]
         except ImportError as e:
@@ -412,6 +429,7 @@ def split_batch(self, batch: Any, microbatch_size: int):
         # Don't split kwargs that don't change
         # Normally split torch tensors
         # List split lists of strings
+        self.check_defaults_are_set({"dont_split_keys":self.dont_split_keys,"list_split_keys":self.list_split_keys,"normal_split_keys":self.normal_split_keys})
         chunked = {}
         for k, v in batch.items():
             if k in self.dont_split_keys:
@@ -578,6 +596,7 @@ class InContextLearningLMTaskDataset(InContextLearningDataset):
     Language modeling tasks test a model's ability to properly predict tokens based on preceding tokens. 
 
 
+    #TODO: Should I only list variables here that are different than InContextLearningDataset?
     Args:
         dataset_uri (str): Either a local path, or a remote path beginning with ``s3://``, or another backend
             supported by :meth:`composer.utils.maybe_create_object_store_from_uri`. Dataset must consist of rows of JSON data points with "context",
@@ -655,6 +674,7 @@ class InContextLearningMultipleChoiceTaskDataset(InContextLearningDataset):
     'gold_indices': List of length |batch_size // N| indicating for each question, which of the answers is correct (via an integer [0, N-1])
     'choice_groupings': Indicates which indices of the batch correspond to which questions
 
+    #TODO: Should I only list variables here that are different than InContextLearningDataset?
     Args:
         dataset_uri (str): Either a local path, or a remote path beginning with ``s3://``, or another backend
             supported by :meth:`composer.utils.maybe_create_object_store_from_uri`. Dataset must consist of rows of JSON data points with "query",
@@ -676,7 +696,6 @@ def __init__(self, choices_key: str = 'choices', *args, **kwargs):
                          normal_split_keys=['gold_indices'],
                          *args,
                          **kwargs)
-        # self.check_defaults_are_set({'num_choices': self.num_choices, 'generations_per_sample':self.generations_per_sample, "top_p": self.top_p,"top_k":self.top_k})
         self.num_choices = len(self.dataset[0][choices_key])
         self.real_split_keys = ['input_ids', 'labels', 'attention_mask']
 
@@ -776,7 +795,7 @@ def split_batch(self, batch: Any, microbatch_size: int):
         Returns:
             list: list of chunked batches
         """
-        # There are extra split options in this func for multiple choice
+        self.check_defaults_are_set({"dont_split_keys":self.dont_split_keys,"normal_split_keys":self.normal_split_keys})
         chunked = {}
         for k, v in batch.items():
             if k in self.dont_split_keys:
@@ -814,6 +833,7 @@ class InContextLearningSchemaTaskDataset(InContextLearningMultipleChoiceTaskData
     'labels': Identical to the input, used by the model to calculate loss/metrics
     'gold_indices': List of length |batch_size // N| indicating for each question, which of the answers is correct (via an integer [0, N-1])
     'choice_groupings': Indicates which indices of the batch correspond to which questions
+    #TODO: Should I only list variables here that are different than InContextLearningDataset?
     Args:
         dataset_uri (str): Either a local path, or a remote path beginning with ``s3://``, or another backend
             supported by :meth:`composer.utils.maybe_create_object_store_from_uri`. Dataset must consist of rows of JSON data points with "query",
@@ -950,6 +970,7 @@ class InContextLearningCodeEvalDataset(InContextLearningDataset):
     - test_inputs: list of test inputs
     - test_outputs: list of test outputs
     - language: the language of the code snippet
+    #TODO: Should I only list variables here that are different than InContextLearningDataset?
     Args:
         dataset_uri (str): Either a local path, or a remote path beginning with ``s3://``, or another backend
         supported by :meth:`composer.utils.maybe_create_object_store_from_uri`. Dataset must consist of rows of JSON data points with "task_id",
@@ -963,9 +984,11 @@ class InContextLearningCodeEvalDataset(InContextLearningDataset):
         destination_path (str): Temporary path to store downloaded datasets
         code_prelimiter (str): String to put before each code prompt (e.g. 'Q: ')
         fewshot_random_seed (int): Random seed to use for fewshot sampling
-        generations_per_sample: how many outputs to generate per prompt
-        top_p: top_p sampling parameter for nucleus sampling
-        top_k: top_k sampling parameter for number of samples to consider
+        generations_per_sample (int): how many outputs to generate per prompt
+        # TODO: is this correct?
+        pass_at_k (int): k for how many chances the model gets to write passing code
+        top_p (int): top_p sampling parameter for nucleus sampling
+        top_k (int): top_k sampling parameter for number of samples to consider
     """
 
     def __init__(
@@ -977,6 +1000,7 @@ def __init__(
         *args,
         **kwargs,
     ):
+        self.check_defaults_are_set({'pass_at_k': pass_at_k, 'generations_per_sample': generations_per_sample, "top_p": top_p,"top_k": top_k})
         if generations_per_sample < pass_at_k:
             raise ValueError(
                 f'generations_per_sample ({generations_per_sample}) must be greater than or equal to pass_at_k ({pass_at_k}) for code evaluation.'
@@ -996,6 +1020,7 @@ def __init__(
             *args,
             **kwargs,
         )
+        # TODO: add temperature
         self.pass_at_k = pass_at_k
         self.generations_per_sample = generations_per_sample
         self.max_prompt_length = self.get_max_prompt_length()
@@ -1062,6 +1087,7 @@ def collate_fn(self, data):
             'generation_length': self.max_seq_len - self.max_prompt_length,
             'generation_kwargs': {
                 'pad_token_id': self.pad_tok_id,
+                # TODO: specify this?
                 'num_beams': 1,  # single beam
                 'num_return_sequences': self.generations_per_sample,  # how many gens per prompt
                 'do_sample': True,
@@ -1343,13 +1369,15 @@ def get_icl_task_dataloader(
         example_delimiter (str): Separator that goes between individual examples (e.g. '\n')
         continuation_delimiter: (str): Separator that goes between context and continuation in each example (e.g. '->')
         question_prelimiter: (str): Text to be prepended before each context segement in each eval example. (e.g. 'Q:', 'The following is a paragraph containing...')
-        hf_loading_vars (dict):
-        hf_parsing_map (dict):
+        hf_loading_vars (dict): A dictionary containing keyword arguments to be passed into `load_dataset` if dataset is being pulled from HF.
+        hf_parsing_map (Dict[str:List[str]]): A dictionary containing a from HF columns to ICL dataset keys. The dictionary should be formatted {icl_key:[hf_key1, hf_key1]}.
+            Values in the dict will be concatenated with ' ' seperating them. If not included, will use the columns already present in the HF dataset.
         destination_path: (str): This is the local file where remote datasets will be saved.
-        fewshot_random_seed (int):
-        pass_at_k (int):
-        generations_per_sample (int):
-        cot_delimiter (str):
+        fewshot_random_seed (int): Random seed to use for fewshot sampling
+        # TODO: is this right?
+        pass_at_k (int): k for how many chances the model gets to write passing code
+        generations_per_sample (int): how many outputs to generate per prompt
+        cot_delimiter (str): Delimiter to place between the chain of thought and continuations.
         has_categories: (bool): If ``True``, we will search the dataset file for a category key, and partition the dataset into a separate dataloader for each category occurring in the data.
 
     Returns:
@@ -1379,6 +1407,8 @@ def get_icl_task_dataloader(
                 fewshot_random_seed=fewshot_random_seed,
                 pass_at_k=pass_at_k,
                 generations_per_sample=generations_per_sample,
+                hf_loading_vars=hf_loading_vars,
+                hf_parsing_map=hf_parsing_map
             )
         return result_dls
     else:

From 23f8735fe4d486e03d80c6225000697d9526b3dc Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Fri, 17 Nov 2023 02:48:22 +0000
Subject: [PATCH 024/116] linting

---
 .../in_context_learning_evaluation.py         | 307 +++++++++---------
 1 file changed, 162 insertions(+), 145 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index d5cc1c607c..ffca89f769 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -124,7 +124,7 @@ class InContextLearningDataset(Dataset):
         prompt_string (str): Prompt string to put once before all fewshot examples/test examples (e.g. 'translate english to french')
         example_delimiter (str): Separator that goes between individual (context, answer) pairs (e.g. '\n')
         continuation_delimiter: (str): Separator that goes between context and answer in each example (e.g. '\nA: ')
-        prelimiter (str): Text to be prepended before each example, including few shot examples 
+        prelimiter (str): Text to be prepended before each example, including few shot examples
         context_key (str): The key from the parsed dataset that the class will use as the "context" (i.e. the main content to be included in the prompt)
         answer_key (str): The key from the parsed dataset that the class will use as the "answer" (i.e. the main content to be predicted by the model)
         destination_path (str): Temporary path to store downloaded datasets
@@ -135,7 +135,7 @@ class InContextLearningDataset(Dataset):
             Values in the dict will be concatenated with ' ' seperating them. If not included, will use the columns already present in the HF dataset.
         stacked_keys (list(str)): keys in the output batch that must be converted to tensors with torch.stack()
         dont_split_keys (list(str)): keys in the ICL dictionary that should not be split among batches.
-        list_split_keys (list(str)): keys in the ICL dictionary that will be split as lists, resulting in microbatch_size sections of the list being inserted in every batch 
+        list_split_keys (list(str)): keys in the ICL dictionary that will be split as lists, resulting in microbatch_size sections of the list being inserted in every batch
         normal_split_keys (list(str)): keys in the ICL dictionary that will be split into chunks regularly
     """
 
@@ -206,18 +206,18 @@ def __len__(self):
     def get_num_samples_in_batch(self, batch: dict) -> int:
         return batch['input_ids'].shape[0]
 
-    def check_defaults_are_set(self, dict_of_defaults:dict):
-        if all([v for v in dict_of_defaults.values()]):
+    def check_defaults_are_set(self, dict_of_defaults: dict):
+        if all(v for v in dict_of_defaults.values()):
             return
-        raise ValueError(f"{type(self).__name__} missing required variable(s): {''.join([k for k, v in dict_of_defaults.items() if not v])}") 
+        raise ValueError(
+            f"{type(self).__name__} missing required variable(s): {''.join([k for k, v in dict_of_defaults.items() if not v])}"
+        )
 
-    def _read_dataset(
-        self,
-        dataset_uri: str,
-        destination_path: str,
-        hf_loading_vars: dict = None,
-        hf_parsing_map: dict = None
-    ):
+    def _read_dataset(self,
+                      dataset_uri: str,
+                      destination_path: str,
+                      hf_loading_vars: dict = None,
+                      hf_parsing_map: dict = None):
         """
         Reads a dataset and handles parsing it from HuggingFace.
         Args:
@@ -228,7 +228,7 @@ def _read_dataset(
             hf_parsing_map (dict): Dictionary in the form of {icl_key: [hf_col1, hf_col2]} that will map one or more hf columns, in order, to ICL dataset cols
 
         Returns:
-            dataset: a loaded HF dataset 
+            dataset: a loaded HF dataset
         """
         try:
             from datasets import load_dataset  # pyright: ignore [reportGeneralTypeIssues]
@@ -271,7 +271,7 @@ def _generate_few_shot_text(
             sample_idx (int): current sample idx
             preamble (str): text to occur at the beginning of the task. Generally instructions or a prompt.
             fewshot_rng (random.Random): seeded sampler to chose samples with
-        
+
         Returns:
             str: the original preamble with num_fewshot examples appended
         """
@@ -297,7 +297,7 @@ def _construct_context(self, sample: dict, preceding_text: str = '', add_answer:
 
         Returns:
 
-            str: The constructed context. The default output context is 
+            str: The constructed context. The default output context is
                  formatted as follows: f'{self.prelimiter}{sample[self.context_key]}{self.continuation_delimiter}'
         """
         ctxt = sample[self.context_key]
@@ -342,7 +342,7 @@ def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict):
         Runs text through the tokenizer and handles special cases.
         Args:
             prompt_and_fewshot (str): the collection of the prompt and fewshot examples that belongs before the example's context
-            ctx (str): the specific example's derrived context 
+            ctx (str): the specific example's derrived context
             example (dict): the example as a dictionary. Used for additional processing in inherited classes.
 
         Returns:
@@ -391,9 +391,9 @@ def collate_fn(self, data):
         The function that the dataloader uses to accumulate data into batches
         Args:
             data (list): list of tokenized datapoints (dicts returned by self._tokenize_example)
-        
+
         Returns:
-            dict: dictionary for a single batch 
+            dict: dictionary for a single batch
         """
         batch = {
             'input_ids': [],
@@ -429,7 +429,11 @@ def split_batch(self, batch: Any, microbatch_size: int):
         # Don't split kwargs that don't change
         # Normally split torch tensors
         # List split lists of strings
-        self.check_defaults_are_set({"dont_split_keys":self.dont_split_keys,"list_split_keys":self.list_split_keys,"normal_split_keys":self.normal_split_keys})
+        self.check_defaults_are_set({
+            'dont_split_keys': self.dont_split_keys,
+            'list_split_keys': self.list_split_keys,
+            'normal_split_keys': self.normal_split_keys
+        })
         chunked = {}
         for k, v in batch.items():
             if k in self.dont_split_keys:
@@ -451,7 +455,7 @@ def split_batch(self, batch: Any, microbatch_size: int):
 
 class InContextLearningQATaskDataset(InContextLearningDataset):
     """A dataset that construct batches for in-context learning question answering evaluation
-    QA tasks evaluate a model's ability to answer questions using a consistent format. 
+    QA tasks evaluate a model's ability to answer questions using a consistent format.
 
     The input format is expected to be a jsonl file with the following fields:
     - context: the question
@@ -488,12 +492,13 @@ def __init__(self, cot_delimiter: str = '', *args, **kwargs):
 
         self.max_answer_length = self.get_max_answer_length()
 
-    def _read_dataset(self,
-                      dataset_uri: str,
-                      destination_path: str,
-                      hf_loading_vars: dict = None,
-                      hf_parsing_map: dict = None,
-                      ):
+    def _read_dataset(
+        self,
+        dataset_uri: str,
+        destination_path: str,
+        hf_loading_vars: dict = None,
+        hf_parsing_map: dict = None,
+    ):
         dataset = super()._read_dataset(dataset_uri, destination_path, hf_loading_vars, hf_parsing_map)
         self.has_cot = 'chain_of_thought' in dataset.features
         return dataset.map(
@@ -504,7 +509,7 @@ def _read_dataset(self,
                 'chain_of_thought': examples.get('chain_of_thought', ''),
             })
 
-    def _get_answer_from_sample(self, sample : dict):
+    def _get_answer_from_sample(self, sample: dict):
         """
         Returns the answer from the sample. Applies chain of thought if self.has_cot is marked as true
         Args:
@@ -523,8 +528,8 @@ def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict):
         Runs text through the tokenizer and handles special cases.
         Args:
             prompt_and_fewshot (str): the collection of the prompt and fewshot examples that belongs before the example's context
-            ctx (str): the specific example's derrived context 
-            example (dict): the example as a dictionary. 
+            ctx (str): the specific example's derrived context
+            example (dict): the example as a dictionary.
 
         Returns:
             dict: dictionary with the tokenized data
@@ -557,9 +562,9 @@ def collate_fn(self, data):
         The function that the dataloader uses to accumulate data into batches
         Args:
             data (list): list of tokenized datapoints (dicts returned by self._tokenize_example)
-        
+
         Returns:
-            dict: dictionary for a single batch 
+            dict: dictionary for a single batch
         """
         batch = {
             'input_ids': [],
@@ -593,7 +598,7 @@ def collate_fn(self, data):
 
 class InContextLearningLMTaskDataset(InContextLearningDataset):
     """A dataset that construct batches for in-context learning language modeling evaluation.
-    Language modeling tasks test a model's ability to properly predict tokens based on preceding tokens. 
+    Language modeling tasks test a model's ability to properly predict tokens based on preceding tokens.
 
 
     #TODO: Should I only list variables here that are different than InContextLearningDataset?
@@ -620,8 +625,8 @@ def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict):
         Runs text through the tokenizer and handles special cases.
         Args:
             prompt_and_fewshot (str): the collection of the prompt and fewshot examples that belongs before the example's context
-            ctx (str): the specific example's derrived context 
-            example (dict): the example as a dictionary. 
+            ctx (str): the specific example's derrived context
+            example (dict): the example as a dictionary.
 
         Returns:
             dict: dictionary with the tokenized data
@@ -638,9 +643,9 @@ def collate_fn(self, data):
         The function that the dataloader uses to accumulate data into batches
         Args:
             data (list): list of tokenized datapoints (dicts returned by self._tokenize_example)
-        
+
         Returns:
-            dict: dictionary for a single batch 
+            dict: dictionary for a single batch
         """
         batch = {'input_ids': [], 'continuation_indices': [], 'mode': 'icl_task', 'labels': []}
         for data_pair in data:
@@ -706,7 +711,7 @@ def _get_answer_from_sample(self, sample: dict):
             sample (dict): the sample from which to retrieve the answer
 
         Returns:
-            str: the full string of the correct answer based on the 'gold' key 
+            str: the full string of the correct answer based on the 'gold' key
         """
         choices = sample['choices']
         gold_idx = sample['gold']
@@ -717,8 +722,8 @@ def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict):
         Runs text through the tokenizer and handles special cases.
         Args:
             prompt_and_fewshot (str): the collection of the prompt and fewshot examples that belongs before the example's context
-            ctx (str): the specific example's derrived context 
-            example (dict): the example as a dictionary. 
+            ctx (str): the specific example's derrived context
+            example (dict): the example as a dictionary.
 
         Returns:
             dict: dictionary with the tokenized data
@@ -736,9 +741,9 @@ def collate_fn(self, data):
         The function that the dataloader uses to accumulate data into batches
         Args:
             data (list): list of tokenized datapoints (dicts returned by self._tokenize_example)
-        
+
         Returns:
-            dict: dictionary for a single batch 
+            dict: dictionary for a single batch
         """
         batch = {
             'input_ids': [],
@@ -795,7 +800,10 @@ def split_batch(self, batch: Any, microbatch_size: int):
         Returns:
             list: list of chunked batches
         """
-        self.check_defaults_are_set({"dont_split_keys":self.dont_split_keys,"normal_split_keys":self.normal_split_keys})
+        self.check_defaults_are_set({
+            'dont_split_keys': self.dont_split_keys,
+            'normal_split_keys': self.normal_split_keys
+        })
         chunked = {}
         for k, v in batch.items():
             if k in self.dont_split_keys:
@@ -893,9 +901,9 @@ def _tokenize_example(self, prompt_and_fewshot: str, context_options: List[str],
         Runs text through the tokenizer and handles special cases.
         Args:
             prompt_and_fewshot (str): the collection of the prompt and fewshot examples that belongs before the example's context
-            ctx (str): the specific example's derrived context 
-            example (dict): the example as a dictionary. 
-        
+            ctx (str): the specific example's derrived context
+            example (dict): the example as a dictionary.
+
         Returns:
             dict: dictionary with the tokenized data
         """
@@ -916,9 +924,9 @@ def collate_fn(self, data):
         The function that the dataloader uses to accumulate data into batches
         Args:
             data (list): list of tokenized datapoints (dicts returned by self._tokenize_example)
-        
+
         Returns:
-            dict: dictionary for a single batch 
+            dict: dictionary for a single batch
         """
         batch = {
             'input_ids': [],
@@ -1000,7 +1008,12 @@ def __init__(
         *args,
         **kwargs,
     ):
-        self.check_defaults_are_set({'pass_at_k': pass_at_k, 'generations_per_sample': generations_per_sample, "top_p": top_p,"top_k": top_k})
+        self.check_defaults_are_set({
+            'pass_at_k': pass_at_k,
+            'generations_per_sample': generations_per_sample,
+            'top_p': top_p,
+            'top_k': top_k
+        })
         if generations_per_sample < pass_at_k:
             raise ValueError(
                 f'generations_per_sample ({generations_per_sample}) must be greater than or equal to pass_at_k ({pass_at_k}) for code evaluation.'
@@ -1046,9 +1059,9 @@ def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict):
         Runs text through the tokenizer and handles special cases.
         Args:
             prompt_and_fewshot (str): the collection of the prompt and fewshot examples that belongs before the example's context
-            ctx (str): the specific example's derrived context 
-            example (dict): the example as a dictionary. 
-        
+            ctx (str): the specific example's derrived context
+            example (dict): the example as a dictionary.
+
         Returns:
             dict: dictionary with the tokenized data
         """
@@ -1068,9 +1081,9 @@ def collate_fn(self, data):
         The function that the dataloader uses to accumulate data into batches
         Args:
             data (list): list of tokenized datapoints (dicts returned by self._tokenize_example)
-        
+
         Returns:
-            dict: dictionary for a single batch 
+            dict: dictionary for a single batch
         """
         batch = {
             'input_ids': [],
@@ -1143,86 +1156,91 @@ def build_icl_dataloader(
 ) -> DataSpec:
     # TODO: Should this be some form of registry?
     if icl_task_type == 'multiple_choice':
-        dataset = InContextLearningMultipleChoiceTaskDataset(dataset_uri=dataset_uri,
-                                                             tokenizer=tokenizer,
-                                                             max_seq_len=max_seq_len,
-                                                             pad_tok_id=pad_tok_id,
-                                                             num_fewshot=num_fewshot,
-                                                             prompt_string=prompt_string,
-                                                             example_delimiter=example_delimiter,
-                                                             continuation_delimiter=continuation_delimiter,
-                                                             destination_path=destination_path,
-                                                             fewshot_random_seed=fewshot_random_seed,
-                                                             hf_loading_vars=hf_loading_vars,
-                                                             hf_parsing_map=hf_parsing_map,
-                                                             )
+        dataset = InContextLearningMultipleChoiceTaskDataset(
+            dataset_uri=dataset_uri,
+            tokenizer=tokenizer,
+            max_seq_len=max_seq_len,
+            pad_tok_id=pad_tok_id,
+            num_fewshot=num_fewshot,
+            prompt_string=prompt_string,
+            example_delimiter=example_delimiter,
+            continuation_delimiter=continuation_delimiter,
+            destination_path=destination_path,
+            fewshot_random_seed=fewshot_random_seed,
+            hf_loading_vars=hf_loading_vars,
+            hf_parsing_map=hf_parsing_map,
+        )
         batch_size = max(dataset.num_choices, batch_size)
         effective_batchsize = batch_size // dataset.num_choices
     elif icl_task_type == 'schema':
-        dataset = InContextLearningSchemaTaskDataset(dataset_uri=dataset_uri,
-                                                     tokenizer=tokenizer,
-                                                     max_seq_len=max_seq_len,
-                                                     pad_tok_id=pad_tok_id,
-                                                     num_fewshot=num_fewshot,
-                                                     prompt_string=prompt_string,
-                                                     example_delimiter=example_delimiter,
-                                                     continuation_delimiter=continuation_delimiter,
-                                                     destination_path=destination_path,
-                                                     fewshot_random_seed=fewshot_random_seed,
-                                                     hf_loading_vars=hf_loading_vars,
-                                                     hf_parsing_map=hf_parsing_map,
-                                                     )
+        dataset = InContextLearningSchemaTaskDataset(
+            dataset_uri=dataset_uri,
+            tokenizer=tokenizer,
+            max_seq_len=max_seq_len,
+            pad_tok_id=pad_tok_id,
+            num_fewshot=num_fewshot,
+            prompt_string=prompt_string,
+            example_delimiter=example_delimiter,
+            continuation_delimiter=continuation_delimiter,
+            destination_path=destination_path,
+            fewshot_random_seed=fewshot_random_seed,
+            hf_loading_vars=hf_loading_vars,
+            hf_parsing_map=hf_parsing_map,
+        )
         batch_size = max(dataset.num_choices, batch_size)
         effective_batchsize = batch_size // dataset.num_choices
     elif icl_task_type == 'language_modeling':
-        dataset = InContextLearningLMTaskDataset(dataset_uri=dataset_uri,
-                                                 tokenizer=tokenizer,
-                                                 max_seq_len=max_seq_len,
-                                                 pad_tok_id=pad_tok_id,
-                                                 num_fewshot=num_fewshot,
-                                                 prompt_string=prompt_string,
-                                                 example_delimiter=example_delimiter,
-                                                 continuation_delimiter=continuation_delimiter,
-                                                 destination_path=destination_path,
-                                                 fewshot_random_seed=fewshot_random_seed,
-                                                 hf_loading_vars=hf_loading_vars,
-                                                 hf_parsing_map=hf_parsing_map,
-                                                 )
+        dataset = InContextLearningLMTaskDataset(
+            dataset_uri=dataset_uri,
+            tokenizer=tokenizer,
+            max_seq_len=max_seq_len,
+            pad_tok_id=pad_tok_id,
+            num_fewshot=num_fewshot,
+            prompt_string=prompt_string,
+            example_delimiter=example_delimiter,
+            continuation_delimiter=continuation_delimiter,
+            destination_path=destination_path,
+            fewshot_random_seed=fewshot_random_seed,
+            hf_loading_vars=hf_loading_vars,
+            hf_parsing_map=hf_parsing_map,
+        )
         effective_batchsize = batch_size
     elif icl_task_type == 'question_answering':
-        dataset = InContextLearningQATaskDataset(dataset_uri=dataset_uri,
-                                                 tokenizer=tokenizer,
-                                                 max_seq_len=max_seq_len,
-                                                 pad_tok_id=pad_tok_id,
-                                                 num_fewshot=num_fewshot,
-                                                 prompt_string=prompt_string,
-                                                 example_delimiter=example_delimiter,
-                                                 continuation_delimiter=continuation_delimiter,
-                                                 destination_path=destination_path,
-                                                 prelimiter=prelimiter,
-                                                 fewshot_random_seed=fewshot_random_seed,
-                                                 hf_loading_vars=hf_loading_vars,
-                                                 hf_parsing_map=hf_parsing_map,
-                                                 cot_delimiter=cot_delimiter,
-                                                 )
+        dataset = InContextLearningQATaskDataset(
+            dataset_uri=dataset_uri,
+            tokenizer=tokenizer,
+            max_seq_len=max_seq_len,
+            pad_tok_id=pad_tok_id,
+            num_fewshot=num_fewshot,
+            prompt_string=prompt_string,
+            example_delimiter=example_delimiter,
+            continuation_delimiter=continuation_delimiter,
+            destination_path=destination_path,
+            prelimiter=prelimiter,
+            fewshot_random_seed=fewshot_random_seed,
+            hf_loading_vars=hf_loading_vars,
+            hf_parsing_map=hf_parsing_map,
+            cot_delimiter=cot_delimiter,
+        )
         effective_batchsize = batch_size
     elif icl_task_type == 'code_evaluation':
-        dataset = InContextLearningCodeEvalDataset(dataset_uri=dataset_uri,
-                                                   tokenizer=tokenizer,
-                                                   max_seq_len=max_seq_len,
-                                                   pad_tok_id=pad_tok_id,
-                                                   num_fewshot=num_fewshot,
-                                                   prompt_string=prompt_string,
-                                                   example_delimiter=example_delimiter,
-                                                   continuation_delimiter=continuation_delimiter,
-                                                   destination_path=destination_path,
-                                                   prelimiter=prelimiter,
-                                                   fewshot_random_seed=fewshot_random_seed,
-                                                   hf_loading_vars=hf_loading_vars,
-                                                   hf_parsing_map=hf_parsing_map,
-                                                   pass_at_k=pass_at_k,
-                                                   generations_per_sample=generations_per_sample,
-                                                   )
+        dataset = InContextLearningCodeEvalDataset(
+            dataset_uri=dataset_uri,
+            tokenizer=tokenizer,
+            max_seq_len=max_seq_len,
+            pad_tok_id=pad_tok_id,
+            num_fewshot=num_fewshot,
+            prompt_string=prompt_string,
+            example_delimiter=example_delimiter,
+            continuation_delimiter=continuation_delimiter,
+            destination_path=destination_path,
+            prelimiter=prelimiter,
+            fewshot_random_seed=fewshot_random_seed,
+            hf_loading_vars=hf_loading_vars,
+            hf_parsing_map=hf_parsing_map,
+            pass_at_k=pass_at_k,
+            generations_per_sample=generations_per_sample,
+        )
         effective_batchsize = batch_size
     else:
         raise Exception(f'Unrecognized ICL task type: {icl_task_type}')
@@ -1253,7 +1271,8 @@ def build_icl_dataloader(
     )
 
 
-def partition_dataset_by_category(dataset_uri: str, destination_path: str, hf_loading_vars: dict, hf_parsing_map: dict) -> Dict[str, str]:
+def partition_dataset_by_category(dataset_uri: str, destination_path: str, hf_loading_vars: dict,
+                                  hf_parsing_map: dict) -> Dict[str, str]:
     """If has_categories is enabled, we partition the dataset into a separate dataset for each category value in the data and write each partition to a local file.
 
     Args:
@@ -1274,7 +1293,7 @@ def partition_dataset_by_category(dataset_uri: str, destination_path: str, hf_lo
             conda_package='datasets',
             conda_channel='conda-forge',
         ) from e
-    if dataset_uri.startswith("hf://"):
+    if dataset_uri.startswith('hf://'):
         cur_dataset_uri = dataset_uri.replace('hf://', '')
         dataset = load_dataset(cur_dataset_uri, **hf_loading_vars)
         if hf_parsing_map:
@@ -1390,26 +1409,24 @@ def get_icl_task_dataloader(
         categories = sorted(output_files.keys())
         for category in categories:
             partition_uri = output_files[category]
-            result_dls[category] = build_icl_dataloader(
-                icl_task_type=icl_task_type,
-                dataset_uri=partition_uri,
-                tokenizer=tokenizer,
-                batch_size=batch_size,
-                max_seq_len=max_seq_len,
-                pad_tok_id=pad_tok_id,
-                num_fewshot=num_fewshot,
-                prompt_string=prompt_string,
-                example_delimiter=example_delimiter,
-                continuation_delimiter=continuation_delimiter,
-                destination_path=partition_uri + '_tmp',
-                prelimiter=question_prelimiter,
-                cot_delimiter=cot_delimiter,
-                fewshot_random_seed=fewshot_random_seed,
-                pass_at_k=pass_at_k,
-                generations_per_sample=generations_per_sample,
-                hf_loading_vars=hf_loading_vars,
-                hf_parsing_map=hf_parsing_map
-            )
+            result_dls[category] = build_icl_dataloader(icl_task_type=icl_task_type,
+                                                        dataset_uri=partition_uri,
+                                                        tokenizer=tokenizer,
+                                                        batch_size=batch_size,
+                                                        max_seq_len=max_seq_len,
+                                                        pad_tok_id=pad_tok_id,
+                                                        num_fewshot=num_fewshot,
+                                                        prompt_string=prompt_string,
+                                                        example_delimiter=example_delimiter,
+                                                        continuation_delimiter=continuation_delimiter,
+                                                        destination_path=partition_uri + '_tmp',
+                                                        prelimiter=question_prelimiter,
+                                                        cot_delimiter=cot_delimiter,
+                                                        fewshot_random_seed=fewshot_random_seed,
+                                                        pass_at_k=pass_at_k,
+                                                        generations_per_sample=generations_per_sample,
+                                                        hf_loading_vars=hf_loading_vars,
+                                                        hf_parsing_map=hf_parsing_map)
         return result_dls
     else:
         return build_icl_dataloader(

From 887a31cf6fec562e53f2337ceb4471cad8499cc4 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Fri, 17 Nov 2023 04:00:53 +0000
Subject: [PATCH 025/116] add temperature

---
 .../in_context_learning_evaluation.py         | 86 ++++++++++---------
 1 file changed, 46 insertions(+), 40 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index ffca89f769..0159fbe2f5 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -997,6 +997,7 @@ class InContextLearningCodeEvalDataset(InContextLearningDataset):
         pass_at_k (int): k for how many chances the model gets to write passing code
         top_p (int): top_p sampling parameter for nucleus sampling
         top_k (int): top_k sampling parameter for number of samples to consider
+        temperature (float): temperture to use while sampling
     """
 
     def __init__(
@@ -1005,6 +1006,7 @@ def __init__(
         pass_at_k: int = 1,
         top_p: Optional[float] = 0.95,
         top_k: Optional[int] = 40,
+        temperature: Optional[int] = 1.0,
         *args,
         **kwargs,
     ):
@@ -1012,7 +1014,8 @@ def __init__(
             'pass_at_k': pass_at_k,
             'generations_per_sample': generations_per_sample,
             'top_p': top_p,
-            'top_k': top_k
+            'top_k': top_k,
+            'temperature': temperature
         })
         if generations_per_sample < pass_at_k:
             raise ValueError(
@@ -1039,6 +1042,7 @@ def __init__(
         self.max_prompt_length = self.get_max_prompt_length()
         self.top_p = top_p
         self.top_k = top_k
+        self.temperature = temperature
 
     def get_max_prompt_length(self):
         """
@@ -1106,7 +1110,8 @@ def collate_fn(self, data):
                 'do_sample': True,
                 'top_p': self.top_p,
                 'top_k': self.top_k,
-                'use_cache': True,
+                'temperature': self.temperature,
+                'use_cache': True
             },
         }
         for sample in data:
@@ -1153,6 +1158,7 @@ def build_icl_dataloader(
     fewshot_random_seed: int,
     pass_at_k: int,
     generations_per_sample: int,
+    temperature: float,
 ) -> DataSpec:
     # TODO: Should this be some form of registry?
     if icl_task_type == 'multiple_choice':
@@ -1224,23 +1230,22 @@ def build_icl_dataloader(
         )
         effective_batchsize = batch_size
     elif icl_task_type == 'code_evaluation':
-        dataset = InContextLearningCodeEvalDataset(
-            dataset_uri=dataset_uri,
-            tokenizer=tokenizer,
-            max_seq_len=max_seq_len,
-            pad_tok_id=pad_tok_id,
-            num_fewshot=num_fewshot,
-            prompt_string=prompt_string,
-            example_delimiter=example_delimiter,
-            continuation_delimiter=continuation_delimiter,
-            destination_path=destination_path,
-            prelimiter=prelimiter,
-            fewshot_random_seed=fewshot_random_seed,
-            hf_loading_vars=hf_loading_vars,
-            hf_parsing_map=hf_parsing_map,
-            pass_at_k=pass_at_k,
-            generations_per_sample=generations_per_sample,
-        )
+        dataset = InContextLearningCodeEvalDataset(dataset_uri=dataset_uri,
+                                                   tokenizer=tokenizer,
+                                                   max_seq_len=max_seq_len,
+                                                   pad_tok_id=pad_tok_id,
+                                                   num_fewshot=num_fewshot,
+                                                   prompt_string=prompt_string,
+                                                   example_delimiter=example_delimiter,
+                                                   continuation_delimiter=continuation_delimiter,
+                                                   destination_path=destination_path,
+                                                   prelimiter=prelimiter,
+                                                   fewshot_random_seed=fewshot_random_seed,
+                                                   hf_loading_vars=hf_loading_vars,
+                                                   hf_parsing_map=hf_parsing_map,
+                                                   pass_at_k=pass_at_k,
+                                                   generations_per_sample=generations_per_sample,
+                                                   temperature=temperature)
         effective_batchsize = batch_size
     else:
         raise Exception(f'Unrecognized ICL task type: {icl_task_type}')
@@ -1344,6 +1349,7 @@ def get_icl_task_dataloader(
     destination_path: str = '',
     fewshot_random_seed: int = 1234,
     pass_at_k: int = 1,
+    temperature: float = 1.0,
     generations_per_sample: int = 1,
     cot_delimiter: str = '',
     has_categories: bool = False,
@@ -1426,26 +1432,26 @@ def get_icl_task_dataloader(
                                                         pass_at_k=pass_at_k,
                                                         generations_per_sample=generations_per_sample,
                                                         hf_loading_vars=hf_loading_vars,
-                                                        hf_parsing_map=hf_parsing_map)
+                                                        hf_parsing_map=hf_parsing_map,
+                                                        temperature=temperature)
         return result_dls
     else:
-        return build_icl_dataloader(
-            icl_task_type=icl_task_type,
-            dataset_uri=dataset_uri,
-            tokenizer=tokenizer,
-            batch_size=batch_size,
-            max_seq_len=max_seq_len,
-            pad_tok_id=pad_tok_id,
-            num_fewshot=num_fewshot,
-            prompt_string=prompt_string,
-            example_delimiter=example_delimiter,
-            hf_loading_vars=hf_loading_vars,
-            hf_parsing_map=hf_parsing_map,
-            continuation_delimiter=continuation_delimiter,
-            destination_path=destination_path,
-            prelimiter=question_prelimiter,
-            cot_delimiter=cot_delimiter,
-            fewshot_random_seed=fewshot_random_seed,
-            pass_at_k=pass_at_k,
-            generations_per_sample=generations_per_sample,
-        )
+        return build_icl_dataloader(icl_task_type=icl_task_type,
+                                    dataset_uri=dataset_uri,
+                                    tokenizer=tokenizer,
+                                    batch_size=batch_size,
+                                    max_seq_len=max_seq_len,
+                                    pad_tok_id=pad_tok_id,
+                                    num_fewshot=num_fewshot,
+                                    prompt_string=prompt_string,
+                                    example_delimiter=example_delimiter,
+                                    hf_loading_vars=hf_loading_vars,
+                                    hf_parsing_map=hf_parsing_map,
+                                    continuation_delimiter=continuation_delimiter,
+                                    destination_path=destination_path,
+                                    prelimiter=question_prelimiter,
+                                    cot_delimiter=cot_delimiter,
+                                    fewshot_random_seed=fewshot_random_seed,
+                                    pass_at_k=pass_at_k,
+                                    generations_per_sample=generations_per_sample,
+                                    temperature=temperature)

From b5afb918d299b5c50bf20bb09f321c2fe5979e5a Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Fri, 17 Nov 2023 08:21:44 +0000
Subject: [PATCH 026/116] remove need for hf:// on hf links

---
 .../in_context_learning_evaluation.py         | 34 +++++++++++++------
 .../test_in_context_learning_datasets.py      |  4 +--
 2 files changed, 26 insertions(+), 12 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 0159fbe2f5..bef14f40cd 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -15,7 +15,7 @@
 
 from composer.core import DataSpec
 from composer.core.data_spec import _default_split_batch, _split_list
-from composer.utils import MissingConditionalImportError, dist, get_file
+from composer.utils import MissingConditionalImportError, dist, get_file, parse_uri
 
 if TYPE_CHECKING:
     import transformers
@@ -31,6 +31,22 @@
     'get_icl_task_dataloader',
 ]
 
+def _check_if_huggingface_uri(uri: str) -> bool:
+    """
+    Takes a dataset uri and checks if it's a HuggingFace dataset uri
+    Returns False if a backend uri is present (ie 's3://', 'oci://') or if the uri is a local file
+    Returns True otherwise
+    Args:
+        uri (str): uri as a string
+
+    Returns:
+        bool: result of parsing uri as a HF uri 
+    """
+    backend, _, _ = parse_uri(uri)
+    if backend == '':
+        return not os.path.isfile(uri)
+    return False
+
 
 def strip_data(sample):
     return {k: v.strip() if isinstance(v, str) else v for k, v in sample.items()}
@@ -112,7 +128,7 @@ class InContextLearningDataset(Dataset):
     The input format is expected to be a jsonl file with different fields based on the task or a link to a huggingface dataset.
 
     Args:
-        dataset_uri (str): A local path, a remote path beginning with ``s3://`` or another backend, or a HuggingFace dataset link with ``hf://`` prepended to it.
+        dataset_uri (str): A local path, a remote path beginning with ``s3://`` or another backend, or a HuggingFace dataset uri.
             Alternate backends must be supported by :meth:`composer.utils.maybe_create_object_store_from_uri`.
             A local dataset must consist of rows of JSON data points with different fields based on the task.
             The default keys expected are "context" and "answer".
@@ -221,7 +237,7 @@ def _read_dataset(self,
         """
         Reads a dataset and handles parsing it from HuggingFace.
         Args:
-            dataset_uri (str): A local path, a remote path beginning with ``s3://`` or another backend, or a HuggingFace dataset link with ``hf://`` prepended to it.
+            dataset_uri (str): A local path, a remote path beginning with ``s3://`` or another backend, or a HuggingFace dataset uri.
                 Alternate backends must be supported by :meth:`composer.utils.maybe_create_object_store_from_uri`.
             destination_path (str): A local path where the data will be stored
             hf_loading_vars (dict): If parsing from HuggingFace, keyword args that will be passed into load_dataset
@@ -238,8 +254,7 @@ def _read_dataset(self,
                 conda_package='datasets',
                 conda_channel='conda-forge',
             ) from e
-        if dataset_uri.startswith('hf://'):
-            dataset_uri = dataset_uri.replace('hf://', '')
+        if _check_if_huggingface_uri(dataset_uri):
             dataset = load_dataset(dataset_uri, **hf_loading_vars)
             if hf_parsing_map:
                 dataset_parsing_func = lambda example: {
@@ -287,7 +302,7 @@ def _generate_few_shot_text(
 
     def _construct_context(self, sample: dict, preceding_text: str = '', add_answer: bool = False):
         """
-        Takes a sample and  constructs a context. Optionally, appends this to preceeding text (such as a
+        Takes a sample and constructs a context. Optionally, appends this to preceeding text (such as a
         prompt or fewshot examples), as well as optionally adds the correct answer (for fewshot examples)
 
         Args:
@@ -464,7 +479,7 @@ class InContextLearningQATaskDataset(InContextLearningDataset):
 
     #TODO: Should I only list variables here that are different than InContextLearningDataset?
     Args:
-        dataset_uri (str): A local path, a remote path beginning with ``s3://`` or another backend, or a HuggingFace dataset link with ``hf://`` prepended to it.
+        dataset_uri (str): A local path, a remote path beginning with ``s3://`` or another backend, or a HuggingFace dataset link.
             Alternate backends must be supported by :meth:`composer.utils.maybe_create_object_store_from_uri`.
             Dataset must consist of rows of JSON data points with "context", "answer", and "aliases". See tests/datasets/local_data/triviaqa_small.jsonl.
         tokenizer (transformers.PreTrainedTokenizerBase): The tokenizer used to map between strings and token ids
@@ -1298,9 +1313,8 @@ def partition_dataset_by_category(dataset_uri: str, destination_path: str, hf_lo
             conda_package='datasets',
             conda_channel='conda-forge',
         ) from e
-    if dataset_uri.startswith('hf://'):
-        cur_dataset_uri = dataset_uri.replace('hf://', '')
-        dataset = load_dataset(cur_dataset_uri, **hf_loading_vars)
+    if _check_if_huggingface_uri(dataset_uri):
+        dataset = load_dataset(dataset_uri, **hf_loading_vars)
         if hf_parsing_map:
             dataset_parsing_func = lambda example: {
                 k: ' '.join([str(example[col]) for col in v]) for k, v in hf_parsing_map.items()
diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py
index e86771e1a9..1ed703df12 100644
--- a/tests/datasets/test_in_context_learning_datasets.py
+++ b/tests/datasets/test_in_context_learning_datasets.py
@@ -1518,7 +1518,7 @@ def test_lm_spacing_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
     assert second_batch_without_last_word.count(' UNIQUE ') == 1
 
 
-@pytest.mark.parametrize('dataset_uri', ['hf://maxisawesome/test_dataset'])
+@pytest.mark.parametrize('dataset_uri', ['maxisawesome/test_dataset'])
 @pytest.mark.parametrize('num_fewshot', [0, 1])
 @pytest.mark.parametrize('prompt_string', ['Complete the voiceline: ', ''])
 @pytest.mark.parametrize('hf_loading_vars', [{
@@ -1567,7 +1567,7 @@ def test_hf_dataloading_lm_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path
     assert decoded_batch == "Looks like it's just you and me.There's a fine line between bravery and stupidity."
 
 
-@pytest.mark.parametrize('dataset_uri', ['hf://maxisawesome/test_dataset'])
+@pytest.mark.parametrize('dataset_uri', ['maxisawesome/test_dataset'])
 @pytest.mark.parametrize('num_fewshot', [0, 1])
 @pytest.mark.parametrize('prompt_string', ['What spell does this invoke? ', ''])
 @pytest.mark.parametrize('hf_loading_vars', [{

From a033fb0ea43fe08d21c80bc08ac291060e73c1b6 Mon Sep 17 00:00:00 2001
From: Max Marion <mmarion538@gmail.com>
Date: Thu, 23 Nov 2023 08:54:54 -0800
Subject: [PATCH 027/116] Update
 composer/datasets/in_context_learning_evaluation.py

Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com>
---
 composer/datasets/in_context_learning_evaluation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index bef14f40cd..5905249aa1 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -123,7 +123,7 @@ def _get_fewshot_sample_idxs(dataset_size: int, num_fewshot: int, sample_idx: in
 
 
 class InContextLearningDataset(Dataset):
-    """A base dataset that construct batches for in-context learning task evaluations
+    """A base dataset that constructs batches for in-context learning task evaluations
 
     The input format is expected to be a jsonl file with different fields based on the task or a link to a huggingface dataset.
 

From c43f65b14baf1f6f3f96c0b5808fe172f76195d7 Mon Sep 17 00:00:00 2001
From: Max Marion <mmarion538@gmail.com>
Date: Thu, 23 Nov 2023 08:55:30 -0800
Subject: [PATCH 028/116] Update
 composer/datasets/in_context_learning_evaluation.py

Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com>
---
 composer/datasets/in_context_learning_evaluation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 5905249aa1..aba6f2350a 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -147,7 +147,7 @@ class InContextLearningDataset(Dataset):
         strip_dataset (bool): Boolean for whether to strip whitespace from data. Trailing whitespace can cause degenerative outputs,
             so unless whitespace should be preserved (for example in code), this should be set to True.
         hf_loading_vars (dict): A dictionary containing keyword arguments to be passed into `load_dataset` if dataset is being pulled from HF.
-        hf_parsing_map (Dict[str:List[str]]): A dictionary containing a from HF columns to ICL dataset keys. The dictionary should be formatted {icl_key:[hf_key1, hf_key1]}.
+        hf_parsing_map (Dict[str:List[str]]): A dictionary containing a mapping from HF columns to ICL dataset keys. The dictionary should be formatted {icl_key:[hf_key1, hf_key1]}.
             Values in the dict will be concatenated with ' ' seperating them. If not included, will use the columns already present in the HF dataset.
         stacked_keys (list(str)): keys in the output batch that must be converted to tensors with torch.stack()
         dont_split_keys (list(str)): keys in the ICL dictionary that should not be split among batches.

From 20c35425cd27822a2cf5010d3599c983a9a84029 Mon Sep 17 00:00:00 2001
From: Max Marion <mmarion538@gmail.com>
Date: Thu, 23 Nov 2023 09:56:28 -0800
Subject: [PATCH 029/116] Update
 composer/datasets/in_context_learning_evaluation.py

Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com>
---
 composer/datasets/in_context_learning_evaluation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index aba6f2350a..da49e46f0e 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -125,7 +125,7 @@ def _get_fewshot_sample_idxs(dataset_size: int, num_fewshot: int, sample_idx: in
 class InContextLearningDataset(Dataset):
     """A base dataset that constructs batches for in-context learning task evaluations
 
-    The input format is expected to be a jsonl file with different fields based on the task or a link to a huggingface dataset.
+    The input format is expected to be a jsonl file with different fields based on the task or a link to a Hugging Face dataset.
 
     Args:
         dataset_uri (str): A local path, a remote path beginning with ``s3://`` or another backend, or a HuggingFace dataset uri.

From 16a5d956e56d0b280f60c2a8e20103f22080f8f2 Mon Sep 17 00:00:00 2001
From: Max Marion <mmarion538@gmail.com>
Date: Thu, 23 Nov 2023 09:56:51 -0800
Subject: [PATCH 030/116] Update
 composer/datasets/in_context_learning_evaluation.py

Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com>
---
 composer/datasets/in_context_learning_evaluation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index da49e46f0e..58a818c6c1 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -241,7 +241,7 @@ def _read_dataset(self,
                 Alternate backends must be supported by :meth:`composer.utils.maybe_create_object_store_from_uri`.
             destination_path (str): A local path where the data will be stored
             hf_loading_vars (dict): If parsing from HuggingFace, keyword args that will be passed into load_dataset
-            hf_parsing_map (dict): Dictionary in the form of {icl_key: [hf_col1, hf_col2]} that will map one or more hf columns, in order, to ICL dataset cols
+            hf_parsing_map (dict): Dictionary in the form of {icl_key: [hf_col1, hf_col2]} that will map one or more hf columns, in order, to ICL dataset columns
 
         Returns:
             dataset: a loaded HF dataset

From 5a15d68d589700d5afe3dd6f7e8ce0f17590d459 Mon Sep 17 00:00:00 2001
From: Max Marion <mmarion538@gmail.com>
Date: Thu, 23 Nov 2023 09:57:30 -0800
Subject: [PATCH 031/116] Update
 composer/datasets/in_context_learning_evaluation.py

Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com>
---
 composer/datasets/in_context_learning_evaluation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 58a818c6c1..b3caba325d 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -331,7 +331,7 @@ def _get_answer_from_sample(self, sample: dict):
             sample (dict): the sample from which to retrieve the answer
 
         Returns:
-            str: the answer in from the sample
+            str: the answer in the sample
         """
         return sample[self.answer_key]
 

From 0aa6f6b5c779b018ae85827d3771e7b5d9bcf59a Mon Sep 17 00:00:00 2001
From: Max Marion <mmarion538@gmail.com>
Date: Thu, 23 Nov 2023 16:32:43 -0800
Subject: [PATCH 032/116] Update
 composer/datasets/in_context_learning_evaluation.py

Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com>
---
 composer/datasets/in_context_learning_evaluation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index b3caba325d..8f40884b1d 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -342,7 +342,7 @@ def _fix_eos_on_preamble(self, preamble: dict):
         If there is an EOS token added, we need to remove it so it is not in the middle of the prompt,
         as the specific eval question's prompt will follow the preamble
         Args:
-            preamble (dict): a dictionary containing a the tokenized input
+            preamble (dict): a dictionary containing the tokenized input
 
         Returns:
             dict: the same dictionary with the final token conditionally removed

From ce638454de00c79886a9b419de4f0a5ec20c1784 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Fri, 24 Nov 2023 01:41:26 +0000
Subject: [PATCH 033/116] fix comments, add test for check hf uri, still wip

---
 composer/datasets/__init__.py                 |  7 ++
 .../in_context_learning_evaluation.py         | 84 +++++++++++--------
 .../test_in_context_learning_datasets.py      | 11 ++-
 3 files changed, 64 insertions(+), 38 deletions(-)

diff --git a/composer/datasets/__init__.py b/composer/datasets/__init__.py
index 56672e27f3..895b469583 100644
--- a/composer/datasets/__init__.py
+++ b/composer/datasets/__init__.py
@@ -15,6 +15,7 @@
 from composer.datasets.mnist import build_mnist_dataloader, build_synthetic_mnist_dataloader
 from composer.datasets.synthetic import (SyntheticBatchPairDataset, SyntheticDataLabelType, SyntheticDataType,
                                          SyntheticPILDataset)
+from composer.datasets.in_context_learning_evaluation import InContextLearningDataset, InContextLearningQATaskDataset, InContextLearningLMTaskDataset, InContextLearningCodeEvalDataset, InContextLearningMultipleChoiceTaskDataset, InContextLearningSchemaTaskDataset
 
 __all__ = [
     'ADE20k',
@@ -24,6 +25,12 @@
     'SyntheticDataLabelType',
     'SyntheticDataType',
     'SyntheticPILDataset',
+    'InContextLearningDataset', 
+    'InContextLearningQATaskDataset', 
+    'InContextLearningLMTaskDataset', 
+    'InContextLearningCodeEvalDataset', 
+    'InContextLearningMultipleChoiceTaskDataset', 
+    'InContextLearningSchemaTaskDataset',
     'build_ade20k_dataloader',
     'build_streaming_ade20k_dataloader',
     'build_streaming_c4_dataloader',
diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index b3caba325d..c6675ad867 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -10,7 +10,6 @@
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
 
 import torch
-import transformers
 from torch.utils.data import DataLoader, Dataset
 
 from composer.core import DataSpec
@@ -33,18 +32,21 @@
 
 def _check_if_huggingface_uri(uri: str) -> bool:
     """
-    Takes a dataset uri and checks if it's a HuggingFace dataset uri
-    Returns False if a backend uri is present (ie 's3://', 'oci://') or if the uri is a local file
-    Returns True otherwise
+    Takes a dataset uri and checks if it's a HuggingFace dataset uri.
+    Returns False if a backend uri is present (ie 's3://', 'oci://') or if the uri is a local file.
+    Returns True otherwise.
     Args:
         uri (str): uri as a string
 
     Returns:
         bool: result of parsing uri as a HF uri 
     """
-    backend, _, _ = parse_uri(uri)
+    backend, _, path = parse_uri(uri)
     if backend == '':
-        return not os.path.isfile(uri)
+        _, ext = os.path.splitext(path)
+        # If there's any extension, it's a link to a local file. If no extention, HF path
+        return ext == ''
+    # If there's any backend, it's a cloud OCI and not HF
     return False
 
 
@@ -53,8 +55,10 @@ def strip_data(sample):
 
 
 def _tokenizer_needs_prefix_space(tokenizer) -> bool:
-    # Test for whether a prefix space is needed before the continuation.
-    # sentencepiece tokenization should not have a prefix space, but gpt2 style BPE should
+    """
+    Test for whether a prefix space is needed before the continuation.
+    Sentencepiece tokenization should not have a prefix space, but gpt2 style BPE should.
+    """
     return len(tokenizer(' a', add_special_tokens=False)['input_ids']) == 1
 
 
@@ -101,12 +105,18 @@ def _make_padded_input(context_enc, continuation_enc, max_seq_len, pad_tok_id, p
 
 
 def _get_fewshot_sample_idxs(dataset_size: int, num_fewshot: int, sample_idx: int, rng: random.Random):
-    # samples without replacement. if num_fewshot exceeds the number of unique samples,
-    # then we will have fewer than num_fewshot examples in context
-
-    # Simpler implementation (but will choose different actual ids which will break some tests)
-    # possible_fewshot_idxs = [i for i in range(0, dataset_size) if i != sample_idx]
-    # fewshot_idxs = set(rng.sample(possible_fewshot_idxs, num_fewshot))
+    """
+    Samples without replacement. If num_fewshot exceeds the number of unique samples,
+    then we will have fewer than num_fewshot examples in context.
+    Args:
+        dataset_size (int): length of the dataset
+        num_fewshot (int): number of examples to prepend
+        sample_idx (int): current sample index (excluded from fewshot choices)
+        rng (random.Random): rng for repeatable sample selection
+    
+    Returns:
+        list: indices of the examples chosen for fewshot selection
+    """
     num_fewshot = min(dataset_size - 1, num_fewshot)
     fewshot_idxs = set(rng.sample(range(0, dataset_size), num_fewshot))
 
@@ -403,7 +413,7 @@ def _prep_example(
 
     def collate_fn(self, data):
         """
-        The function that the dataloader uses to accumulate data into batches
+        The function that the dataloader uses to accumulate data into batches.
         Args:
             data (list): list of tokenized datapoints (dicts returned by self._tokenize_example)
 
@@ -432,7 +442,7 @@ def collate_fn(self, data):
 
     def split_batch(self, batch: Any, microbatch_size: int):
         """
-        Handling for certain specialty columns that must be split into batches in different formats
+        Handling for certain specialty columns that must be split into batches in different formats.
 
         Args:
             batch (dict): batch of data
@@ -469,7 +479,7 @@ def split_batch(self, batch: Any, microbatch_size: int):
 
 
 class InContextLearningQATaskDataset(InContextLearningDataset):
-    """A dataset that construct batches for in-context learning question answering evaluation
+    """A dataset that construct batches for in-context learning question answering evaluation.
     QA tasks evaluate a model's ability to answer questions using a consistent format.
 
     The input format is expected to be a jsonl file with the following fields:
@@ -526,7 +536,7 @@ def _read_dataset(
 
     def _get_answer_from_sample(self, sample: dict):
         """
-        Returns the answer from the sample. Applies chain of thought if self.has_cot is marked as true
+        Returns the answer from the sample. Applies chain of thought if self.has_cot is marked as true.
         Args:
             sample (dict): the sample from which to retrieve the answer
 
@@ -555,7 +565,7 @@ def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict):
 
     def get_max_answer_length(self):
         f"""
-        Loops over the dataset and finds the longes answer length
+        Loops over the dataset and finds the longes answer length.
 
         Returns:
             int: the maximum answer length with an additional buffer of {_MAX_ANSWER_BUFFER_LENGTH} if chain of thought is present
@@ -574,7 +584,7 @@ def get_max_answer_length(self):
 
     def collate_fn(self, data):
         """
-        The function that the dataloader uses to accumulate data into batches
+        The function that the dataloader uses to accumulate data into batches.
         Args:
             data (list): list of tokenized datapoints (dicts returned by self._tokenize_example)
 
@@ -655,7 +665,7 @@ def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict):
 
     def collate_fn(self, data):
         """
-        The function that the dataloader uses to accumulate data into batches
+        The function that the dataloader uses to accumulate data into batches.
         Args:
             data (list): list of tokenized datapoints (dicts returned by self._tokenize_example)
 
@@ -679,13 +689,13 @@ def collate_fn(self, data):
 
 
 class InContextLearningMultipleChoiceTaskDataset(InContextLearningDataset):
-    """A dataset that construct batches for in-context learning multiple choice evaluation
+    """A dataset that construct batches for in-context learning multiple choice evaluation.
 
     If each question has N answer choices, we construct N distinct inputs per question. In order to ensure
     consistency across multi-GPU, we set the batch size to be `min(N, batch_size)` so that all N
     inputs per question can stored in the same batch.
 
-    Each batch then consists of batch_size // N distinct questions and has the following the structure
+    Each batch then consists of batch_size // N distinct questions and has the following the structure.
 
     'input_ids': Input tensor batch x seqlen x # tokens
     'continuation_indices': List of |batch| consisting of tensors indicating which indices in the sequence correspond to the question answer (aka continuation)
@@ -721,7 +731,7 @@ def __init__(self, choices_key: str = 'choices', *args, **kwargs):
 
     def _get_answer_from_sample(self, sample: dict):
         """
-        Returns the correct answer from the sample's choices
+        Returns the correct answer from the sample's choices.
         Args:
             sample (dict): the sample from which to retrieve the answer
 
@@ -753,7 +763,7 @@ def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict):
 
     def collate_fn(self, data):
         """
-        The function that the dataloader uses to accumulate data into batches
+        The function that the dataloader uses to accumulate data into batches.
         Args:
             data (list): list of tokenized datapoints (dicts returned by self._tokenize_example)
 
@@ -845,7 +855,7 @@ def split_batch(self, batch: Any, microbatch_size: int):
 
 
 class InContextLearningSchemaTaskDataset(InContextLearningMultipleChoiceTaskDataset):
-    """A dataset that constructs batches for in-context learning schema evaluation
+    """A dataset that constructs batches for in-context learning schema evaluation.
     A schema task involves sentences with a fill-in-the-blank where the user needs to choose the correct word
     to fill in from a set of N options. We use the partial evaluation technique from https://arxiv.org/abs/1806.02847
     to determine the model's choice of fill-in word.
@@ -880,7 +890,7 @@ def __init__(self, choices_key='context_options', *args, **kwargs):
     def _construct_context(self, sample, preceding_text: str = '', add_answer: bool = False):
         """
         Takes a sample and  constructs a context. Optionally, appends this to preceeding text (such as a
-        prompt or fewshot examples), as well as optionally adds the correct answer (for fewshot examples)
+        prompt or fewshot examples), as well as optionally adds the correct answer (for fewshot examples).
 
         Args:
             sample (dict): the sample from which to construct the context
@@ -892,7 +902,6 @@ def _construct_context(self, sample, preceding_text: str = '', add_answer: bool
         context_options = sample['context_options']
         gold_idx = sample['gold']
         continuation = sample['continuation']
-        assert isinstance(gold_idx, int)
         if add_answer:
             context = context_options[gold_idx]
             if len(preceding_text) > 0:
@@ -936,7 +945,7 @@ def _tokenize_example(self, prompt_and_fewshot: str, context_options: List[str],
 
     def collate_fn(self, data):
         """
-        The function that the dataloader uses to accumulate data into batches
+        The function that the dataloader uses to accumulate data into batches.
         Args:
             data (list): list of tokenized datapoints (dicts returned by self._tokenize_example)
 
@@ -982,7 +991,7 @@ def collate_fn(self, data):
 
 
 class InContextLearningCodeEvalDataset(InContextLearningDataset):
-    """A dataset that constructs batches for in-context learning code evaluation
+    """A dataset that constructs batches for in-context learning code evaluation.
 
     The input format is expected to be a jsonl file with the following fields:
     - task_id: label of given task
@@ -1061,7 +1070,7 @@ def __init__(
 
     def get_max_prompt_length(self):
         """
-        Iterates through the dataset and finds the length of the longest prompt
+        Iterates through the dataset and finds the length of the longest prompt.
         Returns:
             int: maximum prompt length
         """
@@ -1097,7 +1106,7 @@ def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict):
 
     def collate_fn(self, data):
         """
-        The function that the dataloader uses to accumulate data into batches
+        The function that the dataloader uses to accumulate data into batches.
         Args:
             data (list): list of tokenized datapoints (dicts returned by self._tokenize_example)
 
@@ -1175,7 +1184,6 @@ def build_icl_dataloader(
     generations_per_sample: int,
     temperature: float,
 ) -> DataSpec:
-    # TODO: Should this be some form of registry?
     if icl_task_type == 'multiple_choice':
         dataset = InContextLearningMultipleChoiceTaskDataset(
             dataset_uri=dataset_uri,
@@ -1260,7 +1268,8 @@ def build_icl_dataloader(
                                                    hf_parsing_map=hf_parsing_map,
                                                    pass_at_k=pass_at_k,
                                                    generations_per_sample=generations_per_sample,
-                                                   temperature=temperature)
+                                                   temperature=temperature,
+                                                   )
         effective_batchsize = batch_size
     else:
         raise Exception(f'Unrecognized ICL task type: {icl_task_type}')
@@ -1345,7 +1354,6 @@ def partition_dataset_by_category(dataset_uri: str, destination_path: str, hf_lo
     return output_files
 
 
-# TODO: Where do we want to set our defaults?
 def get_icl_task_dataloader(
     icl_task_type: str,
     dataset_uri: str,
@@ -1447,7 +1455,8 @@ def get_icl_task_dataloader(
                                                         generations_per_sample=generations_per_sample,
                                                         hf_loading_vars=hf_loading_vars,
                                                         hf_parsing_map=hf_parsing_map,
-                                                        temperature=temperature)
+                                                        temperature=temperature,
+                                                        )
         return result_dls
     else:
         return build_icl_dataloader(icl_task_type=icl_task_type,
@@ -1468,4 +1477,5 @@ def get_icl_task_dataloader(
                                     fewshot_random_seed=fewshot_random_seed,
                                     pass_at_k=pass_at_k,
                                     generations_per_sample=generations_per_sample,
-                                    temperature=temperature)
+                                    temperature=temperature,
+                                    )
diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py
index 1ed703df12..dc3058285d 100644
--- a/tests/datasets/test_in_context_learning_datasets.py
+++ b/tests/datasets/test_in_context_learning_datasets.py
@@ -17,7 +17,7 @@
 from composer.core import DataSpec
 from composer.datasets.in_context_learning_evaluation import (InContextLearningCodeEvalDataset,
                                                               _get_fewshot_sample_idxs, _make_padded_input,
-                                                              get_icl_task_dataloader)
+                                                              get_icl_task_dataloader, _check_if_huggingface_uri)
 from composer.loggers import InMemoryLogger
 from composer.metrics import (InContextLearningCodeEvalAccuracy, InContextLearningLMAccuracy,
                               InContextLearningMultipleChoiceAccuracy, InContextLearningQAAccuracy)
@@ -75,6 +75,15 @@ def test_batch_padding_logic(tiny_gpt2_tokenizer):
     assert continuation_spans[0] == 48 and continuation_spans[-1] == 2047
 
 
+@pytest.mark.parametrize('uri', ['tests/datasets/local_data/hellaswag_small.jsonl', 's3://oci/url/link.json', 'gcs://blah/blah.json'])
+def test_check_if_huggingface_uri_when_not_hf_uri(uri):
+    assert not _check_if_huggingface_uri(uri)
+
+
+@pytest.mark.parametrize('uri', ['L4NLP/LEval', 'mosaicml/instruct-v3'])
+def test_check_if_huggingface_uri_when_hf_uri(uri):
+    assert _check_if_huggingface_uri(uri)
+
 @pytest.mark.parametrize('padding_side', ['left', 'right', 'middle'])
 def test_make_padding(tiny_gpt2_tokenizer, padding_side):
     context = tiny_gpt2_tokenizer(' cat' * 2000)['input_ids']

From 8fa8da9eb0b55820e88a334a0ebd67042dd1f6a4 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Fri, 24 Nov 2023 02:39:44 +0000
Subject: [PATCH 034/116] add gpu tests back

---
 .../test_in_context_learning_datasets.py      | 42 +++++++++++++------
 1 file changed, 30 insertions(+), 12 deletions(-)

diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py
index dc3058285d..7696a404fc 100644
--- a/tests/datasets/test_in_context_learning_datasets.py
+++ b/tests/datasets/test_in_context_learning_datasets.py
@@ -24,8 +24,7 @@
 from composer.models import HuggingFaceModel
 from composer.trainer import Trainer
 from composer.utils import dist, reproducibility
-
-# from tests.common import device, world_size
+from tests.common import device, world_size
 
 
 def test_fewshot_sample_idxs():
@@ -1000,7 +999,8 @@ def test_code_eval_task_dataloader(dataset_uri, tmp_path, num_fewshot, prompt_st
 
 @pytest.mark.parametrize('dataset_uri', ['lambada_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [0, 5])
-def test_lm_task_evaluation(dataset_uri, num_fewshot, tiny_gpt2_tokenizer, tmp_path):
+@device('gpu')
+def test_lm_task_evaluation(device, dataset_uri, num_fewshot, tiny_gpt2_tokenizer, tmp_path):
     pytest.importorskip('datasets')
     in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -1083,7 +1083,9 @@ def test_schema_task_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer, t
 @pytest.mark.parametrize('dataset_uri', ['mmlu_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [0, 5])
 @pytest.mark.filterwarnings(r'ignore:Cannot split .* of length.*:UserWarning')
-def test_mc_task_evaluation_subcategories(dataset_uri, num_fewshot, tiny_gpt2_model, tiny_gpt2_tokenizer, tmp_path):
+@device('gpu')
+@world_size(1, 2)
+def test_mc_task_evaluation_subcategories(device, world_size, dataset_uri, num_fewshot, tiny_gpt2_model, tiny_gpt2_tokenizer, tmp_path):
     pytest.importorskip('datasets')
     in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -1133,7 +1135,9 @@ def test_mc_task_evaluation_subcategories(dataset_uri, num_fewshot, tiny_gpt2_mo
 
 @pytest.mark.parametrize('dataset_uri', ['piqa_small.jsonl', 'hellaswag_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [0, 5])
-def test_mc_task_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tmp_path, tiny_gpt2_model):
+@device('gpu')
+@world_size(1, 2)
+def test_mc_task_evaluation(device, world_size, num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tmp_path, tiny_gpt2_model):
     pytest.importorskip('datasets')
     in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -1178,7 +1182,9 @@ def test_mc_task_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tmp_p
 
 @pytest.mark.parametrize('dataset_uri', ['triviaqa_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [0, 5])
-def test_qa_task_evaluation_opt_tokenizer(num_fewshot, dataset_uri, tmp_path):
+@device('gpu')
+@world_size(1, 2)
+def test_qa_task_evaluation_opt_tokenizer(device, world_size, num_fewshot, dataset_uri, tmp_path):
     pytest.importorskip('datasets')
     in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -1218,7 +1224,9 @@ def test_qa_task_evaluation_opt_tokenizer(num_fewshot, dataset_uri, tmp_path):
 
 @pytest.mark.parametrize('dataset_uri', ['gsm8k_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [5])
-def test_qa_task_evaluation_with_cot_opt_tokenizer(num_fewshot, dataset_uri, tmp_path):
+@device('gpu')
+@world_size(1, 2)
+def test_qa_task_evaluation_with_cot_opt_tokenizer(device, world_size, num_fewshot, dataset_uri, tmp_path):
     pytest.importorskip('datasets')
     in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -1259,7 +1267,9 @@ def test_qa_task_evaluation_with_cot_opt_tokenizer(num_fewshot, dataset_uri, tmp
 
 @pytest.mark.parametrize('dataset_uri', ['triviaqa_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [0, 5])
-def test_qa_task_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tiny_gpt2_model, tmp_path):
+@device('gpu')
+@world_size(1, 2)
+def test_qa_task_evaluation(device, world_size, num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tiny_gpt2_model, tmp_path):
     pytest.importorskip('datasets')
     in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -1299,7 +1309,9 @@ def test_qa_task_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tiny_
 
 @pytest.mark.parametrize('dataset_uri', ['gsm8k_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [5])
-def test_qa_task_with_cot_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tiny_gpt2_model, tmp_path):
+@device('gpu')
+@world_size(1, 2)
+def test_qa_task_with_cot_evaluation(device, world_size, num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tiny_gpt2_model, tmp_path):
     pytest.importorskip('datasets')
     in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -1353,7 +1365,9 @@ def test_code_eval_requires_valid_envvar(monkeypatch):
 @pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [0])
 @pytest.mark.parametrize('generations_per_sample', range(1, 3))
-def test_code_eval_microbatching(monkeypatch, num_fewshot, dataset_uri, tmp_path, generations_per_sample):
+@device('gpu')
+@world_size(1, 2)
+def test_code_eval_microbatching(monkeypatch, device, world_size, num_fewshot, dataset_uri, tmp_path, generations_per_sample):
     pytest.importorskip('datasets')
     monkeypatch.setenv('CODE_EVAL_DEVICE', 'LOCAL')
     in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
@@ -1400,7 +1414,9 @@ def test_code_eval_microbatching(monkeypatch, num_fewshot, dataset_uri, tmp_path
 @pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [0])
 @pytest.mark.parametrize('generations_per_sample', range(1, 3))
-def test_code_eval_sentpiece_evaluation(monkeypatch, num_fewshot, dataset_uri, tiny_t5_tokenizer, tiny_t5_model,
+@device('gpu')
+@world_size(1, 2)
+def test_code_eval_sentpiece_evaluation(monkeypatch, device, world_size, num_fewshot, dataset_uri, tiny_t5_tokenizer, tiny_t5_model,
                                         tmp_path, generations_per_sample):
     pytest.importorskip('datasets')
     torch.cuda.empty_cache()
@@ -1446,7 +1462,9 @@ def test_code_eval_sentpiece_evaluation(monkeypatch, num_fewshot, dataset_uri, t
 @pytest.mark.parametrize('num_fewshot', [0, 2])
 @pytest.mark.parametrize('generations_per_sample', [1])
 @pytest.mark.filterwarnings(r'ignore: Input length of input_ids is')
-def test_code_eval_task_evaluation(monkeypatch, num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tiny_gpt2_model,
+@device('gpu')
+@world_size(1, 2)
+def test_code_eval_task_evaluation(monkeypatch, device, world_size, num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tiny_gpt2_model,
                                    tmp_path, generations_per_sample):
     pytest.importorskip('datasets')
     torch.cuda.empty_cache()

From 49c8b7db743940b15cc6ef73a4b717fb8ef7b914 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Fri, 24 Nov 2023 03:29:41 +0000
Subject: [PATCH 035/116] update fix_eos_on_preamble

---
 .../in_context_learning_evaluation.py         | 26 +++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index c527a725f1..4953eb36bc 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -317,8 +317,8 @@ def _construct_context(self, sample: dict, preceding_text: str = '', add_answer:
 
         Args:
             sample (dict): the sample from which to construct the context
-            preceding_text (str): any preceding text, needed to if self.example_delimiter is needed at the beginning
-            add_answer (bool): bool for whether or not to add the answer on the end of the context (needed for fewshot examples)
+            preceding_text (str): any preceding text, used as a check for prepending self.example_delimiter
+            add_answer (bool): bool for whether or not to add the answer on the end of the context (e.g. for fewshot examples)
 
         Returns:
 
@@ -345,22 +345,22 @@ def _get_answer_from_sample(self, sample: dict):
         """
         return sample[self.answer_key]
 
-    def _fix_eos_on_preamble(self, preamble: dict):
+    def _fix_eos_on_preamble(self, input_ids: str):
         """
-        If the preamble is empty then preamble['input_ids'] will be a 0-length list,
+        If the input_ids is empty then input_ids['input_ids'] will be a 0-length list,
         unless the tokenizer adds special tokens to empty strings (e.g. OPT tokenizer)
         If there is an EOS token added, we need to remove it so it is not in the middle of the prompt,
-        as the specific eval question's prompt will follow the preamble
+        as the specific eval question's prompt will follow theinput_ids 
         Args:
-            preamble (dict): a dictionary containing the tokenized input
+            input_ids (list): the tokenized input
 
         Returns:
-            dict: the same dictionary with the final token conditionally removed
+            input_ids: the tokenized input conditionally edited  
         """
-        if (self.tokenizer.eos_token_id is not None and len(preamble['input_ids']) > 1 and
-                preamble['input_ids'][-1] == self.tokenizer.eos_token_id):
-            preamble['input_ids'] = preamble['input_ids'][:-1]
-        return preamble
+        if (self.tokenizer.eos_token_id is not None and len(input_ids) > 1 and
+                input_ids[-1] == self.tokenizer.eos_token_id):
+            input_ids = input_ids[:-1]
+        return input_ids 
 
     def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict):
         """
@@ -375,7 +375,7 @@ def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict):
         """
         tokenized_example = {}
         preamble = self.tokenizer(prompt_and_fewshot)
-        preamble = self._fix_eos_on_preamble(preamble)
+        preamble['input_ids'] = self._fix_eos_on_preamble(preamble['input_ids'])
         tokenized_example['preamble'] = preamble
         if self.strip_data:
             # rstrip context because a prompt ending in a space results in degenerate output
@@ -933,7 +933,7 @@ def _tokenize_example(self, prompt_and_fewshot: str, context_options: List[str],
         """
         tokenized_example = {}
         preamble = self.tokenizer(prompt_and_fewshot)
-        preamble = self._fix_eos_on_preamble(preamble)
+        preamble['input_ids'] = self._fix_eos_on_preamble(preamble['input_ids'])
         tokenized_example['preamble'] = preamble
         tokenized_example['context_options'] = [self.tokenizer(c, add_special_tokens=False) for c in context_options]
         continuation = example['continuation']

From e62e12be7ce9c322c6d8b6b32a2c5061846ce830 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Sat, 25 Nov 2023 02:32:04 +0000
Subject: [PATCH 036/116] update comments

---
 .../in_context_learning_evaluation.py         | 287 ++++++++----------
 1 file changed, 131 insertions(+), 156 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 4953eb36bc..2687b9e9cc 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -30,6 +30,7 @@
     'get_icl_task_dataloader',
 ]
 
+
 def _check_if_huggingface_uri(uri: str) -> bool:
     """
     Takes a dataset uri and checks if it's a HuggingFace dataset uri.
@@ -39,7 +40,7 @@ def _check_if_huggingface_uri(uri: str) -> bool:
         uri (str): uri as a string
 
     Returns:
-        bool: result of parsing uri as a HF uri 
+        bool: result of parsing uri as a HF uri
     """
     backend, _, path = parse_uri(uri)
     if backend == '':
@@ -113,7 +114,7 @@ def _get_fewshot_sample_idxs(dataset_size: int, num_fewshot: int, sample_idx: in
         num_fewshot (int): number of examples to prepend
         sample_idx (int): current sample index (excluded from fewshot choices)
         rng (random.Random): rng for repeatable sample selection
-    
+
     Returns:
         list: indices of the examples chosen for fewshot selection
     """
@@ -350,17 +351,17 @@ def _fix_eos_on_preamble(self, input_ids: str):
         If the input_ids is empty then input_ids['input_ids'] will be a 0-length list,
         unless the tokenizer adds special tokens to empty strings (e.g. OPT tokenizer)
         If there is an EOS token added, we need to remove it so it is not in the middle of the prompt,
-        as the specific eval question's prompt will follow theinput_ids 
+        as the specific eval question's prompt will follow theinput_ids
         Args:
             input_ids (list): the tokenized input
 
         Returns:
-            input_ids: the tokenized input conditionally edited  
+            input_ids: the tokenized input conditionally edited
         """
         if (self.tokenizer.eos_token_id is not None and len(input_ids) > 1 and
                 input_ids[-1] == self.tokenizer.eos_token_id):
             input_ids = input_ids[:-1]
-        return input_ids 
+        return input_ids
 
     def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict):
         """
@@ -487,21 +488,9 @@ class InContextLearningQATaskDataset(InContextLearningDataset):
     - answer: the preferred answer to the question
     - aliases: a list of aliases for the answer
 
-    #TODO: Should I only list variables here that are different than InContextLearningDataset?
-    Args:
-        dataset_uri (str): A local path, a remote path beginning with ``s3://`` or another backend, or a HuggingFace dataset link.
-            Alternate backends must be supported by :meth:`composer.utils.maybe_create_object_store_from_uri`.
-            Dataset must consist of rows of JSON data points with "context", "answer", and "aliases". See tests/datasets/local_data/triviaqa_small.jsonl.
-        tokenizer (transformers.PreTrainedTokenizerBase): The tokenizer used to map between strings and token ids
-        max_seq_len (int): The maximum sequence length supported by the model
-        pad_tok_id (int): The special token reserved for padding batches
-        num_fewshot (int): The number of complete fewshot examples to prepend before each test example
-        prompt_string (str): Prompt string to put once before all fewshot examples/test examples (e.g. 'translate english to french')
-        example_delimiter (str): Separator that goes between individual (context, answer) pairs (e.g. '\n')
-        continuation_delimiter: (str): Separator that goes between context and answer in each example (e.g. '\nA: ')
-        destination_path (str): Temporary path to store downloaded datasets
-        prelimiter (str): String to put before each question (e.g. 'Q: ')
-        fewshot_random_seed (int): Random seed to use for fewshot sampling
+    See InContextLearningDataset for more details.
+
+    Additional Args:
         cot_delimiter (str): Delimiter to place between the chain of thought and continuations.
     """
 
@@ -625,21 +614,11 @@ class InContextLearningLMTaskDataset(InContextLearningDataset):
     """A dataset that construct batches for in-context learning language modeling evaluation.
     Language modeling tasks test a model's ability to properly predict tokens based on preceding tokens.
 
+    The input format is expected to be a jsonl file with the following fields:
+    - context: preceding text
+    - continuation: the expected continuation
 
-    #TODO: Should I only list variables here that are different than InContextLearningDataset?
-    Args:
-        dataset_uri (str): Either a local path, or a remote path beginning with ``s3://``, or another backend
-            supported by :meth:`composer.utils.maybe_create_object_store_from_uri`. Dataset must consist of rows of JSON data points with "context",
-            and "continuation". See tests/datasets/local_data/lambada_small.jsonl.
-        tokenizer (Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast]): The tokenizer used to transform data into batches
-        max_seq_len (int): The sequence length expected by the model
-        pad_tok_id (int): The special token reserved for padding the ends of batches
-        num_fewshot (int): The number of complete fewshot examples to prepend before each test example
-        prompt_string (str): Prompt string to put once before all fewshot examples/test examples (e.g. 'translate english to french')
-        example_delimiter (str): Separator that goes between individual (context, continuation) pairs (e.g. '\n')
-        continuation_delimiter: (str): Separator that goes between context and continuation in each example (e.g. '->')
-        destination_path (str): Temporary path to store downloaded datasets
-        fewshot_random_seed (int): Random seed used to select fewshot examples
+    See InContextLearningDataset for more details.
     """
 
     def __init__(self, *args, **kwargs):
@@ -695,29 +674,21 @@ class InContextLearningMultipleChoiceTaskDataset(InContextLearningDataset):
     consistency across multi-GPU, we set the batch size to be `min(N, batch_size)` so that all N
     inputs per question can stored in the same batch.
 
-    Each batch then consists of batch_size // N distinct questions and has the following the structure.
-
-    'input_ids': Input tensor batch x seqlen x # tokens
-    'continuation_indices': List of |batch| consisting of tensors indicating which indices in the sequence correspond to the question answer (aka continuation)
-    'mode': Indicates to the model that this is an ICL task and may rely on a custom code path to properly update metrics
-    'labels': Identical to the input, used by the model to calculate loss/metrics
-    'gold_indices': List of length |batch_size // N| indicating for each question, which of the answers is correct (via an integer [0, N-1])
-    'choice_groupings': Indicates which indices of the batch correspond to which questions
+    The default input format is a jsonl file with the following fields:
+    - query: the preceding text, question, or document relevant to the choices
+    - gold: index of the correct choice under 'choices'
+    - choices: a list of strings, each being one of the potential choices
 
-    #TODO: Should I only list variables here that are different than InContextLearningDataset?
-    Args:
-        dataset_uri (str): Either a local path, or a remote path beginning with ``s3://``, or another backend
-            supported by :meth:`composer.utils.maybe_create_object_store_from_uri`. Dataset must consist of rows of JSON data points with "query",
-            "choices", and "gold" index. See tests/datasets/local_data/piqa_small.jsonl.
-        tokenizer (Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast]): The tokenizer used to transform data into batches
-        max_seq_len (int): The sequence length expected by the model
-        pad_tok_id (int): The special token reserved for padding the ends of batches
-        num_fewshot (int): The number of complete fewshot examples to prepend before each test example
-        prompt_string (str): Prompt string to put once before all fewshot examples/test examples (e.g. 'translate english to french')
-        example_delimiter (str): Separator that goes between individual (context, continuation) pairs (e.g. '\n')
-        continuation_delimiter: (str): Separator that goes between context and continuation in each example (e.g. '->')
-        destination_path (str): Temporary path to store downloaded datasets
-        fewshot_random_seed (int): Random seed used to select fewshot examples
+    Each batch then consists of batch_size // N distinct questions and has the following the structure.
+    - input_ids: Input tensor batch x seqlen x # tokens
+    - continuation_indices: List of |batch| consisting of tensors indicating which indices in the sequence correspond to the question answer (aka continuation)
+    - mode: Indicates to the model that this is an ICL task and may rely on a custom code path to properly update metrics
+    - labels: Identical to the input, used by the model to calculate loss/metrics
+    - gold_indices: List of length |batch_size // N| indicating for each question, which of the answers is correct (via an integer [0, N-1])
+    - choice_groupings: Indicates which indices of the batch correspond to which questions
+
+    Additional Args:
+        choices_key (str): the key under which the choices are stored in the saved dataset. Defaults to 'choices'.
     """
 
     def __init__(self, choices_key: str = 'choices', *args, **kwargs):
@@ -859,29 +830,19 @@ class InContextLearningSchemaTaskDataset(InContextLearningMultipleChoiceTaskData
     A schema task involves sentences with a fill-in-the-blank where the user needs to choose the correct word
     to fill in from a set of N options. We use the partial evaluation technique from https://arxiv.org/abs/1806.02847
     to determine the model's choice of fill-in word.
+
+    The default input format is a jsonl file with the following fields:
+    - context_options: list of strings corresponding to possible preceding context options for the continuation
+    - gold: index of the correct context from 'context_options'
+    - continuation: the finishing continuation
+
     Each batch then consists of batch_size // N distinct tasks and has the following the structure
-    'input_ids': Input tensor batch x seqlen x # tokens
-    'continuation_indices': List of |batch| consisting of tensors indicating which indices in the sequence correspond to the question answer (aka continuation)
-    'mode': Indicates to the model that this is an ICL task and may rely on a custom code path to properly update metrics
-    'labels': Identical to the input, used by the model to calculate loss/metrics
-    'gold_indices': List of length |batch_size // N| indicating for each question, which of the answers is correct (via an integer [0, N-1])
-    'choice_groupings': Indicates which indices of the batch correspond to which questions
-    #TODO: Should I only list variables here that are different than InContextLearningDataset?
-    Args:
-        dataset_uri (str): Either a local path, or a remote path beginning with ``s3://``, or another backend
-            supported by :meth:`composer.utils.maybe_create_object_store_from_uri`. Dataset must consist of rows of JSON data points with "query",
-            "choices", and "gold" index. See tests/datasets/local_data/piqa_small.jsonl.
-        tokenizer (Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast]): The tokenizer used to transform data into batches
-        batch_size (int): Size of a batch used for eval
-        max_seq_len (int): The sequence length expected by the model
-        pad_tok_id (int): The special token reserved for padding the ends of batches
-        num_fewshot (int): The number of complete fewshot examples to prepend before each test example
-        prompt_string (str): Prompt string to put once before all fewshot examples/test examples (e.g. 'translate english to french')
-        example_delimiter (str): Separator that goes between individual (context, continuation) pairs (e.g. '\n')
-        continuation_delimiter: (str): Separator that goes between context and continuation in each example (e.g. '->')
-        destination_path (str): Temporary path to store downloaded datasets
-        fewshot_random_seed (int): Random seed used to select fewshot examples
-        choices_key (str)
+    - input_ids: Input tensor batch x seqlen x # tokens
+    - continuation_indices: List of |batch| consisting of tensors indicating which indices in the sequence correspond to the question answer (aka continuation)
+    - mode: Indicates to the model that this is an ICL task and may rely on a custom code path to properly update metrics
+    - labels: Identical to the input, used by the model to calculate loss/metrics
+    - gold_indices: List of length |batch_size // N| indicating for each question, which of the answers is correct (via an integer [0, N-1])
+    - choice_groupings: Indicates which indices of the batch correspond to which questions
     """
 
     def __init__(self, choices_key='context_options', *args, **kwargs):
@@ -993,7 +954,7 @@ def collate_fn(self, data):
 class InContextLearningCodeEvalDataset(InContextLearningDataset):
     """A dataset that constructs batches for in-context learning code evaluation.
 
-    The input format is expected to be a jsonl file with the following fields:
+    The default input format is expected to be a jsonl file with the following fields:
     - task_id: label of given task
     - prompt: the code snippet that must be completed
     - entry_point: the entry to the function/code snippet to generate
@@ -1002,26 +963,37 @@ class InContextLearningCodeEvalDataset(InContextLearningDataset):
     - test_inputs: list of test inputs
     - test_outputs: list of test outputs
     - language: the language of the code snippet
-    #TODO: Should I only list variables here that are different than InContextLearningDataset?
-    Args:
-        dataset_uri (str): Either a local path, or a remote path beginning with ``s3://``, or another backend
-        supported by :meth:`composer.utils.maybe_create_object_store_from_uri`. Dataset must consist of rows of JSON data points with "task_id",
-        "prompt", "entry_point", "canonical_solution", "test", "test_inputs", and "test_outputs". See tests/datasets/local_data/human_eval_small.jsonl.
-        tokenizer (Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast]): The tokenizer used to map between strings and token ids
-        max_seq_len (int): The maximum sequence length supported by the model
-        pad_tok_id (int): The special token reserved for padding batches
-        num_fewshot (int): The number of complete fewshot examples to prepend before each test example
-        prompt_string (str): Prompt string to put once before all fewshot examples/test examples (e.g. 'translate english to french')
-        example_delimiter (str): Separator that goes between individual (context, answer) pairs (e.g. '\n')
-        destination_path (str): Temporary path to store downloaded datasets
-        code_prelimiter (str): String to put before each code prompt (e.g. 'Q: ')
-        fewshot_random_seed (int): Random seed to use for fewshot sampling
-        generations_per_sample (int): how many outputs to generate per prompt
-        # TODO: is this correct?
-        pass_at_k (int): k for how many chances the model gets to write passing code
-        top_p (int): top_p sampling parameter for nucleus sampling
-        top_k (int): top_k sampling parameter for number of samples to consider
-        temperature (float): temperture to use while sampling
+
+    Each batch then consists of the following the structure
+    - input_ids: Input tensor batch x seqlen x # tokens
+    - mode: Indicates to the model that this is an ICL task and may rely on a custom code path to properly update metrics
+    - mode: always set to 'generate'
+    - labels:
+    - prompts:
+    - cannonical_solutions
+    - entry_points: list of entry points
+    - test_inputs: list of test inputs
+    - test_outputs: list of test outputs
+    - languages:  list of languages
+    - pass_at_k: passed value for pass_at_k
+    - generation_length: derrived maximum generation length
+    - generation_kwargs: Dictionary of kwargs neeeded for generation. Includes the following:
+        - pad_token_id: ID for padding token, derived automatically
+        - num_beams: how many beams to search for generations, always set to 1
+        - num_return_sequences: value passed for 'generations_per_sample',  how many generations per prompt
+        - do_sample: set to True, whether or not the model is sampling (#TODO: explain this better)
+        - top_p: passed top_p
+        - top_k: passed top_k
+        - temperature: passed temperature
+        - use_cache: True (#TODO explain this)
+
+    Additional Args:
+        # TODO: are these correct?
+        generations_per_sample (int) (defaults to 1): how many outputs to generate per prompt
+        pass_at_k (int) (defaults to 1): k for how many chances the model gets to write passing code
+        top_p (int) (defaults to 0.95): top_p sampling parameter for nucleus sampling
+        top_k (int) (defaults to 40): top_k sampling parameter for number of samples to consider
+        temperature (float) (defaults to 1.0): temperature to use while sampling
     """
 
     def __init__(
@@ -1253,23 +1225,24 @@ def build_icl_dataloader(
         )
         effective_batchsize = batch_size
     elif icl_task_type == 'code_evaluation':
-        dataset = InContextLearningCodeEvalDataset(dataset_uri=dataset_uri,
-                                                   tokenizer=tokenizer,
-                                                   max_seq_len=max_seq_len,
-                                                   pad_tok_id=pad_tok_id,
-                                                   num_fewshot=num_fewshot,
-                                                   prompt_string=prompt_string,
-                                                   example_delimiter=example_delimiter,
-                                                   continuation_delimiter=continuation_delimiter,
-                                                   destination_path=destination_path,
-                                                   prelimiter=prelimiter,
-                                                   fewshot_random_seed=fewshot_random_seed,
-                                                   hf_loading_vars=hf_loading_vars,
-                                                   hf_parsing_map=hf_parsing_map,
-                                                   pass_at_k=pass_at_k,
-                                                   generations_per_sample=generations_per_sample,
-                                                   temperature=temperature,
-                                                   )
+        dataset = InContextLearningCodeEvalDataset(
+            dataset_uri=dataset_uri,
+            tokenizer=tokenizer,
+            max_seq_len=max_seq_len,
+            pad_tok_id=pad_tok_id,
+            num_fewshot=num_fewshot,
+            prompt_string=prompt_string,
+            example_delimiter=example_delimiter,
+            continuation_delimiter=continuation_delimiter,
+            destination_path=destination_path,
+            prelimiter=prelimiter,
+            fewshot_random_seed=fewshot_random_seed,
+            hf_loading_vars=hf_loading_vars,
+            hf_parsing_map=hf_parsing_map,
+            pass_at_k=pass_at_k,
+            generations_per_sample=generations_per_sample,
+            temperature=temperature,
+        )
         effective_batchsize = batch_size
     else:
         raise Exception(f'Unrecognized ICL task type: {icl_task_type}')
@@ -1437,45 +1410,47 @@ def get_icl_task_dataloader(
         categories = sorted(output_files.keys())
         for category in categories:
             partition_uri = output_files[category]
-            result_dls[category] = build_icl_dataloader(icl_task_type=icl_task_type,
-                                                        dataset_uri=partition_uri,
-                                                        tokenizer=tokenizer,
-                                                        batch_size=batch_size,
-                                                        max_seq_len=max_seq_len,
-                                                        pad_tok_id=pad_tok_id,
-                                                        num_fewshot=num_fewshot,
-                                                        prompt_string=prompt_string,
-                                                        example_delimiter=example_delimiter,
-                                                        continuation_delimiter=continuation_delimiter,
-                                                        destination_path=partition_uri + '_tmp',
-                                                        prelimiter=question_prelimiter,
-                                                        cot_delimiter=cot_delimiter,
-                                                        fewshot_random_seed=fewshot_random_seed,
-                                                        pass_at_k=pass_at_k,
-                                                        generations_per_sample=generations_per_sample,
-                                                        hf_loading_vars=hf_loading_vars,
-                                                        hf_parsing_map=hf_parsing_map,
-                                                        temperature=temperature,
-                                                        )
+            result_dls[category] = build_icl_dataloader(
+                icl_task_type=icl_task_type,
+                dataset_uri=partition_uri,
+                tokenizer=tokenizer,
+                batch_size=batch_size,
+                max_seq_len=max_seq_len,
+                pad_tok_id=pad_tok_id,
+                num_fewshot=num_fewshot,
+                prompt_string=prompt_string,
+                example_delimiter=example_delimiter,
+                continuation_delimiter=continuation_delimiter,
+                destination_path=partition_uri + '_tmp',
+                prelimiter=question_prelimiter,
+                cot_delimiter=cot_delimiter,
+                fewshot_random_seed=fewshot_random_seed,
+                pass_at_k=pass_at_k,
+                generations_per_sample=generations_per_sample,
+                hf_loading_vars=hf_loading_vars,
+                hf_parsing_map=hf_parsing_map,
+                temperature=temperature,
+            )
         return result_dls
     else:
-        return build_icl_dataloader(icl_task_type=icl_task_type,
-                                    dataset_uri=dataset_uri,
-                                    tokenizer=tokenizer,
-                                    batch_size=batch_size,
-                                    max_seq_len=max_seq_len,
-                                    pad_tok_id=pad_tok_id,
-                                    num_fewshot=num_fewshot,
-                                    prompt_string=prompt_string,
-                                    example_delimiter=example_delimiter,
-                                    hf_loading_vars=hf_loading_vars,
-                                    hf_parsing_map=hf_parsing_map,
-                                    continuation_delimiter=continuation_delimiter,
-                                    destination_path=destination_path,
-                                    prelimiter=question_prelimiter,
-                                    cot_delimiter=cot_delimiter,
-                                    fewshot_random_seed=fewshot_random_seed,
-                                    pass_at_k=pass_at_k,
-                                    generations_per_sample=generations_per_sample,
-                                    temperature=temperature,
-                                    )
+        return build_icl_dataloader(
+            icl_task_type=icl_task_type,
+            dataset_uri=dataset_uri,
+            tokenizer=tokenizer,
+            batch_size=batch_size,
+            max_seq_len=max_seq_len,
+            pad_tok_id=pad_tok_id,
+            num_fewshot=num_fewshot,
+            prompt_string=prompt_string,
+            example_delimiter=example_delimiter,
+            hf_loading_vars=hf_loading_vars,
+            hf_parsing_map=hf_parsing_map,
+            continuation_delimiter=continuation_delimiter,
+            destination_path=destination_path,
+            prelimiter=question_prelimiter,
+            cot_delimiter=cot_delimiter,
+            fewshot_random_seed=fewshot_random_seed,
+            pass_at_k=pass_at_k,
+            generations_per_sample=generations_per_sample,
+            temperature=temperature,
+        )

From c98ca7d57c3523bc8d194b4b7d1abd19126490ae Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Sat, 25 Nov 2023 03:00:27 +0000
Subject: [PATCH 037/116] add return types

---
 .../in_context_learning_evaluation.py         | 63 ++++++++++---------
 1 file changed, 33 insertions(+), 30 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 2687b9e9cc..d231fc29bd 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -7,7 +7,7 @@
 import json
 import os
 import random
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Optional, Union
 
 import torch
 from torch.utils.data import DataLoader, Dataset
@@ -51,7 +51,7 @@ def _check_if_huggingface_uri(uri: str) -> bool:
     return False
 
 
-def strip_data(sample):
+def strip_data(sample: dict) -> dict:
     return {k: v.strip() if isinstance(v, str) else v for k, v in sample.items()}
 
 
@@ -63,7 +63,8 @@ def _tokenizer_needs_prefix_space(tokenizer) -> bool:
     return len(tokenizer(' a', add_special_tokens=False)['input_ids']) == 1
 
 
-def _make_padded_input(context_enc, continuation_enc, max_seq_len, pad_tok_id, padding_side='right'):
+def _make_padded_input(context_enc: List, continuation_enc: List, max_seq_len: int, pad_tok_id: int, padding_side: str = 'right') -> Tuple[torch.tensor, torch.tensor]:
+    # TODO: docstring
     if len(continuation_enc) + len(context_enc) > max_seq_len:
         # clip from the end
         context_max_subseq_len = max_seq_len - len(continuation_enc)
@@ -105,7 +106,7 @@ def _make_padded_input(context_enc, continuation_enc, max_seq_len, pad_tok_id, p
     return inp, continuation_span
 
 
-def _get_fewshot_sample_idxs(dataset_size: int, num_fewshot: int, sample_idx: int, rng: random.Random):
+def _get_fewshot_sample_idxs(dataset_size: int, num_fewshot: int, sample_idx: int, rng: random.Random) -> List[int]:
     """
     Samples without replacement. If num_fewshot exceeds the number of unique samples,
     then we will have fewer than num_fewshot examples in context.
@@ -224,22 +225,24 @@ def __init__(
             },
         )
 
-    def __getitem__(self, index: int):
+    def __getitem__(self, index: int) -> Dict:
         return self.encoded_dataset[index]
 
-    def __len__(self):
+    def __len__(self) -> int:
         return len(self.encoded_dataset)
 
     def get_num_samples_in_batch(self, batch: dict) -> int:
         return batch['input_ids'].shape[0]
 
-    def check_defaults_are_set(self, dict_of_defaults: dict):
+    def check_defaults_are_set(self, dict_of_defaults: dict) -> None:
         if all(v for v in dict_of_defaults.values()):
             return
         raise ValueError(
             f"{type(self).__name__} missing required variable(s): {''.join([k for k, v in dict_of_defaults.items() if not v])}"
         )
 
+    # TODO conditionally return dataset type?
+    # TODO make all type checking Dict and List or dict and list
     def _read_dataset(self,
                       dataset_uri: str,
                       destination_path: str,
@@ -311,7 +314,7 @@ def _generate_few_shot_text(
 
         return few_shot_text
 
-    def _construct_context(self, sample: dict, preceding_text: str = '', add_answer: bool = False):
+    def _construct_context(self, sample: dict, preceding_text: str = '', add_answer: bool = False) -> str:
         """
         Takes a sample and constructs a context. Optionally, appends this to preceeding text (such as a
         prompt or fewshot examples), as well as optionally adds the correct answer (for fewshot examples)
@@ -335,7 +338,7 @@ def _construct_context(self, sample: dict, preceding_text: str = '', add_answer:
             ctxt = f'{ctxt}{self._get_answer_from_sample(sample)}'
         return ctxt
 
-    def _get_answer_from_sample(self, sample: dict):
+    def _get_answer_from_sample(self, sample: Dict[str, Any]) -> str:
         """
         Returns the answer from the sample
         Args:
@@ -346,7 +349,7 @@ def _get_answer_from_sample(self, sample: dict):
         """
         return sample[self.answer_key]
 
-    def _fix_eos_on_preamble(self, input_ids: str):
+    def _fix_eos_on_preamble(self, input_ids: List[int]) -> List[int]:
         """
         If the input_ids is empty then input_ids['input_ids'] will be a 0-length list,
         unless the tokenizer adds special tokens to empty strings (e.g. OPT tokenizer)
@@ -363,7 +366,7 @@ def _fix_eos_on_preamble(self, input_ids: str):
             input_ids = input_ids[:-1]
         return input_ids
 
-    def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict):
+    def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict) -> Dict[str, Any]:
         """
         Runs text through the tokenizer and handles special cases.
         Args:
@@ -412,7 +415,8 @@ def _prep_example(
         tokenized_example = self._tokenize_example(prompt_and_fewshot, ctxt, example)
         return tokenized_example
 
-    def collate_fn(self, data):
+    # TODO: confirm this typing?
+    def collate_fn(self, data: Dict[str, Any]) -> Dict[str, Any]:
         """
         The function that the dataloader uses to accumulate data into batches.
         Args:
@@ -441,7 +445,7 @@ def collate_fn(self, data):
         batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
         return batch
 
-    def split_batch(self, batch: Any, microbatch_size: int):
+    def split_batch(self, batch: Any, microbatch_size: int) -> List[Dict[str, Any]]:
         """
         Handling for certain specialty columns that must be split into batches in different formats.
 
@@ -523,7 +527,7 @@ def _read_dataset(
                 'chain_of_thought': examples.get('chain_of_thought', ''),
             })
 
-    def _get_answer_from_sample(self, sample: dict):
+    def _get_answer_from_sample(self, sample: dict) -> str:
         """
         Returns the answer from the sample. Applies chain of thought if self.has_cot is marked as true.
         Args:
@@ -537,7 +541,7 @@ def _get_answer_from_sample(self, sample: dict):
         else:
             return sample[self.answer_key]
 
-    def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict):
+    def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict) -> Dict[str, Any]:
         """
         Runs text through the tokenizer and handles special cases.
         Args:
@@ -552,7 +556,7 @@ def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict):
         tokenized_example['aliases'] = list(example.get('aliases', []))
         return tokenized_example
 
-    def get_max_answer_length(self):
+    def get_max_answer_length(self) -> int:
         f"""
         Loops over the dataset and finds the longes answer length.
 
@@ -571,7 +575,7 @@ def get_max_answer_length(self):
         max_answer_length = max_answer_length + (_MAX_ANSWER_BUFFER_LENGTH if len(self.cot_delimiter) > 0 else 0)
         return max_answer_length
 
-    def collate_fn(self, data):
+    def collate_fn(self, data: dict) -> Dict[str, Any]:
         """
         The function that the dataloader uses to accumulate data into batches.
         Args:
@@ -624,7 +628,7 @@ class InContextLearningLMTaskDataset(InContextLearningDataset):
     def __init__(self, *args, **kwargs):
         super().__init__(answer_key='continuation', *args, **kwargs)
 
-    def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict):
+    def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict) -> Dict[str, Any]:
         """
         Runs text through the tokenizer and handles special cases.
         Args:
@@ -642,7 +646,7 @@ def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict):
         tokenized_example['continuation'] = self.tokenizer(cont, add_special_tokens=False)
         return tokenized_example
 
-    def collate_fn(self, data):
+    def collate_fn(self, data: Dict[str, any]) -> Dict[str, Any]:
         """
         The function that the dataloader uses to accumulate data into batches.
         Args:
@@ -700,7 +704,7 @@ def __init__(self, choices_key: str = 'choices', *args, **kwargs):
         self.num_choices = len(self.dataset[0][choices_key])
         self.real_split_keys = ['input_ids', 'labels', 'attention_mask']
 
-    def _get_answer_from_sample(self, sample: dict):
+    def _get_answer_from_sample(self, sample: dict) -> str:
         """
         Returns the correct answer from the sample's choices.
         Args:
@@ -713,7 +717,7 @@ def _get_answer_from_sample(self, sample: dict):
         gold_idx = sample['gold']
         return choices[gold_idx]
 
-    def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict):
+    def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict) -> Dict[str, Any]:
         """
         Runs text through the tokenizer and handles special cases.
         Args:
@@ -732,7 +736,7 @@ def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict):
         tokenized_example['gold'] = example['gold']
         return tokenized_example
 
-    def collate_fn(self, data):
+    def collate_fn(self, data: Dict[str, Any]) -> Dict[str, Any]:
         """
         The function that the dataloader uses to accumulate data into batches.
         Args:
@@ -781,7 +785,7 @@ def collate_fn(self, data):
     def get_num_samples_in_batch(self, batch) -> int:
         return batch['input_ids'].shape[0] // self.num_choices
 
-    def split_batch(self, batch: Any, microbatch_size: int):
+    def split_batch(self, batch: Any, microbatch_size: int) -> Dict[str, Any]:
         """Split batch while ensuring all continuations are in the same microbatch.
 
         In ICL Multiple Choice, we duplicate each data point for each possible continuation.
@@ -848,7 +852,7 @@ class InContextLearningSchemaTaskDataset(InContextLearningMultipleChoiceTaskData
     def __init__(self, choices_key='context_options', *args, **kwargs):
         super().__init__(choices_key=choices_key, *args, **kwargs)
 
-    def _construct_context(self, sample, preceding_text: str = '', add_answer: bool = False):
+    def _construct_context(self, sample, preceding_text: str = '', add_answer: bool = False) -> str:
         """
         Takes a sample and  constructs a context. Optionally, appends this to preceeding text (such as a
         prompt or fewshot examples), as well as optionally adds the correct answer (for fewshot examples).
@@ -881,7 +885,7 @@ def _construct_context(self, sample, preceding_text: str = '', add_answer: bool
                 context_options = [f'{self.example_delimiter}{c}{cont_del}' for c in context_options]
             return context_options
 
-    def _tokenize_example(self, prompt_and_fewshot: str, context_options: List[str], example: dict):
+    def _tokenize_example(self, prompt_and_fewshot: str, context_options: List[str], example: dict) -> Dict[str, Any]:
         """
         Runs text through the tokenizer and handles special cases.
         Args:
@@ -904,7 +908,7 @@ def _tokenize_example(self, prompt_and_fewshot: str, context_options: List[str],
         tokenized_example['gold'] = example['gold']
         return tokenized_example
 
-    def collate_fn(self, data):
+    def collate_fn(self, data) -> Dict[str, Any]:
         """
         The function that the dataloader uses to accumulate data into batches.
         Args:
@@ -1032,7 +1036,6 @@ def __init__(
             *args,
             **kwargs,
         )
-        # TODO: add temperature
         self.pass_at_k = pass_at_k
         self.generations_per_sample = generations_per_sample
         self.max_prompt_length = self.get_max_prompt_length()
@@ -1040,7 +1043,7 @@ def __init__(
         self.top_k = top_k
         self.temperature = temperature
 
-    def get_max_prompt_length(self):
+    def get_max_prompt_length(self) -> int:
         """
         Iterates through the dataset and finds the length of the longest prompt.
         Returns:
@@ -1054,7 +1057,7 @@ def get_max_prompt_length(self):
             )
         return max_prompt_length
 
-    def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict):
+    def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict) -> Dict[str, Any]:
         """
         Runs text through the tokenizer and handles special cases.
         Args:
@@ -1076,7 +1079,7 @@ def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict):
         tokenized_example['language'] = example['language']
         return tokenized_example
 
-    def collate_fn(self, data):
+    def collate_fn(self, data: Dict[str, Any]) -> Dict[str, Any]:
         """
         The function that the dataloader uses to accumulate data into batches.
         Args:

From b3635ab30da63f47439aaad4c31bd5a4fe635af7 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Sun, 26 Nov 2023 14:57:44 +0000
Subject: [PATCH 038/116] typing, comments

---
 .../in_context_learning_evaluation.py         | 149 +++++++++---------
 1 file changed, 74 insertions(+), 75 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index d231fc29bd..c8ab08f9ca 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -51,11 +51,11 @@ def _check_if_huggingface_uri(uri: str) -> bool:
     return False
 
 
-def strip_data(sample: dict) -> dict:
+def strip_data(sample: Dict) -> Dict:
     return {k: v.strip() if isinstance(v, str) else v for k, v in sample.items()}
 
 
-def _tokenizer_needs_prefix_space(tokenizer) -> bool:
+def _tokenizer_needs_prefix_space(tokenizer: transformers.PreTrainedTokenizerBase) -> bool:
     """
     Test for whether a prefix space is needed before the continuation.
     Sentencepiece tokenization should not have a prefix space, but gpt2 style BPE should.
@@ -158,13 +158,13 @@ class InContextLearningDataset(Dataset):
         destination_path (str): Temporary path to store downloaded datasets
         strip_dataset (bool): Boolean for whether to strip whitespace from data. Trailing whitespace can cause degenerative outputs,
             so unless whitespace should be preserved (for example in code), this should be set to True.
-        hf_loading_vars (dict): A dictionary containing keyword arguments to be passed into `load_dataset` if dataset is being pulled from HF.
+        hf_loading_vars (Dict): A dictionary containing keyword arguments to be passed into `load_dataset` if dataset is being pulled from HF.
         hf_parsing_map (Dict[str:List[str]]): A dictionary containing a mapping from HF columns to ICL dataset keys. The dictionary should be formatted {icl_key:[hf_key1, hf_key1]}.
             Values in the dict will be concatenated with ' ' seperating them. If not included, will use the columns already present in the HF dataset.
-        stacked_keys (list(str)): keys in the output batch that must be converted to tensors with torch.stack()
-        dont_split_keys (list(str)): keys in the ICL dictionary that should not be split among batches.
-        list_split_keys (list(str)): keys in the ICL dictionary that will be split as lists, resulting in microbatch_size sections of the list being inserted in every batch
-        normal_split_keys (list(str)): keys in the ICL dictionary that will be split into chunks regularly
+        stacked_keys (List(str)): keys in the output batch that must be converted to tensors with torch.stack()
+        dont_split_keys (List(str)): keys in the ICL dictionary that should not be split among batches.
+        list_split_keys (List(str)): keys in the ICL dictionary that will be split as lists, resulting in microbatch_size sections of the list being inserted in every batch
+        normal_split_keys (List(str)): keys in the ICL dictionary that will be split into chunks regularly
     """
 
     def __init__(
@@ -184,8 +184,8 @@ def __init__(
             context_key: str = 'context',
             answer_key: str = 'answer',
             strip_dataset: bool = True,
-            hf_loading_vars: dict = None,
-            hf_parsing_map: dict = None,
+            hf_loading_vars: Dict = None,
+            hf_parsing_map: Dict = None,
             stacked_keys: List[str] = None,
             dont_split_keys: List[str] = None,
             list_split_keys: List[str] = None,
@@ -196,6 +196,7 @@ def __init__(
         self.max_seq_len = max_seq_len
         self.pad_tok_id = pad_tok_id
         self.num_fewshot = num_fewshot
+        # TODO: check this is correct for all dataset types 
         self.padding_side = 'left'
 
         self.prelimiter = prelimiter
@@ -231,7 +232,7 @@ def __getitem__(self, index: int) -> Dict:
     def __len__(self) -> int:
         return len(self.encoded_dataset)
 
-    def get_num_samples_in_batch(self, batch: dict) -> int:
+    def get_num_samples_in_batch(self, batch: Dict) -> int:
         return batch['input_ids'].shape[0]
 
     def check_defaults_are_set(self, dict_of_defaults: dict) -> None:
@@ -242,20 +243,19 @@ def check_defaults_are_set(self, dict_of_defaults: dict) -> None:
         )
 
     # TODO conditionally return dataset type?
-    # TODO make all type checking Dict and List or dict and list
     def _read_dataset(self,
                       dataset_uri: str,
                       destination_path: str,
-                      hf_loading_vars: dict = None,
-                      hf_parsing_map: dict = None):
+                      hf_loading_vars: Dict = None,
+                      hf_parsing_map: Dict = None) -> transformers.Dataset:
         """
         Reads a dataset and handles parsing it from HuggingFace.
         Args:
             dataset_uri (str): A local path, a remote path beginning with ``s3://`` or another backend, or a HuggingFace dataset uri.
                 Alternate backends must be supported by :meth:`composer.utils.maybe_create_object_store_from_uri`.
             destination_path (str): A local path where the data will be stored
-            hf_loading_vars (dict): If parsing from HuggingFace, keyword args that will be passed into load_dataset
-            hf_parsing_map (dict): Dictionary in the form of {icl_key: [hf_col1, hf_col2]} that will map one or more hf columns, in order, to ICL dataset columns
+            hf_loading_vars (Dict): If parsing from HuggingFace, keyword args that will be passed into load_dataset
+            hf_parsing_map (Dict): Dictionary in the form of {icl_key: [hf_col1, hf_col2]} that will map one or more hf columns, in order, to ICL dataset columns
 
         Returns:
             dataset: a loaded HF dataset
@@ -314,13 +314,13 @@ def _generate_few_shot_text(
 
         return few_shot_text
 
-    def _construct_context(self, sample: dict, preceding_text: str = '', add_answer: bool = False) -> str:
+    def _construct_context(self, sample: Dict, preceding_text: str = '', add_answer: bool = False) -> str:
         """
         Takes a sample and constructs a context. Optionally, appends this to preceeding text (such as a
         prompt or fewshot examples), as well as optionally adds the correct answer (for fewshot examples)
 
         Args:
-            sample (dict): the sample from which to construct the context
+            sample (Dict): the sample from which to construct the context
             preceding_text (str): any preceding text, used as a check for prepending self.example_delimiter
             add_answer (bool): bool for whether or not to add the answer on the end of the context (e.g. for fewshot examples)
 
@@ -342,7 +342,7 @@ def _get_answer_from_sample(self, sample: Dict[str, Any]) -> str:
         """
         Returns the answer from the sample
         Args:
-            sample (dict): the sample from which to retrieve the answer
+            sample (Dict): the sample from which to retrieve the answer
 
         Returns:
             str: the answer in the sample
@@ -351,12 +351,12 @@ def _get_answer_from_sample(self, sample: Dict[str, Any]) -> str:
 
     def _fix_eos_on_preamble(self, input_ids: List[int]) -> List[int]:
         """
-        If the input_ids is empty then input_ids['input_ids'] will be a 0-length list,
+        If the input_ids is empty then input_ids['input_ids'] will be a 0-length List,
         unless the tokenizer adds special tokens to empty strings (e.g. OPT tokenizer)
         If there is an EOS token added, we need to remove it so it is not in the middle of the prompt,
         as the specific eval question's prompt will follow theinput_ids
         Args:
-            input_ids (list): the tokenized input
+            input_ids (List): the tokenized input
 
         Returns:
             input_ids: the tokenized input conditionally edited
@@ -366,16 +366,16 @@ def _fix_eos_on_preamble(self, input_ids: List[int]) -> List[int]:
             input_ids = input_ids[:-1]
         return input_ids
 
-    def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict) -> Dict[str, Any]:
+    def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -> Dict[str, Any]:
         """
         Runs text through the tokenizer and handles special cases.
         Args:
             prompt_and_fewshot (str): the collection of the prompt and fewshot examples that belongs before the example's context
             ctx (str): the specific example's derrived context
-            example (dict): the example as a dictionary. Used for additional processing in inherited classes.
+            example (Dict): the example as a dictionary. Used for additional processing in inherited classes.
 
         Returns:
-            dict: dictionary with the tokenized data
+            Dict: dictionary with the tokenized data
         """
         tokenized_example = {}
         preamble = self.tokenizer(prompt_and_fewshot)
@@ -389,7 +389,7 @@ def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict) -
 
     def _prep_example(
         self,
-        example: dict,
+        example: Dict,
         example_idx: int,
         num_fewshot: int,
         prompt_string: str,
@@ -401,14 +401,14 @@ def _prep_example(
         example context/continuation pairs which precede the test context/continuation pair.
 
         Args:
-            example (dict): A dictionary from the hf dataset
+            example (Dict): A Dictionary from the hf dataset
             example_idx (int): the index of example
             num_fewshot (int): Number of examples context/continuation pairs to prepend to the test pair
             prompt_string (str): The prompt to prepend to all inputs
             fewshot_rng (random.Random): Random number generator to use for fewshot sampling
 
         Returns:
-            dict: contains a dictionary with the tokenized data
+            Dict: contains a dictionary with the tokenized data
         """
         prompt_and_fewshot = self._generate_few_shot_text(num_fewshot, example_idx, prompt_string, fewshot_rng)
         ctxt = self._construct_context(example, prompt_and_fewshot, add_answer=False)
@@ -420,10 +420,10 @@ def collate_fn(self, data: Dict[str, Any]) -> Dict[str, Any]:
         """
         The function that the dataloader uses to accumulate data into batches.
         Args:
-            data (list): list of tokenized datapoints (dicts returned by self._tokenize_example)
+            data (List): list of tokenized datapoints (dicts returned by self._tokenize_example)
 
         Returns:
-            dict: dictionary for a single batch
+            Dict: dictionary for a single batch
         """
         batch = {
             'input_ids': [],
@@ -450,11 +450,11 @@ def split_batch(self, batch: Any, microbatch_size: int) -> List[Dict[str, Any]]:
         Handling for certain specialty columns that must be split into batches in different formats.
 
         Args:
-            batch (dict): batch of data
+            batch (Dict): batch of data
             microbatch_size (int): size of microbatches
 
         Returns:
-            list: list of chunked batches
+            List: list of chunked batches
         """
         # Don't split kwargs that don't change
         # Normally split torch tensors
@@ -477,7 +477,7 @@ def split_batch(self, batch: Any, microbatch_size: int) -> List[Dict[str, Any]]:
                 raise ValueError(f'Unexpected key {k}')
         num_chunks = len(chunked['input_ids'])
         for k, v in batch.items():
-            if isinstance(v, (int, float, str, bool, dict)):
+            if isinstance(v, (int, float, str, bool, Dict)):
                 chunked[k] = [v] * num_chunks
 
         return [{k: v[idx] for k, v in chunked.items()} for idx in range(num_chunks)]
@@ -514,8 +514,8 @@ def _read_dataset(
         self,
         dataset_uri: str,
         destination_path: str,
-        hf_loading_vars: dict = None,
-        hf_parsing_map: dict = None,
+        hf_loading_vars: Dict = None,
+        hf_parsing_map: Dict = None,
     ):
         dataset = super()._read_dataset(dataset_uri, destination_path, hf_loading_vars, hf_parsing_map)
         self.has_cot = 'chain_of_thought' in dataset.features
@@ -527,11 +527,11 @@ def _read_dataset(
                 'chain_of_thought': examples.get('chain_of_thought', ''),
             })
 
-    def _get_answer_from_sample(self, sample: dict) -> str:
+    def _get_answer_from_sample(self, sample: Dict) -> str:
         """
         Returns the answer from the sample. Applies chain of thought if self.has_cot is marked as true.
         Args:
-            sample (dict): the sample from which to retrieve the answer
+            sample (Dict): the sample from which to retrieve the answer
 
         Returns:
             str: the answer in from the sample with chain of thought and delimiter if needed
@@ -541,16 +541,16 @@ def _get_answer_from_sample(self, sample: dict) -> str:
         else:
             return sample[self.answer_key]
 
-    def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict) -> Dict[str, Any]:
+    def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -> Dict[str, Any]:
         """
         Runs text through the tokenizer and handles special cases.
         Args:
             prompt_and_fewshot (str): the collection of the prompt and fewshot examples that belongs before the example's context
             ctx (str): the specific example's derrived context
-            example (dict): the example as a dictionary.
+            example (Dict): the example as a dictionary.
 
         Returns:
-            dict: dictionary with the tokenized data
+            Dict: dictionary with the tokenized data
         """
         tokenized_example = super()._tokenize_example(prompt_and_fewshot, ctxt, example)
         tokenized_example['aliases'] = list(example.get('aliases', []))
@@ -575,14 +575,14 @@ def get_max_answer_length(self) -> int:
         max_answer_length = max_answer_length + (_MAX_ANSWER_BUFFER_LENGTH if len(self.cot_delimiter) > 0 else 0)
         return max_answer_length
 
-    def collate_fn(self, data: dict) -> Dict[str, Any]:
+    def collate_fn(self, data: Dict) -> Dict[str, Any]:
         """
         The function that the dataloader uses to accumulate data into batches.
         Args:
-            data (list): list of tokenized datapoints (dicts returned by self._tokenize_example)
+            data (List): list of tokenized datapoints (dicts returned by self._tokenize_example)
 
         Returns:
-            dict: dictionary for a single batch
+            Dict: dictionary for a single batch
         """
         batch = {
             'input_ids': [],
@@ -628,16 +628,16 @@ class InContextLearningLMTaskDataset(InContextLearningDataset):
     def __init__(self, *args, **kwargs):
         super().__init__(answer_key='continuation', *args, **kwargs)
 
-    def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict) -> Dict[str, Any]:
+    def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -> Dict[str, Any]:
         """
         Runs text through the tokenizer and handles special cases.
         Args:
             prompt_and_fewshot (str): the collection of the prompt and fewshot examples that belongs before the example's context
             ctx (str): the specific example's derrived context
-            example (dict): the example as a dictionary.
+            example (Dict): the example as a dictionary.
 
         Returns:
-            dict: dictionary with the tokenized data
+            Dict: dictionary with the tokenized data
         """
         tokenized_example = super()._tokenize_example(prompt_and_fewshot, ctxt, example)
         cont = example['continuation']
@@ -650,10 +650,10 @@ def collate_fn(self, data: Dict[str, any]) -> Dict[str, Any]:
         """
         The function that the dataloader uses to accumulate data into batches.
         Args:
-            data (list): list of tokenized datapoints (dicts returned by self._tokenize_example)
+            data (List): list of tokenized datapoints (dicts returned by self._tokenize_example)
 
         Returns:
-            dict: dictionary for a single batch
+            Dict: dictionary for a single batch
         """
         batch = {'input_ids': [], 'continuation_indices': [], 'mode': 'icl_task', 'labels': []}
         for data_pair in data:
@@ -704,11 +704,11 @@ def __init__(self, choices_key: str = 'choices', *args, **kwargs):
         self.num_choices = len(self.dataset[0][choices_key])
         self.real_split_keys = ['input_ids', 'labels', 'attention_mask']
 
-    def _get_answer_from_sample(self, sample: dict) -> str:
+    def _get_answer_from_sample(self, sample: Dict) -> str:
         """
         Returns the correct answer from the sample's choices.
         Args:
-            sample (dict): the sample from which to retrieve the answer
+            sample (Dict): the sample from which to retrieve the answer
 
         Returns:
             str: the full string of the correct answer based on the 'gold' key
@@ -717,16 +717,16 @@ def _get_answer_from_sample(self, sample: dict) -> str:
         gold_idx = sample['gold']
         return choices[gold_idx]
 
-    def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict) -> Dict[str, Any]:
+    def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -> Dict[str, Any]:
         """
         Runs text through the tokenizer and handles special cases.
         Args:
             prompt_and_fewshot (str): the collection of the prompt and fewshot examples that belongs before the example's context
             ctx (str): the specific example's derrived context
-            example (dict): the example as a dictionary.
+            example (Dict): the example as a dictionary.
 
         Returns:
-            dict: dictionary with the tokenized data
+            Dict: dictionary with the tokenized data
         """
         tokenized_example = super()._tokenize_example(prompt_and_fewshot, ctxt, example)
         choices = example['choices']
@@ -740,10 +740,10 @@ def collate_fn(self, data: Dict[str, Any]) -> Dict[str, Any]:
         """
         The function that the dataloader uses to accumulate data into batches.
         Args:
-            data (list): list of tokenized datapoints (dicts returned by self._tokenize_example)
+            data (List): list of tokenized datapoints (dicts returned by self._tokenize_example)
 
         Returns:
-            dict: dictionary for a single batch
+            Dict: dictionary for a single batch
         """
         batch = {
             'input_ids': [],
@@ -781,7 +781,6 @@ def collate_fn(self, data: Dict[str, Any]) -> Dict[str, Any]:
         batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
         return batch
 
-    # TODO: should I type all the return values like this did?
     def get_num_samples_in_batch(self, batch) -> int:
         return batch['input_ids'].shape[0] // self.num_choices
 
@@ -794,7 +793,7 @@ def split_batch(self, batch: Any, microbatch_size: int) -> Dict[str, Any]:
         microbatch_size are tracked in logical samples, we split logical attributes by
         microbatch_size and real attributes by microbatch_size * num_choices.
         Args:
-            batch (dict): batch of data
+            batch (Dict): batch of data
             microbatch_size (int): size of microbatches
 
         Returns:
@@ -810,10 +809,10 @@ def split_batch(self, batch: Any, microbatch_size: int) -> Dict[str, Any]:
                 # Defer broadcasting primitives until we know num_chunks
                 pass
             elif k == 'continuation_indices':
-                # List of list, so we have to directly call _split_list
+                # List of lists, so we have to directly call _split_list
                 chunked[k] = _split_list(v, microbatch_size * self.num_choices)
             elif k == 'choice_groupings':
-                # List of list, so we have to directly call _split_list
+                # List of lists, so we have to directly call _split_list
                 chunked[k] = _split_list(v, microbatch_size)
             elif k in self.real_split_keys:
                 chunked[k] = _default_split_batch(v, microbatch_size * self.num_choices)
@@ -858,7 +857,7 @@ def _construct_context(self, sample, preceding_text: str = '', add_answer: bool
         prompt or fewshot examples), as well as optionally adds the correct answer (for fewshot examples).
 
         Args:
-            sample (dict): the sample from which to construct the context
+            sample (Dict): the sample from which to construct the context
             preceding_text (str): any preceding text, needed to if self.example_delimiter is needed at the beginning
             add_answer (bool): bool for whether or not to add the answer on the end of the context (needed for fewshot examples)
 
@@ -885,16 +884,16 @@ def _construct_context(self, sample, preceding_text: str = '', add_answer: bool
                 context_options = [f'{self.example_delimiter}{c}{cont_del}' for c in context_options]
             return context_options
 
-    def _tokenize_example(self, prompt_and_fewshot: str, context_options: List[str], example: dict) -> Dict[str, Any]:
+    def _tokenize_example(self, prompt_and_fewshot: str, context_options: List[str], example: Dict) -> Dict[str, Any]:
         """
         Runs text through the tokenizer and handles special cases.
         Args:
             prompt_and_fewshot (str): the collection of the prompt and fewshot examples that belongs before the example's context
             ctx (str): the specific example's derrived context
-            example (dict): the example as a dictionary.
+            example (Dict): the example as a dictionary.
 
         Returns:
-            dict: dictionary with the tokenized data
+            Dict: dictionary with the tokenized data
         """
         tokenized_example = {}
         preamble = self.tokenizer(prompt_and_fewshot)
@@ -912,10 +911,10 @@ def collate_fn(self, data) -> Dict[str, Any]:
         """
         The function that the dataloader uses to accumulate data into batches.
         Args:
-            data (list): list of tokenized datapoints (dicts returned by self._tokenize_example)
+            data (List): list of tokenized datapoints (dicts returned by self._tokenize_example)
 
         Returns:
-            dict: dictionary for a single batch
+            Dict: dictionary for a single batch
         """
         batch = {
             'input_ids': [],
@@ -1057,16 +1056,16 @@ def get_max_prompt_length(self) -> int:
             )
         return max_prompt_length
 
-    def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict) -> Dict[str, Any]:
+    def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -> Dict[str, Any]:
         """
         Runs text through the tokenizer and handles special cases.
         Args:
             prompt_and_fewshot (str): the collection of the prompt and fewshot examples that belongs before the example's context
             ctx (str): the specific example's derrived context
-            example (dict): the example as a dictionary.
+            example (Dict): the example as a dictionary.
 
         Returns:
-            dict: dictionary with the tokenized data
+            Dict: dictionary with the tokenized data
         """
         tokenized_example = super()._tokenize_example(prompt_and_fewshot, ctxt, example)
         tokenized_example['prompt_text'] = example['prompt']
@@ -1083,10 +1082,10 @@ def collate_fn(self, data: Dict[str, Any]) -> Dict[str, Any]:
         """
         The function that the dataloader uses to accumulate data into batches.
         Args:
-            data (list): list of tokenized datapoints (dicts returned by self._tokenize_example)
+            data (List): list of tokenized datapoints (dicts returned by self._tokenize_example)
 
         Returns:
-            dict: dictionary for a single batch
+            Dict: dictionary for a single batch
         """
         batch = {
             'input_ids': [],
@@ -1149,8 +1148,8 @@ def build_icl_dataloader(
     prompt_string: str,  # e.g. 'translate english to french:'
     example_delimiter: str,  # e.g. '\n'
     continuation_delimiter: str,  # e.g. ''
-    hf_loading_vars: dict,
-    hf_parsing_map: dict,
+    hf_loading_vars: Dict,
+    hf_parsing_map: Dict,
     destination_path: str,
     prelimiter: str,  # e.g. 'Question: '
     cot_delimiter: str,
@@ -1276,8 +1275,8 @@ def build_icl_dataloader(
     )
 
 
-def partition_dataset_by_category(dataset_uri: str, destination_path: str, hf_loading_vars: dict,
-                                  hf_parsing_map: dict) -> Dict[str, str]:
+def partition_dataset_by_category(dataset_uri: str, destination_path: str, hf_loading_vars: Dict,
+                                  hf_parsing_map: Dict) -> Dict[str, str]:
     """If has_categories is enabled, we partition the dataset into a separate dataset for each category value in the data and write each partition to a local file.
 
     Args:
@@ -1342,8 +1341,8 @@ def get_icl_task_dataloader(
     example_delimiter: str,  # e.g. '\n'
     continuation_delimiter: str = '',
     question_prelimiter: str = '',  # e.g. 'Question: '
-    hf_loading_vars: dict = None,
-    hf_parsing_map: dict = None,
+    hf_loading_vars: Dict = None,
+    hf_parsing_map: Dict = None,
     destination_path: str = '',
     fewshot_random_seed: int = 1234,
     pass_at_k: int = 1,
@@ -1392,7 +1391,7 @@ def get_icl_task_dataloader(
         example_delimiter (str): Separator that goes between individual examples (e.g. '\n')
         continuation_delimiter: (str): Separator that goes between context and continuation in each example (e.g. '->')
         question_prelimiter: (str): Text to be prepended before each context segement in each eval example. (e.g. 'Q:', 'The following is a paragraph containing...')
-        hf_loading_vars (dict): A dictionary containing keyword arguments to be passed into `load_dataset` if dataset is being pulled from HF.
+        hf_loading_vars (Dict): A dictionary containing keyword arguments to be passed into `load_dataset` if dataset is being pulled from HF.
         hf_parsing_map (Dict[str:List[str]]): A dictionary containing a from HF columns to ICL dataset keys. The dictionary should be formatted {icl_key:[hf_key1, hf_key1]}.
             Values in the dict will be concatenated with ' ' seperating them. If not included, will use the columns already present in the HF dataset.
         destination_path: (str): This is the local file where remote datasets will be saved.

From 77e8b1e31a44cc4cbd426653e801cfb902342758 Mon Sep 17 00:00:00 2001
From: root <eitan.turok@databricks.com>
Date: Sun, 26 Nov 2023 21:28:34 +0000
Subject: [PATCH 039/116] init RAG Generation task

---
 composer/datasets/in_context_learning_evaluation.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index bef14f40cd..b9de7fe558 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -468,6 +468,15 @@ def split_batch(self, batch: Any, microbatch_size: int):
         return [{k: v[idx] for k, v in chunked.items()} for idx in range(num_chunks)]
 
 
+class InContextLearningRAGGenerationTaskDataset(InContextLearningDataset):
+    """A dataset that construct batches for in-context learning RAG generation evaluation
+    Rag generation tasks evaluate a model's ability to answer questions based on passages.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+
 class InContextLearningQATaskDataset(InContextLearningDataset):
     """A dataset that construct batches for in-context learning question answering evaluation
     QA tasks evaluate a model's ability to answer questions using a consistent format.

From 75bf465e7c845afb3ebb461c734b9aaa1a0c6c5b Mon Sep 17 00:00:00 2001
From: root <eitan.turok@databricks.com>
Date: Sun, 26 Nov 2023 23:46:17 +0000
Subject: [PATCH 040/116] init _construct_context for RAG eval

---
 .../in_context_learning_evaluation.py         | 35 ++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index b9de7fe558..346fbd0b9e 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -471,10 +471,43 @@ def split_batch(self, batch: Any, microbatch_size: int):
 class InContextLearningRAGGenerationTaskDataset(InContextLearningDataset):
     """A dataset that construct batches for in-context learning RAG generation evaluation
     Rag generation tasks evaluate a model's ability to answer questions based on passages.
+
+    Args:
+        passage_delimiter (str): Delimiter to place between each passage.
+        passage_query_delimiter (str): Delimiter to place between the last passage and the query.
     """
 
-    def __init__(self, *args, **kwargs):
+    def __init__(
+            self,
+            passage_delimiter: str = '\nPassage: ',
+            passage_query_delimiter: str = '\nQuery: ',
+            *args,
+            **kwargs
+            ):
         super().__init__(*args, **kwargs)
+        self.passage_delimiter = passage_delimiter
+        self.passage_query_delimiter = passage_query_delimiter
+
+    def _construct_context(self, sample: dict, preceding_text: str = '', add_answer: bool = False):
+        """
+        Takes a sample and constructs a context. Optionally, appends this to preceeding text (such as a
+        prompt or fewshot examples), as well as optionally adds the correct answer (for fewshot examples)
+
+        Args:
+            sample (dict): the sample from which to construct the context
+            preceding_text (str): any preceding text, needed to if self.example_delimiter is needed at the beginning
+            add_answer (bool): bool for whether or not to add the answer on the end of the context (needed for fewshot examples)
+
+        Returns:
+
+            str: The constructed context. The default output context is
+                 formatted as follows: f'{self.prelimiter}{sample['self.passages_key']}{sample[self.context_key]}{self.continuation_delimiter}'
+        """
+        passages = passage_delimiter.lstrip('\n ')
+        passages += f'{passage_delimiter}'.join(sample['passages'])
+        query = sample['query']
+        context = f'{self.prelimiter}{pssgs}{self.passage_query_delimiter}{query}'
+        return context
 
 
 class InContextLearningQATaskDataset(InContextLearningDataset):

From 288e8a861b251f1ad94cf224188a5e6dfbcac765 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Mon, 27 Nov 2023 17:42:04 +0000
Subject: [PATCH 041/116] fix context key, move hf test dataset, few docstrings

---
 .../in_context_learning_evaluation.py         | 20 +++++++++----------
 .../test_in_context_learning_datasets.py      | 11 +++++-----
 2 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index c8ab08f9ca..fb189c1613 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -180,7 +180,6 @@ def __init__(
             continuation_delimiter: str,
             destination_path: str,
             prelimiter: str = '',
-            # TODO: should this be used to both set and access the data / tokenized examples?
             context_key: str = 'context',
             answer_key: str = 'answer',
             strip_dataset: bool = True,
@@ -242,7 +241,6 @@ def check_defaults_are_set(self, dict_of_defaults: dict) -> None:
             f"{type(self).__name__} missing required variable(s): {''.join([k for k, v in dict_of_defaults.items() if not v])}"
         )
 
-    # TODO conditionally return dataset type?
     def _read_dataset(self,
                       dataset_uri: str,
                       destination_path: str,
@@ -371,7 +369,7 @@ def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -
         Runs text through the tokenizer and handles special cases.
         Args:
             prompt_and_fewshot (str): the collection of the prompt and fewshot examples that belongs before the example's context
-            ctx (str): the specific example's derrived context
+            ctxt (str): the specific example's derrived context
             example (Dict): the example as a dictionary. Used for additional processing in inherited classes.
 
         Returns:
@@ -384,7 +382,7 @@ def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -
         if self.strip_data:
             # rstrip context because a prompt ending in a space results in degenerate output
             ctxt = ctxt.rstrip()
-        tokenized_example[self.context_key] = self.tokenizer(ctxt, add_special_tokens=False)
+        tokenized_example['context'] = self.tokenizer(ctxt, add_special_tokens=False)
         return tokenized_example
 
     def _prep_example(
@@ -415,7 +413,6 @@ def _prep_example(
         tokenized_example = self._tokenize_example(prompt_and_fewshot, ctxt, example)
         return tokenized_example
 
-    # TODO: confirm this typing?
     def collate_fn(self, data: Dict[str, Any]) -> Dict[str, Any]:
         """
         The function that the dataloader uses to accumulate data into batches.
@@ -432,7 +429,7 @@ def collate_fn(self, data: Dict[str, Any]) -> Dict[str, Any]:
             'labels': [],
         }
         for data_pair in data:
-            context_enc = data_pair['preamble']['input_ids'] + data_pair[self.context_key]['input_ids']
+            context_enc = data_pair['preamble']['input_ids'] + data_pair['context']['input_ids']
 
             inp, continuation_span = _make_padded_input(context_enc, data_pair['continuation']['input_ids'],
                                                         self.max_seq_len, self.pad_tok_id)
@@ -558,7 +555,7 @@ def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -
 
     def get_max_answer_length(self) -> int:
         f"""
-        Loops over the dataset and finds the longes answer length.
+        Loops over the dataset and finds the longest answer length.
 
         Returns:
             int: the maximum answer length with an additional buffer of {_MAX_ANSWER_BUFFER_LENGTH} if chain of thought is present
@@ -597,7 +594,7 @@ def collate_fn(self, data: Dict) -> Dict[str, Any]:
         }
         for sample in data:
             aliases = sample['aliases']
-            context_enc = sample['preamble']['input_ids'] + sample[self.context_key]['input_ids']
+            context_enc = sample['preamble']['input_ids'] + sample['context']['input_ids']
             inp, _ = _make_padded_input(
                 context_enc,
                 [],
@@ -757,7 +754,7 @@ def collate_fn(self, data: Dict[str, Any]) -> Dict[str, Any]:
             choice_start_idx = len(batch['continuation_indices'])
 
             for choice in data_pair['choices']:
-                context_enc = data_pair['preamble']['input_ids'] + data_pair[self.context_key]['input_ids']
+                context_enc = data_pair['preamble']['input_ids'] + data_pair['context']['input_ids']
                 continuation_enc = choice['input_ids']
                 inp, continuation_span = _make_padded_input(context_enc, continuation_enc, self.max_seq_len,
                                                             self.pad_tok_id)
@@ -1052,7 +1049,7 @@ def get_max_prompt_length(self) -> int:
         for sample in self.encoded_dataset:
             max_prompt_length = max(
                 max_prompt_length,
-                len(sample['preamble']['input_ids'] + sample['prompt']['input_ids']),
+                len(sample['preamble']['input_ids'] + sample['context']['input_ids']),
             )
         return max_prompt_length
 
@@ -1113,7 +1110,7 @@ def collate_fn(self, data: Dict[str, Any]) -> Dict[str, Any]:
             },
         }
         for sample in data:
-            context_enc = sample['preamble']['input_ids'] + sample['prompt']['input_ids']
+            context_enc = sample['preamble']['input_ids'] + sample['context']['input_ids']
             inp, _ = _make_padded_input(
                 context_enc,
                 [],
@@ -1329,6 +1326,7 @@ def partition_dataset_by_category(dataset_uri: str, destination_path: str, hf_lo
     return output_files
 
 
+
 def get_icl_task_dataloader(
     icl_task_type: str,
     dataset_uri: str,
diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py
index 7696a404fc..b7f5db0b92 100644
--- a/tests/datasets/test_in_context_learning_datasets.py
+++ b/tests/datasets/test_in_context_learning_datasets.py
@@ -1545,7 +1545,7 @@ def test_lm_spacing_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
     assert second_batch_without_last_word.count(' UNIQUE ') == 1
 
 
-@pytest.mark.parametrize('dataset_uri', ['maxisawesome/test_dataset'])
+@pytest.mark.parametrize('dataset_uri', ['mosaicml/test_dataset'])
 @pytest.mark.parametrize('num_fewshot', [0, 1])
 @pytest.mark.parametrize('prompt_string', ['Complete the voiceline: ', ''])
 @pytest.mark.parametrize('hf_loading_vars', [{
@@ -1589,12 +1589,13 @@ def test_hf_dataloading_lm_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path
     max_idx = max(batch['continuation_indices'][0]).item()
     assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx + 1]) == ' and me.'
 
-    decoded_batch = tokenizer.decode(batch['input_ids'][batch['input_ids'] != tokenizer.eos_token_id])
-    # Pytorch kills our dim_size = 2 here and concatenates the two strings.
-    assert decoded_batch == "Looks like it's just you and me.There's a fine line between bravery and stupidity."
+    
+    decoded_batch = [tokenizer.decode(row[row != tokenizer.eos_token_id]) for row in batch['input_ids']]
+    assert decoded_batch[0] == "Looks like it's just you and me."
+    assert decoded_batch[1] == "There's a fine line between bravery and stupidity."
 
 
-@pytest.mark.parametrize('dataset_uri', ['maxisawesome/test_dataset'])
+@pytest.mark.parametrize('dataset_uri', ['mosaicml/test_dataset'])
 @pytest.mark.parametrize('num_fewshot', [0, 1])
 @pytest.mark.parametrize('prompt_string', ['What spell does this invoke? ', ''])
 @pytest.mark.parametrize('hf_loading_vars', [{

From e8362a8ac9595262853b36a1d5e107eef1ad5e5b Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Mon, 27 Nov 2023 21:55:05 +0000
Subject: [PATCH 042/116] fix docstrings, add second path for schema

---
 .../in_context_learning_evaluation.py         | 194 +++++++++++-------
 1 file changed, 123 insertions(+), 71 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index fb189c1613..eb2c3586e2 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -7,7 +7,7 @@
 import json
 import os
 import random
-from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Optional, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 
 import torch
 from torch.utils.data import DataLoader, Dataset
@@ -63,7 +63,11 @@ def _tokenizer_needs_prefix_space(tokenizer: transformers.PreTrainedTokenizerBas
     return len(tokenizer(' a', add_special_tokens=False)['input_ids']) == 1
 
 
-def _make_padded_input(context_enc: List, continuation_enc: List, max_seq_len: int, pad_tok_id: int, padding_side: str = 'right') -> Tuple[torch.tensor, torch.tensor]:
+def _make_padded_input(context_enc: List,
+                       continuation_enc: List,
+                       max_seq_len: int,
+                       pad_tok_id: int,
+                       padding_side: str = 'right') -> Tuple[torch.tensor, torch.tensor]:
     # TODO: docstring
     if len(continuation_enc) + len(context_enc) > max_seq_len:
         # clip from the end
@@ -135,7 +139,8 @@ def _get_fewshot_sample_idxs(dataset_size: int, num_fewshot: int, sample_idx: in
 
 
 class InContextLearningDataset(Dataset):
-    """A base dataset that constructs batches for in-context learning task evaluations
+    """
+    A base dataset that constructs batches for in-context learning task evaluations
 
     The input format is expected to be a jsonl file with different fields based on the task or a link to a Hugging Face dataset.
 
@@ -167,35 +172,34 @@ class InContextLearningDataset(Dataset):
         normal_split_keys (List(str)): keys in the ICL dictionary that will be split into chunks regularly
     """
 
-    def __init__(
-            self,
-            dataset_uri: str,
-            tokenizer: transformers.PreTrainedTokenizerBase,
-            max_seq_len: int,
-            pad_tok_id: int,
-            num_fewshot: int,
-            fewshot_random_seed: int,
-            prompt_string: str,
-            example_delimiter: str,
-            continuation_delimiter: str,
-            destination_path: str,
-            prelimiter: str = '',
-            context_key: str = 'context',
-            answer_key: str = 'answer',
-            strip_dataset: bool = True,
-            hf_loading_vars: Dict = None,
-            hf_parsing_map: Dict = None,
-            stacked_keys: List[str] = None,
-            dont_split_keys: List[str] = None,
-            list_split_keys: List[str] = None,
-            normal_split_keys: List[str] = None):
+    def __init__(self,
+                 dataset_uri: str,
+                 tokenizer: transformers.PreTrainedTokenizerBase,
+                 max_seq_len: int,
+                 pad_tok_id: int,
+                 num_fewshot: int,
+                 fewshot_random_seed: int,
+                 prompt_string: str,
+                 example_delimiter: str,
+                 continuation_delimiter: str,
+                 destination_path: str,
+                 prelimiter: str = '',
+                 context_key: str = 'context',
+                 answer_key: str = 'answer',
+                 strip_dataset: bool = True,
+                 hf_loading_vars: Dict = None,
+                 hf_parsing_map: Dict = None,
+                 stacked_keys: List[str] = None,
+                 dont_split_keys: List[str] = None,
+                 list_split_keys: List[str] = None,
+                 normal_split_keys: List[str] = None):
         self.tokenizer = tokenizer
         self.prefix_space = _tokenizer_needs_prefix_space(self.tokenizer)
 
         self.max_seq_len = max_seq_len
         self.pad_tok_id = pad_tok_id
         self.num_fewshot = num_fewshot
-        # TODO: check this is correct for all dataset types 
+        # TODO: check this is correct for all dataset types
         self.padding_side = 'left'
 
         self.prelimiter = prelimiter
@@ -287,7 +291,8 @@ def _generate_few_shot_text(
         preamble: str,
         fewshot_rng: random.Random,
     ) -> str:
-        """Formats the prompt fewshot examples for test sample `sample_idx`.
+        """
+        Formats the prompt fewshot examples for test sample `sample_idx`.
 
         Randomly select `num_fewshot` samples from the dataset (not including the sample at `sample_idx`) and constructs
         a context with its answer appended.
@@ -314,8 +319,8 @@ def _generate_few_shot_text(
 
     def _construct_context(self, sample: Dict, preceding_text: str = '', add_answer: bool = False) -> str:
         """
-        Takes a sample and constructs a context. Optionally, appends this to preceeding text (such as a
-        prompt or fewshot examples), as well as optionally adds the correct answer (for fewshot examples)
+        Takes a sample and constructs a context. Optionally adds the correct answer (for fewshot examples)
+        and handle exmple delemiters
 
         Args:
             sample (Dict): the sample from which to construct the context
@@ -323,7 +328,6 @@ def _construct_context(self, sample: Dict, preceding_text: str = '', add_answer:
             add_answer (bool): bool for whether or not to add the answer on the end of the context (e.g. for fewshot examples)
 
         Returns:
-
             str: The constructed context. The default output context is
                  formatted as follows: f'{self.prelimiter}{sample[self.context_key]}{self.continuation_delimiter}'
         """
@@ -393,7 +397,8 @@ def _prep_example(
         prompt_string: str,
         fewshot_rng: random.Random,
     ) -> List[Dict[str, Any]]:
-        """Prepares a single example from a HF Dataset into tokenized format with prompt and fewshot examples.
+        """
+        Prepares a single example from a HF Dataset into tokenized format with prompt and fewshot examples.
 
         Each task consists of a context and a continuation as well as an optional prompt and optional list of
         example context/continuation pairs which precede the test context/continuation pair.
@@ -481,7 +486,8 @@ def split_batch(self, batch: Any, microbatch_size: int) -> List[Dict[str, Any]]:
 
 
 class InContextLearningQATaskDataset(InContextLearningDataset):
-    """A dataset that construct batches for in-context learning question answering evaluation.
+    """
+    A dataset that construct batches for in-context learning question answering evaluation.
     QA tasks evaluate a model's ability to answer questions using a consistent format.
 
     The input format is expected to be a jsonl file with the following fields:
@@ -612,7 +618,8 @@ def collate_fn(self, data: Dict) -> Dict[str, Any]:
 
 
 class InContextLearningLMTaskDataset(InContextLearningDataset):
-    """A dataset that construct batches for in-context learning language modeling evaluation.
+    """
+    A dataset that construct batches for in-context learning language modeling evaluation.
     Language modeling tasks test a model's ability to properly predict tokens based on preceding tokens.
 
     The input format is expected to be a jsonl file with the following fields:
@@ -669,7 +676,8 @@ def collate_fn(self, data: Dict[str, any]) -> Dict[str, Any]:
 
 
 class InContextLearningMultipleChoiceTaskDataset(InContextLearningDataset):
-    """A dataset that construct batches for in-context learning multiple choice evaluation.
+    """
+    A dataset that construct batches for in-context learning multiple choice evaluation.
 
     If each question has N answer choices, we construct N distinct inputs per question. In order to ensure
     consistency across multi-GPU, we set the batch size to be `min(N, batch_size)` so that all N
@@ -782,7 +790,8 @@ def get_num_samples_in_batch(self, batch) -> int:
         return batch['input_ids'].shape[0] // self.num_choices
 
     def split_batch(self, batch: Any, microbatch_size: int) -> Dict[str, Any]:
-        """Split batch while ensuring all continuations are in the same microbatch.
+        """
+        Split batch while ensuring all continuations are in the same microbatch.
 
         In ICL Multiple Choice, we duplicate each data point for each possible continuation.
         When splitting a batch, we have logical samples, which refer to one possible question,
@@ -826,7 +835,8 @@ def split_batch(self, batch: Any, microbatch_size: int) -> Dict[str, Any]:
 
 
 class InContextLearningSchemaTaskDataset(InContextLearningMultipleChoiceTaskDataset):
-    """A dataset that constructs batches for in-context learning schema evaluation.
+    """
+    A dataset that constructs batches for in-context learning schema evaluation.
     A schema task involves sentences with a fill-in-the-blank where the user needs to choose the correct word
     to fill in from a set of N options. We use the partial evaluation technique from https://arxiv.org/abs/1806.02847
     to determine the model's choice of fill-in word.
@@ -850,36 +860,75 @@ def __init__(self, choices_key='context_options', *args, **kwargs):
 
     def _construct_context(self, sample, preceding_text: str = '', add_answer: bool = False) -> str:
         """
-        Takes a sample and  constructs a context. Optionally, appends this to preceeding text (such as a
-        prompt or fewshot examples), as well as optionally adds the correct answer (for fewshot examples).
+        Takes a sample and constructs a context with the correct context for the sample's continuation.
 
         Args:
             sample (Dict): the sample from which to construct the context
             preceding_text (str): any preceding text, needed to if self.example_delimiter is needed at the beginning
-            add_answer (bool): bool for whether or not to add the answer on the end of the context (needed for fewshot examples)
+            add_answer (bool): this will always be true when calling this function for SchemaTaskDataset
+
+        Returns:
+            str: the single correct context for a given continuation
 
-        TODO: finish documentation after discussions
         """
         context_options = sample['context_options']
         gold_idx = sample['gold']
         continuation = sample['continuation']
-        if add_answer:
-            context = context_options[gold_idx]
-            if len(preceding_text) > 0:
-                context = f'{self.example_delimiter}{context}'
-            context = f'{context}{self.continuation_delimiter}{continuation}'
-            return context
-        else:
-            # TODO: This is a kinda code-smelly bcus we return two different types
-            # depending on the situation (a string if we hav add_answer=True or a
-            # list of strings if add_answer=False)
-            if len(preceding_text) > 0:
-                if self.strip_data:
-                    cont_del = self.continuation_delimiter.rstrip()
-                else:
-                    cont_del = self.continuation_delimiter
-                context_options = [f'{self.example_delimiter}{c}{cont_del}' for c in context_options]
-            return context_options
+        context = context_options[gold_idx]
+        if len(preceding_text) > 0:
+            context = f'{self.example_delimiter}{context}'
+        context = f'{context}{self.continuation_delimiter}{continuation}'
+        return context
+
+    def _construct_multiple_contexts(self, sample: Dict, preceding_text: str = '') -> str:
+        """
+        Takes a sample and constructs all contexts. Optionally, appends this to preceeding text (such as a
+        prompt or fewshot examples).
+
+        Args:
+            sample (Dict): the sample from which to construct the context
+            preceding_text (str): any preceding text, needed to if self.example_delimiter is needed at the beginning
+
+        Returns:
+            list: all context options for the selected sample with formatting
+        """
+        context_options = sample['context_options']
+        if len(preceding_text) > 0:
+            if self.strip_data:
+                cont_del = self.continuation_delimiter.rstrip()
+            else:
+                cont_del = self.continuation_delimiter
+            context_options = [f'{self.example_delimiter}{c}{cont_del}' for c in context_options]
+        return context_options
+
+    def _prep_example(
+        self,
+        example: Dict,
+        example_idx: int,
+        num_fewshot: int,
+        prompt_string: str,
+        fewshot_rng: random.Random,
+    ) -> List[Dict[str, Any]]:
+        """
+        Prepares a single example from a HF Dataset into tokenized format with prompt and fewshot examples.
+
+        Each task consists of multiple contexts and a single, correct continuation. Will preprend fewshot examples and
+        prompt if present.
+
+        Args:
+            example (Dict): A dictionary from the hf dataset
+            example_idx (int): the index of example
+            num_fewshot (int): Number of examples context/continuation pairs to prepend to the test pair
+            prompt_string (str): The prompt to prepend to all inputs
+            fewshot_rng (random.Random): Random number generator to use for fewshot sampling
+
+        Returns:
+            Dict: contains a dictionary with the tokenized data
+        """
+        prompt_and_fewshot = self._generate_few_shot_text(num_fewshot, example_idx, prompt_string, fewshot_rng)
+        ctxt = self._construct_multiple_contexts(example, prompt_and_fewshot)
+        tokenized_example = self._tokenize_example(prompt_and_fewshot, ctxt, example)
+        return tokenized_example
 
     def _tokenize_example(self, prompt_and_fewshot: str, context_options: List[str], example: Dict) -> Dict[str, Any]:
         """
@@ -952,7 +1001,8 @@ def collate_fn(self, data) -> Dict[str, Any]:
 
 
 class InContextLearningCodeEvalDataset(InContextLearningDataset):
-    """A dataset that constructs batches for in-context learning code evaluation.
+    """
+    A dataset that constructs batches for in-context learning code evaluation.
 
     The default input format is expected to be a jsonl file with the following fields:
     - task_id: label of given task
@@ -965,12 +1015,12 @@ class InContextLearningCodeEvalDataset(InContextLearningDataset):
     - language: the language of the code snippet
 
     Each batch then consists of the following the structure
-    - input_ids: Input tensor batch x seqlen x # tokens
+    - input_ids: Input tensor batch x seqlen x num tokens
     - mode: Indicates to the model that this is an ICL task and may rely on a custom code path to properly update metrics
     - mode: always set to 'generate'
-    - labels:
-    - prompts:
-    - cannonical_solutions
+    - labels: exact solution for the coding problem
+    - prompts: prompt for the task
+    - cannonical_solutions: exact solutions
     - entry_points: list of entry points
     - test_inputs: list of test inputs
     - test_outputs: list of test outputs
@@ -980,12 +1030,12 @@ class InContextLearningCodeEvalDataset(InContextLearningDataset):
     - generation_kwargs: Dictionary of kwargs neeeded for generation. Includes the following:
         - pad_token_id: ID for padding token, derived automatically
         - num_beams: how many beams to search for generations, always set to 1
-        - num_return_sequences: value passed for 'generations_per_sample',  how many generations per prompt
-        - do_sample: set to True, whether or not the model is sampling (#TODO: explain this better)
-        - top_p: passed top_p
-        - top_k: passed top_k
-        - temperature: passed temperature
-        - use_cache: True (#TODO explain this)
+        - num_return_sequences: value passed for 'generations_per_sample', how many generations per prompt
+        - do_sample: determines whether model is sampling or greedily decoding. Always set to True
+        - top_p: the cumulative probability of parameter highest probability vocabulary tokens to keep for nucleus sampling. Must be between 0 and 1
+        - top_k: the number of highest probability vocabulary tokens to keep for top-k-filtering. Between 1 and infinity.
+        - temperature: randomness used during prediction. 1.0 is deterministic. defaults to 1.0
+        - use_cache: Whether or not to use past key values to speed up sampling. Always set to True
 
     Additional Args:
         # TODO: are these correct?
@@ -1123,6 +1173,7 @@ def collate_fn(self, data: Dict[str, Any]) -> Dict[str, Any]:
             batch['canonical_solutions'].append(sample['canonical_solution'])
             batch['prompts'].append(sample['prompt_text'])
             batch['tests'].append(sample['test'])
+            # TODO: why use this twice?
             batch['labels'].append(sample['canonical_solution'])
             batch['entry_points'].append(sample['entry_point'])
             batch['test_inputs'].append(sample['test_inputs'])
@@ -1274,7 +1325,8 @@ def build_icl_dataloader(
 
 def partition_dataset_by_category(dataset_uri: str, destination_path: str, hf_loading_vars: Dict,
                                   hf_parsing_map: Dict) -> Dict[str, str]:
-    """If has_categories is enabled, we partition the dataset into a separate dataset for each category value in the data and write each partition to a local file.
+    """
+    If has_categories is enabled, we partition the dataset into a separate dataset for each category value in the data and write each partition to a local file.
 
     Args:
         dataset_uri (str): Location of dataset.
@@ -1326,7 +1378,6 @@ def partition_dataset_by_category(dataset_uri: str, destination_path: str, hf_lo
     return output_files
 
 
-
 def get_icl_task_dataloader(
     icl_task_type: str,
     dataset_uri: str,
@@ -1349,7 +1400,8 @@ def get_icl_task_dataloader(
     cot_delimiter: str = '',
     has_categories: bool = False,
 ) -> Union[DataSpec, Dict[str, DataSpec]]:
-    """This constructs a dataloader (or dataloaders if has_categories is True) capable of evaluating LLMs on in-context learning language modeling tasks, for example LAMBADA. An example usage is below:
+    """
+    This constructs a dataloader (or dataloaders if has_categories is True) capable of evaluating LLMs on in-context learning language modeling tasks, for example LAMBADA. An example usage is below:
 
     >>> dl = get_icl_task_dataloader(
        ... 'language_modeling',

From 9a1a071f2200c6679e2ef4c0e85afaed91961730 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Mon, 27 Nov 2023 22:07:53 +0000
Subject: [PATCH 043/116] init collate_fn,  _tokenize_example functions (bug
 exists)

---
 .../in_context_learning_evaluation.py         | 89 ++++++++++++++++---
 1 file changed, 78 insertions(+), 11 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 346fbd0b9e..3c5fb1fdf0 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -357,7 +357,7 @@ def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict):
         Runs text through the tokenizer and handles special cases.
         Args:
             prompt_and_fewshot (str): the collection of the prompt and fewshot examples that belongs before the example's context
-            ctx (str): the specific example's derrived context
+            ctx (str): the specific example's derived context
             example (dict): the example as a dictionary. Used for additional processing in inherited classes.
 
         Returns:
@@ -484,9 +484,11 @@ def __init__(
             *args,
             **kwargs
             ):
-        super().__init__(*args, **kwargs)
+        kwargs.pop('passage_delimiter', None)
+        kwargs.pop('passage_query_delimiter', None)
         self.passage_delimiter = passage_delimiter
         self.passage_query_delimiter = passage_query_delimiter
+        super().__init__(*args, **kwargs)
 
     def _construct_context(self, sample: dict, preceding_text: str = '', add_answer: bool = False):
         """
@@ -499,16 +501,62 @@ def _construct_context(self, sample: dict, preceding_text: str = '', add_answer:
             add_answer (bool): bool for whether or not to add the answer on the end of the context (needed for fewshot examples)
 
         Returns:
-
             str: The constructed context. The default output context is
                  formatted as follows: f'{self.prelimiter}{sample['self.passages_key']}{sample[self.context_key]}{self.continuation_delimiter}'
         """
-        passages = passage_delimiter.lstrip('\n ')
-        passages += f'{passage_delimiter}'.join(sample['passages'])
+        passages = self.passage_delimiter.lstrip('\n ')
+        passages += f'{self.passage_delimiter}'.join(sample['passages'])
         query = sample['query']
-        context = f'{self.prelimiter}{pssgs}{self.passage_query_delimiter}{query}'
+        context = f'{self.prelimiter}{passages}{self.passage_query_delimiter}{query}'
         return context
 
+    def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict):
+        """
+        Runs text through the tokenizer and handles special cases.
+        Args:
+            prompt_and_fewshot (str): the collection of the prompt and fewshot examples that belongs before the example's context
+            ctx (str): the specific example's derived context
+            example (dict): the example as a dictionary.
+
+        Returns:
+            dict: dictionary with the tokenized data
+        """
+        tokenized_example = super()._tokenize_example(prompt_and_fewshot, ctxt, example)
+        answer = example['answers'][0]
+        tokenized_example['answer'] = self.tokenizer(answer, add_special_tokens=False)
+        return tokenized_example
+
+
+    def collate_fn(self, data):
+        """
+        The function that the dataloader uses to accumulate data into batches
+        Args:
+            data (list): list of tokenized datapoints (dicts returned by self._tokenize_example)
+
+        Returns:
+            dict: dictionary for a single batch
+        """
+        batch = {
+            'input_ids': [],
+            'continuation_indices': [],
+            'mode': 'icl_task',
+            'labels': [],
+            'answer_indices': []
+            }
+        for data_pair in data:
+            context_enc = data_pair['preamble']['input_ids'] + data_pair['context']['input_ids']
+            answer_enc = data_pair['answer']['input_ids']
+
+            inp, answer_span = _make_padded_input(context_enc, answer_enc, self.max_seq_len,
+                                                        self.pad_tok_id)
+            batch['input_ids'].append(inp)
+            batch['answer_indices'].append(answer_span)
+            batch['labels'].append(inp)
+
+        batch = {k: torch.stack(v) if k in self.stacked_keys else v for k, v in batch.items()}
+        batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
+        return batch
+
 
 class InContextLearningQATaskDataset(InContextLearningDataset):
     """A dataset that construct batches for in-context learning question answering evaluation
@@ -585,7 +633,7 @@ def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict):
         Runs text through the tokenizer and handles special cases.
         Args:
             prompt_and_fewshot (str): the collection of the prompt and fewshot examples that belongs before the example's context
-            ctx (str): the specific example's derrived context
+            ctx (str): the specific example's derived context
             example (dict): the example as a dictionary.
 
         Returns:
@@ -682,7 +730,7 @@ def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict):
         Runs text through the tokenizer and handles special cases.
         Args:
             prompt_and_fewshot (str): the collection of the prompt and fewshot examples that belongs before the example's context
-            ctx (str): the specific example's derrived context
+            ctx (str): the specific example's derived context
             example (dict): the example as a dictionary.
 
         Returns:
@@ -779,7 +827,7 @@ def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict):
         Runs text through the tokenizer and handles special cases.
         Args:
             prompt_and_fewshot (str): the collection of the prompt and fewshot examples that belongs before the example's context
-            ctx (str): the specific example's derrived context
+            ctx (str): the specific example's derived context
             example (dict): the example as a dictionary.
 
         Returns:
@@ -958,7 +1006,7 @@ def _tokenize_example(self, prompt_and_fewshot: str, context_options: List[str],
         Runs text through the tokenizer and handles special cases.
         Args:
             prompt_and_fewshot (str): the collection of the prompt and fewshot examples that belongs before the example's context
-            ctx (str): the specific example's derrived context
+            ctx (str): the specific example's derived context
             example (dict): the example as a dictionary.
 
         Returns:
@@ -1120,7 +1168,7 @@ def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict):
         Runs text through the tokenizer and handles special cases.
         Args:
             prompt_and_fewshot (str): the collection of the prompt and fewshot examples that belongs before the example's context
-            ctx (str): the specific example's derrived context
+            ctx (str): the specific example's derived context
             example (dict): the example as a dictionary.
 
         Returns:
@@ -1304,6 +1352,24 @@ def build_icl_dataloader(
                                                    generations_per_sample=generations_per_sample,
                                                    temperature=temperature)
         effective_batchsize = batch_size
+    elif icl_task_type == 'rag':
+        dataset = InContextLearningRAGGenerationTaskDataset(
+            dataset_uri=dataset_uri,
+            tokenizer=tokenizer,
+            max_seq_len=max_seq_len,
+            pad_tok_id=pad_tok_id,
+            num_fewshot=num_fewshot,
+            prompt_string=prompt_string,
+            example_delimiter=example_delimiter,
+            continuation_delimiter=continuation_delimiter,
+            passage_delimiter='\nPassage: ',
+            passage_query_delimiter='\nQuery: ',
+            destination_path=destination_path,
+            fewshot_random_seed=fewshot_random_seed,
+            hf_loading_vars=hf_loading_vars,
+            hf_parsing_map=hf_parsing_map,   
+        )
+        effective_batchsize = batch_size
     else:
         raise Exception(f'Unrecognized ICL task type: {icl_task_type}')
 
@@ -1316,6 +1382,7 @@ def build_icl_dataloader(
             InContextLearningMultipleChoiceTaskDataset,
             InContextLearningQATaskDataset,
             InContextLearningCodeEvalDataset,
+            InContextLearningRAGGenerationTaskDataset,
         ),
     ):
         split_batch = dataset.split_batch

From c8d45383e6ceefd7f25bfbc25c5376fc190eb6b8 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Mon, 27 Nov 2023 22:10:39 +0000
Subject: [PATCH 044/116] fix typo in warning error

---
 composer/datasets/in_context_learning_evaluation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 3c5fb1fdf0..14a170de3c 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -226,7 +226,7 @@ def check_defaults_are_set(self, dict_of_defaults: dict):
         if all(v for v in dict_of_defaults.values()):
             return
         raise ValueError(
-            f"{type(self).__name__} missing required variable(s): {''.join([k for k, v in dict_of_defaults.items() if not v])}"
+            f"{type(self).__name__} missing required variable(s): {', '.join([k for k, v in dict_of_defaults.items() if not v])}"
         )
 
     def _read_dataset(self,

From ea8c0ebd062ac437d498e5ce5fbaf164cc3f89cc Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Tue, 28 Nov 2023 16:09:44 +0000
Subject: [PATCH 045/116] remove canonical_solution from batch

---
 composer/datasets/in_context_learning_evaluation.py | 6 +-----
 tests/datasets/test_in_context_learning_datasets.py | 8 +++-----
 2 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index eb2c3586e2..1762b5ab9c 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -1020,7 +1020,6 @@ class InContextLearningCodeEvalDataset(InContextLearningDataset):
     - mode: always set to 'generate'
     - labels: exact solution for the coding problem
     - prompts: prompt for the task
-    - cannonical_solutions: exact solutions
     - entry_points: list of entry points
     - test_inputs: list of test inputs
     - test_outputs: list of test outputs
@@ -1076,7 +1075,7 @@ def __init__(
             dont_split_keys=['mode', 'generation_length', 'pass_at_k', 'generation_kwargs'],
             normal_split_keys=['input_ids', 'attention_mask'],
             list_split_keys=[
-                'labels', 'tests', 'canonical_solutions', 'entry_points', 'test_inputs', 'test_outputs', 'prompts',
+                'labels', 'tests', 'entry_points', 'test_inputs', 'test_outputs', 'prompts',
                 'languages'
             ],
             *args,
@@ -1140,7 +1139,6 @@ def collate_fn(self, data: Dict[str, Any]) -> Dict[str, Any]:
             'labels': [],
             'prompts': [],  # list of prompts
             'tests': [],  # list of tests
-            'canonical_solutions': [],  # list of solutions
             'entry_points': [],  # list of entry points
             'test_inputs': [],  # list of test inputs
             'test_outputs': [],  # list of test outputs
@@ -1170,10 +1168,8 @@ def collate_fn(self, data: Dict[str, Any]) -> Dict[str, Any]:
             )
 
             batch['input_ids'].append(inp)
-            batch['canonical_solutions'].append(sample['canonical_solution'])
             batch['prompts'].append(sample['prompt_text'])
             batch['tests'].append(sample['test'])
-            # TODO: why use this twice?
             batch['labels'].append(sample['canonical_solution'])
             batch['entry_points'].append(sample['entry_point'])
             batch['test_inputs'].append(sample['test_inputs'])
diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py
index b7f5db0b92..85c1782878 100644
--- a/tests/datasets/test_in_context_learning_datasets.py
+++ b/tests/datasets/test_in_context_learning_datasets.py
@@ -725,7 +725,7 @@ def test_code_eval_split_batch(dataset_uri, tmp_path):
         'labels': str,
         'prompts': str,
         'tests': str,
-        'canonical_solutions': str,
+        # 'canonical_solutions': str,
         'entry_points': str,
         'test_inputs': list,
         'test_outputs': list,
@@ -873,9 +873,7 @@ def test_code_eval_test_cases(dataset_uri, tmp_path):
     assert any(item[0] != tokenizer.eos_token_id for item in batch['input_ids'])  # longest should be pushed left
 
     mod = types.ModuleType('test_module')
-    for prompt, solution, inputs, outputs, entry_point in zip(batch['prompts'], batch['canonical_solutions'],
-                                                              batch['test_inputs'], batch['test_outputs'],
-                                                              batch['entry_points']):
+    for prompt, solution, inputs, outputs, entry_point in zip(batch['prompts'], batch['labels'], batch['test_inputs'], batch['test_outputs'], batch['entry_points']):
         exec(prompt + solution, mod.__dict__)
         for test_input, test_output in zip(inputs, outputs):
             result = mod.__dict__[entry_point](*eval(test_input))
@@ -1649,4 +1647,4 @@ def test_hf_dataloading_custom_parsing(dataset_uri, tiny_gpt2_tokenizer, tmp_pat
     assert all(
         set(found) == set(expected) for found, expected in zip(batch['labels'], [['defeaning blast'], ['cold snap']]))
     assert decoded_batch[0].endswith('Orbs: quas wex exort\nSpell:')
-    assert decoded_batch[1].endswith('Orbs: quas quas quas\nSpell:')
\ No newline at end of file
+    assert decoded_batch[1].endswith('Orbs: quas quas quas\nSpell:')

From 17198ff68bfa9da607b206f8cd3bb653c8aabff8 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Tue, 28 Nov 2023 16:12:08 +0000
Subject: [PATCH 046/116] missed one canonical_sllution

---
 tests/datasets/test_in_context_learning_datasets.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py
index 85c1782878..ae4b477445 100644
--- a/tests/datasets/test_in_context_learning_datasets.py
+++ b/tests/datasets/test_in_context_learning_datasets.py
@@ -725,7 +725,6 @@ def test_code_eval_split_batch(dataset_uri, tmp_path):
         'labels': str,
         'prompts': str,
         'tests': str,
-        # 'canonical_solutions': str,
         'entry_points': str,
         'test_inputs': list,
         'test_outputs': list,

From c07a4d9d266e50394b109aad54659ef5a184aeb9 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Tue, 28 Nov 2023 16:15:30 +0000
Subject: [PATCH 047/116] remove encoded dataset to have just one dataset var

---
 composer/datasets/in_context_learning_evaluation.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 1762b5ab9c..2f43837f78 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -219,7 +219,7 @@ def __init__(self,
             self.dataset = self.dataset.map(strip_data)
 
         fewshot_rng = random.Random(fewshot_random_seed)
-        self.encoded_dataset = self.dataset.map(
+        self.dataset = self.dataset.map(
             self._prep_example,
             with_indices=True,
             fn_kwargs={
@@ -230,10 +230,10 @@ def __init__(self,
         )
 
     def __getitem__(self, index: int) -> Dict:
-        return self.encoded_dataset[index]
+        return self.dataset[index]
 
     def __len__(self) -> int:
-        return len(self.encoded_dataset)
+        return len(self.dataset)
 
     def get_num_samples_in_batch(self, batch: Dict) -> int:
         return batch['input_ids'].shape[0]
@@ -1095,7 +1095,7 @@ def get_max_prompt_length(self) -> int:
             int: maximum prompt length
         """
         max_prompt_length = 0
-        for sample in self.encoded_dataset:
+        for sample in self.dataset:
             max_prompt_length = max(
                 max_prompt_length,
                 len(sample['preamble']['input_ids'] + sample['context']['input_ids']),

From 8c5df99a2eab5da15221f55e72a89681f96cc82c Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Tue, 28 Nov 2023 16:33:09 +0000
Subject: [PATCH 048/116] rename sample to example

---
 .../in_context_learning_evaluation.py         | 138 +++++++++---------
 .../test_in_context_learning_datasets.py      |   8 +-
 2 files changed, 73 insertions(+), 73 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 2f43837f78..6785bc504a 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -51,8 +51,8 @@ def _check_if_huggingface_uri(uri: str) -> bool:
     return False
 
 
-def strip_data(sample: Dict) -> Dict:
-    return {k: v.strip() if isinstance(v, str) else v for k, v in sample.items()}
+def strip_data(example: Dict) -> Dict:
+    return {k: v.strip() if isinstance(v, str) else v for k, v in example.items()}
 
 
 def _tokenizer_needs_prefix_space(tokenizer: transformers.PreTrainedTokenizerBase) -> bool:
@@ -110,14 +110,14 @@ def _make_padded_input(context_enc: List,
     return inp, continuation_span
 
 
-def _get_fewshot_sample_idxs(dataset_size: int, num_fewshot: int, sample_idx: int, rng: random.Random) -> List[int]:
+def _get_fewshot_sample_idxs(dataset_size: int, num_fewshot: int, example_idx: int, rng: random.Random) -> List[int]:
     """
-    Samples without replacement. If num_fewshot exceeds the number of unique samples,
+    Samples indices without replacement. If num_fewshot exceeds the number of unique examples in the dataset,
     then we will have fewer than num_fewshot examples in context.
     Args:
         dataset_size (int): length of the dataset
         num_fewshot (int): number of examples to prepend
-        sample_idx (int): current sample index (excluded from fewshot choices)
+        example_idx (int): current example's index (excluded from fewshot choices)
         rng (random.Random): rng for repeatable sample selection
 
     Returns:
@@ -126,13 +126,13 @@ def _get_fewshot_sample_idxs(dataset_size: int, num_fewshot: int, sample_idx: in
     num_fewshot = min(dataset_size - 1, num_fewshot)
     fewshot_idxs = set(rng.sample(range(0, dataset_size), num_fewshot))
 
-    if sample_idx in fewshot_idxs:
-        fewshot_idxs.remove(sample_idx)
+    if example_idx in fewshot_idxs:
+        fewshot_idxs.remove(example_idx)
         if len(fewshot_idxs) >= dataset_size - 1:
             return fewshot_idxs
 
         replacement_sample = rng.choice(range(0, dataset_size))
-        while replacement_sample in fewshot_idxs or replacement_sample == sample_idx:
+        while replacement_sample in fewshot_idxs or replacement_sample == example_idx:
             replacement_sample = rng.choice(range(0, dataset_size))
         fewshot_idxs.add(replacement_sample)
     return fewshot_idxs
@@ -287,20 +287,20 @@ def _read_dataset(self,
     def _generate_few_shot_text(
         self,
         num_fewshot: int,
-        sample_idx: int,
+        example_idx: int,
         preamble: str,
         fewshot_rng: random.Random,
     ) -> str:
         """
-        Formats the prompt fewshot examples for test sample `sample_idx`.
+        Formats the fewshot prompt for test example `example_idx`.
 
-        Randomly select `num_fewshot` samples from the dataset (not including the sample at `sample_idx`) and constructs
-        a context with its answer appended.
+        Randomly select `num_fewshot` samples from the dataset (excluding the example at `example_idx`) and constructs
+        contextes with answers appended.
 
-        Returns the formatted prompt_string + concatenated list of formatted few shot examples.
+        Returns the formatted prompt_string + concatenated list of formatted few shot examples as a string.
         Args:
             num_fewshot (int): number of examples to prepend
-            sample_idx (int): current sample idx
+            example_idx (int): current example idx
             preamble (str): text to occur at the beginning of the task. Generally instructions or a prompt.
             fewshot_rng (random.Random): seeded sampler to chose samples with
 
@@ -310,46 +310,46 @@ def _generate_few_shot_text(
         few_shot_text = preamble
 
         if num_fewshot > 0:
-            fewshot_idxs = _get_fewshot_sample_idxs(len(self.dataset), num_fewshot, sample_idx, fewshot_rng)
+            fewshot_idxs = _get_fewshot_sample_idxs(len(self.dataset), num_fewshot, example_idx, fewshot_rng)
             for fewshot_idx in fewshot_idxs:
                 ctxt = self._construct_context(self.dataset[fewshot_idx], few_shot_text, add_answer=True)
                 few_shot_text += ctxt
 
         return few_shot_text
 
-    def _construct_context(self, sample: Dict, preceding_text: str = '', add_answer: bool = False) -> str:
+    def _construct_context(self, example: Dict, preceding_text: str = '', add_answer: bool = False) -> str:
         """
-        Takes a sample and constructs a context. Optionally adds the correct answer (for fewshot examples)
-        and handle exmple delemiters
+        Takes an example and constructs a context. Optionally adds the correct answer (for fewshot examples)
+        and handles example delemiters
 
         Args:
-            sample (Dict): the sample from which to construct the context
+            example (Dict): the example from which to construct the context
             preceding_text (str): any preceding text, used as a check for prepending self.example_delimiter
             add_answer (bool): bool for whether or not to add the answer on the end of the context (e.g. for fewshot examples)
 
         Returns:
             str: The constructed context. The default output context is
-                 formatted as follows: f'{self.prelimiter}{sample[self.context_key]}{self.continuation_delimiter}'
+                 formatted as follows: f'{self.prelimiter}{example[self.context_key]}{self.continuation_delimiter}'
         """
-        ctxt = sample[self.context_key]
+        ctxt = example[self.context_key]
         ctxt = f'{self.prelimiter}{ctxt}'
         if len(preceding_text) > 0:
             ctxt = f'{self.example_delimiter}{ctxt}'
         ctxt = f'{ctxt}{self.continuation_delimiter}'
         if add_answer:
-            ctxt = f'{ctxt}{self._get_answer_from_sample(sample)}'
+            ctxt = f'{ctxt}{self._get_answer_from_example(example)}'
         return ctxt
 
-    def _get_answer_from_sample(self, sample: Dict[str, Any]) -> str:
+    def _get_answer_from_example(self, example: Dict[str, Any]) -> str:
         """
-        Returns the answer from the sample
+        Returns the answer from the example 
         Args:
-            sample (Dict): the sample from which to retrieve the answer
+            example (Dict): the example from which to retrieve the answer
 
         Returns:
-            str: the answer in the sample
+            str: the answer in the example 
         """
-        return sample[self.answer_key]
+        return example[self.answer_key]
 
     def _fix_eos_on_preamble(self, input_ids: List[int]) -> List[int]:
         """
@@ -530,19 +530,19 @@ def _read_dataset(
                 'chain_of_thought': examples.get('chain_of_thought', ''),
             })
 
-    def _get_answer_from_sample(self, sample: Dict) -> str:
+    def _get_answer_from_example(self, example: Dict) -> str:
         """
-        Returns the answer from the sample. Applies chain of thought if self.has_cot is marked as true.
+        Returns the answer from the example. Applies chain of thought if self.has_cot is marked as true.
         Args:
-            sample (Dict): the sample from which to retrieve the answer
+            example (Dict): the example from which to retrieve the answer
 
         Returns:
-            str: the answer in from the sample with chain of thought and delimiter if needed
+            str: the answer in from the example with chain of thought and delimiter if needed
         """
         if self.has_cot:
-            return f'{sample["chain_of_thought"]}{self.cot_delimiter}{sample[self.answer_key]}'
+            return f'{example["chain_of_thought"]}{self.cot_delimiter}{example[self.answer_key]}'
         else:
-            return sample[self.answer_key]
+            return example[self.answer_key]
 
     def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -> Dict[str, Any]:
         """
@@ -567,11 +567,11 @@ def get_max_answer_length(self) -> int:
             int: the maximum answer length with an additional buffer of {_MAX_ANSWER_BUFFER_LENGTH} if chain of thought is present
         """
         max_answer_length = 0
-        for sample in self.dataset:
-            all_answers = [sample[self.answer_key]] + list(sample.get('aliases', []))
+        for example in self.dataset:
+            all_answers = [example[self.answer_key]] + list(example.get('aliases', []))
             for answer in all_answers:
                 if self.has_cot:
-                    response = (f'{sample["chain_of_thought"]}{self.cot_delimiter}{answer}')
+                    response = (f'{example["chain_of_thought"]}{self.cot_delimiter}{answer}')
                 else:
                     response = answer
                 max_answer_length = max(max_answer_length, len(self.tokenizer(response)['input_ids']))
@@ -598,9 +598,9 @@ def collate_fn(self, data: Dict) -> Dict[str, Any]:
                 'use_cache': True
             },
         }
-        for sample in data:
-            aliases = sample['aliases']
-            context_enc = sample['preamble']['input_ids'] + sample['context']['input_ids']
+        for example in data:
+            aliases = example['aliases']
+            context_enc = example['preamble']['input_ids'] + example['context']['input_ids']
             inp, _ = _make_padded_input(
                 context_enc,
                 [],
@@ -709,17 +709,17 @@ def __init__(self, choices_key: str = 'choices', *args, **kwargs):
         self.num_choices = len(self.dataset[0][choices_key])
         self.real_split_keys = ['input_ids', 'labels', 'attention_mask']
 
-    def _get_answer_from_sample(self, sample: Dict) -> str:
+    def _get_answer_from_example(self, example: Dict) -> str:
         """
-        Returns the correct answer from the sample's choices.
+        Returns the correct answer from the example's choices.
         Args:
-            sample (Dict): the sample from which to retrieve the answer
+            example (Dict): the example from which to retrieve the answer
 
         Returns:
             str: the full string of the correct answer based on the 'gold' key
         """
-        choices = sample['choices']
-        gold_idx = sample['gold']
+        choices = example['choices']
+        gold_idx = example['gold']
         return choices[gold_idx]
 
     def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -> Dict[str, Any]:
@@ -794,9 +794,9 @@ def split_batch(self, batch: Any, microbatch_size: int) -> Dict[str, Any]:
         Split batch while ensuring all continuations are in the same microbatch.
 
         In ICL Multiple Choice, we duplicate each data point for each possible continuation.
-        When splitting a batch, we have logical samples, which refer to one possible question,
-        and real samples, which refers to one possible continuation. As sample count and
-        microbatch_size are tracked in logical samples, we split logical attributes by
+        When splitting a batch, we have logical example, which refer to one possible question,
+        and real example, which refers to one possible continuation. As example count and
+        microbatch_size are tracked in logical example, we split logical attributes by
         microbatch_size and real attributes by microbatch_size * num_choices.
         Args:
             batch (Dict): batch of data
@@ -858,12 +858,12 @@ class InContextLearningSchemaTaskDataset(InContextLearningMultipleChoiceTaskData
     def __init__(self, choices_key='context_options', *args, **kwargs):
         super().__init__(choices_key=choices_key, *args, **kwargs)
 
-    def _construct_context(self, sample, preceding_text: str = '', add_answer: bool = False) -> str:
+    def _construct_context(self, example, preceding_text: str = '', add_answer: bool = False) -> str:
         """
-        Takes a sample and constructs a context with the correct context for the sample's continuation.
+        Takes a example and constructs a context with the correct context for the example's continuation.
 
         Args:
-            sample (Dict): the sample from which to construct the context
+            example (Dict): the example from which to construct the context
             preceding_text (str): any preceding text, needed to if self.example_delimiter is needed at the beginning
             add_answer (bool): this will always be true when calling this function for SchemaTaskDataset
 
@@ -871,28 +871,28 @@ def _construct_context(self, sample, preceding_text: str = '', add_answer: bool
             str: the single correct context for a given continuation
 
         """
-        context_options = sample['context_options']
-        gold_idx = sample['gold']
-        continuation = sample['continuation']
+        context_options = example['context_options']
+        gold_idx = example['gold']
+        continuation = example['continuation']
         context = context_options[gold_idx]
         if len(preceding_text) > 0:
             context = f'{self.example_delimiter}{context}'
         context = f'{context}{self.continuation_delimiter}{continuation}'
         return context
 
-    def _construct_multiple_contexts(self, sample: Dict, preceding_text: str = '') -> str:
+    def _construct_multiple_contexts(self, example: Dict, preceding_text: str = '') -> str:
         """
-        Takes a sample and constructs all contexts. Optionally, appends this to preceeding text (such as a
+        Takes a example and constructs all contexts. Optionally, appends this to preceeding text (such as a
         prompt or fewshot examples).
 
         Args:
-            sample (Dict): the sample from which to construct the context
+            example (Dict): the example from which to construct the context
             preceding_text (str): any preceding text, needed to if self.example_delimiter is needed at the beginning
 
         Returns:
-            list: all context options for the selected sample with formatting
+            list: all context options for the selected example with formatting
         """
-        context_options = sample['context_options']
+        context_options = example['context_options']
         if len(preceding_text) > 0:
             if self.strip_data:
                 cont_del = self.continuation_delimiter.rstrip()
@@ -1095,10 +1095,10 @@ def get_max_prompt_length(self) -> int:
             int: maximum prompt length
         """
         max_prompt_length = 0
-        for sample in self.dataset:
+        for example in self.dataset:
             max_prompt_length = max(
                 max_prompt_length,
-                len(sample['preamble']['input_ids'] + sample['context']['input_ids']),
+                len(example['preamble']['input_ids'] + example['context']['input_ids']),
             )
         return max_prompt_length
 
@@ -1157,8 +1157,8 @@ def collate_fn(self, data: Dict[str, Any]) -> Dict[str, Any]:
                 'use_cache': True
             },
         }
-        for sample in data:
-            context_enc = sample['preamble']['input_ids'] + sample['context']['input_ids']
+        for example in data:
+            context_enc = example['preamble']['input_ids'] + example['context']['input_ids']
             inp, _ = _make_padded_input(
                 context_enc,
                 [],
@@ -1168,13 +1168,13 @@ def collate_fn(self, data: Dict[str, Any]) -> Dict[str, Any]:
             )
 
             batch['input_ids'].append(inp)
-            batch['prompts'].append(sample['prompt_text'])
-            batch['tests'].append(sample['test'])
-            batch['labels'].append(sample['canonical_solution'])
-            batch['entry_points'].append(sample['entry_point'])
-            batch['test_inputs'].append(sample['test_inputs'])
-            batch['test_outputs'].append(sample['test_outputs'])
-            batch['languages'].append(sample['language'])
+            batch['prompts'].append(example['prompt_text'])
+            batch['tests'].append(example['test'])
+            batch['labels'].append(example['canonical_solution'])
+            batch['entry_points'].append(example['entry_point'])
+            batch['test_inputs'].append(example['test_inputs'])
+            batch['test_outputs'].append(example['test_outputs'])
+            batch['languages'].append(example['language'])
 
         batch = {k: torch.stack(v) if k in self.stacked_keys else v for k, v in batch.items()}
         batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py
index ae4b477445..5d54dce362 100644
--- a/tests/datasets/test_in_context_learning_datasets.py
+++ b/tests/datasets/test_in_context_learning_datasets.py
@@ -30,16 +30,16 @@
 def test_fewshot_sample_idxs():
     rng = random.Random(1234)
 
-    fewshot_idxs = _get_fewshot_sample_idxs(dataset_size=5, num_fewshot=4, sample_idx=4, rng=rng)
+    fewshot_idxs = _get_fewshot_sample_idxs(dataset_size=5, num_fewshot=4, example_idx=4, rng=rng)
     assert fewshot_idxs == {0, 1, 2, 3}
 
-    fewshot_idxs = _get_fewshot_sample_idxs(dataset_size=5, num_fewshot=5, sample_idx=4, rng=rng)
+    fewshot_idxs = _get_fewshot_sample_idxs(dataset_size=5, num_fewshot=5, example_idx=4, rng=rng)
     assert fewshot_idxs == {0, 1, 2, 3}
 
-    fewshot_idxs = _get_fewshot_sample_idxs(dataset_size=5, num_fewshot=500, sample_idx=4, rng=rng)
+    fewshot_idxs = _get_fewshot_sample_idxs(dataset_size=5, num_fewshot=500, example_idx=4, rng=rng)
     assert fewshot_idxs == {0, 1, 2, 3}
 
-    fewshot_idxs = _get_fewshot_sample_idxs(dataset_size=10, num_fewshot=7, sample_idx=4, rng=rng)
+    fewshot_idxs = _get_fewshot_sample_idxs(dataset_size=10, num_fewshot=7, example_idx=4, rng=rng)
     assert len(fewshot_idxs) == 7 and 4 not in fewshot_idxs
 
 

From bad4b30b9e938d7a135ebbb26d9bf8929c762e53 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Tue, 28 Nov 2023 16:35:05 +0000
Subject: [PATCH 049/116] improve comment

---
 composer/datasets/in_context_learning_evaluation.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 6785bc504a..03a96e0b4a 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -35,7 +35,6 @@ def _check_if_huggingface_uri(uri: str) -> bool:
     """
     Takes a dataset uri and checks if it's a HuggingFace dataset uri.
     Returns False if a backend uri is present (ie 's3://', 'oci://') or if the uri is a local file.
-    Returns True otherwise.
     Args:
         uri (str): uri as a string
 
@@ -43,11 +42,11 @@ def _check_if_huggingface_uri(uri: str) -> bool:
         bool: result of parsing uri as a HF uri
     """
     backend, _, path = parse_uri(uri)
+    # If there's any backend, it's a cloud OCI and not HF
     if backend == '':
         _, ext = os.path.splitext(path)
         # If there's any extension, it's a link to a local file. If no extention, HF path
         return ext == ''
-    # If there's any backend, it's a cloud OCI and not HF
     return False
 
 

From bbd00b50682086ccddd0b1a2d4a9c7622c1c27b1 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Tue, 28 Nov 2023 16:58:06 +0000
Subject: [PATCH 050/116] edit RAGtask

---
 .../in_context_learning_evaluation.py         | 20 +++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index e917edaa7e..7d5917499c 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -191,7 +191,9 @@ def __init__(self,
                  stacked_keys: List[str] = None,
                  dont_split_keys: List[str] = None,
                  list_split_keys: List[str] = None,
-                 normal_split_keys: List[str] = None):
+                 normal_split_keys: List[str] = None,
+                 ):
+        
         self.tokenizer = tokenizer
         self.prefix_space = _tokenizer_needs_prefix_space(self.tokenizer)
 
@@ -199,8 +201,8 @@ def __init__(self,
         self.pad_tok_id = pad_tok_id
         self.num_fewshot = num_fewshot
         # TODO: check this is correct for all dataset types
+        # TODO: change how this is set, using default is unintuitive rn
         self.padding_side = 'left'
-
         self.prelimiter = prelimiter
         self.example_delimiter = example_delimiter
         self.continuation_delimiter = continuation_delimiter
@@ -484,6 +486,7 @@ def split_batch(self, batch: Any, microbatch_size: int) -> List[Dict[str, Any]]:
         return [{k: v[idx] for k, v in chunked.items()} for idx in range(num_chunks)]
 
 
+# TODO: write tests for this class
 class InContextLearningRAGGenerationTaskDataset(InContextLearningDataset):
     """A dataset that construct batches for in-context learning RAG generation evaluation
     Rag generation tasks evaluate a model's ability to answer questions based on passages.
@@ -506,23 +509,24 @@ def __init__(
         self.passage_query_delimiter = passage_query_delimiter
         super().__init__(*args, **kwargs)
 
-    def _construct_context(self, sample: dict, preceding_text: str = '', add_answer: bool = False):
+    def _construct_context(self, example: dict, preceding_text: str = '', add_answer: bool = False):
         """
-        Takes a sample and constructs a context. Optionally, appends this to preceeding text (such as a
+        Takes a example and constructs a context. Optionally, appends this to preceeding text (such as a
         prompt or fewshot examples), as well as optionally adds the correct answer (for fewshot examples)
 
         Args:
-            sample (dict): the sample from which to construct the context
+            example (dict): the example from which to construct the context
             preceding_text (str): any preceding text, needed to if self.example_delimiter is needed at the beginning
             add_answer (bool): bool for whether or not to add the answer on the end of the context (needed for fewshot examples)
 
         Returns:
             str: The constructed context. The default output context is
-                 formatted as follows: f'{self.prelimiter}{sample['self.passages_key']}{sample[self.context_key]}{self.continuation_delimiter}'
+                 formatted as follows: f'{self.prelimiter}{example['self.passages_key']}{example[self.context_key]}{self.continuation_delimiter}'
         """
         passages = self.passage_delimiter.lstrip('\n ')
-        passages += f'{self.passage_delimiter}'.join(sample['passages'])
-        query = sample['query']
+        passages += f'{self.passage_delimiter}'.join(example['passages'])
+        query = example['query']
+        # TODO: add few_shot capabilities
         context = f'{self.prelimiter}{passages}{self.passage_query_delimiter}{query}'
         return context
 

From a4b63b7bbf2b569576c24991708e2b5280604588 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Thu, 30 Nov 2023 00:04:26 +0000
Subject: [PATCH 051/116] rm hf parsing func

---
 .../in_context_learning_evaluation.py         | 109 +++++++-----------
 .../test_in_context_learning_datasets.py      |  45 ++++----
 2 files changed, 60 insertions(+), 94 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 7d5917499c..7b8c69c282 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -14,7 +14,7 @@
 
 from composer.core import DataSpec
 from composer.core.data_spec import _default_split_batch, _split_list
-from composer.utils import MissingConditionalImportError, dist, get_file, parse_uri
+from composer.utils import MissingConditionalImportError, dist, get_file
 
 if TYPE_CHECKING:
     import transformers
@@ -31,25 +31,6 @@
 ]
 
 
-def _check_if_huggingface_uri(uri: str) -> bool:
-    """
-    Takes a dataset uri and checks if it's a HuggingFace dataset uri.
-    Returns False if a backend uri is present (ie 's3://', 'oci://') or if the uri is a local file.
-    Args:
-        uri (str): uri as a string
-
-    Returns:
-        bool: result of parsing uri as a HF uri
-    """
-    backend, _, path = parse_uri(uri)
-    # If there's any backend, it's a cloud OCI and not HF
-    if backend == '':
-        _, ext = os.path.splitext(path)
-        # If there's any extension, it's a link to a local file. If no extention, HF path
-        return ext == ''
-    return False
-
-
 def strip_data(example: Dict) -> Dict:
     return {k: v.strip() if isinstance(v, str) else v for k, v in example.items()}
 
@@ -171,29 +152,30 @@ class InContextLearningDataset(Dataset):
         normal_split_keys (List(str)): keys in the ICL dictionary that will be split into chunks regularly
     """
 
-    def __init__(self,
-                 dataset_uri: str,
-                 tokenizer: transformers.PreTrainedTokenizerBase,
-                 max_seq_len: int,
-                 pad_tok_id: int,
-                 num_fewshot: int,
-                 fewshot_random_seed: int,
-                 prompt_string: str,
-                 example_delimiter: str,
-                 continuation_delimiter: str,
-                 destination_path: str,
-                 prelimiter: str = '',
-                 context_key: str = 'context',
-                 answer_key: str = 'answer',
-                 strip_dataset: bool = True,
-                 hf_loading_vars: Dict = None,
-                 hf_parsing_map: Dict = None,
-                 stacked_keys: List[str] = None,
-                 dont_split_keys: List[str] = None,
-                 list_split_keys: List[str] = None,
-                 normal_split_keys: List[str] = None,
-                 ):
-        
+    def __init__(
+        self,
+        dataset_uri: str,
+        tokenizer: transformers.PreTrainedTokenizerBase,
+        max_seq_len: int,
+        pad_tok_id: int,
+        num_fewshot: int,
+        fewshot_random_seed: int,
+        prompt_string: str,
+        example_delimiter: str,
+        continuation_delimiter: str,
+        destination_path: str,
+        prelimiter: str = '',
+        context_key: str = 'context',
+        answer_key: str = 'answer',
+        strip_dataset: bool = True,
+        hf_loading_vars: Dict = None,
+        hf_parsing_map: Dict = None,
+        stacked_keys: List[str] = None,
+        dont_split_keys: List[str] = None,
+        list_split_keys: List[str] = None,
+        normal_split_keys: List[str] = None,
+    ):
+
         self.tokenizer = tokenizer
         self.prefix_space = _tokenizer_needs_prefix_space(self.tokenizer)
 
@@ -271,7 +253,8 @@ def _read_dataset(self,
                 conda_package='datasets',
                 conda_channel='conda-forge',
             ) from e
-        if _check_if_huggingface_uri(dataset_uri):
+        if 'hf://' in dataset_uri:
+            dataset_uri = dataset_uri.replace('hf://', '')
             dataset = load_dataset(dataset_uri, **hf_loading_vars)
             if hf_parsing_map:
                 dataset_parsing_func = lambda example: {
@@ -343,12 +326,12 @@ def _construct_context(self, example: Dict, preceding_text: str = '', add_answer
 
     def _get_answer_from_example(self, example: Dict[str, Any]) -> str:
         """
-        Returns the answer from the example 
+        Returns the answer from the example
         Args:
             example (Dict): the example from which to retrieve the answer
 
         Returns:
-            str: the answer in the example 
+            str: the answer in the example
         """
         return example[self.answer_key]
 
@@ -496,13 +479,11 @@ class InContextLearningRAGGenerationTaskDataset(InContextLearningDataset):
         passage_query_delimiter (str): Delimiter to place between the last passage and the query.
     """
 
-    def __init__(
-            self,
-            passage_delimiter: str = '\nPassage: ',
-            passage_query_delimiter: str = '\nQuery: ',
-            *args,
-            **kwargs
-            ):
+    def __init__(self,
+                 passage_delimiter: str = '\nPassage: ',
+                 passage_query_delimiter: str = '\nQuery: ',
+                 *args,
+                 **kwargs):
         kwargs.pop('passage_delimiter', None)
         kwargs.pop('passage_query_delimiter', None)
         self.passage_delimiter = passage_delimiter
@@ -546,7 +527,6 @@ def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict):
         tokenized_example['answer'] = self.tokenizer(answer, add_special_tokens=False)
         return tokenized_example
 
-
     def collate_fn(self, data):
         """
         The function that the dataloader uses to accumulate data into batches
@@ -556,19 +536,12 @@ def collate_fn(self, data):
         Returns:
             dict: dictionary for a single batch
         """
-        batch = {
-            'input_ids': [],
-            'continuation_indices': [],
-            'mode': 'icl_task',
-            'labels': [],
-            'answer_indices': []
-            }
+        batch = {'input_ids': [], 'continuation_indices': [], 'mode': 'icl_task', 'labels': [], 'answer_indices': []}
         for data_pair in data:
             context_enc = data_pair['preamble']['input_ids'] + data_pair['context']['input_ids']
             answer_enc = data_pair['answer']['input_ids']
 
-            inp, answer_span = _make_padded_input(context_enc, answer_enc, self.max_seq_len,
-                                                        self.pad_tok_id)
+            inp, answer_span = _make_padded_input(context_enc, answer_enc, self.max_seq_len, self.pad_tok_id)
             batch['input_ids'].append(inp)
             batch['answer_indices'].append(answer_span)
             batch['labels'].append(inp)
@@ -1167,10 +1140,7 @@ def __init__(
             stacked_keys=['input_ids'],
             dont_split_keys=['mode', 'generation_length', 'pass_at_k', 'generation_kwargs'],
             normal_split_keys=['input_ids', 'attention_mask'],
-            list_split_keys=[
-                'labels', 'tests', 'entry_points', 'test_inputs', 'test_outputs', 'prompts',
-                'languages'
-            ],
+            list_split_keys=['labels', 'tests', 'entry_points', 'test_inputs', 'test_outputs', 'prompts', 'languages'],
             *args,
             **kwargs,
         )
@@ -1398,7 +1368,7 @@ def build_icl_dataloader(
             destination_path=destination_path,
             fewshot_random_seed=fewshot_random_seed,
             hf_loading_vars=hf_loading_vars,
-            hf_parsing_map=hf_parsing_map,   
+            hf_parsing_map=hf_parsing_map,
         )
         effective_batchsize = batch_size
     else:
@@ -1454,7 +1424,8 @@ def partition_dataset_by_category(dataset_uri: str, destination_path: str, hf_lo
             conda_package='datasets',
             conda_channel='conda-forge',
         ) from e
-    if _check_if_huggingface_uri(dataset_uri):
+    if 'hf://' in dataset_uri:
+        dataset_uri = dataset_uri.replace('hf://', '')
         dataset = load_dataset(dataset_uri, **hf_loading_vars)
         if hf_parsing_map:
             dataset_parsing_func = lambda example: {
diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py
index 5d54dce362..f4834ee871 100644
--- a/tests/datasets/test_in_context_learning_datasets.py
+++ b/tests/datasets/test_in_context_learning_datasets.py
@@ -17,7 +17,7 @@
 from composer.core import DataSpec
 from composer.datasets.in_context_learning_evaluation import (InContextLearningCodeEvalDataset,
                                                               _get_fewshot_sample_idxs, _make_padded_input,
-                                                              get_icl_task_dataloader, _check_if_huggingface_uri)
+                                                              get_icl_task_dataloader)
 from composer.loggers import InMemoryLogger
 from composer.metrics import (InContextLearningCodeEvalAccuracy, InContextLearningLMAccuracy,
                               InContextLearningMultipleChoiceAccuracy, InContextLearningQAAccuracy)
@@ -74,15 +74,6 @@ def test_batch_padding_logic(tiny_gpt2_tokenizer):
     assert continuation_spans[0] == 48 and continuation_spans[-1] == 2047
 
 
-@pytest.mark.parametrize('uri', ['tests/datasets/local_data/hellaswag_small.jsonl', 's3://oci/url/link.json', 'gcs://blah/blah.json'])
-def test_check_if_huggingface_uri_when_not_hf_uri(uri):
-    assert not _check_if_huggingface_uri(uri)
-
-
-@pytest.mark.parametrize('uri', ['L4NLP/LEval', 'mosaicml/instruct-v3'])
-def test_check_if_huggingface_uri_when_hf_uri(uri):
-    assert _check_if_huggingface_uri(uri)
-
 @pytest.mark.parametrize('padding_side', ['left', 'right', 'middle'])
 def test_make_padding(tiny_gpt2_tokenizer, padding_side):
     context = tiny_gpt2_tokenizer(' cat' * 2000)['input_ids']
@@ -872,7 +863,8 @@ def test_code_eval_test_cases(dataset_uri, tmp_path):
     assert any(item[0] != tokenizer.eos_token_id for item in batch['input_ids'])  # longest should be pushed left
 
     mod = types.ModuleType('test_module')
-    for prompt, solution, inputs, outputs, entry_point in zip(batch['prompts'], batch['labels'], batch['test_inputs'], batch['test_outputs'], batch['entry_points']):
+    for prompt, solution, inputs, outputs, entry_point in zip(batch['prompts'], batch['labels'], batch['test_inputs'],
+                                                              batch['test_outputs'], batch['entry_points']):
         exec(prompt + solution, mod.__dict__)
         for test_input, test_output in zip(inputs, outputs):
             result = mod.__dict__[entry_point](*eval(test_input))
@@ -1082,7 +1074,8 @@ def test_schema_task_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer, t
 @pytest.mark.filterwarnings(r'ignore:Cannot split .* of length.*:UserWarning')
 @device('gpu')
 @world_size(1, 2)
-def test_mc_task_evaluation_subcategories(device, world_size, dataset_uri, num_fewshot, tiny_gpt2_model, tiny_gpt2_tokenizer, tmp_path):
+def test_mc_task_evaluation_subcategories(device, world_size, dataset_uri, num_fewshot, tiny_gpt2_model,
+                                          tiny_gpt2_tokenizer, tmp_path):
     pytest.importorskip('datasets')
     in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -1134,7 +1127,8 @@ def test_mc_task_evaluation_subcategories(device, world_size, dataset_uri, num_f
 @pytest.mark.parametrize('num_fewshot', [0, 5])
 @device('gpu')
 @world_size(1, 2)
-def test_mc_task_evaluation(device, world_size, num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tmp_path, tiny_gpt2_model):
+def test_mc_task_evaluation(device, world_size, num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tmp_path,
+                            tiny_gpt2_model):
     pytest.importorskip('datasets')
     in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -1266,7 +1260,8 @@ def test_qa_task_evaluation_with_cot_opt_tokenizer(device, world_size, num_fewsh
 @pytest.mark.parametrize('num_fewshot', [0, 5])
 @device('gpu')
 @world_size(1, 2)
-def test_qa_task_evaluation(device, world_size, num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tiny_gpt2_model, tmp_path):
+def test_qa_task_evaluation(device, world_size, num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tiny_gpt2_model,
+                            tmp_path):
     pytest.importorskip('datasets')
     in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -1308,7 +1303,8 @@ def test_qa_task_evaluation(device, world_size, num_fewshot, dataset_uri, tiny_g
 @pytest.mark.parametrize('num_fewshot', [5])
 @device('gpu')
 @world_size(1, 2)
-def test_qa_task_with_cot_evaluation(device, world_size, num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tiny_gpt2_model, tmp_path):
+def test_qa_task_with_cot_evaluation(device, world_size, num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tiny_gpt2_model,
+                                     tmp_path):
     pytest.importorskip('datasets')
     in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -1364,7 +1360,8 @@ def test_code_eval_requires_valid_envvar(monkeypatch):
 @pytest.mark.parametrize('generations_per_sample', range(1, 3))
 @device('gpu')
 @world_size(1, 2)
-def test_code_eval_microbatching(monkeypatch, device, world_size, num_fewshot, dataset_uri, tmp_path, generations_per_sample):
+def test_code_eval_microbatching(monkeypatch, device, world_size, num_fewshot, dataset_uri, tmp_path,
+                                 generations_per_sample):
     pytest.importorskip('datasets')
     monkeypatch.setenv('CODE_EVAL_DEVICE', 'LOCAL')
     in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
@@ -1413,8 +1410,8 @@ def test_code_eval_microbatching(monkeypatch, device, world_size, num_fewshot, d
 @pytest.mark.parametrize('generations_per_sample', range(1, 3))
 @device('gpu')
 @world_size(1, 2)
-def test_code_eval_sentpiece_evaluation(monkeypatch, device, world_size, num_fewshot, dataset_uri, tiny_t5_tokenizer, tiny_t5_model,
-                                        tmp_path, generations_per_sample):
+def test_code_eval_sentpiece_evaluation(monkeypatch, device, world_size, num_fewshot, dataset_uri, tiny_t5_tokenizer,
+                                        tiny_t5_model, tmp_path, generations_per_sample):
     pytest.importorskip('datasets')
     torch.cuda.empty_cache()
     monkeypatch.setenv('CODE_EVAL_DEVICE', 'LOCAL')
@@ -1461,8 +1458,8 @@ def test_code_eval_sentpiece_evaluation(monkeypatch, device, world_size, num_few
 @pytest.mark.filterwarnings(r'ignore: Input length of input_ids is')
 @device('gpu')
 @world_size(1, 2)
-def test_code_eval_task_evaluation(monkeypatch, device, world_size, num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tiny_gpt2_model,
-                                   tmp_path, generations_per_sample):
+def test_code_eval_task_evaluation(monkeypatch, device, world_size, num_fewshot, dataset_uri, tiny_gpt2_tokenizer,
+                                   tiny_gpt2_model, tmp_path, generations_per_sample):
     pytest.importorskip('datasets')
     torch.cuda.empty_cache()
     monkeypatch.setenv('CODE_EVAL_DEVICE', 'LOCAL')
@@ -1542,7 +1539,7 @@ def test_lm_spacing_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
     assert second_batch_without_last_word.count(' UNIQUE ') == 1
 
 
-@pytest.mark.parametrize('dataset_uri', ['mosaicml/test_dataset'])
+@pytest.mark.parametrize('dataset_uri', ['hf://mosaicml/test_dataset'])
 @pytest.mark.parametrize('num_fewshot', [0, 1])
 @pytest.mark.parametrize('prompt_string', ['Complete the voiceline: ', ''])
 @pytest.mark.parametrize('hf_loading_vars', [{
@@ -1586,20 +1583,19 @@ def test_hf_dataloading_lm_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path
     max_idx = max(batch['continuation_indices'][0]).item()
     assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx + 1]) == ' and me.'
 
-    
     decoded_batch = [tokenizer.decode(row[row != tokenizer.eos_token_id]) for row in batch['input_ids']]
     assert decoded_batch[0] == "Looks like it's just you and me."
     assert decoded_batch[1] == "There's a fine line between bravery and stupidity."
 
 
-@pytest.mark.parametrize('dataset_uri', ['mosaicml/test_dataset'])
+@pytest.mark.parametrize('dataset_uri', ['hf://mosaicml/test_dataset'])
 @pytest.mark.parametrize('num_fewshot', [0, 1])
 @pytest.mark.parametrize('prompt_string', ['What spell does this invoke? ', ''])
 @pytest.mark.parametrize('hf_loading_vars', [{
     'split': 'test',
     'name': 'invoker',
 }])
-@pytest.mark.parametrize('hf_parsing_map', [{"context":['quas','wex','exort'],"answer":['spell']}])
+@pytest.mark.parametrize('hf_parsing_map', [{'context': ['quas', 'wex', 'exort'], 'answer': ['spell']}])
 def test_hf_dataloading_custom_parsing(dataset_uri, tiny_gpt2_tokenizer, tmp_path, num_fewshot, prompt_string,
                                        hf_loading_vars, hf_parsing_map):
     pytest.importorskip('datasets')
@@ -1638,7 +1634,6 @@ def test_hf_dataloading_custom_parsing(dataset_uri, tiny_gpt2_tokenizer, tmp_pat
 
     decoded_batch = tokenizer.batch_decode(batch['input_ids'])
     assert all(item.count('Orbs: ') == num_fewshot + 1 for item in decoded_batch)
-    # import IPython; IPython.embed()
     assert all(item.count('\nSpell:') == num_fewshot + 1 for item in decoded_batch)
 
     if len(prompt_string) > 0:

From ab87b708202e3e8a0483479103342f70003d1c8b Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Thu, 30 Nov 2023 00:21:45 +0000
Subject: [PATCH 052/116] fix docstring, rename fewshot fun

---
 composer/datasets/in_context_learning_evaluation.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 7b8c69c282..573a3a6b6f 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -144,7 +144,7 @@ class InContextLearningDataset(Dataset):
         strip_dataset (bool): Boolean for whether to strip whitespace from data. Trailing whitespace can cause degenerative outputs,
             so unless whitespace should be preserved (for example in code), this should be set to True.
         hf_loading_vars (Dict): A dictionary containing keyword arguments to be passed into `load_dataset` if dataset is being pulled from HF.
-        hf_parsing_map (Dict[str:List[str]]): A dictionary containing a mapping from HF columns to ICL dataset keys. The dictionary should be formatted {icl_key:[hf_key1, hf_key1]}.
+        hf_parsing_map (Dict[str, List[str]]): A dictionary containing a mapping from HF columns to ICL dataset keys. The dictionary should be formatted {icl_key:[hf_key1, hf_key1]}.
             Values in the dict will be concatenated with ' ' seperating them. If not included, will use the columns already present in the HF dataset.
         stacked_keys (List(str)): keys in the output batch that must be converted to tensors with torch.stack()
         dont_split_keys (List(str)): keys in the ICL dictionary that should not be split among batches.
@@ -268,7 +268,7 @@ def _read_dataset(self,
             dataset = load_dataset('json', data_files=destination_path, split='train', streaming=False)
         return dataset
 
-    def _generate_few_shot_text(
+    def _generate_few_shot_prompt(
         self,
         num_fewshot: int,
         example_idx: int,
@@ -397,7 +397,7 @@ def _prep_example(
         Returns:
             Dict: contains a dictionary with the tokenized data
         """
-        prompt_and_fewshot = self._generate_few_shot_text(num_fewshot, example_idx, prompt_string, fewshot_rng)
+        prompt_and_fewshot = self._generate_few_shot_prompt(num_fewshot, example_idx, prompt_string, fewshot_rng)
         ctxt = self._construct_context(example, prompt_and_fewshot, add_answer=False)
         tokenized_example = self._tokenize_example(prompt_and_fewshot, ctxt, example)
         return tokenized_example
@@ -991,7 +991,7 @@ def _prep_example(
         Returns:
             Dict: contains a dictionary with the tokenized data
         """
-        prompt_and_fewshot = self._generate_few_shot_text(num_fewshot, example_idx, prompt_string, fewshot_rng)
+        prompt_and_fewshot = self._generate_few_shot_prompt(num_fewshot, example_idx, prompt_string, fewshot_rng)
         ctxt = self._construct_multiple_contexts(example, prompt_and_fewshot)
         tokenized_example = self._tokenize_example(prompt_and_fewshot, ctxt, example)
         return tokenized_example

From 44694b009e2df9b88484549fa5e423c9cc51a962 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Thu, 30 Nov 2023 01:49:42 +0000
Subject: [PATCH 053/116] docstring

---
 .../in_context_learning_evaluation.py         | 23 ++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 573a3a6b6f..60b82b670f 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -48,7 +48,25 @@ def _make_padded_input(context_enc: List,
                        max_seq_len: int,
                        pad_tok_id: int,
                        padding_side: str = 'right') -> Tuple[torch.tensor, torch.tensor]:
-    # TODO: docstring
+    """
+    Takes an encoded context and continuation and clips the beginning of the context if they're too long.
+    Adds the padding token to the specified side.
+
+    Args:
+        context_enc (List): the encoded input to the model
+        continuation_enc (List): the encoded desired output for the example
+        max_seq_list (int): maximum length sequences can be
+        pad_tok_id (int): the token id we pad with
+        padding_side (str): which side to pad the context on. Can be 'right' or 'left
+
+    Returns:
+        input (torch.tensor): the padded and encoded context 
+        continuation_span (torch.tensor): the _inclusive_ range of indices corresponding to the continuation
+     
+
+    """
+
+    # TODO: Not obvious this happens here, should probably be it's own funciton
     if len(continuation_enc) + len(context_enc) > max_seq_len:
         # clip from the end
         context_max_subseq_len = max_seq_len - len(continuation_enc)
@@ -59,7 +77,6 @@ def _make_padded_input(context_enc: List,
 
         context_enc = context_enc[-(context_max_subseq_len):]
 
-    # continuation span is the _inclusive_ range of indices corresponding to the continuation
     continuation_span = torch.tensor(range(len(context_enc), len(context_enc) + len(continuation_enc)))
     inp = torch.tensor(
         (context_enc + continuation_enc),
@@ -718,7 +735,7 @@ def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -
 
     def collate_fn(self, data: Dict[str, any]) -> Dict[str, Any]:
         """
-        The function that the dataloader uses to accumulate data into batches.
+        Accumulate examples into batches
         Args:
             data (List): list of tokenized datapoints (dicts returned by self._tokenize_example)
 

From a12da95115023bc1d29264b5dd0e3e5fe53dd058 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Thu, 30 Nov 2023 09:26:28 +0000
Subject: [PATCH 054/116] change default split_batch to check types

---
 composer/datasets/in_context_learning_evaluation.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 60b82b670f..0d1d4115df 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -469,15 +469,15 @@ def split_batch(self, batch: Any, microbatch_size: int) -> List[Dict[str, Any]]:
         })
         chunked = {}
         for k, v in batch.items():
-            if k in self.dont_split_keys:
+            if type(v) in [str, int, dict]:
                 # Defer broadcasting until we know num_chunks
                 pass
-            elif k in self.list_split_keys:
+            elif type(v) == list:
                 chunked[k] = _split_list(v, microbatch_size)
-            elif k in self.normal_split_keys:
+            elif type(v) == torch.Tensor:
                 chunked[k] = _default_split_batch(v, microbatch_size)
             else:
-                raise ValueError(f'Unexpected key {k}')
+                raise ValueError(f'Unexpected key {k}, value , type {type(v)}')
         num_chunks = len(chunked['input_ids'])
         for k, v in batch.items():
             if isinstance(v, (int, float, str, bool, Dict)):

From 7ad31991793726a47f05219c7524a7d18345cff5 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Thu, 30 Nov 2023 09:38:42 +0000
Subject: [PATCH 055/116] remove need to set split_keys

---
 .../in_context_learning_evaluation.py         | 56 ++++++-------------
 1 file changed, 18 insertions(+), 38 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 0d1d4115df..fe1b745cd2 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -188,9 +188,6 @@ def __init__(
         hf_loading_vars: Dict = None,
         hf_parsing_map: Dict = None,
         stacked_keys: List[str] = None,
-        dont_split_keys: List[str] = None,
-        list_split_keys: List[str] = None,
-        normal_split_keys: List[str] = None,
     ):
 
         self.tokenizer = tokenizer
@@ -208,9 +205,6 @@ def __init__(
         self.context_key = context_key
         self.answer_key = answer_key
         self.stacked_keys = stacked_keys or ['input_ids', 'labels']
-        self.dont_split_keys = dont_split_keys or []
-        self.list_split_keys = list_split_keys or []
-        self.normal_split_keys = normal_split_keys or []
 
         hf_loading_vars = hf_loading_vars or {}
         self.dataset = self._read_dataset(dataset_uri, destination_path, hf_loading_vars, hf_parsing_map)
@@ -462,14 +456,9 @@ def split_batch(self, batch: Any, microbatch_size: int) -> List[Dict[str, Any]]:
         # Don't split kwargs that don't change
         # Normally split torch tensors
         # List split lists of strings
-        self.check_defaults_are_set({
-            'dont_split_keys': self.dont_split_keys,
-            'list_split_keys': self.list_split_keys,
-            'normal_split_keys': self.normal_split_keys
-        })
         chunked = {}
         for k, v in batch.items():
-            if type(v) in [str, int, dict]:
+            if type(v) in [str, float, int, dict, bool]:
                 # Defer broadcasting until we know num_chunks
                 pass
             elif type(v) == list:
@@ -477,10 +466,10 @@ def split_batch(self, batch: Any, microbatch_size: int) -> List[Dict[str, Any]]:
             elif type(v) == torch.Tensor:
                 chunked[k] = _default_split_batch(v, microbatch_size)
             else:
-                raise ValueError(f'Unexpected key {k}, value , type {type(v)}')
+                raise ValueError(f'Unexpected value type {type(v)} with key {k}')
         num_chunks = len(chunked['input_ids'])
         for k, v in batch.items():
-            if isinstance(v, (int, float, str, bool, Dict)):
+            if isinstance(v, (int, float, str, bool, dict)):
                 chunked[k] = [v] * num_chunks
 
         return [{k: v[idx] for k, v in chunked.items()} for idx in range(num_chunks)]
@@ -588,9 +577,6 @@ def __init__(self, cot_delimiter: str = '', *args, **kwargs):
         self.cot_delimiter = cot_delimiter
         self.has_cot = False
         super().__init__(stacked_keys=['input_ids'],
-                         dont_split_keys=['mode', 'generation_length', 'generation_kwargs', 'cot_delimiter'],
-                         normal_split_keys=['input_ids', 'attention_mask'],
-                         list_split_keys=['labels'],
                          *args,
                          **kwargs)
 
@@ -785,12 +771,9 @@ class InContextLearningMultipleChoiceTaskDataset(InContextLearningDataset):
 
     def __init__(self, choices_key: str = 'choices', *args, **kwargs):
         super().__init__(context_key='query',
-                         dont_split_keys=['mode'],
-                         normal_split_keys=['gold_indices'],
                          *args,
                          **kwargs)
         self.num_choices = len(self.dataset[0][choices_key])
-        self.real_split_keys = ['input_ids', 'labels', 'attention_mask']
 
     def _get_answer_from_example(self, example: Dict) -> str:
         """
@@ -888,32 +871,32 @@ def split_batch(self, batch: Any, microbatch_size: int) -> Dict[str, Any]:
         Returns:
             list: list of chunked batches
         """
-        self.check_defaults_are_set({
-            'dont_split_keys': self.dont_split_keys,
-            'normal_split_keys': self.normal_split_keys
-        })
         chunked = {}
         for k, v in batch.items():
-            if k in self.dont_split_keys:
+            if type(v) in [str, int, dict, bool]:
                 # Defer broadcasting primitives until we know num_chunks
                 pass
-            elif k == 'continuation_indices':
-                # List of lists, so we have to directly call _split_list
-                chunked[k] = _split_list(v, microbatch_size * self.num_choices)
-            elif k == 'choice_groupings':
-                # List of lists, so we have to directly call _split_list
-                chunked[k] = _split_list(v, microbatch_size)
-            elif k in self.real_split_keys:
+            elif type(v) == list:
+                element_type = type(v[0])
+                # list of tensors - 'continuation_indices'
+                if element_type == torch.Tensor:
+                    chunked[k] = _split_list(v, microbatch_size * self.num_choices)
+                # list of tuples - 'choice_groupings'
+                elif element_type == tuple:
+                    chunked[k] = _split_list(v, microbatch_size)
+                # list - 'gold_indices'
+                else:
+                    chunked[k] = _default_split_batch(v, microbatch_size)
+            elif type(v) == torch.Tensor:
                 chunked[k] = _default_split_batch(v, microbatch_size * self.num_choices)
-            elif k in self.normal_split_keys:
-                chunked[k] = _default_split_batch(v, microbatch_size)
             else:
-                raise ValueError(f'Unexpected key {k}')
+                raise ValueError(f'Unexpected value type {type(v)} with key {k}')
         num_chunks = len(chunked['input_ids'])
         # Broadcast primitives to all chunks
         for k, v in batch.items():
             if isinstance(v, (int, float, str, bool)):
                 chunked[k] = [v] * num_chunks
+
         return [{k: v[idx] for k, v in chunked.items()} for idx in range(num_chunks)]
 
 
@@ -1155,9 +1138,6 @@ def __init__(
             answer_key='canonical_solution',
             strip_dataset=False,
             stacked_keys=['input_ids'],
-            dont_split_keys=['mode', 'generation_length', 'pass_at_k', 'generation_kwargs'],
-            normal_split_keys=['input_ids', 'attention_mask'],
-            list_split_keys=['labels', 'tests', 'entry_points', 'test_inputs', 'test_outputs', 'prompts', 'languages'],
             *args,
             **kwargs,
         )

From e5de590cdc42f61d2d98c861c25f2c722ad0bceb Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Thu, 30 Nov 2023 09:51:45 +0000
Subject: [PATCH 056/116] doc string update

---
 composer/datasets/in_context_learning_evaluation.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index fe1b745cd2..a967b88f9b 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -164,9 +164,6 @@ class InContextLearningDataset(Dataset):
         hf_parsing_map (Dict[str, List[str]]): A dictionary containing a mapping from HF columns to ICL dataset keys. The dictionary should be formatted {icl_key:[hf_key1, hf_key1]}.
             Values in the dict will be concatenated with ' ' seperating them. If not included, will use the columns already present in the HF dataset.
         stacked_keys (List(str)): keys in the output batch that must be converted to tensors with torch.stack()
-        dont_split_keys (List(str)): keys in the ICL dictionary that should not be split among batches.
-        list_split_keys (List(str)): keys in the ICL dictionary that will be split as lists, resulting in microbatch_size sections of the list being inserted in every batch
-        normal_split_keys (List(str)): keys in the ICL dictionary that will be split into chunks regularly
     """
 
     def __init__(
@@ -670,6 +667,7 @@ def collate_fn(self, data: Dict) -> Dict[str, Any]:
         for example in data:
             aliases = example['aliases']
             context_enc = example['preamble']['input_ids'] + example['context']['input_ids']
+            # TODO: if no cont_span, then don't need to stack labels
             inp, _ = _make_padded_input(
                 context_enc,
                 [],

From 24c50df1254149105209bdd4afd5ca9d3a8e93e5 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Thu, 30 Nov 2023 10:16:56 +0000
Subject: [PATCH 057/116] improve comments

---
 .../in_context_learning_evaluation.py         | 22 +++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index a967b88f9b..2788d45c25 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -286,7 +286,7 @@ def _generate_few_shot_prompt(
         """
         Formats the fewshot prompt for test example `example_idx`.
 
-        Randomly select `num_fewshot` samples from the dataset (excluding the example at `example_idx`) and constructs
+        Randomly selects `num_fewshot` samples from the dataset (excluding the example at `example_idx`) and constructs
         contextes with answers appended.
 
         Returns the formatted prompt_string + concatenated list of formatted few shot examples as a string.
@@ -311,8 +311,8 @@ def _generate_few_shot_prompt(
 
     def _construct_context(self, example: Dict, preceding_text: str = '', add_answer: bool = False) -> str:
         """
-        Takes an example and constructs a context. Optionally adds the correct answer (for fewshot examples)
-        and handles example delemiters
+        Takes an example and constructs a context, ie the input the model reads for this example. 
+        Optionally adds the correct answer (for fewshot examples) and handles example delemiters
 
         Args:
             example (Dict): the example from which to construct the context
@@ -348,7 +348,7 @@ def _fix_eos_on_preamble(self, input_ids: List[int]) -> List[int]:
         If the input_ids is empty then input_ids['input_ids'] will be a 0-length List,
         unless the tokenizer adds special tokens to empty strings (e.g. OPT tokenizer)
         If there is an EOS token added, we need to remove it so it is not in the middle of the prompt,
-        as the specific eval question's prompt will follow theinput_ids
+        as the specific eval question's prompt will follow the input_ids
         Args:
             input_ids (List): the tokenized input
 
@@ -1195,19 +1195,19 @@ def collate_fn(self, data: Dict[str, Any]) -> Dict[str, Any]:
             'input_ids': [],
             'mode': 'generate',
             'labels': [],
-            'prompts': [],  # list of prompts
-            'tests': [],  # list of tests
-            'entry_points': [],  # list of entry points
-            'test_inputs': [],  # list of test inputs
-            'test_outputs': [],  # list of test outputs
-            'languages': [],  # list of languages
+            'prompts': [],  
+            'tests': [],  
+            'entry_points': [],  
+            'test_inputs': [],  
+            'test_outputs': [], 
+            'languages': [],  
             'pass_at_k': self.pass_at_k,
             'generation_length': self.max_seq_len - self.max_prompt_length,
             'generation_kwargs': {
                 'pad_token_id': self.pad_tok_id,
                 # TODO: specify this?
                 'num_beams': 1,  # single beam
-                'num_return_sequences': self.generations_per_sample,  # how many gens per prompt
+                'num_return_sequences': self.generations_per_sample,  
                 'do_sample': True,
                 'top_p': self.top_p,
                 'top_k': self.top_k,

From 6e412250190457a700e816915267fc0204364d6a Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Thu, 30 Nov 2023 10:31:11 +0000
Subject: [PATCH 058/116] rm stacked_keys for tokenize_labels bool

---
 .../in_context_learning_evaluation.py         | 45 ++++++++++++-------
 1 file changed, 30 insertions(+), 15 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 2788d45c25..0108c09bff 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -163,7 +163,7 @@ class InContextLearningDataset(Dataset):
         hf_loading_vars (Dict): A dictionary containing keyword arguments to be passed into `load_dataset` if dataset is being pulled from HF.
         hf_parsing_map (Dict[str, List[str]]): A dictionary containing a mapping from HF columns to ICL dataset keys. The dictionary should be formatted {icl_key:[hf_key1, hf_key1]}.
             Values in the dict will be concatenated with ' ' seperating them. If not included, will use the columns already present in the HF dataset.
-        stacked_keys (List(str)): keys in the output batch that must be converted to tensors with torch.stack()
+        tokenize_labels (bool): Whether or not the labels should be tokenized. Used in metric calculation and for direct comparison
     """
 
     def __init__(
@@ -184,7 +184,7 @@ def __init__(
         strip_dataset: bool = True,
         hf_loading_vars: Dict = None,
         hf_parsing_map: Dict = None,
-        stacked_keys: List[str] = None,
+        tokenize_labels: bool = True,
     ):
 
         self.tokenizer = tokenizer
@@ -201,7 +201,7 @@ def __init__(
         self.continuation_delimiter = continuation_delimiter
         self.context_key = context_key
         self.answer_key = answer_key
-        self.stacked_keys = stacked_keys or ['input_ids', 'labels']
+        self.tokenize_labels = tokenize_labels 
 
         hf_loading_vars = hf_loading_vars or {}
         self.dataset = self._read_dataset(dataset_uri, destination_path, hf_loading_vars, hf_parsing_map)
@@ -426,8 +426,11 @@ def collate_fn(self, data: Dict[str, Any]) -> Dict[str, Any]:
             'labels': [],
         }
         for data_pair in data:
+            # TODO: move this to tokenize_example
             context_enc = data_pair['preamble']['input_ids'] + data_pair['context']['input_ids']
 
+            # TODO: use self.answer_key 
+            # TODO: write a boolean tokenize_labels
             inp, continuation_span = _make_padded_input(context_enc, data_pair['continuation']['input_ids'],
                                                         self.max_seq_len, self.pad_tok_id)
 
@@ -435,7 +438,10 @@ def collate_fn(self, data: Dict[str, Any]) -> Dict[str, Any]:
             batch['continuation_indicies'].append(continuation_span)
             batch['labels'].append(inp)
 
-        batch = {k: torch.stack(v) if k in self.stacked_keys else v for k, v in batch.items()}
+        batch['input_ids'] = torch.stack(batch['input_ids'])
+        if self.tokenize_labels:
+            batch['labels'] = torch.stack(batch['labels'])
+        
         batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
         return batch
 
@@ -549,7 +555,9 @@ def collate_fn(self, data):
             batch['answer_indices'].append(answer_span)
             batch['labels'].append(inp)
 
-        batch = {k: torch.stack(v) if k in self.stacked_keys else v for k, v in batch.items()}
+        batch['input_ids'] = torch.stack(batch['input_ids'])
+        if self.tokenize_labels:
+            batch['labels'] = torch.stack(batch['labels'])
         batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
         return batch
 
@@ -573,9 +581,7 @@ class InContextLearningQATaskDataset(InContextLearningDataset):
     def __init__(self, cot_delimiter: str = '', *args, **kwargs):
         self.cot_delimiter = cot_delimiter
         self.has_cot = False
-        super().__init__(stacked_keys=['input_ids'],
-                         *args,
-                         **kwargs)
+        super().__init__(tokenize_labels=False, *args, **kwargs)
 
         self.max_answer_length = self.get_max_answer_length()
 
@@ -667,7 +673,6 @@ def collate_fn(self, data: Dict) -> Dict[str, Any]:
         for example in data:
             aliases = example['aliases']
             context_enc = example['preamble']['input_ids'] + example['context']['input_ids']
-            # TODO: if no cont_span, then don't need to stack labels
             inp, _ = _make_padded_input(
                 context_enc,
                 [],
@@ -679,7 +684,9 @@ def collate_fn(self, data: Dict) -> Dict[str, Any]:
             batch['input_ids'].append(inp)
             batch['labels'].append(aliases)
 
-        batch = {k: torch.stack(v) if k in self.stacked_keys else v for k, v in batch.items()}
+        batch['input_ids'] = torch.stack(batch['input_ids'])
+        if self.tokenize_labels:
+            batch['labels'] = torch.stack(batch['labels'])
         batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
         return batch
 
@@ -737,7 +744,9 @@ def collate_fn(self, data: Dict[str, any]) -> Dict[str, Any]:
             batch['continuation_indices'].append(continuation_span)
             batch['labels'].append(inp)
 
-        batch = {k: torch.stack(v) if k in self.stacked_keys else v for k, v in batch.items()}
+        batch['input_ids'] = torch.stack(batch['input_ids'])
+        if self.tokenize_labels:
+            batch['labels'] = torch.stack(batch['labels'])
         batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
         return batch
 
@@ -846,7 +855,9 @@ def collate_fn(self, data: Dict[str, Any]) -> Dict[str, Any]:
         # since the batch may consist of multiple questions, the choice_groupings indicates
         # which contiguous sequences of elements in the batch correspond to which question
         # gold_indices indicates which of the [0, N-1] choices is the correct one for each question.
-        batch = {k: torch.stack(v) if k in self.stacked_keys else v for k, v in batch.items()}
+        batch['input_ids'] = torch.stack(batch['input_ids'])
+        if self.tokenize_labels:
+            batch['labels'] = torch.stack(batch['labels'])
         batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
         return batch
 
@@ -1059,7 +1070,9 @@ def collate_fn(self, data) -> Dict[str, Any]:
         # since the batch may consist of multiple questions, the choice_groupings indicates
         # which contiguous sequences of elements in the batch correspond to which question
         # gold_indices indicates which of the [0, N-1] choices is the correct one for each question.
-        batch = {k: torch.stack(v) if k in self.stacked_keys else v for k, v in batch.items()}
+        batch['input_ids'] = torch.stack(batch['input_ids'])
+        if self.tokenize_labels:
+            batch['labels'] = torch.stack(batch['labels'])
         batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
         return batch
 
@@ -1135,7 +1148,7 @@ def __init__(
             context_key='prompt',
             answer_key='canonical_solution',
             strip_dataset=False,
-            stacked_keys=['input_ids'],
+            tokenize_labels=False,
             *args,
             **kwargs,
         )
@@ -1234,7 +1247,9 @@ def collate_fn(self, data: Dict[str, Any]) -> Dict[str, Any]:
             batch['test_outputs'].append(example['test_outputs'])
             batch['languages'].append(example['language'])
 
-        batch = {k: torch.stack(v) if k in self.stacked_keys else v for k, v in batch.items()}
+        batch['input_ids'] = torch.stack(batch['input_ids'])
+        if self.tokenize_labels:
+            batch['labels'] = torch.stack(batch['labels'])
         batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
         return batch
 

From 7bad80c738b55cc85f8ccfdaf0c4f1563796075d Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Thu, 30 Nov 2023 10:49:05 +0000
Subject: [PATCH 059/116] initial wip in comments

---
 .../in_context_learning_evaluation.py         | 56 +++++++++++++------
 1 file changed, 39 insertions(+), 17 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 0108c09bff..f5ad474228 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -60,9 +60,9 @@ def _make_padded_input(context_enc: List,
         padding_side (str): which side to pad the context on. Can be 'right' or 'left
 
     Returns:
-        input (torch.tensor): the padded and encoded context 
+        input (torch.tensor): the padded and encoded context
         continuation_span (torch.tensor): the _inclusive_ range of indices corresponding to the continuation
-     
+
 
     """
 
@@ -201,7 +201,7 @@ def __init__(
         self.continuation_delimiter = continuation_delimiter
         self.context_key = context_key
         self.answer_key = answer_key
-        self.tokenize_labels = tokenize_labels 
+        self.tokenize_labels = tokenize_labels
 
         hf_loading_vars = hf_loading_vars or {}
         self.dataset = self._read_dataset(dataset_uri, destination_path, hf_loading_vars, hf_parsing_map)
@@ -311,7 +311,7 @@ def _generate_few_shot_prompt(
 
     def _construct_context(self, example: Dict, preceding_text: str = '', add_answer: bool = False) -> str:
         """
-        Takes an example and constructs a context, ie the input the model reads for this example. 
+        Takes an example and constructs a context, ie the input the model reads for this example.
         Optionally adds the correct answer (for fewshot examples) and handles example delemiters
 
         Args:
@@ -429,8 +429,9 @@ def collate_fn(self, data: Dict[str, Any]) -> Dict[str, Any]:
             # TODO: move this to tokenize_example
             context_enc = data_pair['preamble']['input_ids'] + data_pair['context']['input_ids']
 
-            # TODO: use self.answer_key 
-            # TODO: write a boolean tokenize_labels
+            # TODO: use self.answer_key in other classes
+            # TODO: use tokenize_labels in other classes
+            # TODO: extract input_ids in tokenize_example
             inp, continuation_span = _make_padded_input(context_enc, data_pair['continuation']['input_ids'],
                                                         self.max_seq_len, self.pad_tok_id)
 
@@ -441,7 +442,7 @@ def collate_fn(self, data: Dict[str, Any]) -> Dict[str, Any]:
         batch['input_ids'] = torch.stack(batch['input_ids'])
         if self.tokenize_labels:
             batch['labels'] = torch.stack(batch['labels'])
-        
+
         batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
         return batch
 
@@ -734,6 +735,29 @@ def collate_fn(self, data: Dict[str, any]) -> Dict[str, Any]:
             Dict: dictionary for a single batch
         """
         batch = {'input_ids': [], 'continuation_indices': [], 'mode': 'icl_task', 'labels': []}
+        # self.default_batch = {'input_ids': [], 'continuation_indices': [], 'mode': 'icl_task', 'labels': []}
+        # batch = self.defatul_batch
+        # batch_mapping = {
+        #     "input_ids": 'context',
+        #     "continuation_indices": 'continuation',
+        #     "labels": 'context'
+        # }
+        # for data_pair in data:
+        #     for batch_key, data_key in batch_mapping:
+        #         if batch_key == 'input_ids':
+        #             if self.tokenize_labels:
+        #                 inp, cont_span = _make_padded_input(data_pair[data_key], data_pair[self.answer_key], self.max_seq_len, self.pad_tok_id)
+        #                 batch['input_ids'].append(inp)
+        #                 batch['continuation_indices'].append(cont_span)
+        #             else:
+        #                 # TODO: just make this cont_span return as empty list if dat_pari[ans_key] is none?
+        #                 # answer = data_pair[self.answer_key] if self.tokenize_label else []
+        #                 inp, cont_span = _make_padded_input(data_pair[data_key], data_pair[self.answer_key], self.max_seq_len, self.pad_tok_id)
+        #                 # TODO: if label is also inp, what then? maybe check is batch_key == self.context_key
+        #                 batch['input_ids'].append(inp)
+        #         else:
+        #             batch[batch_key].append(data_pair[data_key])
+
         for data_pair in data:
             context_enc = data_pair['preamble']['input_ids'] + data_pair['context']['input_ids']
             continuation_enc = data_pair['continuation']['input_ids']
@@ -777,9 +801,7 @@ class InContextLearningMultipleChoiceTaskDataset(InContextLearningDataset):
     """
 
     def __init__(self, choices_key: str = 'choices', *args, **kwargs):
-        super().__init__(context_key='query',
-                         *args,
-                         **kwargs)
+        super().__init__(context_key='query', *args, **kwargs)
         self.num_choices = len(self.dataset[0][choices_key])
 
     def _get_answer_from_example(self, example: Dict) -> str:
@@ -1208,19 +1230,19 @@ def collate_fn(self, data: Dict[str, Any]) -> Dict[str, Any]:
             'input_ids': [],
             'mode': 'generate',
             'labels': [],
-            'prompts': [],  
-            'tests': [],  
-            'entry_points': [],  
-            'test_inputs': [],  
-            'test_outputs': [], 
-            'languages': [],  
+            'prompts': [],
+            'tests': [],
+            'entry_points': [],
+            'test_inputs': [],
+            'test_outputs': [],
+            'languages': [],
             'pass_at_k': self.pass_at_k,
             'generation_length': self.max_seq_len - self.max_prompt_length,
             'generation_kwargs': {
                 'pad_token_id': self.pad_tok_id,
                 # TODO: specify this?
                 'num_beams': 1,  # single beam
-                'num_return_sequences': self.generations_per_sample,  
+                'num_return_sequences': self.generations_per_sample,
                 'do_sample': True,
                 'top_p': self.top_p,
                 'top_k': self.top_k,

From 393bc4bfc6f50b2534526f8fe63bfdcc1590f9e0 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Thu, 30 Nov 2023 10:54:22 +0000
Subject: [PATCH 060/116] make _conv_tokens_to_tensors func

---
 .../in_context_learning_evaluation.py         | 73 ++++++++-----------
 1 file changed, 31 insertions(+), 42 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index f5ad474228..64ca511bfe 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -60,9 +60,9 @@ def _make_padded_input(context_enc: List,
         padding_side (str): which side to pad the context on. Can be 'right' or 'left
 
     Returns:
-        input (torch.tensor): the padded and encoded context
+        input (torch.tensor): the padded and encoded context 
         continuation_span (torch.tensor): the _inclusive_ range of indices corresponding to the continuation
-
+     
 
     """
 
@@ -201,7 +201,7 @@ def __init__(
         self.continuation_delimiter = continuation_delimiter
         self.context_key = context_key
         self.answer_key = answer_key
-        self.tokenize_labels = tokenize_labels
+        self.tokenize_labels = tokenize_labels 
 
         hf_loading_vars = hf_loading_vars or {}
         self.dataset = self._read_dataset(dataset_uri, destination_path, hf_loading_vars, hf_parsing_map)
@@ -311,7 +311,7 @@ def _generate_few_shot_prompt(
 
     def _construct_context(self, example: Dict, preceding_text: str = '', add_answer: bool = False) -> str:
         """
-        Takes an example and constructs a context, ie the input the model reads for this example.
+        Takes an example and constructs a context, ie the input the model reads for this example. 
         Optionally adds the correct answer (for fewshot examples) and handles example delemiters
 
         Args:
@@ -410,6 +410,13 @@ def _prep_example(
         tokenized_example = self._tokenize_example(prompt_and_fewshot, ctxt, example)
         return tokenized_example
 
+    def _convert_tokens_to_tensors(self, batch: Dict) -> Dict[str, Any]:
+        batch['input_ids'] = torch.stack(batch['input_ids'])
+        if self.tokenize_labels:
+            batch['labels'] = torch.stack(batch['labels'])
+        batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
+        return batch
+
     def collate_fn(self, data: Dict[str, Any]) -> Dict[str, Any]:
         """
         The function that the dataloader uses to accumulate data into batches.
@@ -439,11 +446,7 @@ def collate_fn(self, data: Dict[str, Any]) -> Dict[str, Any]:
             batch['continuation_indicies'].append(continuation_span)
             batch['labels'].append(inp)
 
-        batch['input_ids'] = torch.stack(batch['input_ids'])
-        if self.tokenize_labels:
-            batch['labels'] = torch.stack(batch['labels'])
-
-        batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
+        batch = self._convert_tokens_to_tensors(batch)
         return batch
 
     def split_batch(self, batch: Any, microbatch_size: int) -> List[Dict[str, Any]]:
@@ -556,10 +559,7 @@ def collate_fn(self, data):
             batch['answer_indices'].append(answer_span)
             batch['labels'].append(inp)
 
-        batch['input_ids'] = torch.stack(batch['input_ids'])
-        if self.tokenize_labels:
-            batch['labels'] = torch.stack(batch['labels'])
-        batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
+        batch = self._convert_tokens_to_tensors(batch)
         return batch
 
 
@@ -685,10 +685,7 @@ def collate_fn(self, data: Dict) -> Dict[str, Any]:
             batch['input_ids'].append(inp)
             batch['labels'].append(aliases)
 
-        batch['input_ids'] = torch.stack(batch['input_ids'])
-        if self.tokenize_labels:
-            batch['labels'] = torch.stack(batch['labels'])
-        batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
+        batch = self._convert_tokens_to_tensors(batch)
         return batch
 
 
@@ -736,7 +733,7 @@ def collate_fn(self, data: Dict[str, any]) -> Dict[str, Any]:
         """
         batch = {'input_ids': [], 'continuation_indices': [], 'mode': 'icl_task', 'labels': []}
         # self.default_batch = {'input_ids': [], 'continuation_indices': [], 'mode': 'icl_task', 'labels': []}
-        # batch = self.defatul_batch
+        # batch = self.defatul_batch 
         # batch_mapping = {
         #     "input_ids": 'context',
         #     "continuation_indices": 'continuation',
@@ -758,6 +755,8 @@ def collate_fn(self, data: Dict[str, any]) -> Dict[str, Any]:
         #         else:
         #             batch[batch_key].append(data_pair[data_key])
 
+
+
         for data_pair in data:
             context_enc = data_pair['preamble']['input_ids'] + data_pair['context']['input_ids']
             continuation_enc = data_pair['continuation']['input_ids']
@@ -768,10 +767,7 @@ def collate_fn(self, data: Dict[str, any]) -> Dict[str, Any]:
             batch['continuation_indices'].append(continuation_span)
             batch['labels'].append(inp)
 
-        batch['input_ids'] = torch.stack(batch['input_ids'])
-        if self.tokenize_labels:
-            batch['labels'] = torch.stack(batch['labels'])
-        batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
+        batch = self._convert_tokens_to_tensors(batch)
         return batch
 
 
@@ -801,7 +797,9 @@ class InContextLearningMultipleChoiceTaskDataset(InContextLearningDataset):
     """
 
     def __init__(self, choices_key: str = 'choices', *args, **kwargs):
-        super().__init__(context_key='query', *args, **kwargs)
+        super().__init__(context_key='query',
+                         *args,
+                         **kwargs)
         self.num_choices = len(self.dataset[0][choices_key])
 
     def _get_answer_from_example(self, example: Dict) -> str:
@@ -877,10 +875,7 @@ def collate_fn(self, data: Dict[str, Any]) -> Dict[str, Any]:
         # since the batch may consist of multiple questions, the choice_groupings indicates
         # which contiguous sequences of elements in the batch correspond to which question
         # gold_indices indicates which of the [0, N-1] choices is the correct one for each question.
-        batch['input_ids'] = torch.stack(batch['input_ids'])
-        if self.tokenize_labels:
-            batch['labels'] = torch.stack(batch['labels'])
-        batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
+        batch = self._convert_tokens_to_tensors(batch)
         return batch
 
     def get_num_samples_in_batch(self, batch) -> int:
@@ -1092,10 +1087,7 @@ def collate_fn(self, data) -> Dict[str, Any]:
         # since the batch may consist of multiple questions, the choice_groupings indicates
         # which contiguous sequences of elements in the batch correspond to which question
         # gold_indices indicates which of the [0, N-1] choices is the correct one for each question.
-        batch['input_ids'] = torch.stack(batch['input_ids'])
-        if self.tokenize_labels:
-            batch['labels'] = torch.stack(batch['labels'])
-        batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
+        batch = self._convert_tokens_to_tensors(batch)
         return batch
 
 
@@ -1230,19 +1222,19 @@ def collate_fn(self, data: Dict[str, Any]) -> Dict[str, Any]:
             'input_ids': [],
             'mode': 'generate',
             'labels': [],
-            'prompts': [],
-            'tests': [],
-            'entry_points': [],
-            'test_inputs': [],
-            'test_outputs': [],
-            'languages': [],
+            'prompts': [],  
+            'tests': [],  
+            'entry_points': [],  
+            'test_inputs': [],  
+            'test_outputs': [], 
+            'languages': [],  
             'pass_at_k': self.pass_at_k,
             'generation_length': self.max_seq_len - self.max_prompt_length,
             'generation_kwargs': {
                 'pad_token_id': self.pad_tok_id,
                 # TODO: specify this?
                 'num_beams': 1,  # single beam
-                'num_return_sequences': self.generations_per_sample,
+                'num_return_sequences': self.generations_per_sample,  
                 'do_sample': True,
                 'top_p': self.top_p,
                 'top_k': self.top_k,
@@ -1269,10 +1261,7 @@ def collate_fn(self, data: Dict[str, Any]) -> Dict[str, Any]:
             batch['test_outputs'].append(example['test_outputs'])
             batch['languages'].append(example['language'])
 
-        batch['input_ids'] = torch.stack(batch['input_ids'])
-        if self.tokenize_labels:
-            batch['labels'] = torch.stack(batch['labels'])
-        batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
+        batch = self._convert_tokens_to_tensors(batch)
         return batch
 
 

From 5a0448973253e1d58ba09a9842d5656153087439 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Thu, 30 Nov 2023 11:13:54 +0000
Subject: [PATCH 061/116] wip - sketch out batch_mappings

---
 .../in_context_learning_evaluation.py         | 140 ++++++++++--------
 1 file changed, 76 insertions(+), 64 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 64ca511bfe..31918d1a48 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -60,9 +60,9 @@ def _make_padded_input(context_enc: List,
         padding_side (str): which side to pad the context on. Can be 'right' or 'left
 
     Returns:
-        input (torch.tensor): the padded and encoded context 
+        input (torch.tensor): the padded and encoded context
         continuation_span (torch.tensor): the _inclusive_ range of indices corresponding to the continuation
-     
+
 
     """
 
@@ -201,7 +201,7 @@ def __init__(
         self.continuation_delimiter = continuation_delimiter
         self.context_key = context_key
         self.answer_key = answer_key
-        self.tokenize_labels = tokenize_labels 
+        self.tokenize_labels = tokenize_labels
 
         hf_loading_vars = hf_loading_vars or {}
         self.dataset = self._read_dataset(dataset_uri, destination_path, hf_loading_vars, hf_parsing_map)
@@ -311,7 +311,7 @@ def _generate_few_shot_prompt(
 
     def _construct_context(self, example: Dict, preceding_text: str = '', add_answer: bool = False) -> str:
         """
-        Takes an example and constructs a context, ie the input the model reads for this example. 
+        Takes an example and constructs a context, ie the input the model reads for this example.
         Optionally adds the correct answer (for fewshot examples) and handles example delemiters
 
         Args:
@@ -426,29 +426,34 @@ def collate_fn(self, data: Dict[str, Any]) -> Dict[str, Any]:
         Returns:
             Dict: dictionary for a single batch
         """
-        batch = {
-            'input_ids': [],
-            'continuation_indices': [],
-            'mode': 'icl_task',
-            'labels': [],
+        # TODO: move preamble + context to tokenize_example
+        # TODO: use self.answer_key in other classes
+        # TODO: use tokenize_labels in other classes
+        # TODO: extract input_ids in tokenize_example
+        batch = self.default_batch
+        batch_mapping = {
+            "input_ids": 'context',
+            "continuation_indices": 'continuation',
+            "labels": 'context'
         }
         for data_pair in data:
-            # TODO: move this to tokenize_example
-            context_enc = data_pair['preamble']['input_ids'] + data_pair['context']['input_ids']
-
-            # TODO: use self.answer_key in other classes
-            # TODO: use tokenize_labels in other classes
-            # TODO: extract input_ids in tokenize_example
-            inp, continuation_span = _make_padded_input(context_enc, data_pair['continuation']['input_ids'],
-                                                        self.max_seq_len, self.pad_tok_id)
-
-            batch['input_ids'].append(inp)
-            batch['continuation_indicies'].append(continuation_span)
-            batch['labels'].append(inp)
+            for batch_key, data_key in batch_mapping:
+                if data_key == self.context_key:
+                    if self.tokenize_labels:
+                        # NOTE: this will be run twice if more than one batch_key uses the input (like in LM task)
+                        inp, cont_span = _make_padded_input(data_pair[self.context_key], data_pair[self.answer_key], self.max_seq_len, self.pad_tok_id)
+                        batch[batch_key].append(inp)
+                        batch['continuation_indices'].append(cont_span)
+                    else:
+                        inp, _ = _make_padded_input(data_pair[data_key], data_pair[self.answer_key], self.max_seq_len, self.pad_tok_id)
+                        batch[batch_key].append(inp)
+                else:
+                    batch[batch_key].append(data_pair[data_key])
 
         batch = self._convert_tokens_to_tensors(batch)
         return batch
 
+
     def split_batch(self, batch: Any, microbatch_size: int) -> List[Dict[str, Any]]:
         """
         Handling for certain specialty columns that must be split into batches in different formats.
@@ -660,6 +665,10 @@ def collate_fn(self, data: Dict) -> Dict[str, Any]:
         Returns:
             Dict: dictionary for a single batch
         """
+        # batch_mapping = {
+        #     'input_ids': self.context_key,
+        #     'labels': 'aliases',
+        # }
         batch = {
             'input_ids': [],
             'mode': 'generate',
@@ -703,6 +712,7 @@ class InContextLearningLMTaskDataset(InContextLearningDataset):
 
     def __init__(self, *args, **kwargs):
         super().__init__(answer_key='continuation', *args, **kwargs)
+        self.default_batch = {'input_ids': [], 'continuation_indices': [], 'mode': 'icl_task', 'labels': []}
 
     def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -> Dict[str, Any]:
         """
@@ -731,41 +741,36 @@ def collate_fn(self, data: Dict[str, any]) -> Dict[str, Any]:
         Returns:
             Dict: dictionary for a single batch
         """
-        batch = {'input_ids': [], 'continuation_indices': [], 'mode': 'icl_task', 'labels': []}
-        # self.default_batch = {'input_ids': [], 'continuation_indices': [], 'mode': 'icl_task', 'labels': []}
-        # batch = self.defatul_batch 
-        # batch_mapping = {
-        #     "input_ids": 'context',
-        #     "continuation_indices": 'continuation',
-        #     "labels": 'context'
-        # }
-        # for data_pair in data:
-        #     for batch_key, data_key in batch_mapping:
-        #         if batch_key == 'input_ids':
-        #             if self.tokenize_labels:
-        #                 inp, cont_span = _make_padded_input(data_pair[data_key], data_pair[self.answer_key], self.max_seq_len, self.pad_tok_id)
-        #                 batch['input_ids'].append(inp)
-        #                 batch['continuation_indices'].append(cont_span)
-        #             else:
-        #                 # TODO: just make this cont_span return as empty list if dat_pari[ans_key] is none?
-        #                 # answer = data_pair[self.answer_key] if self.tokenize_label else []
-        #                 inp, cont_span = _make_padded_input(data_pair[data_key], data_pair[self.answer_key], self.max_seq_len, self.pad_tok_id)
-        #                 # TODO: if label is also inp, what then? maybe check is batch_key == self.context_key
-        #                 batch['input_ids'].append(inp)
-        #         else:
-        #             batch[batch_key].append(data_pair[data_key])
-
-
-
+        # batch = {'input_ids': [], 'continuation_indices': [], 'mode': 'icl_task', 'labels': []}
+        batch = self.defatul_batch
+        batch_mapping = {
+            "input_ids": 'context',
+            "continuation_indices": 'continuation',
+            "labels": 'context'
+        }
         for data_pair in data:
-            context_enc = data_pair['preamble']['input_ids'] + data_pair['context']['input_ids']
-            continuation_enc = data_pair['continuation']['input_ids']
+            for batch_key, data_key in batch_mapping:
+                if data_key == self.context_key:
+                    if self.tokenize_labels:
+                        # NOTE: this will be run twice if more than one batch_key uses the input (like in LM task)
+                        inp, cont_span = _make_padded_input(data_pair[self.context_key], data_pair[self.answer_key], self.max_seq_len, self.pad_tok_id)
+                        batch[batch_key].append(inp)
+                        batch['continuation_indices'].append(cont_span)
+                    else:
+                        inp, _ = _make_padded_input(data_pair[data_key], data_pair[self.answer_key], self.max_seq_len, self.pad_tok_id)
+                        batch[batch_key].append(inp)
+                else:
+                    batch[batch_key].append(data_pair[data_key])
 
-            inp, continuation_span = _make_padded_input(context_enc, continuation_enc, self.max_seq_len,
-                                                        self.pad_tok_id)
-            batch['input_ids'].append(inp)
-            batch['continuation_indices'].append(continuation_span)
-            batch['labels'].append(inp)
+        # for data_pair in data:
+        #     context_enc = data_pair['preamble']['input_ids'] + data_pair['context']['input_ids']
+        #     continuation_enc = data_pair['continuation']['input_ids']
+
+        #     inp, continuation_span = _make_padded_input(context_enc, continuation_enc, self.max_seq_len,
+        #                                                 self.pad_tok_id)
+        #     batch['input_ids'].append(inp)
+        #     batch['continuation_indices'].append(continuation_span)
+        #     batch['labels'].append(inp)
 
         batch = self._convert_tokens_to_tensors(batch)
         return batch
@@ -797,9 +802,7 @@ class InContextLearningMultipleChoiceTaskDataset(InContextLearningDataset):
     """
 
     def __init__(self, choices_key: str = 'choices', *args, **kwargs):
-        super().__init__(context_key='query',
-                         *args,
-                         **kwargs)
+        super().__init__(context_key='query', *args, **kwargs)
         self.num_choices = len(self.dataset[0][choices_key])
 
     def _get_answer_from_example(self, example: Dict) -> str:
@@ -1218,23 +1221,32 @@ def collate_fn(self, data: Dict[str, Any]) -> Dict[str, Any]:
         Returns:
             Dict: dictionary for a single batch
         """
+        # batch_mapping = {
+        #     'input_ids': self.context_key,
+        #     'labels': self.answer_key,
+        #     'tests': 'test',
+        #     'entry_points': 'entry_point',
+        #     'test_inputs': 'test_input',
+        #     'test_outputs': 'test_outputs',
+        #     'languages': 'language'
+        # }
         batch = {
             'input_ids': [],
             'mode': 'generate',
             'labels': [],
-            'prompts': [],  
-            'tests': [],  
-            'entry_points': [],  
-            'test_inputs': [],  
-            'test_outputs': [], 
-            'languages': [],  
+            'prompts': [],
+            'tests': [],
+            'entry_points': [],
+            'test_inputs': [],
+            'test_outputs': [],
+            'languages': [],
             'pass_at_k': self.pass_at_k,
             'generation_length': self.max_seq_len - self.max_prompt_length,
             'generation_kwargs': {
                 'pad_token_id': self.pad_tok_id,
                 # TODO: specify this?
                 'num_beams': 1,  # single beam
-                'num_return_sequences': self.generations_per_sample,  
+                'num_return_sequences': self.generations_per_sample,
                 'do_sample': True,
                 'top_p': self.top_p,
                 'top_k': self.top_k,

From 1f393c0678f32b229c1d86b5403a1ccab6a943f8 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Mon, 4 Dec 2023 19:08:46 +0000
Subject: [PATCH 062/116] linting and debugging statements to help me remember
 where I'm doing wip

---
 .../in_context_learning_evaluation.py         | 525 ++++++++++--------
 .../test_in_context_learning_datasets.py      |  11 +-
 2 files changed, 293 insertions(+), 243 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 31918d1a48..d53d6359fc 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -43,6 +43,25 @@ def _tokenizer_needs_prefix_space(tokenizer: transformers.PreTrainedTokenizerBas
     return len(tokenizer(' a', add_special_tokens=False)['input_ids']) == 1
 
 
+def _trim_context(context_enc: List, continuation_enc: List, max_seq_len: int) -> List:
+    if len(continuation_enc) + len(context_enc) > max_seq_len:
+        context_max_subseq_len = max_seq_len - len(continuation_enc)
+
+        if context_max_subseq_len < 0:
+            # can't support continuations which are longer than the max seq len
+            raise Exception(f'Dataset included continuation longer than the max seq len')
+
+        # TODO: is this true?
+        # clip from the end
+        context_enc = context_enc[-(context_max_subseq_len):]
+    return context_enc
+
+
+def _get_continuation_span(context_enc: List, continuation_enc: List) -> list:
+    return torch.tensor(range(len(context_enc), len(context_enc) + len(continuation_enc)))
+    # return list(range(len(context_enc), len(context_enc) + len(continuation_enc)))
+
+
 def _make_padded_input(context_enc: List,
                        continuation_enc: List,
                        max_seq_len: int,
@@ -66,24 +85,13 @@ def _make_padded_input(context_enc: List,
 
     """
 
-    # TODO: Not obvious this happens here, should probably be it's own funciton
-    if len(continuation_enc) + len(context_enc) > max_seq_len:
-        # clip from the end
-        context_max_subseq_len = max_seq_len - len(continuation_enc)
-
-        if context_max_subseq_len < 0:
-            raise Exception(f'Dataset included continuation longer than the max seq len')
-            # can't support continuations which are longer than the max seq len
-
-        context_enc = context_enc[-(context_max_subseq_len):]
-
-    continuation_span = torch.tensor(range(len(context_enc), len(context_enc) + len(continuation_enc)))
     inp = torch.tensor(
         (context_enc + continuation_enc),
         dtype=torch.long,
     )
     (inp_len,) = inp.shape
 
+    print(padding_side)
     # pad length from seq to padding_length
     if padding_side == 'right':
         inp = torch.cat(
@@ -104,7 +112,7 @@ def _make_padded_input(context_enc: List,
     else:
         raise ValueError(f"Unknown padding_side {padding_side}. padding_side must be either 'left' or 'right'")
 
-    return inp, continuation_span
+    return inp
 
 
 def _get_fewshot_sample_idxs(dataset_size: int, num_fewshot: int, example_idx: int, rng: random.Random) -> List[int]:
@@ -182,6 +190,9 @@ def __init__(
         context_key: str = 'context',
         answer_key: str = 'answer',
         strip_dataset: bool = True,
+        padding_side: str = 'right',
+        default_batch: Dict = None,
+        batch_mapping: Dict = None,
         hf_loading_vars: Dict = None,
         hf_parsing_map: Dict = None,
         tokenize_labels: bool = True,
@@ -195,13 +206,15 @@ def __init__(
         self.num_fewshot = num_fewshot
         # TODO: check this is correct for all dataset types
         # TODO: change how this is set, using default is unintuitive rn
-        self.padding_side = 'left'
+        self.padding_side = padding_side
         self.prelimiter = prelimiter
         self.example_delimiter = example_delimiter
         self.continuation_delimiter = continuation_delimiter
         self.context_key = context_key
         self.answer_key = answer_key
         self.tokenize_labels = tokenize_labels
+        self.batch_mapping = batch_mapping
+        self.default_batch = default_batch
 
         hf_loading_vars = hf_loading_vars or {}
         self.dataset = self._read_dataset(dataset_uri, destination_path, hf_loading_vars, hf_parsing_map)
@@ -373,12 +386,30 @@ def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -
         """
         tokenized_example = {}
         preamble = self.tokenizer(prompt_and_fewshot)
-        preamble['input_ids'] = self._fix_eos_on_preamble(preamble['input_ids'])
-        tokenized_example['preamble'] = preamble
+        preamble = self._fix_eos_on_preamble(preamble['input_ids'])
         if self.strip_data:
             # rstrip context because a prompt ending in a space results in degenerate output
             ctxt = ctxt.rstrip()
-        tokenized_example['context'] = self.tokenizer(ctxt, add_special_tokens=False)
+        tokenized_context = self.tokenizer(ctxt, add_special_tokens=False)['input_ids']
+        tokenized_context = preamble + tokenized_context
+
+        if self.tokenize_labels:
+            tokenized_answer = self.tokenizer(self._get_answer_from_example(example))['input_ids']
+            trimmed_context = _trim_context(tokenized_context, tokenized_answer, self.max_seq_len)
+            continuation_indices = _get_continuation_span(trimmed_context, tokenized_answer)
+            padded_context = _make_padded_input(trimmed_context, tokenized_answer, self.max_seq_len, self.pad_tok_id,
+                                                self.padding_side)
+
+            tokenized_example[self.context_key] = padded_context
+            tokenized_example[self.answer_key] = tokenized_answer
+            tokenized_example['continuation_indices'] = continuation_indices
+        else:
+            trimmed_context = _trim_context(tokenized_context, [], self.max_seq_len)
+            padded_context = _make_padded_input(trimmed_context, [], self.max_seq_len, self.pad_tok_id,
+                                                self.padding_side)
+
+            tokenized_example[self.context_key] = padded_context
+            tokenized_example[self.answer_key] = self._get_answer_from_example(example)
         return tokenized_example
 
     def _prep_example(
@@ -411,10 +442,11 @@ def _prep_example(
         return tokenized_example
 
     def _convert_tokens_to_tensors(self, batch: Dict) -> Dict[str, Any]:
-        batch['input_ids'] = torch.stack(batch['input_ids'])
+        # zzzz HF converts ur torch tensors into lists so need to convert them back
+        batch['input_ids'] = torch.stack(list(map(torch.tensor, batch['input_ids'])))
         if self.tokenize_labels:
-            batch['labels'] = torch.stack(batch['labels'])
-        batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
+            batch['labels'] = torch.stack(list(map(torch.tensor, batch['labels'])))
+            batch['continuation_indices'] = list(map(torch.tensor, batch['continuation_indices']))
         return batch
 
     def collate_fn(self, data: Dict[str, Any]) -> Dict[str, Any]:
@@ -426,34 +458,18 @@ def collate_fn(self, data: Dict[str, Any]) -> Dict[str, Any]:
         Returns:
             Dict: dictionary for a single batch
         """
-        # TODO: move preamble + context to tokenize_example
-        # TODO: use self.answer_key in other classes
-        # TODO: use tokenize_labels in other classes
-        # TODO: extract input_ids in tokenize_example
         batch = self.default_batch
-        batch_mapping = {
-            "input_ids": 'context',
-            "continuation_indices": 'continuation',
-            "labels": 'context'
-        }
         for data_pair in data:
-            for batch_key, data_key in batch_mapping:
-                if data_key == self.context_key:
-                    if self.tokenize_labels:
-                        # NOTE: this will be run twice if more than one batch_key uses the input (like in LM task)
-                        inp, cont_span = _make_padded_input(data_pair[self.context_key], data_pair[self.answer_key], self.max_seq_len, self.pad_tok_id)
-                        batch[batch_key].append(inp)
-                        batch['continuation_indices'].append(cont_span)
-                    else:
-                        inp, _ = _make_padded_input(data_pair[data_key], data_pair[self.answer_key], self.max_seq_len, self.pad_tok_id)
-                        batch[batch_key].append(inp)
-                else:
-                    batch[batch_key].append(data_pair[data_key])
+            for batch_key, data_key in self.batch_mapping.items():
+                batch[batch_key].append(data_pair[data_key])
+            if 'continuation_indices' in data_pair:
+                batch['continuation_indices'].append(data_pair['continuation_indices'])
 
         batch = self._convert_tokens_to_tensors(batch)
+        batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
+        # import IPython; IPython.embed()
         return batch
 
-
     def split_batch(self, batch: Any, microbatch_size: int) -> List[Dict[str, Any]]:
         """
         Handling for certain specialty columns that must be split into batches in different formats.
@@ -542,7 +558,7 @@ def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict):
         """
         tokenized_example = super()._tokenize_example(prompt_and_fewshot, ctxt, example)
         answer = example['answers'][0]
-        tokenized_example['answer'] = self.tokenizer(answer, add_special_tokens=False)
+        tokenized_example['answer'] = self.tokenizer(answer, add_special_tokens=False)['input_ids']
         return tokenized_example
 
     def collate_fn(self, data):
@@ -556,8 +572,8 @@ def collate_fn(self, data):
         """
         batch = {'input_ids': [], 'continuation_indices': [], 'mode': 'icl_task', 'labels': [], 'answer_indices': []}
         for data_pair in data:
-            context_enc = data_pair['preamble']['input_ids'] + data_pair['context']['input_ids']
-            answer_enc = data_pair['answer']['input_ids']
+            context_enc = data_pair['context']
+            answer_enc = data_pair['answer']
 
             inp, answer_span = _make_padded_input(context_enc, answer_enc, self.max_seq_len, self.pad_tok_id)
             batch['input_ids'].append(inp)
@@ -565,6 +581,7 @@ def collate_fn(self, data):
             batch['labels'].append(inp)
 
         batch = self._convert_tokens_to_tensors(batch)
+        batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
         return batch
 
 
@@ -587,9 +604,29 @@ class InContextLearningQATaskDataset(InContextLearningDataset):
     def __init__(self, cot_delimiter: str = '', *args, **kwargs):
         self.cot_delimiter = cot_delimiter
         self.has_cot = False
-        super().__init__(tokenize_labels=False, *args, **kwargs)
-
-        self.max_answer_length = self.get_max_answer_length()
+        super().__init__(
+            default_batch={
+                'input_ids': [],
+                'mode': 'generate',
+                'labels': [],
+                'cot_delimiter': self.cot_delimiter,
+                'generation_length': 0,
+                'generation_kwargs': {
+                    'pad_token_id': 0,
+                    'use_cache': True
+                }
+            },
+            batch_mapping={
+                # TODO: self.context_key?
+                'input_ids': 'context',
+                'labels': 'aliases',
+            },
+            padding_side='left',
+            tokenize_labels=False,
+            *args,
+            **kwargs)
+        # self.max_answer_length = self.get_max_answer_length()
+        self.default_batch['generation_kwargs'] = self.pad_tok_id
 
     def _read_dataset(
         self,
@@ -600,13 +637,17 @@ def _read_dataset(
     ):
         dataset = super()._read_dataset(dataset_uri, destination_path, hf_loading_vars, hf_parsing_map)
         self.has_cot = 'chain_of_thought' in dataset.features
-        return dataset.map(
+        dataset = dataset.map(
             lambda examples: {
                 'context': examples['context'],
                 'answer': examples['answer'],
                 'aliases': set([examples['answer']] + examples.get('aliases', [])),
                 'chain_of_thought': examples.get('chain_of_thought', ''),
             })
+        max_answer_length = self._get_max_answer_length(dataset)
+        self.max_seq_len = self.max_seq_len - max_answer_length
+        self.default_batch['generation_length'] = max_answer_length
+        return dataset
 
     def _get_answer_from_example(self, example: Dict) -> str:
         """
@@ -637,7 +678,7 @@ def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -
         tokenized_example['aliases'] = list(example.get('aliases', []))
         return tokenized_example
 
-    def get_max_answer_length(self) -> int:
+    def _get_max_answer_length(self, dataset) -> int:
         f"""
         Loops over the dataset and finds the longest answer length.
 
@@ -645,7 +686,7 @@ def get_max_answer_length(self) -> int:
             int: the maximum answer length with an additional buffer of {_MAX_ANSWER_BUFFER_LENGTH} if chain of thought is present
         """
         max_answer_length = 0
-        for example in self.dataset:
+        for example in dataset:
             all_answers = [example[self.answer_key]] + list(example.get('aliases', []))
             for answer in all_answers:
                 if self.has_cot:
@@ -656,46 +697,33 @@ def get_max_answer_length(self) -> int:
         max_answer_length = max_answer_length + (_MAX_ANSWER_BUFFER_LENGTH if len(self.cot_delimiter) > 0 else 0)
         return max_answer_length
 
-    def collate_fn(self, data: Dict) -> Dict[str, Any]:
-        """
-        The function that the dataloader uses to accumulate data into batches.
-        Args:
-            data (List): list of tokenized datapoints (dicts returned by self._tokenize_example)
-
-        Returns:
-            Dict: dictionary for a single batch
-        """
-        # batch_mapping = {
-        #     'input_ids': self.context_key,
-        #     'labels': 'aliases',
-        # }
-        batch = {
-            'input_ids': [],
-            'mode': 'generate',
-            'labels': [],
-            'cot_delimiter': self.cot_delimiter,
-            'generation_length': self.max_answer_length,
-            'generation_kwargs': {
-                'pad_token_id': self.pad_tok_id,
-                'use_cache': True
-            },
-        }
-        for example in data:
-            aliases = example['aliases']
-            context_enc = example['preamble']['input_ids'] + example['context']['input_ids']
-            inp, _ = _make_padded_input(
-                context_enc,
-                [],
-                self.max_seq_len - self.max_answer_length,
-                self.pad_tok_id,
-                padding_side=self.padding_side,
-            )
-
-            batch['input_ids'].append(inp)
-            batch['labels'].append(aliases)
-
-        batch = self._convert_tokens_to_tensors(batch)
-        return batch
+    # def collate_fn(self, data: Dict) -> Dict[str, Any]:
+    #     """
+    #     The function that the dataloader uses to accumulate data into batches.
+    #     Args:
+    #         data (List): list of tokenized datapoints (dicts returned by self._tokenize_example)
+
+    #     Returns:
+    #         Dict: dictionary for a single batch
+    #     """
+    #     batch = self.default_batch
+    #     for example in data:
+    #         aliases = example['aliases']
+    #         context_enc = example['preamble'] + example['context']
+    #         inp, _ = _make_padded_input(
+    #             context_enc,
+    #             [],
+    #             self.max_seq_len - self.max_answer_length,
+    #             self.pad_tok_id,
+    #             padding_side=self.padding_side,
+    #         )
+
+    #         batch['input_ids'].append(inp)
+    #         batch['labels'].append(aliases)
+
+    #     batch = self._convert_tokens_to_tensors(batch)
+    #     batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
+    #     return batch
 
 
 class InContextLearningLMTaskDataset(InContextLearningDataset):
@@ -711,69 +739,46 @@ class InContextLearningLMTaskDataset(InContextLearningDataset):
     """
 
     def __init__(self, *args, **kwargs):
-        super().__init__(answer_key='continuation', *args, **kwargs)
-        self.default_batch = {'input_ids': [], 'continuation_indices': [], 'mode': 'icl_task', 'labels': []}
-
-    def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -> Dict[str, Any]:
-        """
-        Runs text through the tokenizer and handles special cases.
-        Args:
-            prompt_and_fewshot (str): the collection of the prompt and fewshot examples that belongs before the example's context
-            ctx (str): the specific example's derrived context
-            example (Dict): the example as a dictionary.
+        super().__init__(
+            answer_key='continuation',
+            default_batch={
+                'input_ids': [],
+                'continuation_indices': [],
+                'mode': 'icl_task',
+                'labels': []
+            },
+            batch_mapping={
+                'input_ids': 'context',
+                # "continuation_indices": 'continuation',
+                'labels': 'context'
+            },
+            padding_side='right',
+            *args,
+            **kwargs)
 
-        Returns:
-            Dict: dictionary with the tokenized data
-        """
-        tokenized_example = super()._tokenize_example(prompt_and_fewshot, ctxt, example)
-        cont = example['continuation']
+    def _get_answer_from_example(self, example: Dict[str, Any]) -> str:
+        cont = example[self.answer_key]
         if self.prefix_space and not cont.startswith(' '):
             cont = f' {cont}'
-        tokenized_example['continuation'] = self.tokenizer(cont, add_special_tokens=False)
-        return tokenized_example
-
-    def collate_fn(self, data: Dict[str, any]) -> Dict[str, Any]:
-        """
-        Accumulate examples into batches
-        Args:
-            data (List): list of tokenized datapoints (dicts returned by self._tokenize_example)
-
-        Returns:
-            Dict: dictionary for a single batch
-        """
-        # batch = {'input_ids': [], 'continuation_indices': [], 'mode': 'icl_task', 'labels': []}
-        batch = self.defatul_batch
-        batch_mapping = {
-            "input_ids": 'context',
-            "continuation_indices": 'continuation',
-            "labels": 'context'
-        }
-        for data_pair in data:
-            for batch_key, data_key in batch_mapping:
-                if data_key == self.context_key:
-                    if self.tokenize_labels:
-                        # NOTE: this will be run twice if more than one batch_key uses the input (like in LM task)
-                        inp, cont_span = _make_padded_input(data_pair[self.context_key], data_pair[self.answer_key], self.max_seq_len, self.pad_tok_id)
-                        batch[batch_key].append(inp)
-                        batch['continuation_indices'].append(cont_span)
-                    else:
-                        inp, _ = _make_padded_input(data_pair[data_key], data_pair[self.answer_key], self.max_seq_len, self.pad_tok_id)
-                        batch[batch_key].append(inp)
-                else:
-                    batch[batch_key].append(data_pair[data_key])
-
-        # for data_pair in data:
-        #     context_enc = data_pair['preamble']['input_ids'] + data_pair['context']['input_ids']
-        #     continuation_enc = data_pair['continuation']['input_ids']
-
-        #     inp, continuation_span = _make_padded_input(context_enc, continuation_enc, self.max_seq_len,
-        #                                                 self.pad_tok_id)
-        #     batch['input_ids'].append(inp)
-        #     batch['continuation_indices'].append(continuation_span)
-        #     batch['labels'].append(inp)
-
-        batch = self._convert_tokens_to_tensors(batch)
-        return batch
+        return cont
+
+    # def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -> Dict[str, Any]:
+    #     """
+    #     Runs text through the tokenizer and handles special cases.
+    #     Args:
+    #         prompt_and_fewshot (str): the collection of the prompt and fewshot examples that belongs before the example's context
+    #         ctx (str): the specific example's derrived context
+    #         example (Dict): the example as a dictionary.
+
+    #     Returns:
+    #         Dict: dictionary with the tokenized data
+    #     """
+    #     tokenized_example = super()._tokenize_example(prompt_and_fewshot, ctxt, example)
+    #     cont = example['continuation']
+    #     if self.prefix_space and not cont.startswith(' '):
+    #         cont = f' {cont}'
+    #     tokenized_example['continuation'] = self.tokenizer(cont, add_special_tokens=False)['input_ids']
+    #     return tokenized_example
 
 
 class InContextLearningMultipleChoiceTaskDataset(InContextLearningDataset):
@@ -802,7 +807,7 @@ class InContextLearningMultipleChoiceTaskDataset(InContextLearningDataset):
     """
 
     def __init__(self, choices_key: str = 'choices', *args, **kwargs):
-        super().__init__(context_key='query', *args, **kwargs)
+        super().__init__(context_key='query', padding_side='right', *args, **kwargs)
         self.num_choices = len(self.dataset[0][choices_key])
 
     def _get_answer_from_example(self, example: Dict) -> str:
@@ -833,7 +838,9 @@ def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -
         choices = example['choices']
         if self.prefix_space:
             choices = [(f' {choice}' if not choice.startswith(' ') else choice) for choice in choices]
-        tokenized_example['choices'] = [self.tokenizer(choice, add_special_tokens=False) for choice in choices]
+        tokenized_example['choices'] = [
+            self.tokenizer(choice, add_special_tokens=False)['input_ids'] for choice in choices
+        ]
         tokenized_example['gold'] = example['gold']
         return tokenized_example
 
@@ -858,8 +865,8 @@ def collate_fn(self, data: Dict[str, Any]) -> Dict[str, Any]:
             choice_start_idx = len(batch['continuation_indices'])
 
             for choice in data_pair['choices']:
-                context_enc = data_pair['preamble']['input_ids'] + data_pair['context']['input_ids']
-                continuation_enc = choice['input_ids']
+                context_enc = data_pair['preamble'] + data_pair['context']
+                continuation_enc = choice
                 inp, continuation_span = _make_padded_input(context_enc, continuation_enc, self.max_seq_len,
                                                             self.pad_tok_id)
 
@@ -879,6 +886,7 @@ def collate_fn(self, data: Dict[str, Any]) -> Dict[str, Any]:
         # which contiguous sequences of elements in the batch correspond to which question
         # gold_indices indicates which of the [0, N-1] choices is the correct one for each question.
         batch = self._convert_tokens_to_tensors(batch)
+        batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
         return batch
 
     def get_num_samples_in_batch(self, batch) -> int:
@@ -951,6 +959,7 @@ class InContextLearningSchemaTaskDataset(InContextLearningMultipleChoiceTaskData
     """
 
     def __init__(self, choices_key='context_options', *args, **kwargs):
+        # padding_side = left
         super().__init__(choices_key=choices_key, *args, **kwargs)
 
     def _construct_context(self, example, preceding_text: str = '', add_answer: bool = False) -> str:
@@ -1038,13 +1047,15 @@ def _tokenize_example(self, prompt_and_fewshot: str, context_options: List[str],
         """
         tokenized_example = {}
         preamble = self.tokenizer(prompt_and_fewshot)
-        preamble['input_ids'] = self._fix_eos_on_preamble(preamble['input_ids'])
+        preamble = self._fix_eos_on_preamble(preamble['input_ids'])
         tokenized_example['preamble'] = preamble
-        tokenized_example['context_options'] = [self.tokenizer(c, add_special_tokens=False) for c in context_options]
+        tokenized_example['context_options'] = [
+            self.tokenizer(c, add_special_tokens=False)['input_ids'] for c in context_options
+        ]
         continuation = example['continuation']
         if self.prefix_space:
             continuation = (f' {continuation}' if not continuation.startswith(' ') else continuation)
-        tokenized_example['continuation'] = self.tokenizer(continuation, add_special_tokens=False)
+        tokenized_example['continuation'] = self.tokenizer(continuation, add_special_tokens=False)['input_ids']
         tokenized_example['gold'] = example['gold']
         return tokenized_example
 
@@ -1065,13 +1076,22 @@ def collate_fn(self, data) -> Dict[str, Any]:
             'gold_indices': [],
             'choice_groupings': [],
         }
+        # batch_map = {
+        #     "gold_indices": "gold",
+        #     "input_ids": "context",
+        #     "labels": "context"
+        # }
+        # for data_pair in data:
+        #     continuation_start_idx = len(batch['continuation_indices'])
+        #     for context in context_options[self.choices_key]:
+
         for data_pair in data:
             continuation_start_idx = len(batch['continuation_indices'])
             context_options = data_pair['context_options']
 
             for context in context_options:
-                context_enc = data_pair['preamble']['input_ids'] + context['input_ids']
-                continuation_enc = data_pair['continuation']['input_ids']
+                context_enc = data_pair['preamble'] + context
+                continuation_enc = data_pair['continuation']
                 inp, continuation_span = _make_padded_input(context_enc, continuation_enc, self.max_seq_len,
                                                             self.pad_tok_id)
 
@@ -1091,6 +1111,7 @@ def collate_fn(self, data) -> Dict[str, Any]:
         # which contiguous sequences of elements in the batch correspond to which question
         # gold_indices indicates which of the [0, N-1] choices is the correct one for each question.
         batch = self._convert_tokens_to_tensors(batch)
+        batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
         return batch
 
 
@@ -1149,32 +1170,62 @@ def __init__(
         *args,
         **kwargs,
     ):
-        self.check_defaults_are_set({
-            'pass_at_k': pass_at_k,
-            'generations_per_sample': generations_per_sample,
-            'top_p': top_p,
-            'top_k': top_k,
-            'temperature': temperature
-        })
-        if generations_per_sample < pass_at_k:
-            raise ValueError(
-                f'generations_per_sample ({generations_per_sample}) must be greater than or equal to pass_at_k ({pass_at_k}) for code evaluation.'
-            )
-
+        # self.check_defaults_are_set({
+        #     'pass_at_k': pass_at_k,
+        #     'generations_per_sample': generations_per_sample,
+        #     'top_p': top_p,
+        #     'top_k': top_k,
+        #     'temperature': temperature
+        # })
+        # if generations_per_sample < pass_at_k:
+        #     raise ValueError(
+        #         f'generations_per_sample ({generations_per_sample}) must be greater than or equal to pass_at_k ({pass_at_k}) for code evaluation.'
+        #     )
+        batch_mapping = {
+            'input_ids': 'prompt',
+            'prompts': 'prompt_text',
+            'tests': 'test',
+            'labels': 'canonical_solution',
+            'entry_points': 'entry_point',
+            'test_inputs': 'test_inputs',
+            'test_outputs': 'test_outputs',
+            'languages': 'language'
+        }
         super().__init__(
             context_key='prompt',
             answer_key='canonical_solution',
             strip_dataset=False,
             tokenize_labels=False,
+            padding_side='left',
+            batch_mapping=batch_mapping,
             *args,
             **kwargs,
         )
-        self.pass_at_k = pass_at_k
-        self.generations_per_sample = generations_per_sample
         self.max_prompt_length = self.get_max_prompt_length()
-        self.top_p = top_p
-        self.top_k = top_k
-        self.temperature = temperature
+        self.default_batch = {
+            'input_ids': [],
+            'mode': 'generate',
+            'labels': [],
+            'prompts': [],
+            'tests': [],
+            'entry_points': [],
+            'test_inputs': [],
+            'test_outputs': [],
+            'languages': [],
+            'pass_at_k': pass_at_k,
+            'generation_length': self.max_seq_len - self.max_prompt_length,
+            'generation_kwargs': {
+                'pad_token_id': self.pad_tok_id,
+                # TODO: specify this?
+                'num_beams': 1,  # single beam
+                'num_return_sequences': generations_per_sample,
+                'do_sample': True,
+                'top_p': top_p,
+                'top_k': top_k,
+                'temperature': temperature,
+                'use_cache': True
+            },
+        }
 
     def get_max_prompt_length(self) -> int:
         """
@@ -1186,7 +1237,7 @@ def get_max_prompt_length(self) -> int:
         for example in self.dataset:
             max_prompt_length = max(
                 max_prompt_length,
-                len(example['preamble']['input_ids'] + example['context']['input_ids']),
+                len(example[self.context_key]),
             )
         return max_prompt_length
 
@@ -1212,69 +1263,61 @@ def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -
         tokenized_example['language'] = example['language']
         return tokenized_example
 
-    def collate_fn(self, data: Dict[str, Any]) -> Dict[str, Any]:
-        """
-        The function that the dataloader uses to accumulate data into batches.
-        Args:
-            data (List): list of tokenized datapoints (dicts returned by self._tokenize_example)
-
-        Returns:
-            Dict: dictionary for a single batch
-        """
-        # batch_mapping = {
-        #     'input_ids': self.context_key,
-        #     'labels': self.answer_key,
-        #     'tests': 'test',
-        #     'entry_points': 'entry_point',
-        #     'test_inputs': 'test_input',
-        #     'test_outputs': 'test_outputs',
-        #     'languages': 'language'
-        # }
-        batch = {
-            'input_ids': [],
-            'mode': 'generate',
-            'labels': [],
-            'prompts': [],
-            'tests': [],
-            'entry_points': [],
-            'test_inputs': [],
-            'test_outputs': [],
-            'languages': [],
-            'pass_at_k': self.pass_at_k,
-            'generation_length': self.max_seq_len - self.max_prompt_length,
-            'generation_kwargs': {
-                'pad_token_id': self.pad_tok_id,
-                # TODO: specify this?
-                'num_beams': 1,  # single beam
-                'num_return_sequences': self.generations_per_sample,
-                'do_sample': True,
-                'top_p': self.top_p,
-                'top_k': self.top_k,
-                'temperature': self.temperature,
-                'use_cache': True
-            },
-        }
-        for example in data:
-            context_enc = example['preamble']['input_ids'] + example['context']['input_ids']
-            inp, _ = _make_padded_input(
-                context_enc,
-                [],
-                self.max_prompt_length,
-                self.pad_tok_id,
-                padding_side=self.padding_side,
-            )
-
-            batch['input_ids'].append(inp)
-            batch['prompts'].append(example['prompt_text'])
-            batch['tests'].append(example['test'])
-            batch['labels'].append(example['canonical_solution'])
-            batch['entry_points'].append(example['entry_point'])
-            batch['test_inputs'].append(example['test_inputs'])
-            batch['test_outputs'].append(example['test_outputs'])
-            batch['languages'].append(example['language'])
-
-        batch = self._convert_tokens_to_tensors(batch)
-        return batch
+    # def collate_fn(self, data: Dict[str, Any]) -> Dict[str, Any]:
+    #     """
+    #     The function that the dataloader uses to accumulate data into batches.
+    #     Args:
+    #         data (List): list of tokenized datapoints (dicts returned by self._tokenize_example)
+
+    #     Returns:
+    #         Dict: dictionary for a single batch
+    #     """
+    #     batch = {
+    #         'input_ids': [],
+    #         'mode': 'generate',
+    #         'labels': [],
+    #         'prompts': [],
+    #         'tests': [],
+    #         'entry_points': [],
+    #         'test_inputs': [],
+    #         'test_outputs': [],
+    #         'languages': [],
+    #         'pass_at_k': self.pass_at_k,
+    #         'generation_length': self.max_seq_len - self.max_prompt_length,
+    #         'generation_kwargs': {
+    #             'pad_token_id': self.pad_tok_id,
+    #             # TODO: specify this?
+    #             'num_beams': 1,  # single beam
+    #             'num_return_sequences': self.generations_per_sample,
+    #             'do_sample': True,
+    #             'top_p': self.top_p,
+    #             'top_k': self.top_k,
+    #             'temperature': self.temperature,
+    #             'use_cache': True
+    #         },
+    #     }
+    #     for example in data:
+    #         # context_enc = example[self.context_key]
+    #         # inp, _ = _make_padded_input(
+    #         #     context_enc,
+    #         #     [],
+    #         #     self.max_prompt_length,
+    #         #     self.pad_tok_id,
+    #         #     padding_side=self.padding_side,
+    #         # )
+
+    #         batch['input_ids'].append(example[self.context_key])
+    #         batch['prompts'].append(example['prompt_text'])
+    #         batch['tests'].append(example['test'])
+    #         batch['labels'].append(example['canonical_solution'])
+    #         batch['entry_points'].append(example['entry_point'])
+    #         batch['test_inputs'].append(example['test_inputs'])
+    #         batch['test_outputs'].append(example['test_outputs'])
+    #         batch['languages'].append(example['language'])
+
+    #     batch = self._convert_tokens_to_tensors(batch)
+    #     batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
+    #     return batch
 
 
 def build_icl_dataloader(
diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py
index f4834ee871..e1f70332ad 100644
--- a/tests/datasets/test_in_context_learning_datasets.py
+++ b/tests/datasets/test_in_context_learning_datasets.py
@@ -549,7 +549,11 @@ def test_qa_task_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path, num_fews
     assert all(item[0] == tokenizer.eos_token_id for item in batch['input_ids'])
 
     decoded_batch = tokenizer.batch_decode(batch['input_ids'])
-    assert all(item.count('Q: ') == num_fewshot + 1 for item in decoded_batch)
+    try:
+        assert all(item.count('Q: ') == num_fewshot + 1 for item in decoded_batch)
+    except:
+        import IPython; IPython.embed()
+
     assert all(item.count('\nA:') == num_fewshot + 1 for item in decoded_batch)
 
     if len(prompt_string) > 0:
@@ -937,7 +941,10 @@ def test_code_eval_task_dataloader(dataset_uri, tmp_path, num_fewshot, prompt_st
     assert batch['mode'] == 'generate'
     # the maximum generation length from the small test data
     assert batch['generation_length'] == seqlen - max_prompt_length
-    assert any(item[0] != tokenizer.eos_token_id for item in batch['input_ids'])  # longest should be pushed left
+    try:
+        assert any(item[0] != tokenizer.eos_token_id for item in batch['input_ids'])  # longest should be pushed left
+    except:
+        import IPython; IPython.embed()
 
     decoded_batch = tokenizer.batch_decode(batch['input_ids'])
     assert all(item.count('Code start: \n') == num_fewshot + 1 for item in decoded_batch)

From d0e79574d0673a6d5904ff79037c1d4e8993f8d2 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Tue, 5 Dec 2023 07:25:02 +0000
Subject: [PATCH 063/116] all tests except one sus schema test passing

---
 .../in_context_learning_evaluation.py         | 422 +++++++-----------
 .../test_in_context_learning_datasets.py      |  23 +-
 2 files changed, 165 insertions(+), 280 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index d53d6359fc..db337aa3a5 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -4,6 +4,7 @@
 
 from __future__ import annotations
 
+import copy
 import json
 import os
 import random
@@ -51,7 +52,6 @@ def _trim_context(context_enc: List, continuation_enc: List, max_seq_len: int) -
             # can't support continuations which are longer than the max seq len
             raise Exception(f'Dataset included continuation longer than the max seq len')
 
-        # TODO: is this true?
         # clip from the end
         context_enc = context_enc[-(context_max_subseq_len):]
     return context_enc
@@ -91,7 +91,6 @@ def _make_padded_input(context_enc: List,
     )
     (inp_len,) = inp.shape
 
-    print(padding_side)
     # pad length from seq to padding_length
     if padding_side == 'right':
         inp = torch.cat(
@@ -191,6 +190,7 @@ def __init__(
         answer_key: str = 'answer',
         strip_dataset: bool = True,
         padding_side: str = 'right',
+        padding_size: int = None,
         default_batch: Dict = None,
         batch_mapping: Dict = None,
         hf_loading_vars: Dict = None,
@@ -205,8 +205,8 @@ def __init__(
         self.pad_tok_id = pad_tok_id
         self.num_fewshot = num_fewshot
         # TODO: check this is correct for all dataset types
-        # TODO: change how this is set, using default is unintuitive rn
         self.padding_side = padding_side
+        self.padding_size = padding_size if padding_size else self.max_seq_len
         self.prelimiter = prelimiter
         self.example_delimiter = example_delimiter
         self.continuation_delimiter = continuation_delimiter
@@ -342,10 +342,10 @@ def _construct_context(self, example: Dict, preceding_text: str = '', add_answer
             ctxt = f'{self.example_delimiter}{ctxt}'
         ctxt = f'{ctxt}{self.continuation_delimiter}'
         if add_answer:
-            ctxt = f'{ctxt}{self._get_answer_from_example(example)}'
+            ctxt = f'{ctxt}{self._get_answer_from_example(example, in_context=add_answer)}'
         return ctxt
 
-    def _get_answer_from_example(self, example: Dict[str, Any]) -> str:
+    def _get_answer_from_example(self, example: Dict[str, Any], in_context=False) -> str:
         """
         Returns the answer from the example
         Args:
@@ -385,27 +385,31 @@ def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -
             Dict: dictionary with the tokenized data
         """
         tokenized_example = {}
+        # Always add special tokens to preamble
         preamble = self.tokenizer(prompt_and_fewshot)
         preamble = self._fix_eos_on_preamble(preamble['input_ids'])
         if self.strip_data:
             # rstrip context because a prompt ending in a space results in degenerate output
             ctxt = ctxt.rstrip()
+        # Never add special tokens to context
         tokenized_context = self.tokenizer(ctxt, add_special_tokens=False)['input_ids']
         tokenized_context = preamble + tokenized_context
 
         if self.tokenize_labels:
-            tokenized_answer = self.tokenizer(self._get_answer_from_example(example))['input_ids']
-            trimmed_context = _trim_context(tokenized_context, tokenized_answer, self.max_seq_len)
+            # Never add special tokens to answer
+            tokenized_answer = self.tokenizer(self._get_answer_from_example(example),
+                                              add_special_tokens=False)['input_ids']
+            trimmed_context = _trim_context(tokenized_context, tokenized_answer, self.padding_size)
             continuation_indices = _get_continuation_span(trimmed_context, tokenized_answer)
-            padded_context = _make_padded_input(trimmed_context, tokenized_answer, self.max_seq_len, self.pad_tok_id,
+            padded_context = _make_padded_input(trimmed_context, tokenized_answer, self.padding_size, self.pad_tok_id,
                                                 self.padding_side)
 
             tokenized_example[self.context_key] = padded_context
             tokenized_example[self.answer_key] = tokenized_answer
             tokenized_example['continuation_indices'] = continuation_indices
         else:
-            trimmed_context = _trim_context(tokenized_context, [], self.max_seq_len)
-            padded_context = _make_padded_input(trimmed_context, [], self.max_seq_len, self.pad_tok_id,
+            trimmed_context = _trim_context(tokenized_context, [], self.padding_size)
+            padded_context = _make_padded_input(trimmed_context, [], self.padding_size, self.pad_tok_id,
                                                 self.padding_side)
 
             tokenized_example[self.context_key] = padded_context
@@ -458,7 +462,7 @@ def collate_fn(self, data: Dict[str, Any]) -> Dict[str, Any]:
         Returns:
             Dict: dictionary for a single batch
         """
-        batch = self.default_batch
+        batch = copy.deepcopy(self.default_batch)
         for data_pair in data:
             for batch_key, data_key in self.batch_mapping.items():
                 batch[batch_key].append(data_pair[data_key])
@@ -467,7 +471,6 @@ def collate_fn(self, data: Dict[str, Any]) -> Dict[str, Any]:
 
         batch = self._convert_tokens_to_tensors(batch)
         batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
-        # import IPython; IPython.embed()
         return batch
 
     def split_batch(self, batch: Any, microbatch_size: int) -> List[Dict[str, Any]]:
@@ -604,29 +607,23 @@ class InContextLearningQATaskDataset(InContextLearningDataset):
     def __init__(self, cot_delimiter: str = '', *args, **kwargs):
         self.cot_delimiter = cot_delimiter
         self.has_cot = False
-        super().__init__(
-            default_batch={
-                'input_ids': [],
-                'mode': 'generate',
-                'labels': [],
-                'cot_delimiter': self.cot_delimiter,
-                'generation_length': 0,
-                'generation_kwargs': {
-                    'pad_token_id': 0,
-                    'use_cache': True
-                }
-            },
-            batch_mapping={
-                # TODO: self.context_key?
-                'input_ids': 'context',
-                'labels': 'aliases',
-            },
-            padding_side='left',
-            tokenize_labels=False,
-            *args,
-            **kwargs)
-        # self.max_answer_length = self.get_max_answer_length()
-        self.default_batch['generation_kwargs'] = self.pad_tok_id
+        super().__init__(padding_side='left', tokenize_labels=False, *args, **kwargs)
+        # NOTE: set these after init call bcus they take class vars
+        self.default_batch = {
+            'input_ids': [],
+            'mode': 'generate',
+            'labels': [],
+            'cot_delimiter': self.cot_delimiter,
+            'generation_length': self.max_answer_length,
+            'generation_kwargs': {
+                'pad_token_id': self.pad_tok_id,
+                'use_cache': True
+            }
+        }
+        self.batch_mapping = {
+            'input_ids': self.context_key,
+            'labels': 'aliases',
+        }
 
     def _read_dataset(
         self,
@@ -644,12 +641,12 @@ def _read_dataset(
                 'aliases': set([examples['answer']] + examples.get('aliases', [])),
                 'chain_of_thought': examples.get('chain_of_thought', ''),
             })
-        max_answer_length = self._get_max_answer_length(dataset)
-        self.max_seq_len = self.max_seq_len - max_answer_length
-        self.default_batch['generation_length'] = max_answer_length
+        self.max_answer_length = self._get_max_answer_length(dataset)
+        # NOTE: This is the only time we use the class variable padding_size.
+        self.padding_size = self.max_seq_len - self.max_answer_length
         return dataset
 
-    def _get_answer_from_example(self, example: Dict) -> str:
+    def _get_answer_from_example(self, example: Dict, in_context=False) -> str:
         """
         Returns the answer from the example. Applies chain of thought if self.has_cot is marked as true.
         Args:
@@ -697,34 +694,6 @@ def _get_max_answer_length(self, dataset) -> int:
         max_answer_length = max_answer_length + (_MAX_ANSWER_BUFFER_LENGTH if len(self.cot_delimiter) > 0 else 0)
         return max_answer_length
 
-    # def collate_fn(self, data: Dict) -> Dict[str, Any]:
-    #     """
-    #     The function that the dataloader uses to accumulate data into batches.
-    #     Args:
-    #         data (List): list of tokenized datapoints (dicts returned by self._tokenize_example)
-
-    #     Returns:
-    #         Dict: dictionary for a single batch
-    #     """
-    #     batch = self.default_batch
-    #     for example in data:
-    #         aliases = example['aliases']
-    #         context_enc = example['preamble'] + example['context']
-    #         inp, _ = _make_padded_input(
-    #             context_enc,
-    #             [],
-    #             self.max_seq_len - self.max_answer_length,
-    #             self.pad_tok_id,
-    #             padding_side=self.padding_side,
-    #         )
-
-    #         batch['input_ids'].append(inp)
-    #         batch['labels'].append(aliases)
-
-    #     batch = self._convert_tokens_to_tensors(batch)
-    #     batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
-    #     return batch
-
 
 class InContextLearningLMTaskDataset(InContextLearningDataset):
     """
@@ -739,47 +708,27 @@ class InContextLearningLMTaskDataset(InContextLearningDataset):
     """
 
     def __init__(self, *args, **kwargs):
-        super().__init__(
-            answer_key='continuation',
-            default_batch={
-                'input_ids': [],
-                'continuation_indices': [],
-                'mode': 'icl_task',
-                'labels': []
-            },
-            batch_mapping={
-                'input_ids': 'context',
-                # "continuation_indices": 'continuation',
-                'labels': 'context'
-            },
-            padding_side='right',
-            *args,
-            **kwargs)
-
-    def _get_answer_from_example(self, example: Dict[str, Any]) -> str:
+        super().__init__(answer_key='continuation',
+                         default_batch={
+                             'input_ids': [],
+                             'continuation_indices': [],
+                             'mode': 'icl_task',
+                             'labels': []
+                         },
+                         batch_mapping={
+                             'input_ids': 'context',
+                             'labels': 'context'
+                         },
+                         padding_side='right',
+                         *args,
+                         **kwargs)
+
+    def _get_answer_from_example(self, example: Dict[str, Any], in_context=False) -> str:
         cont = example[self.answer_key]
-        if self.prefix_space and not cont.startswith(' '):
+        if self.prefix_space and not cont.startswith(' ') and not in_context:
             cont = f' {cont}'
         return cont
 
-    # def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -> Dict[str, Any]:
-    #     """
-    #     Runs text through the tokenizer and handles special cases.
-    #     Args:
-    #         prompt_and_fewshot (str): the collection of the prompt and fewshot examples that belongs before the example's context
-    #         ctx (str): the specific example's derrived context
-    #         example (Dict): the example as a dictionary.
-
-    #     Returns:
-    #         Dict: dictionary with the tokenized data
-    #     """
-    #     tokenized_example = super()._tokenize_example(prompt_and_fewshot, ctxt, example)
-    #     cont = example['continuation']
-    #     if self.prefix_space and not cont.startswith(' '):
-    #         cont = f' {cont}'
-    #     tokenized_example['continuation'] = self.tokenizer(cont, add_special_tokens=False)['input_ids']
-    #     return tokenized_example
-
 
 class InContextLearningMultipleChoiceTaskDataset(InContextLearningDataset):
     """
@@ -807,10 +756,22 @@ class InContextLearningMultipleChoiceTaskDataset(InContextLearningDataset):
     """
 
     def __init__(self, choices_key: str = 'choices', *args, **kwargs):
-        super().__init__(context_key='query', padding_side='right', *args, **kwargs)
-        self.num_choices = len(self.dataset[0][choices_key])
+        self.choices_key = choices_key
+        default_batch = {
+            'input_ids': [],
+            'continuation_indices': [],
+            'mode': 'icl_task',
+            'labels': [],
+            'gold_indices': [],
+            'choice_groupings': [],
+        }
+        context_key = kwargs.pop('context_key', 'query')
+        super().__init__(context_key=context_key, default_batch=default_batch, padding_side='right', *args, **kwargs)
+        self.num_choices = len(self.dataset[0][self.choices_key])
+        self.batch_mapping_per_choice = {'input_ids': 'context', 'labels': 'context'}
+        self.batch_map_per_example = {'gold_indices': 'gold'}
 
-    def _get_answer_from_example(self, example: Dict) -> str:
+    def _get_answer_from_example(self, example: Dict, in_context=False) -> str:
         """
         Returns the correct answer from the example's choices.
         Args:
@@ -819,7 +780,7 @@ def _get_answer_from_example(self, example: Dict) -> str:
         Returns:
             str: the full string of the correct answer based on the 'gold' key
         """
-        choices = example['choices']
+        choices = example[self.choices_key]
         gold_idx = example['gold']
         return choices[gold_idx]
 
@@ -834,13 +795,37 @@ def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -
         Returns:
             Dict: dictionary with the tokenized data
         """
-        tokenized_example = super()._tokenize_example(prompt_and_fewshot, ctxt, example)
-        choices = example['choices']
-        if self.prefix_space:
-            choices = [(f' {choice}' if not choice.startswith(' ') else choice) for choice in choices]
-        tokenized_example['choices'] = [
-            self.tokenizer(choice, add_special_tokens=False)['input_ids'] for choice in choices
-        ]
+        # NOTE: some of this is repeated from super class but for loop makes things considerably different
+        tokenized_example = {}
+        # Always add special tokens to preamble
+        preamble = self.tokenizer(prompt_and_fewshot)
+        preamble = self._fix_eos_on_preamble(preamble['input_ids'])
+        if self.strip_data:
+            # rstrip context because a prompt ending in a space results in degenerate output
+            ctxt = ctxt.rstrip()
+        # Never add special tokens to context
+        tokenized_context = self.tokenizer(ctxt, add_special_tokens=False)['input_ids']
+        tokenized_context = preamble + tokenized_context
+
+        tokenized_example[self.context_key] = []
+        tokenized_example[self.answer_key] = []
+        tokenized_example['continuation_indices'] = []
+        # NOTE: Treating tokenize_labels as True for all MC datasets (required for our accuracy anyway)
+        for choice in example[self.choices_key]:
+            if self.prefix_space:
+                choice = f' {choice}' if not choice.startswith(' ') else choice
+
+            # Never add special tokens to answer
+            tokenized_answer = self.tokenizer(choice, add_special_tokens=False)['input_ids']
+            trimmed_context = _trim_context(tokenized_context, tokenized_answer, self.padding_size)
+            continuation_indices = _get_continuation_span(trimmed_context, tokenized_answer)
+            padded_context = _make_padded_input(trimmed_context, tokenized_answer, self.padding_size, self.pad_tok_id,
+                                                self.padding_side)
+
+            tokenized_example[self.context_key].append(padded_context)
+            tokenized_example[self.answer_key].append(tokenized_answer)
+            tokenized_example['continuation_indices'].append(continuation_indices)
+
         tokenized_example['gold'] = example['gold']
         return tokenized_example
 
@@ -853,26 +838,14 @@ def collate_fn(self, data: Dict[str, Any]) -> Dict[str, Any]:
         Returns:
             Dict: dictionary for a single batch
         """
-        batch = {
-            'input_ids': [],
-            'continuation_indices': [],
-            'mode': 'icl_task',
-            'labels': [],
-            'gold_indices': [],
-            'choice_groupings': [],
-        }
+        batch = copy.deepcopy(self.default_batch)
         for data_pair in data:
             choice_start_idx = len(batch['continuation_indices'])
-
-            for choice in data_pair['choices']:
-                context_enc = data_pair['preamble'] + data_pair['context']
-                continuation_enc = choice
-                inp, continuation_span = _make_padded_input(context_enc, continuation_enc, self.max_seq_len,
-                                                            self.pad_tok_id)
-
-                batch['input_ids'].append(inp)
-                batch['continuation_indices'].append(continuation_span)
-                batch['labels'].append(inp)
+            # TODO: use batch_mappings? Could be fine as is
+            for i, context_enc in enumerate(data_pair[self.context_key]):
+                batch['input_ids'].append(context_enc)
+                batch['continuation_indices'].append(data_pair['continuation_indices'][i])
+                batch['labels'].append(context_enc)
 
             batch['gold_indices'].append(data_pair['gold'])
             choice_end_idx = len(batch['continuation_indices'])
@@ -959,8 +932,15 @@ class InContextLearningSchemaTaskDataset(InContextLearningMultipleChoiceTaskData
     """
 
     def __init__(self, choices_key='context_options', *args, **kwargs):
-        # padding_side = left
-        super().__init__(choices_key=choices_key, *args, **kwargs)
+        super().__init__(choices_key=choices_key, context_key=choices_key, *args, **kwargs)
+        self.default_batch = {
+            'input_ids': [],
+            'continuation_indices': [],
+            'mode': 'icl_task',
+            'labels': [],
+            'gold_indices': [],
+            'choice_groupings': [],
+        }
 
     def _construct_context(self, example, preceding_text: str = '', add_answer: bool = False) -> str:
         """
@@ -975,7 +955,7 @@ def _construct_context(self, example, preceding_text: str = '', add_answer: bool
             str: the single correct context for a given continuation
 
         """
-        context_options = example['context_options']
+        context_options = example[self.choices_key]
         gold_idx = example['gold']
         continuation = example['continuation']
         context = context_options[gold_idx]
@@ -996,7 +976,7 @@ def _construct_multiple_contexts(self, example: Dict, preceding_text: str = '')
         Returns:
             list: all context options for the selected example with formatting
         """
-        context_options = example['context_options']
+        context_options = example[self.choices_key]
         if len(preceding_text) > 0:
             if self.strip_data:
                 cont_del = self.continuation_delimiter.rstrip()
@@ -1048,72 +1028,27 @@ def _tokenize_example(self, prompt_and_fewshot: str, context_options: List[str],
         tokenized_example = {}
         preamble = self.tokenizer(prompt_and_fewshot)
         preamble = self._fix_eos_on_preamble(preamble['input_ids'])
-        tokenized_example['preamble'] = preamble
-        tokenized_example['context_options'] = [
-            self.tokenizer(c, add_special_tokens=False)['input_ids'] for c in context_options
-        ]
+        encoded_contexts = [self.tokenizer(c, add_special_tokens=False)['input_ids'] for c in context_options]
         continuation = example['continuation']
         if self.prefix_space:
             continuation = (f' {continuation}' if not continuation.startswith(' ') else continuation)
-        tokenized_example['continuation'] = self.tokenizer(continuation, add_special_tokens=False)['input_ids']
+        tokenized_continuation = self.tokenizer(continuation, add_special_tokens=False)['input_ids']
+
+        tokenized_example[self.context_key] = []
+        tokenized_example['continuation_indices'] = []
+        tokenized_example[self.answer_key] = []
+        for context in encoded_contexts:
+            trimmed_context = _trim_context(context, tokenized_continuation, self.padding_size)
+            continuation_indices = _get_continuation_span(trimmed_context, tokenized_continuation)
+            padded_context = _make_padded_input(trimmed_context, tokenized_continuation, self.padding_size,
+                                                self.pad_tok_id, self.padding_side)
+            tokenized_example[self.context_key].append(padded_context)
+            tokenized_example['continuation_indices'].append(continuation_indices)
+            tokenized_example[self.answer_key].append(tokenized_continuation)
+
         tokenized_example['gold'] = example['gold']
         return tokenized_example
 
-    def collate_fn(self, data) -> Dict[str, Any]:
-        """
-        The function that the dataloader uses to accumulate data into batches.
-        Args:
-            data (List): list of tokenized datapoints (dicts returned by self._tokenize_example)
-
-        Returns:
-            Dict: dictionary for a single batch
-        """
-        batch = {
-            'input_ids': [],
-            'continuation_indices': [],
-            'mode': 'icl_task',
-            'labels': [],
-            'gold_indices': [],
-            'choice_groupings': [],
-        }
-        # batch_map = {
-        #     "gold_indices": "gold",
-        #     "input_ids": "context",
-        #     "labels": "context"
-        # }
-        # for data_pair in data:
-        #     continuation_start_idx = len(batch['continuation_indices'])
-        #     for context in context_options[self.choices_key]:
-
-        for data_pair in data:
-            continuation_start_idx = len(batch['continuation_indices'])
-            context_options = data_pair['context_options']
-
-            for context in context_options:
-                context_enc = data_pair['preamble'] + context
-                continuation_enc = data_pair['continuation']
-                inp, continuation_span = _make_padded_input(context_enc, continuation_enc, self.max_seq_len,
-                                                            self.pad_tok_id)
-
-                batch['input_ids'].append(inp)
-                batch['labels'].append(inp)
-                batch['continuation_indices'].append(continuation_span)
-
-            batch['gold_indices'].append(data_pair['gold'])
-            continuation_end_idx = len(batch['continuation_indices'])
-            batch['choice_groupings'].append((continuation_start_idx, continuation_end_idx))
-
-        # We run each distinct query + answer choice through the model separately and determine which
-        # answer has the lowest per-token-perplexity.
-        #
-        # If each question has N possible choices, all N must be grouped together as distinct elements of the batch
-        # since the batch may consist of multiple questions, the choice_groupings indicates
-        # which contiguous sequences of elements in the batch correspond to which question
-        # gold_indices indicates which of the [0, N-1] choices is the correct one for each question.
-        batch = self._convert_tokens_to_tensors(batch)
-        batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
-        return batch
-
 
 class InContextLearningCodeEvalDataset(InContextLearningDataset):
     """
@@ -1170,17 +1105,10 @@ def __init__(
         *args,
         **kwargs,
     ):
-        # self.check_defaults_are_set({
-        #     'pass_at_k': pass_at_k,
-        #     'generations_per_sample': generations_per_sample,
-        #     'top_p': top_p,
-        #     'top_k': top_k,
-        #     'temperature': temperature
-        # })
-        # if generations_per_sample < pass_at_k:
-        #     raise ValueError(
-        #         f'generations_per_sample ({generations_per_sample}) must be greater than or equal to pass_at_k ({pass_at_k}) for code evaluation.'
-        #     )
+        if generations_per_sample < pass_at_k:
+            raise ValueError(
+                f'generations_per_sample ({generations_per_sample}) must be greater than or equal to pass_at_k ({pass_at_k}) for code evaluation.'
+            )
         batch_mapping = {
             'input_ids': 'prompt',
             'prompts': 'prompt_text',
@@ -1191,6 +1119,7 @@ def __init__(
             'test_outputs': 'test_outputs',
             'languages': 'language'
         }
+        self.max_prompt_length = 0
         super().__init__(
             context_key='prompt',
             answer_key='canonical_solution',
@@ -1201,7 +1130,7 @@ def __init__(
             *args,
             **kwargs,
         )
-        self.max_prompt_length = self.get_max_prompt_length()
+        self.dataset = self.adjust_padding()
         self.default_batch = {
             'input_ids': [],
             'mode': 'generate',
@@ -1216,7 +1145,6 @@ def __init__(
             'generation_length': self.max_seq_len - self.max_prompt_length,
             'generation_kwargs': {
                 'pad_token_id': self.pad_tok_id,
-                # TODO: specify this?
                 'num_beams': 1,  # single beam
                 'num_return_sequences': generations_per_sample,
                 'do_sample': True,
@@ -1235,12 +1163,28 @@ def get_max_prompt_length(self) -> int:
         """
         max_prompt_length = 0
         for example in self.dataset:
+            # Will this elimante tokens we want to keep?
+            unpadded_example = [token for token in example[self.context_key] if token != self.pad_tok_id]
             max_prompt_length = max(
                 max_prompt_length,
-                len(example[self.context_key]),
+                len(unpadded_example),
             )
         return max_prompt_length
 
+    def adjust_padding(self):
+        self.max_prompt_length = self.get_max_prompt_length()
+
+        def _trim_padding(example):
+            full_prompt = [token for token in example[self.context_key] if token != self.pad_tok_id]
+            full_prompt = _trim_context(full_prompt, [], self.max_prompt_length)
+            padded_context = _make_padded_input(full_prompt, [], self.max_prompt_length, self.pad_tok_id,
+                                                self.padding_side)
+
+            example[self.context_key] = padded_context
+            return example
+
+        return self.dataset.map(_trim_padding)
+
     def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -> Dict[str, Any]:
         """
         Runs text through the tokenizer and handles special cases.
@@ -1263,62 +1207,6 @@ def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -
         tokenized_example['language'] = example['language']
         return tokenized_example
 
-    # def collate_fn(self, data: Dict[str, Any]) -> Dict[str, Any]:
-    #     """
-    #     The function that the dataloader uses to accumulate data into batches.
-    #     Args:
-    #         data (List): list of tokenized datapoints (dicts returned by self._tokenize_example)
-
-    #     Returns:
-    #         Dict: dictionary for a single batch
-    #     """
-    #     batch = {
-    #         'input_ids': [],
-    #         'mode': 'generate',
-    #         'labels': [],
-    #         'prompts': [],
-    #         'tests': [],
-    #         'entry_points': [],
-    #         'test_inputs': [],
-    #         'test_outputs': [],
-    #         'languages': [],
-    #         'pass_at_k': self.pass_at_k,
-    #         'generation_length': self.max_seq_len - self.max_prompt_length,
-    #         'generation_kwargs': {
-    #             'pad_token_id': self.pad_tok_id,
-    #             # TODO: specify this?
-    #             'num_beams': 1,  # single beam
-    #             'num_return_sequences': self.generations_per_sample,
-    #             'do_sample': True,
-    #             'top_p': self.top_p,
-    #             'top_k': self.top_k,
-    #             'temperature': self.temperature,
-    #             'use_cache': True
-    #         },
-    #     }
-    #     for example in data:
-    #         # context_enc = example[self.context_key]
-    #         # inp, _ = _make_padded_input(
-    #         #     context_enc,
-    #         #     [],
-    #         #     self.max_prompt_length,
-    #         #     self.pad_tok_id,
-    #         #     padding_side=self.padding_side,
-    #         # )
-
-    #         batch['input_ids'].append(example[self.context_key])
-    #         batch['prompts'].append(example['prompt_text'])
-    #         batch['tests'].append(example['test'])
-    #         batch['labels'].append(example['canonical_solution'])
-    #         batch['entry_points'].append(example['entry_point'])
-    #         batch['test_inputs'].append(example['test_inputs'])
-    #         batch['test_outputs'].append(example['test_outputs'])
-    #         batch['languages'].append(example['language'])
-
-    #     batch = self._convert_tokens_to_tensors(batch)
-    #     batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
-    #     return batch
-
 
 def build_icl_dataloader(
     icl_task_type: str,
diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py
index e1f70332ad..fe4b53fd93 100644
--- a/tests/datasets/test_in_context_learning_datasets.py
+++ b/tests/datasets/test_in_context_learning_datasets.py
@@ -15,9 +15,9 @@
 
 from composer import Evaluator
 from composer.core import DataSpec
-from composer.datasets.in_context_learning_evaluation import (InContextLearningCodeEvalDataset,
+from composer.datasets.in_context_learning_evaluation import (InContextLearningCodeEvalDataset, _get_continuation_span,
                                                               _get_fewshot_sample_idxs, _make_padded_input,
-                                                              get_icl_task_dataloader)
+                                                              _trim_context, get_icl_task_dataloader)
 from composer.loggers import InMemoryLogger
 from composer.metrics import (InContextLearningCodeEvalAccuracy, InContextLearningLMAccuracy,
                               InContextLearningMultipleChoiceAccuracy, InContextLearningQAAccuracy)
@@ -67,9 +67,13 @@ def test_fewshot_sample_idxs_randomness():
 
 
 def test_batch_padding_logic(tiny_gpt2_tokenizer):
+    # def test_get_continuation_span(tiny_gpt2_tokenizer):
     continuation = tiny_gpt2_tokenizer(' dog' * 2000)['input_ids']
     context = tiny_gpt2_tokenizer(' cat' * 2000)['input_ids']
-    _, continuation_spans = _make_padded_input(context, continuation, 2048, tiny_gpt2_tokenizer.eos_token_id)
+    trimmed_context = _trim_context(context, continuation, 2048)
+    continuation_spans = _get_continuation_span(trimmed_context, continuation)
+    # TODO is this correct? Add more tests
+    # padded_input = _get_continuation_span(trimmed_context, continuation)
     # the context (of len 2000) gets clipped to len 48 so that the whole continuation can fit
     assert continuation_spans[0] == 48 and continuation_spans[-1] == 2047
 
@@ -82,7 +86,7 @@ def test_make_padding(tiny_gpt2_tokenizer, padding_side):
     error_context = contextlib.nullcontext() if padding_side in {'left', 'right'} else pytest.raises(ValueError)
 
     with error_context:
-        input_ids, _ = _make_padded_input(context, [], 2048, padding_id, padding_side=padding_side)
+        input_ids = _make_padded_input(context, [], 2048, padding_id, padding_side=padding_side)
 
         if padding_side == 'left':
             assert input_ids[0] == tiny_gpt2_tokenizer.eos_token_id
@@ -549,11 +553,7 @@ def test_qa_task_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path, num_fews
     assert all(item[0] == tokenizer.eos_token_id for item in batch['input_ids'])
 
     decoded_batch = tokenizer.batch_decode(batch['input_ids'])
-    try:
-        assert all(item.count('Q: ') == num_fewshot + 1 for item in decoded_batch)
-    except:
-        import IPython; IPython.embed()
-
+    assert all(item.count('Q: ') == num_fewshot + 1 for item in decoded_batch)
     assert all(item.count('\nA:') == num_fewshot + 1 for item in decoded_batch)
 
     if len(prompt_string) > 0:
@@ -941,10 +941,7 @@ def test_code_eval_task_dataloader(dataset_uri, tmp_path, num_fewshot, prompt_st
     assert batch['mode'] == 'generate'
     # the maximum generation length from the small test data
     assert batch['generation_length'] == seqlen - max_prompt_length
-    try:
-        assert any(item[0] != tokenizer.eos_token_id for item in batch['input_ids'])  # longest should be pushed left
-    except:
-        import IPython; IPython.embed()
+    assert any(item[0] != tokenizer.eos_token_id for item in batch['input_ids'])  # longest should be pushed left
 
     decoded_batch = tokenizer.batch_decode(batch['input_ids'])
     assert all(item.count('Code start: \n') == num_fewshot + 1 for item in decoded_batch)

From 5caffbf59fde296b26c0927dbf471432a26bf436 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Tue, 5 Dec 2023 19:55:01 +0000
Subject: [PATCH 064/116] fix missing fewshot for schema

---
 composer/datasets/in_context_learning_evaluation.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index db337aa3a5..a17b4880f9 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -325,7 +325,7 @@ def _generate_few_shot_prompt(
     def _construct_context(self, example: Dict, preceding_text: str = '', add_answer: bool = False) -> str:
         """
         Takes an example and constructs a context, ie the input the model reads for this example.
-        Optionally adds the correct answer (for fewshot examples) and handles example delemiters
+        Optionally adds the correct answer (for fewshot examples) and handles example delimiters
 
         Args:
             example (Dict): the example from which to construct the context
@@ -414,6 +414,7 @@ def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -
 
             tokenized_example[self.context_key] = padded_context
             tokenized_example[self.answer_key] = self._get_answer_from_example(example)
+
         return tokenized_example
 
     def _prep_example(
@@ -1028,7 +1029,7 @@ def _tokenize_example(self, prompt_and_fewshot: str, context_options: List[str],
         tokenized_example = {}
         preamble = self.tokenizer(prompt_and_fewshot)
         preamble = self._fix_eos_on_preamble(preamble['input_ids'])
-        encoded_contexts = [self.tokenizer(c, add_special_tokens=False)['input_ids'] for c in context_options]
+        encoded_contexts = [preamble + self.tokenizer(c, add_special_tokens=False)['input_ids'] for c in context_options]
         continuation = example['continuation']
         if self.prefix_space:
             continuation = (f' {continuation}' if not continuation.startswith(' ') else continuation)

From bf53ec11502a63a44cec88f8306774aff041f089 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Tue, 5 Dec 2023 23:26:45 +0000
Subject: [PATCH 065/116] rm temperature add generation_kwargs

---
 .../in_context_learning_evaluation.py         | 58 ++++++++++---------
 1 file changed, 31 insertions(+), 27 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index a17b4880f9..efa230297a 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -196,6 +196,7 @@ def __init__(
         hf_loading_vars: Dict = None,
         hf_parsing_map: Dict = None,
         tokenize_labels: bool = True,
+        generation_kwargs: Dict = None,
     ):
 
         self.tokenizer = tokenizer
@@ -204,7 +205,6 @@ def __init__(
         self.max_seq_len = max_seq_len
         self.pad_tok_id = pad_tok_id
         self.num_fewshot = num_fewshot
-        # TODO: check this is correct for all dataset types
         self.padding_side = padding_side
         self.padding_size = padding_size if padding_size else self.max_seq_len
         self.prelimiter = prelimiter
@@ -215,6 +215,7 @@ def __init__(
         self.tokenize_labels = tokenize_labels
         self.batch_mapping = batch_mapping
         self.default_batch = default_batch
+        self._update_generation_kwargs(generation_kwargs if generation_kwargs else {})
 
         hf_loading_vars = hf_loading_vars or {}
         self.dataset = self._read_dataset(dataset_uri, destination_path, hf_loading_vars, hf_parsing_map)
@@ -242,12 +243,10 @@ def __len__(self) -> int:
     def get_num_samples_in_batch(self, batch: Dict) -> int:
         return batch['input_ids'].shape[0]
 
-    def check_defaults_are_set(self, dict_of_defaults: dict) -> None:
-        if all(v for v in dict_of_defaults.values()):
-            return
-        raise ValueError(
-            f"{type(self).__name__} missing required variable(s): {', '.join([k for k, v in dict_of_defaults.items() if not v])}"
-        )
+    def _update_generation_kwargs(self, generation_kwargs):
+        if 'generation_kwargs' not in self.default_batch:
+            self.default_batch['generation_kwargs'] = {}
+        self.default_batch.update(generation_kwargs)
 
     def _read_dataset(self,
                       dataset_uri: str,
@@ -625,6 +624,7 @@ def __init__(self, cot_delimiter: str = '', *args, **kwargs):
             'input_ids': self.context_key,
             'labels': 'aliases',
         }
+        self._update_generation_kwargs(kwargs.get('generation_kwargs',{}))
 
     def _read_dataset(
         self,
@@ -766,6 +766,7 @@ def __init__(self, choices_key: str = 'choices', *args, **kwargs):
             'gold_indices': [],
             'choice_groupings': [],
         }
+        # TODO: is there something cleaner here?
         context_key = kwargs.pop('context_key', 'query')
         super().__init__(context_key=context_key, default_batch=default_batch, padding_side='right', *args, **kwargs)
         self.num_choices = len(self.dataset[0][self.choices_key])
@@ -833,6 +834,13 @@ def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -
     def collate_fn(self, data: Dict[str, Any]) -> Dict[str, Any]:
         """
         The function that the dataloader uses to accumulate data into batches.
+        We run each distinct query + answer choice through the model separately and determine which
+        answer has the lowest per-token-perplexity.
+       
+        If each question has N possible choices, all N must be grouped together as distinct elements of the batch
+        since the batch may consist of multiple questions, the choice_groupings indicates
+        which contiguous sequences of elements in the batch correspond to which question
+        gold_indices indicates which of the [0, N-1] choices is the correct one for each question.
         Args:
             data (List): list of tokenized datapoints (dicts returned by self._tokenize_example)
 
@@ -852,13 +860,6 @@ def collate_fn(self, data: Dict[str, Any]) -> Dict[str, Any]:
             choice_end_idx = len(batch['continuation_indices'])
             batch['choice_groupings'].append((choice_start_idx, choice_end_idx))
 
-        # We run each distinct query + answer choice through the model separately and determine which
-        # answer has the lowest per-token-perplexity.
-        #
-        # If each question has N possible choices, all N must be grouped together as distinct elements of the batch
-        # since the batch may consist of multiple questions, the choice_groupings indicates
-        # which contiguous sequences of elements in the batch correspond to which question
-        # gold_indices indicates which of the [0, N-1] choices is the correct one for each question.
         batch = self._convert_tokens_to_tensors(batch)
         batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
         return batch
@@ -1084,7 +1085,6 @@ class InContextLearningCodeEvalDataset(InContextLearningDataset):
         - do_sample: determines whether model is sampling or greedily decoding. Always set to True
         - top_p: the cumulative probability of parameter highest probability vocabulary tokens to keep for nucleus sampling. Must be between 0 and 1
         - top_k: the number of highest probability vocabulary tokens to keep for top-k-filtering. Between 1 and infinity.
-        - temperature: randomness used during prediction. 1.0 is deterministic. defaults to 1.0
         - use_cache: Whether or not to use past key values to speed up sampling. Always set to True
 
     Additional Args:
@@ -1093,7 +1093,6 @@ class InContextLearningCodeEvalDataset(InContextLearningDataset):
         pass_at_k (int) (defaults to 1): k for how many chances the model gets to write passing code
         top_p (int) (defaults to 0.95): top_p sampling parameter for nucleus sampling
         top_k (int) (defaults to 40): top_k sampling parameter for number of samples to consider
-        temperature (float) (defaults to 1.0): temperature to use while sampling
     """
 
     def __init__(
@@ -1102,7 +1101,6 @@ def __init__(
         pass_at_k: int = 1,
         top_p: Optional[float] = 0.95,
         top_k: Optional[int] = 40,
-        temperature: Optional[int] = 1.0,
         *args,
         **kwargs,
     ):
@@ -1151,10 +1149,11 @@ def __init__(
                 'do_sample': True,
                 'top_p': top_p,
                 'top_k': top_k,
-                'temperature': temperature,
                 'use_cache': True
             },
         }
+        self._update_generation_kwargs(kwargs['generation_kwargs'])
+    
 
     def get_max_prompt_length(self) -> int:
         """
@@ -1164,7 +1163,7 @@ def get_max_prompt_length(self) -> int:
         """
         max_prompt_length = 0
         for example in self.dataset:
-            # Will this elimante tokens we want to keep?
+            # TODO: Will this elimante tokens we want to keep?
             unpadded_example = [token for token in example[self.context_key] if token != self.pad_tok_id]
             max_prompt_length = max(
                 max_prompt_length,
@@ -1228,7 +1227,7 @@ def build_icl_dataloader(
     fewshot_random_seed: int,
     pass_at_k: int,
     generations_per_sample: int,
-    temperature: float,
+    generation_kwargs: Dict,
 ) -> DataSpec:
     if icl_task_type == 'multiple_choice':
         dataset = InContextLearningMultipleChoiceTaskDataset(
@@ -1244,6 +1243,7 @@ def build_icl_dataloader(
             fewshot_random_seed=fewshot_random_seed,
             hf_loading_vars=hf_loading_vars,
             hf_parsing_map=hf_parsing_map,
+            generation_kwargs=generation_kwargs,
         )
         batch_size = max(dataset.num_choices, batch_size)
         effective_batchsize = batch_size // dataset.num_choices
@@ -1261,6 +1261,7 @@ def build_icl_dataloader(
             fewshot_random_seed=fewshot_random_seed,
             hf_loading_vars=hf_loading_vars,
             hf_parsing_map=hf_parsing_map,
+            generation_kwargs=generation_kwargs,
         )
         batch_size = max(dataset.num_choices, batch_size)
         effective_batchsize = batch_size // dataset.num_choices
@@ -1278,6 +1279,7 @@ def build_icl_dataloader(
             fewshot_random_seed=fewshot_random_seed,
             hf_loading_vars=hf_loading_vars,
             hf_parsing_map=hf_parsing_map,
+            generation_kwargs=generation_kwargs,
         )
         effective_batchsize = batch_size
     elif icl_task_type == 'question_answering':
@@ -1296,6 +1298,7 @@ def build_icl_dataloader(
             hf_loading_vars=hf_loading_vars,
             hf_parsing_map=hf_parsing_map,
             cot_delimiter=cot_delimiter,
+            generation_kwargs=generation_kwargs,
         )
         effective_batchsize = batch_size
     elif icl_task_type == 'code_evaluation':
@@ -1315,7 +1318,7 @@ def build_icl_dataloader(
             hf_parsing_map=hf_parsing_map,
             pass_at_k=pass_at_k,
             generations_per_sample=generations_per_sample,
-            temperature=temperature,
+            generation_kwargs=generation_kwargs,
         )
         effective_batchsize = batch_size
     elif icl_task_type == 'rag':
@@ -1334,6 +1337,7 @@ def build_icl_dataloader(
             fewshot_random_seed=fewshot_random_seed,
             hf_loading_vars=hf_loading_vars,
             hf_parsing_map=hf_parsing_map,
+            generation_kwargs=generation_kwargs,
         )
         effective_batchsize = batch_size
     else:
@@ -1433,16 +1437,16 @@ def get_icl_task_dataloader(
     prompt_string: str,  # e.g. 'translate english to french:'
     example_delimiter: str,  # e.g. '\n'
     continuation_delimiter: str = '',
-    question_prelimiter: str = '',  # e.g. 'Question: '
-    hf_loading_vars: Dict = None,
-    hf_parsing_map: Dict = None,
     destination_path: str = '',
+    question_prelimiter: str = '',  # e.g. 'Question: '
     fewshot_random_seed: int = 1234,
     pass_at_k: int = 1,
-    temperature: float = 1.0,
     generations_per_sample: int = 1,
     cot_delimiter: str = '',
     has_categories: bool = False,
+    hf_loading_vars: Dict = None,
+    hf_parsing_map: Dict = None,
+    generation_kwargs: Dict = None,
 ) -> Union[DataSpec, Dict[str, DataSpec]]:
     """
     This constructs a dataloader (or dataloaders if has_categories is True) capable of evaluating LLMs on in-context learning language modeling tasks, for example LAMBADA. An example usage is below:
@@ -1525,7 +1529,7 @@ def get_icl_task_dataloader(
                 generations_per_sample=generations_per_sample,
                 hf_loading_vars=hf_loading_vars,
                 hf_parsing_map=hf_parsing_map,
-                temperature=temperature,
+                generation_kwargs=generation_kwargs,
             )
         return result_dls
     else:
@@ -1548,5 +1552,5 @@ def get_icl_task_dataloader(
             fewshot_random_seed=fewshot_random_seed,
             pass_at_k=pass_at_k,
             generations_per_sample=generations_per_sample,
-            temperature=temperature,
+            generation_kwargs=generation_kwargs,
         )

From b90ca7f91c6478d774dbc1aebd4cad372f108f30 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Tue, 5 Dec 2023 23:36:28 +0000
Subject: [PATCH 066/116] add defaults that are currently set in llm-foundry
 builders.py

---
 composer/datasets/in_context_learning_evaluation.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index efa230297a..36c48def8c 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -1434,14 +1434,14 @@ def get_icl_task_dataloader(
     max_seq_len: int,
     pad_tok_id: int,
     num_fewshot: int,
-    prompt_string: str,  # e.g. 'translate english to french:'
-    example_delimiter: str,  # e.g. '\n'
-    continuation_delimiter: str = '',
+    prompt_string: str = '',  # e.g. 'translate english to french:'
+    example_delimiter: str = '\n',  # e.g. '\n'
+    continuation_delimiter: str = ' ',
     destination_path: str = '',
     question_prelimiter: str = '',  # e.g. 'Question: '
     fewshot_random_seed: int = 1234,
     pass_at_k: int = 1,
-    generations_per_sample: int = 1,
+    generations_per_sample: int = 20,
     cot_delimiter: str = '',
     has_categories: bool = False,
     hf_loading_vars: Dict = None,

From 820e7fc4597eda81ff159add13b39239c467ba44 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Wed, 6 Dec 2023 19:23:41 +0000
Subject: [PATCH 067/116] fix defaults in tests, add some comments

---
 .../in_context_learning_evaluation.py         | 35 ++++++++++++-------
 .../test_in_context_learning_datasets.py      |  4 +++
 2 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 36c48def8c..8f8b535119 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -59,7 +59,6 @@ def _trim_context(context_enc: List, continuation_enc: List, max_seq_len: int) -
 
 def _get_continuation_span(context_enc: List, continuation_enc: List) -> list:
     return torch.tensor(range(len(context_enc), len(context_enc) + len(continuation_enc)))
-    # return list(range(len(context_enc), len(context_enc) + len(continuation_enc)))
 
 
 def _make_padded_input(context_enc: List,
@@ -213,9 +212,9 @@ def __init__(
         self.context_key = context_key
         self.answer_key = answer_key
         self.tokenize_labels = tokenize_labels
-        self.batch_mapping = batch_mapping
-        self.default_batch = default_batch
-        self._update_generation_kwargs(generation_kwargs if generation_kwargs else {})
+        self.batch_mapping = batch_mapping or {}
+        self.default_batch = default_batch or {}
+        self._update_generation_kwargs(generation_kwargs or {})
 
         hf_loading_vars = hf_loading_vars or {}
         self.dataset = self._read_dataset(dataset_uri, destination_path, hf_loading_vars, hf_parsing_map)
@@ -243,10 +242,19 @@ def __len__(self) -> int:
     def get_num_samples_in_batch(self, batch: Dict) -> int:
         return batch['input_ids'].shape[0]
 
-    def _update_generation_kwargs(self, generation_kwargs):
+    def _update_generation_kwargs(self, generation_kwargs: Dict) -> None:
+        """
+        Updates self.default_batch with the passed in generation_kwargs.
+        This must be run after self.default_batch is set (for example, if self.default_batch is set after __init__() is run,
+        likely because default_batch needs a class variable like self.pad_tok_id or self.max_answer_length).
+
+        Args:
+
+        """
         if 'generation_kwargs' not in self.default_batch:
             self.default_batch['generation_kwargs'] = {}
-        self.default_batch.update(generation_kwargs)
+        if generation_kwargs:
+            self.default_batch['generation_kwargs'].update(generation_kwargs)
 
     def _read_dataset(self,
                       dataset_uri: str,
@@ -607,6 +615,7 @@ class InContextLearningQATaskDataset(InContextLearningDataset):
     def __init__(self, cot_delimiter: str = '', *args, **kwargs):
         self.cot_delimiter = cot_delimiter
         self.has_cot = False
+        self.max_answer_length = 0
         super().__init__(padding_side='left', tokenize_labels=False, *args, **kwargs)
         # NOTE: set these after init call bcus they take class vars
         self.default_batch = {
@@ -624,7 +633,7 @@ def __init__(self, cot_delimiter: str = '', *args, **kwargs):
             'input_ids': self.context_key,
             'labels': 'aliases',
         }
-        self._update_generation_kwargs(kwargs.get('generation_kwargs',{}))
+        self._update_generation_kwargs(kwargs.get('generation_kwargs'))
 
     def _read_dataset(
         self,
@@ -1088,8 +1097,7 @@ class InContextLearningCodeEvalDataset(InContextLearningDataset):
         - use_cache: Whether or not to use past key values to speed up sampling. Always set to True
 
     Additional Args:
-        # TODO: are these correct?
-        generations_per_sample (int) (defaults to 1): how many outputs to generate per prompt
+        generations_per_sample (int) (defaults to 1): The number of independently computed returned sequences for each element in the batch
         pass_at_k (int) (defaults to 1): k for how many chances the model gets to write passing code
         top_p (int) (defaults to 0.95): top_p sampling parameter for nucleus sampling
         top_k (int) (defaults to 40): top_k sampling parameter for number of samples to consider
@@ -1118,6 +1126,7 @@ def __init__(
             'test_outputs': 'test_outputs',
             'languages': 'language'
         }
+        # Linting complains if this is not set in init
         self.max_prompt_length = 0
         super().__init__(
             context_key='prompt',
@@ -1152,7 +1161,7 @@ def __init__(
                 'use_cache': True
             },
         }
-        self._update_generation_kwargs(kwargs['generation_kwargs'])
+        self._update_generation_kwargs(kwargs.get('generation_kwargs'))
     
 
     def get_max_prompt_length(self) -> int:
@@ -1175,8 +1184,10 @@ def adjust_padding(self):
         self.max_prompt_length = self.get_max_prompt_length()
 
         def _trim_padding(example):
-            full_prompt = [token for token in example[self.context_key] if token != self.pad_tok_id]
-            full_prompt = _trim_context(full_prompt, [], self.max_prompt_length)
+            # Remove padding tokens applied during tokenization
+            unpadded_prompt = [token for token in example[self.context_key] if token != self.pad_tok_id]
+            # Pad only to max_promp_length
+            full_prompt = _trim_context(unpadded_prompt, [], self.max_prompt_length)
             padded_context = _make_padded_input(full_prompt, [], self.max_prompt_length, self.pad_tok_id,
                                                 self.padding_side)
 
diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py
index fe4b53fd93..46e8877bd1 100644
--- a/tests/datasets/test_in_context_learning_datasets.py
+++ b/tests/datasets/test_in_context_learning_datasets.py
@@ -763,6 +763,7 @@ def test_code_eval_sentpiece_dataloader(dataset_uri, tmp_path, num_fewshot, prom
                                  num_fewshot=num_fewshot,
                                  prompt_string=prompt_string,
                                  example_delimiter='\n',
+                                 continuation_delimiter='',
                                  question_prelimiter='Code start: \n',
                                  destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'),
                                  generations_per_sample=generations_per_sample)
@@ -848,6 +849,7 @@ def test_code_eval_test_cases(dataset_uri, tmp_path):
                                  num_fewshot=0,
                                  prompt_string='',
                                  example_delimiter='\n',
+                                 continuation_delimiter='',
                                  question_prelimiter='Code start: \n',
                                  destination_path=str(tmp_path / f'icl_.jsonl'),
                                  generations_per_sample=1)
@@ -896,6 +898,7 @@ def test_code_eval_pass_at_k_validity(dataset_uri, tmp_path):
                                 num_fewshot=0,
                                 prompt_string='',
                                 example_delimiter='\n',
+                                continuation_delimiter='',
                                 question_prelimiter='Code start: \n',
                                 destination_path=str(tmp_path / f'icl_.jsonl'),
                                 pass_at_k=10,
@@ -925,6 +928,7 @@ def test_code_eval_task_dataloader(dataset_uri, tmp_path, num_fewshot, prompt_st
                                  num_fewshot=num_fewshot,
                                  prompt_string=prompt_string,
                                  example_delimiter='\n',
+                                 continuation_delimiter='',
                                  question_prelimiter='Code start: \n',
                                  destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'),
                                  generations_per_sample=generations_per_sample)

From 51c057198d0d7219c658608c194bfcf908160932 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Wed, 6 Dec 2023 23:37:42 +0000
Subject: [PATCH 068/116] tests wip

---
 .../in_context_learning_evaluation.py         |  34 +-
 .../test_in_context_learning_datasets.py      | 373 ++++++++++++++++--
 2 files changed, 365 insertions(+), 42 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 8f8b535119..75fd52f4e9 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -81,7 +81,6 @@ def _make_padded_input(context_enc: List,
         input (torch.tensor): the padded and encoded context
         continuation_span (torch.tensor): the _inclusive_ range of indices corresponding to the continuation
 
-
     """
 
     inp = torch.tensor(
@@ -365,7 +364,7 @@ def _get_answer_from_example(self, example: Dict[str, Any], in_context=False) ->
 
     def _fix_eos_on_preamble(self, input_ids: List[int]) -> List[int]:
         """
-        If the input_ids is empty then input_ids['input_ids'] will be a 0-length List,
+        If the input_ids is empty then input_ids will be a 0-length List,
         unless the tokenizer adds special tokens to empty strings (e.g. OPT tokenizer)
         If there is an EOS token added, we need to remove it so it is not in the middle of the prompt,
         as the specific eval question's prompt will follow the input_ids
@@ -453,6 +452,7 @@ def _prep_example(
         tokenized_example = self._tokenize_example(prompt_and_fewshot, ctxt, example)
         return tokenized_example
 
+    # TODO: Maybe make this not a class function
     def _convert_tokens_to_tensors(self, batch: Dict) -> Dict[str, Any]:
         # zzzz HF converts ur torch tensors into lists so need to convert them back
         batch['input_ids'] = torch.stack(list(map(torch.tensor, batch['input_ids'])))
@@ -461,6 +461,7 @@ def _convert_tokens_to_tensors(self, batch: Dict) -> Dict[str, Any]:
             batch['continuation_indices'] = list(map(torch.tensor, batch['continuation_indices']))
         return batch
 
+    # TODO: Test this?
     def collate_fn(self, data: Dict[str, Any]) -> Dict[str, Any]:
         """
         The function that the dataloader uses to accumulate data into batches.
@@ -740,6 +741,7 @@ def _get_answer_from_example(self, example: Dict[str, Any], in_context=False) ->
         return cont
 
 
+# TODO: ensure tests
 class InContextLearningMultipleChoiceTaskDataset(InContextLearningDataset):
     """
     A dataset that construct batches for in-context learning multiple choice evaluation.
@@ -921,6 +923,7 @@ def split_batch(self, batch: Any, microbatch_size: int) -> Dict[str, Any]:
         return [{k: v[idx] for k, v in chunked.items()} for idx in range(num_chunks)]
 
 
+# TODO: ensure tests
 class InContextLearningSchemaTaskDataset(InContextLearningMultipleChoiceTaskDataset):
     """
     A dataset that constructs batches for in-context learning schema evaluation.
@@ -1164,24 +1167,24 @@ def __init__(
         self._update_generation_kwargs(kwargs.get('generation_kwargs'))
     
 
-    def get_max_prompt_length(self) -> int:
+    def adjust_padding(self):
         """
-        Iterates through the dataset and finds the length of the longest prompt.
+        Adjusts padding to the maximum prompt size rather than max_seq_len. 
+        Needs to be done after the dataset has been processed because we can't get the prompt length
+        until after we've tokenized it.
+
         Returns:
-            int: maximum prompt length
+            dataset: 
         """
         max_prompt_length = 0
         for example in self.dataset:
-            # TODO: Will this elimante tokens we want to keep?
+            # TODO: Will this elimanate tokens we want to keep?
             unpadded_example = [token for token in example[self.context_key] if token != self.pad_tok_id]
             max_prompt_length = max(
                 max_prompt_length,
                 len(unpadded_example),
             )
-        return max_prompt_length
-
-    def adjust_padding(self):
-        self.max_prompt_length = self.get_max_prompt_length()
+        self.max_prompt_length = max_prompt_length
 
         def _trim_padding(example):
             # Remove padding tokens applied during tokenization
@@ -1198,14 +1201,8 @@ def _trim_padding(example):
 
     def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -> Dict[str, Any]:
         """
-        Runs text through the tokenizer and handles special cases.
-        Args:
-            prompt_and_fewshot (str): the collection of the prompt and fewshot examples that belongs before the example's context
-            ctx (str): the specific example's derrived context
-            example (Dict): the example as a dictionary.
-
-        Returns:
-            Dict: dictionary with the tokenized data
+        Adds extra code task details to the example dictionary.
+        See InContextLearningDataset for more details
         """
         tokenized_example = super()._tokenize_example(prompt_and_fewshot, ctxt, example)
         tokenized_example['prompt_text'] = example['prompt']
@@ -1508,6 +1505,7 @@ def get_icl_task_dataloader(
         # TODO: is this right?
         pass_at_k (int): k for how many chances the model gets to write passing code
         generations_per_sample (int): how many outputs to generate per prompt
+
         cot_delimiter (str): Delimiter to place between the chain of thought and continuations.
         has_categories: (bool): If ``True``, we will search the dataset file for a category key, and partition the dataset into a separate dataloader for each category occurring in the data.
 
diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py
index 46e8877bd1..16a77d01f2 100644
--- a/tests/datasets/test_in_context_learning_datasets.py
+++ b/tests/datasets/test_in_context_learning_datasets.py
@@ -15,7 +15,8 @@
 
 from composer import Evaluator
 from composer.core import DataSpec
-from composer.datasets.in_context_learning_evaluation import (InContextLearningCodeEvalDataset, _get_continuation_span,
+from composer.datasets.in_context_learning_evaluation import (InContextLearningDataset, InContextLearningQATaskDataset, InContextLearningCodeEvalDataset, strip_data, _tokenizer_needs_prefix_space,
+                                                              _get_continuation_span,
                                                               _get_fewshot_sample_idxs, _make_padded_input,
                                                               _trim_context, get_icl_task_dataloader)
 from composer.loggers import InMemoryLogger
@@ -26,6 +27,92 @@
 from composer.utils import dist, reproducibility
 from tests.common import device, world_size
 
+def test_strip_data():
+    data_to_strip = {"strip_data": "  boo!  \n", "has_space": "  wa hoo!", "end_space": "yoohoo!  "}
+    stripped_data = strip_data(data_to_strip)
+    for k, v in stripped_data.items():
+        assert k in data_to_strip
+        assert not v[0].isspace()
+        assert not v[-1].isspace()
+
+def test_tokenizer_needs_prefix_space_when_space_not_needed(tiny_gpt2_tokenizer):
+    # TODO: get a tokenizer that does not need prefix space
+    assert not _tokenizer_needs_prefix_space(tiny_gpt2_tokenizer)
+
+def test_tokenizer_needs_prefix_space_when_space_needed():
+    tokenizer = AutoTokenizer.from_pretrained('facebook/opt-125m', use_fast=False)
+    assert _tokenizer_needs_prefix_space(tokenizer)
+
+def test_trim_context():
+    context = [0] * 99 + [1] * 2037
+    continuation = [2] * 10
+    max_seq_len = 2048
+    trimmed_context = _trim_context(context, continuation, max_seq_len=max_seq_len)
+    assert len(trimmed_context) == 2038
+    assert trimmed_context[0] == 0
+    assert trimmed_context[1] == 1
+
+def test_trim_context_no_continuation():
+    context = [0] * 2048
+    max_seq_len = 2048
+    trimmed_context = _trim_context(context, [], max_seq_len=max_seq_len)
+    assert len(trimmed_context) == 2048
+    context = [0] * 3000  + [1]
+    max_seq_len = 2048
+    trimmed_context = _trim_context(context, [], max_seq_len=max_seq_len)
+    assert len(trimmed_context) == 2048
+    assert trimmed_context[-1] == 1
+
+
+def test_get_continuation_span():
+    context = [0] * 200
+    continuation = [1] * 3
+    cont_span = _get_continuation_span(context, continuation)
+    assert torch.all(torch.eq(cont_span, torch.tensor([200, 201, 202])))
+    continuation = [1] 
+    cont_span = _get_continuation_span(context, continuation)
+    assert torch.all(torch.eq(cont_span, torch.tensor([200])))
+
+@pytest.mark.parametrize('padding_side', ['left', 'right', 'middle'])
+def test_make_padding(tiny_gpt2_tokenizer, padding_side):
+    context = tiny_gpt2_tokenizer(' cat' * 2000)['input_ids']
+    padding_id = tiny_gpt2_tokenizer.eos_token_id
+
+    error_context = contextlib.nullcontext() if padding_side in {'left', 'right'} else pytest.raises(ValueError)
+
+    with error_context:
+        input_ids = _make_padded_input(context, [], 2048, padding_id, padding_side=padding_side)
+
+        if padding_side == 'left':
+            assert input_ids[0] == tiny_gpt2_tokenizer.eos_token_id
+            assert input_ids[48:].tolist() == context
+        elif padding_side == 'right':
+            assert input_ids[-1] == tiny_gpt2_tokenizer.eos_token_id
+            assert input_ids[:-48].tolist() == context
+
+
+def test_batch_padding_logic_no_padding(tiny_gpt2_tokenizer):
+    continuation = tiny_gpt2_tokenizer(' dog' * 2000)['input_ids']
+    context = tiny_gpt2_tokenizer(' cat' * 2000)['input_ids']
+    max_seq_len = 2048
+    trimmed_context = _trim_context(context, continuation, max_seq_len)
+    continuation_spans = _get_continuation_span(trimmed_context, continuation)
+    padded_input = _make_padded_input(trimmed_context, continuation, max_seq_len, tiny_gpt2_tokenizer.pad_token_id, padding_side='right')
+    assert continuation_spans[0] == 48 and continuation_spans[-1] == 2047
+    assert len(padded_input) == 2048
+    assert tiny_gpt2_tokenizer.pad_token_id not in padded_input
+
+def test_batch_padding_logic_with_padding(tiny_gpt2_tokenizer):
+    continuation = tiny_gpt2_tokenizer(' dog' * 200)['input_ids']
+    context = tiny_gpt2_tokenizer(' cat' * 200)['input_ids']
+    max_seq_len = 2048
+    trimmed_context = _trim_context(context, continuation, max_seq_len)
+    continuation_spans = _get_continuation_span(trimmed_context, continuation)
+    padded_input = _make_padded_input(trimmed_context, continuation, max_seq_len, tiny_gpt2_tokenizer.pad_token_id, padding_side='right')
+    assert continuation_spans[0] == 200 and continuation_spans[-1] == 399
+    assert len(padded_input) == 2048
+    assert padded_input[-1] == tiny_gpt2_tokenizer.pad_token_id
+
 
 def test_fewshot_sample_idxs():
     rng = random.Random(1234)
@@ -66,34 +153,272 @@ def test_fewshot_sample_idxs_randomness():
     assert rng_1_sample_2 != rng_3_sample_2
 
 
-def test_batch_padding_logic(tiny_gpt2_tokenizer):
-    # def test_get_continuation_span(tiny_gpt2_tokenizer):
-    continuation = tiny_gpt2_tokenizer(' dog' * 2000)['input_ids']
-    context = tiny_gpt2_tokenizer(' cat' * 2000)['input_ids']
-    trimmed_context = _trim_context(context, continuation, 2048)
-    continuation_spans = _get_continuation_span(trimmed_context, continuation)
-    # TODO is this correct? Add more tests
-    # padded_input = _get_continuation_span(trimmed_context, continuation)
-    # the context (of len 2000) gets clipped to len 48 so that the whole continuation can fit
-    assert continuation_spans[0] == 48 and continuation_spans[-1] == 2047
+def test_update_generation_kwargs(tiny_gpt2_tokenizer, tmp_path):
+    tokenizer = tiny_gpt2_tokenizer
+    seqlen = 2048
+    num_fewshot = 0
+    prompt_string = ''
+    hf_loading_vars = {
+        'split': 'test',
+        'name': 'invoker',
+    }
+    hf_parsing_map = {'context': ['quas', 'wex', 'exort'], 'answer': ['spell']}
+    gen_kwargs = {"test_arg1": 1, "test_arg2": 2}
+
+    dl = InContextLearningDataset(dataset_uri='hf://mosaicml/test_dataset',
+                                 tokenizer=tokenizer,
+                                 max_seq_len=seqlen,
+                                 pad_tok_id=tokenizer.eos_token_id,
+                                 num_fewshot=num_fewshot,
+                                 fewshot_random_seed=1,
+                                 prompt_string=prompt_string,
+                                 example_delimiter='\n',
+                                 prelimiter='Orbs: ',
+                                 continuation_delimiter='\nSpell:',
+                                 destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
+                                 hf_loading_vars=hf_loading_vars,
+                                 hf_parsing_map=hf_parsing_map,
+                                 generation_kwargs=gen_kwargs
+                                 )
+    assert dl.default_batch['generation_kwargs'] == {"test_arg1":1, "test_arg2":2}
 
 
-@pytest.mark.parametrize('padding_side', ['left', 'right', 'middle'])
-def test_make_padding(tiny_gpt2_tokenizer, padding_side):
-    context = tiny_gpt2_tokenizer(' cat' * 2000)['input_ids']
-    padding_id = tiny_gpt2_tokenizer.eos_token_id
+def test_update_generation_kwargs_no_kwargs(tiny_gpt2_tokenizer, tmp_path):
+    tokenizer = tiny_gpt2_tokenizer
+    seqlen = 2048
+    num_fewshot = 0
+    prompt_string = ''
+    hf_loading_vars = {
+        'split': 'test',
+        'name': 'invoker',
+    }
+    hf_parsing_map = {'context': ['quas', 'wex', 'exort'], 'answer': ['spell']}
 
-    error_context = contextlib.nullcontext() if padding_side in {'left', 'right'} else pytest.raises(ValueError)
+    dl = InContextLearningDataset(dataset_uri='hf://mosaicml/test_dataset',
+                                 tokenizer=tokenizer,
+                                 max_seq_len=seqlen,
+                                 pad_tok_id=tokenizer.eos_token_id,
+                                 num_fewshot=num_fewshot,
+                                 fewshot_random_seed=1,
+                                 prompt_string=prompt_string,
+                                 example_delimiter='\n',
+                                 prelimiter='Orbs: ',
+                                 continuation_delimiter='\nSpell:',
+                                 destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
+                                 hf_loading_vars=hf_loading_vars,
+                                 hf_parsing_map=hf_parsing_map)
+    assert not dl.default_batch['generation_kwargs']
 
-    with error_context:
-        input_ids = _make_padded_input(context, [], 2048, padding_id, padding_side=padding_side)
 
-        if padding_side == 'left':
-            assert input_ids[0] == tiny_gpt2_tokenizer.eos_token_id
-            assert input_ids[48:].tolist() == context
-        elif padding_side == 'right':
-            assert input_ids[-1] == tiny_gpt2_tokenizer.eos_token_id
-            assert input_ids[:-48].tolist() == context
+def test_generate_few_shot_prompt():
+    pass
+
+def test_construct_context(tiny_gpt2_tokenizer, tmp_path):
+    tokenizer = tiny_gpt2_tokenizer
+    seqlen = 2048
+    num_fewshot = 0
+    prompt_string = ''
+    hf_loading_vars = {
+        'split': 'test',
+        'name': 'invoker',
+    }
+    hf_parsing_map = {'context': ['quas', 'wex', 'exort'], 'answer': ['spell']}
+
+    dl = InContextLearningDataset(dataset_uri='hf://mosaicml/test_dataset',
+                                 tokenizer=tokenizer,
+                                 max_seq_len=seqlen,
+                                 pad_tok_id=tokenizer.eos_token_id,
+                                 num_fewshot=num_fewshot,
+                                 fewshot_random_seed=1,
+                                 prompt_string=prompt_string,
+                                 example_delimiter='\n',
+                                 prelimiter='Orbs: ',
+                                 continuation_delimiter='\nSpell: ',
+                                 destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
+                                 hf_loading_vars=hf_loading_vars,
+                                 hf_parsing_map=hf_parsing_map)
+    constructed_context = dl._construct_context({'context':'quas quas exort', 'answer': 'ice wall'})
+    assert constructed_context == 'Orbs: quas quas exort\nSpell: '
+    constructed_context = dl._construct_context({'context':'quas quas exort', 'answer': 'ice wall'}, add_answer=True)
+    assert constructed_context == 'Orbs: quas quas exort\nSpell: ice wall'
+    constructed_context = dl._construct_context({'context':'quas quas exort', 'answer': 'ice wall'}, preceding_text='The harsh White Waste beckons!', add_answer=True)
+    assert constructed_context == '\nOrbs: quas quas exort\nSpell: ice wall'
+
+def test_get_answer_from_example(tiny_gpt2_tokenizer, tmp_path):
+    tokenizer = tiny_gpt2_tokenizer
+    seqlen = 2048
+    num_fewshot = 0
+    prompt_string = ''
+    hf_loading_vars = {
+        'split': 'test',
+        'name': 'invoker',
+    }
+    hf_parsing_map = {'context': ['quas', 'wex', 'exort'], 'answer': ['spell']}
+
+    dl = InContextLearningDataset(dataset_uri='hf://mosaicml/test_dataset',
+                                 tokenizer=tokenizer,
+                                 max_seq_len=seqlen,
+                                 pad_tok_id=tokenizer.eos_token_id,
+                                 num_fewshot=num_fewshot,
+                                 fewshot_random_seed=1,
+                                 prompt_string=prompt_string,
+                                 example_delimiter='\n',
+                                 prelimiter='Orbs: ',
+                                 continuation_delimiter='\nSpell:',
+                                 destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
+                                 hf_loading_vars=hf_loading_vars,
+                                 hf_parsing_map=hf_parsing_map)
+    answer = dl._get_answer_from_example({'context': "wex exort exort", 'answer': 'alacrity'})
+    assert answer == 'alacrity'
+
+def test_fix_eos_on_preamble(tmp_path):
+    tokenizer = AutoTokenizer.from_pretrained('facebook/opt-125m', use_fast=False)
+    seqlen = 2048
+    num_fewshot = 0
+    prompt_string = ''
+    hf_loading_vars = {
+        'split': 'test',
+        'name': 'invoker',
+    }
+    hf_parsing_map = {'context': ['quas', 'wex', 'exort'], 'answer': ['spell']}
+
+    dl = InContextLearningDataset(dataset_uri='hf://mosaicml/test_dataset',
+                                 tokenizer=tokenizer,
+                                 max_seq_len=seqlen,
+                                 pad_tok_id=tokenizer.eos_token_id,
+                                 num_fewshot=num_fewshot,
+                                 fewshot_random_seed=1,
+                                 prompt_string=prompt_string,
+                                 example_delimiter='\n',
+                                 prelimiter='Orbs: ',
+                                 continuation_delimiter='\nSpell:',
+                                 destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
+                                 hf_loading_vars=hf_loading_vars,
+                                 hf_parsing_map=hf_parsing_map)
+    preamble = 'blah blah blah.'
+    tokenized_preamble = tokenizer.encode(preamble)
+    tokenized_preamble += [tokenizer.eos_token_id]
+    fixed_preamble = dl._fix_eos_on_preamble(tokenized_preamble)
+    assert tokenized_preamble[:-1] == fixed_preamble 
+    assert fixed_preamble[-1] != tokenizer.eos_token_id
+
+def test_tokenize_eample(tiny_gpt2_tokenizer, tmp_path):
+
+    pass
+
+def test_qa_set_cot_no_cot(tmp_path):
+    pytest.importorskip('datasets')
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    dataset_uri = f'{local_data}/triviaqa_small.jsonl'
+    tokenizer = AutoTokenizer.from_pretrained('facebook/opt-125m')
+
+    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
+    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
+    dl = InContextLearningQATaskDataset(
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        max_seq_len=1024,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=0,
+        fewshot_random_seed=1234,
+        prompt_string='',
+        example_delimiter='\n',
+        continuation_delimiter=': ',
+        destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
+    )
+    assert not dl.has_cot
+
+def test_qa_set_cot_has_cot(tmp_path):
+    pytest.importorskip('datasets')
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    dataset_uri = f'{local_data}/gsm8k_small.jsonl'
+    tokenizer = AutoTokenizer.from_pretrained('facebook/opt-125m')
+
+    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
+    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
+    dl = InContextLearningQATaskDataset(
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        max_seq_len=1024,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=0,
+        fewshot_random_seed=1234,
+        prompt_string='',
+        example_delimiter='\n',
+        continuation_delimiter=': ',
+        destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
+    )
+    assert dl.has_cot
+
+def test_qa_get_max_answer_length():
+    pass
+
+def test_qa_get_answer_from_example():
+    pass
+
+def test_qa_tokenize_example():
+    pass
+
+def test_lm_get_answer_from_example():
+    pass
+
+def test_code_adjust_padding(tiny_gpt2_tokenizer, tmp_path):
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    dataset_uri = f'{local_data}/human_eval_small.jsonl'
+    tokenizer = tiny_gpt2_tokenizer
+    seqlen = 2048
+    num_fewshot = 0
+    prompt_string = ''
+    gen_kwargs = {"temperature": .9, "top_p": .95, "num_beams": 9000}
+
+    dl = InContextLearningCodeEvalDataset(
+            dataset_uri=dataset_uri,
+            tokenizer=tokenizer,
+            max_seq_len=seqlen,
+            pad_tok_id=tokenizer.eos_token_id,
+            num_fewshot=num_fewshot,
+            fewshot_random_seed=1,
+            prompt_string=prompt_string,
+            example_delimiter='\n',
+            prelimiter='Code start:',
+            continuation_delimiter='\nPlease code:',
+            destination_path=str(tmp_path / 'test_human_eval_small.jsonl'),
+            generation_kwargs=gen_kwargs,
+            generations_per_sample=10,
+        )
+    
+    assert all([len(data['prompt']) == 148 for data in dl.dataset])
+    
+
+def test_code_update_gen_kwargs(tiny_gpt2_tokenizer, tmp_path):
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    dataset_uri = f'{local_data}/human_eval_small.jsonl'
+    tokenizer = tiny_gpt2_tokenizer
+    seqlen = 2048
+    num_fewshot = 0
+    prompt_string = ''
+    gen_kwargs = {"temperature": .9, "top_p": .95, "num_beams": 9000}
+
+    dl = InContextLearningCodeEvalDataset(
+            dataset_uri=dataset_uri,
+            tokenizer=tokenizer,
+            max_seq_len=seqlen,
+            pad_tok_id=tokenizer.eos_token_id,
+            num_fewshot=num_fewshot,
+            fewshot_random_seed=1,
+            prompt_string=prompt_string,
+            example_delimiter='\n',
+            prelimiter='Code start:',
+            continuation_delimiter='\nPlease code:',
+            destination_path=str(tmp_path / 'test_human_eval_small.jsonl'),
+            generation_kwargs=gen_kwargs,
+            generations_per_sample=10,
+        )
+    assert dl.default_batch['generation_kwargs']['num_beams'] == 9000
+    assert dl.default_batch['generation_kwargs']['top_p'] == .95
+    assert dl.default_batch['generation_kwargs']['top_k'] == 40
+    assert dl.default_batch['generation_kwargs']['temperature'] == .9
+    assert dl.default_batch['generation_kwargs']['do_sample'] == True
 
 
 @pytest.mark.parametrize('dataset_uri', ['mmlu_small.jsonl'])

From be900ab94864d8eef9d662a96f5af8845d17c1dc Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Thu, 7 Dec 2023 01:19:28 +0000
Subject: [PATCH 069/116] tests for new funcs

---
 .../in_context_learning_evaluation.py         |  14 +-
 .../test_in_context_learning_datasets.py      | 438 +++++++++++++-----
 2 files changed, 322 insertions(+), 130 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 75fd52f4e9..486b1ded10 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -452,7 +452,7 @@ def _prep_example(
         tokenized_example = self._tokenize_example(prompt_and_fewshot, ctxt, example)
         return tokenized_example
 
-    # TODO: Maybe make this not a class function
+    # TODO: Maybe make this not a class function. Also, could make our padding operations work on lists
     def _convert_tokens_to_tensors(self, batch: Dict) -> Dict[str, Any]:
         # zzzz HF converts ur torch tensors into lists so need to convert them back
         batch['input_ids'] = torch.stack(list(map(torch.tensor, batch['input_ids'])))
@@ -736,6 +736,7 @@ def __init__(self, *args, **kwargs):
 
     def _get_answer_from_example(self, example: Dict[str, Any], in_context=False) -> str:
         cont = example[self.answer_key]
+        # Should this be in the base class?
         if self.prefix_space and not cont.startswith(' ') and not in_context:
             cont = f' {cont}'
         return cont
@@ -847,7 +848,7 @@ def collate_fn(self, data: Dict[str, Any]) -> Dict[str, Any]:
         The function that the dataloader uses to accumulate data into batches.
         We run each distinct query + answer choice through the model separately and determine which
         answer has the lowest per-token-perplexity.
-       
+
         If each question has N possible choices, all N must be grouped together as distinct elements of the batch
         since the batch may consist of multiple questions, the choice_groupings indicates
         which contiguous sequences of elements in the batch correspond to which question
@@ -1042,7 +1043,9 @@ def _tokenize_example(self, prompt_and_fewshot: str, context_options: List[str],
         tokenized_example = {}
         preamble = self.tokenizer(prompt_and_fewshot)
         preamble = self._fix_eos_on_preamble(preamble['input_ids'])
-        encoded_contexts = [preamble + self.tokenizer(c, add_special_tokens=False)['input_ids'] for c in context_options]
+        encoded_contexts = [
+            preamble + self.tokenizer(c, add_special_tokens=False)['input_ids'] for c in context_options
+        ]
         continuation = example['continuation']
         if self.prefix_space:
             continuation = (f' {continuation}' if not continuation.startswith(' ') else continuation)
@@ -1165,16 +1168,15 @@ def __init__(
             },
         }
         self._update_generation_kwargs(kwargs.get('generation_kwargs'))
-    
 
     def adjust_padding(self):
         """
-        Adjusts padding to the maximum prompt size rather than max_seq_len. 
+        Adjusts padding to the maximum prompt size rather than max_seq_len.
         Needs to be done after the dataset has been processed because we can't get the prompt length
         until after we've tokenized it.
 
         Returns:
-            dataset: 
+            dataset:
         """
         max_prompt_length = 0
         for example in self.dataset:
diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py
index 16a77d01f2..9a5f7de2a3 100644
--- a/tests/datasets/test_in_context_learning_datasets.py
+++ b/tests/datasets/test_in_context_learning_datasets.py
@@ -15,10 +15,12 @@
 
 from composer import Evaluator
 from composer.core import DataSpec
-from composer.datasets.in_context_learning_evaluation import (InContextLearningDataset, InContextLearningQATaskDataset, InContextLearningCodeEvalDataset, strip_data, _tokenizer_needs_prefix_space,
-                                                              _get_continuation_span,
+from composer.datasets.in_context_learning_evaluation import (InContextLearningCodeEvalDataset,
+                                                              InContextLearningDataset, InContextLearningLMTaskDataset,
+                                                              InContextLearningQATaskDataset, _get_continuation_span,
                                                               _get_fewshot_sample_idxs, _make_padded_input,
-                                                              _trim_context, get_icl_task_dataloader)
+                                                              _tokenizer_needs_prefix_space, _trim_context,
+                                                              get_icl_task_dataloader, strip_data)
 from composer.loggers import InMemoryLogger
 from composer.metrics import (InContextLearningCodeEvalAccuracy, InContextLearningLMAccuracy,
                               InContextLearningMultipleChoiceAccuracy, InContextLearningQAAccuracy)
@@ -27,22 +29,26 @@
 from composer.utils import dist, reproducibility
 from tests.common import device, world_size
 
+
 def test_strip_data():
-    data_to_strip = {"strip_data": "  boo!  \n", "has_space": "  wa hoo!", "end_space": "yoohoo!  "}
+    data_to_strip = {'strip_data': '  boo!  \n', 'has_space': '  wa hoo!', 'end_space': 'yoohoo!  '}
     stripped_data = strip_data(data_to_strip)
     for k, v in stripped_data.items():
         assert k in data_to_strip
         assert not v[0].isspace()
         assert not v[-1].isspace()
 
+
+@pytest.mark.skip(reason="Currently don't have a tokenizer that satisfies this test")
 def test_tokenizer_needs_prefix_space_when_space_not_needed(tiny_gpt2_tokenizer):
-    # TODO: get a tokenizer that does not need prefix space
     assert not _tokenizer_needs_prefix_space(tiny_gpt2_tokenizer)
 
+
 def test_tokenizer_needs_prefix_space_when_space_needed():
     tokenizer = AutoTokenizer.from_pretrained('facebook/opt-125m', use_fast=False)
     assert _tokenizer_needs_prefix_space(tokenizer)
 
+
 def test_trim_context():
     context = [0] * 99 + [1] * 2037
     continuation = [2] * 10
@@ -52,12 +58,13 @@ def test_trim_context():
     assert trimmed_context[0] == 0
     assert trimmed_context[1] == 1
 
+
 def test_trim_context_no_continuation():
     context = [0] * 2048
     max_seq_len = 2048
     trimmed_context = _trim_context(context, [], max_seq_len=max_seq_len)
     assert len(trimmed_context) == 2048
-    context = [0] * 3000  + [1]
+    context = [0] * 3000 + [1]
     max_seq_len = 2048
     trimmed_context = _trim_context(context, [], max_seq_len=max_seq_len)
     assert len(trimmed_context) == 2048
@@ -69,10 +76,11 @@ def test_get_continuation_span():
     continuation = [1] * 3
     cont_span = _get_continuation_span(context, continuation)
     assert torch.all(torch.eq(cont_span, torch.tensor([200, 201, 202])))
-    continuation = [1] 
+    continuation = [1]
     cont_span = _get_continuation_span(context, continuation)
     assert torch.all(torch.eq(cont_span, torch.tensor([200])))
 
+
 @pytest.mark.parametrize('padding_side', ['left', 'right', 'middle'])
 def test_make_padding(tiny_gpt2_tokenizer, padding_side):
     context = tiny_gpt2_tokenizer(' cat' * 2000)['input_ids']
@@ -97,18 +105,27 @@ def test_batch_padding_logic_no_padding(tiny_gpt2_tokenizer):
     max_seq_len = 2048
     trimmed_context = _trim_context(context, continuation, max_seq_len)
     continuation_spans = _get_continuation_span(trimmed_context, continuation)
-    padded_input = _make_padded_input(trimmed_context, continuation, max_seq_len, tiny_gpt2_tokenizer.pad_token_id, padding_side='right')
+    padded_input = _make_padded_input(trimmed_context,
+                                      continuation,
+                                      max_seq_len,
+                                      tiny_gpt2_tokenizer.pad_token_id,
+                                      padding_side='right')
     assert continuation_spans[0] == 48 and continuation_spans[-1] == 2047
     assert len(padded_input) == 2048
     assert tiny_gpt2_tokenizer.pad_token_id not in padded_input
 
+
 def test_batch_padding_logic_with_padding(tiny_gpt2_tokenizer):
     continuation = tiny_gpt2_tokenizer(' dog' * 200)['input_ids']
     context = tiny_gpt2_tokenizer(' cat' * 200)['input_ids']
     max_seq_len = 2048
     trimmed_context = _trim_context(context, continuation, max_seq_len)
     continuation_spans = _get_continuation_span(trimmed_context, continuation)
-    padded_input = _make_padded_input(trimmed_context, continuation, max_seq_len, tiny_gpt2_tokenizer.pad_token_id, padding_side='right')
+    padded_input = _make_padded_input(trimmed_context,
+                                      continuation,
+                                      max_seq_len,
+                                      tiny_gpt2_tokenizer.pad_token_id,
+                                      padding_side='right')
     assert continuation_spans[0] == 200 and continuation_spans[-1] == 399
     assert len(padded_input) == 2048
     assert padded_input[-1] == tiny_gpt2_tokenizer.pad_token_id
@@ -163,24 +180,23 @@ def test_update_generation_kwargs(tiny_gpt2_tokenizer, tmp_path):
         'name': 'invoker',
     }
     hf_parsing_map = {'context': ['quas', 'wex', 'exort'], 'answer': ['spell']}
-    gen_kwargs = {"test_arg1": 1, "test_arg2": 2}
+    gen_kwargs = {'test_arg1': 1, 'test_arg2': 2}
 
     dl = InContextLearningDataset(dataset_uri='hf://mosaicml/test_dataset',
-                                 tokenizer=tokenizer,
-                                 max_seq_len=seqlen,
-                                 pad_tok_id=tokenizer.eos_token_id,
-                                 num_fewshot=num_fewshot,
-                                 fewshot_random_seed=1,
-                                 prompt_string=prompt_string,
-                                 example_delimiter='\n',
-                                 prelimiter='Orbs: ',
-                                 continuation_delimiter='\nSpell:',
-                                 destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
-                                 hf_loading_vars=hf_loading_vars,
-                                 hf_parsing_map=hf_parsing_map,
-                                 generation_kwargs=gen_kwargs
-                                 )
-    assert dl.default_batch['generation_kwargs'] == {"test_arg1":1, "test_arg2":2}
+                                  tokenizer=tokenizer,
+                                  max_seq_len=seqlen,
+                                  pad_tok_id=tokenizer.eos_token_id,
+                                  num_fewshot=num_fewshot,
+                                  fewshot_random_seed=1,
+                                  prompt_string=prompt_string,
+                                  example_delimiter='\n',
+                                  prelimiter='Orbs: ',
+                                  continuation_delimiter='\nSpell:',
+                                  destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
+                                  hf_loading_vars=hf_loading_vars,
+                                  hf_parsing_map=hf_parsing_map,
+                                  generation_kwargs=gen_kwargs)
+    assert dl.default_batch['generation_kwargs'] == {'test_arg1': 1, 'test_arg2': 2}
 
 
 def test_update_generation_kwargs_no_kwargs(tiny_gpt2_tokenizer, tmp_path):
@@ -195,24 +211,21 @@ def test_update_generation_kwargs_no_kwargs(tiny_gpt2_tokenizer, tmp_path):
     hf_parsing_map = {'context': ['quas', 'wex', 'exort'], 'answer': ['spell']}
 
     dl = InContextLearningDataset(dataset_uri='hf://mosaicml/test_dataset',
-                                 tokenizer=tokenizer,
-                                 max_seq_len=seqlen,
-                                 pad_tok_id=tokenizer.eos_token_id,
-                                 num_fewshot=num_fewshot,
-                                 fewshot_random_seed=1,
-                                 prompt_string=prompt_string,
-                                 example_delimiter='\n',
-                                 prelimiter='Orbs: ',
-                                 continuation_delimiter='\nSpell:',
-                                 destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
-                                 hf_loading_vars=hf_loading_vars,
-                                 hf_parsing_map=hf_parsing_map)
+                                  tokenizer=tokenizer,
+                                  max_seq_len=seqlen,
+                                  pad_tok_id=tokenizer.eos_token_id,
+                                  num_fewshot=num_fewshot,
+                                  fewshot_random_seed=1,
+                                  prompt_string=prompt_string,
+                                  example_delimiter='\n',
+                                  prelimiter='Orbs: ',
+                                  continuation_delimiter='\nSpell:',
+                                  destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
+                                  hf_loading_vars=hf_loading_vars,
+                                  hf_parsing_map=hf_parsing_map)
     assert not dl.default_batch['generation_kwargs']
 
 
-def test_generate_few_shot_prompt():
-    pass
-
 def test_construct_context(tiny_gpt2_tokenizer, tmp_path):
     tokenizer = tiny_gpt2_tokenizer
     seqlen = 2048
@@ -225,25 +238,31 @@ def test_construct_context(tiny_gpt2_tokenizer, tmp_path):
     hf_parsing_map = {'context': ['quas', 'wex', 'exort'], 'answer': ['spell']}
 
     dl = InContextLearningDataset(dataset_uri='hf://mosaicml/test_dataset',
-                                 tokenizer=tokenizer,
-                                 max_seq_len=seqlen,
-                                 pad_tok_id=tokenizer.eos_token_id,
-                                 num_fewshot=num_fewshot,
-                                 fewshot_random_seed=1,
-                                 prompt_string=prompt_string,
-                                 example_delimiter='\n',
-                                 prelimiter='Orbs: ',
-                                 continuation_delimiter='\nSpell: ',
-                                 destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
-                                 hf_loading_vars=hf_loading_vars,
-                                 hf_parsing_map=hf_parsing_map)
-    constructed_context = dl._construct_context({'context':'quas quas exort', 'answer': 'ice wall'})
+                                  tokenizer=tokenizer,
+                                  max_seq_len=seqlen,
+                                  pad_tok_id=tokenizer.eos_token_id,
+                                  num_fewshot=num_fewshot,
+                                  fewshot_random_seed=1,
+                                  prompt_string=prompt_string,
+                                  example_delimiter='\n',
+                                  prelimiter='Orbs: ',
+                                  continuation_delimiter='\nSpell: ',
+                                  destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
+                                  hf_loading_vars=hf_loading_vars,
+                                  hf_parsing_map=hf_parsing_map)
+    constructed_context = dl._construct_context({'context': 'quas quas exort', 'answer': 'ice wall'})
     assert constructed_context == 'Orbs: quas quas exort\nSpell: '
-    constructed_context = dl._construct_context({'context':'quas quas exort', 'answer': 'ice wall'}, add_answer=True)
+    constructed_context = dl._construct_context({'context': 'quas quas exort', 'answer': 'ice wall'}, add_answer=True)
     assert constructed_context == 'Orbs: quas quas exort\nSpell: ice wall'
-    constructed_context = dl._construct_context({'context':'quas quas exort', 'answer': 'ice wall'}, preceding_text='The harsh White Waste beckons!', add_answer=True)
+    constructed_context = dl._construct_context({
+        'context': 'quas quas exort',
+        'answer': 'ice wall'
+    },
+                                                preceding_text='The harsh White Waste beckons!',
+                                                add_answer=True)
     assert constructed_context == '\nOrbs: quas quas exort\nSpell: ice wall'
 
+
 def test_get_answer_from_example(tiny_gpt2_tokenizer, tmp_path):
     tokenizer = tiny_gpt2_tokenizer
     seqlen = 2048
@@ -256,21 +275,22 @@ def test_get_answer_from_example(tiny_gpt2_tokenizer, tmp_path):
     hf_parsing_map = {'context': ['quas', 'wex', 'exort'], 'answer': ['spell']}
 
     dl = InContextLearningDataset(dataset_uri='hf://mosaicml/test_dataset',
-                                 tokenizer=tokenizer,
-                                 max_seq_len=seqlen,
-                                 pad_tok_id=tokenizer.eos_token_id,
-                                 num_fewshot=num_fewshot,
-                                 fewshot_random_seed=1,
-                                 prompt_string=prompt_string,
-                                 example_delimiter='\n',
-                                 prelimiter='Orbs: ',
-                                 continuation_delimiter='\nSpell:',
-                                 destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
-                                 hf_loading_vars=hf_loading_vars,
-                                 hf_parsing_map=hf_parsing_map)
-    answer = dl._get_answer_from_example({'context': "wex exort exort", 'answer': 'alacrity'})
+                                  tokenizer=tokenizer,
+                                  max_seq_len=seqlen,
+                                  pad_tok_id=tokenizer.eos_token_id,
+                                  num_fewshot=num_fewshot,
+                                  fewshot_random_seed=1,
+                                  prompt_string=prompt_string,
+                                  example_delimiter='\n',
+                                  prelimiter='Orbs: ',
+                                  continuation_delimiter='\nSpell:',
+                                  destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
+                                  hf_loading_vars=hf_loading_vars,
+                                  hf_parsing_map=hf_parsing_map)
+    answer = dl._get_answer_from_example({'context': 'wex exort exort', 'answer': 'alacrity'})
     assert answer == 'alacrity'
 
+
 def test_fix_eos_on_preamble(tmp_path):
     tokenizer = AutoTokenizer.from_pretrained('facebook/opt-125m', use_fast=False)
     seqlen = 2048
@@ -283,28 +303,95 @@ def test_fix_eos_on_preamble(tmp_path):
     hf_parsing_map = {'context': ['quas', 'wex', 'exort'], 'answer': ['spell']}
 
     dl = InContextLearningDataset(dataset_uri='hf://mosaicml/test_dataset',
-                                 tokenizer=tokenizer,
-                                 max_seq_len=seqlen,
-                                 pad_tok_id=tokenizer.eos_token_id,
-                                 num_fewshot=num_fewshot,
-                                 fewshot_random_seed=1,
-                                 prompt_string=prompt_string,
-                                 example_delimiter='\n',
-                                 prelimiter='Orbs: ',
-                                 continuation_delimiter='\nSpell:',
-                                 destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
-                                 hf_loading_vars=hf_loading_vars,
-                                 hf_parsing_map=hf_parsing_map)
+                                  tokenizer=tokenizer,
+                                  max_seq_len=seqlen,
+                                  pad_tok_id=tokenizer.eos_token_id,
+                                  num_fewshot=num_fewshot,
+                                  fewshot_random_seed=1,
+                                  prompt_string=prompt_string,
+                                  example_delimiter='\n',
+                                  prelimiter='Orbs: ',
+                                  continuation_delimiter='\nSpell:',
+                                  destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
+                                  hf_loading_vars=hf_loading_vars,
+                                  hf_parsing_map=hf_parsing_map)
     preamble = 'blah blah blah.'
     tokenized_preamble = tokenizer.encode(preamble)
     tokenized_preamble += [tokenizer.eos_token_id]
     fixed_preamble = dl._fix_eos_on_preamble(tokenized_preamble)
-    assert tokenized_preamble[:-1] == fixed_preamble 
+    assert tokenized_preamble[:-1] == fixed_preamble
     assert fixed_preamble[-1] != tokenizer.eos_token_id
 
-def test_tokenize_eample(tiny_gpt2_tokenizer, tmp_path):
 
-    pass
+def test_tokenize_example_with_tokenize_labels(tiny_gpt2_tokenizer, tmp_path):
+    tokenizer = tiny_gpt2_tokenizer
+    seqlen = 2048
+    num_fewshot = 0
+    prompt_string = ''
+    hf_loading_vars = {
+        'split': 'test',
+        'name': 'invoker',
+    }
+    hf_parsing_map = {'context': ['quas', 'wex', 'exort'], 'answer': ['spell']}
+
+    dl = InContextLearningDataset(dataset_uri='hf://mosaicml/test_dataset',
+                                  tokenizer=tokenizer,
+                                  max_seq_len=seqlen,
+                                  pad_tok_id=tokenizer.eos_token_id,
+                                  num_fewshot=num_fewshot,
+                                  fewshot_random_seed=1,
+                                  prompt_string=prompt_string,
+                                  example_delimiter='\n',
+                                  prelimiter='Orbs: ',
+                                  continuation_delimiter='\nSpell: ',
+                                  destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
+                                  hf_loading_vars=hf_loading_vars,
+                                  hf_parsing_map=hf_parsing_map,
+                                  tokenize_labels=True)
+    tokenized_example = dl._tokenize_example('What spell does this invoke? ', 'exort exort wex\nSpell: ',
+                                             {'answer': ' Meatball'})
+    tokenized_input = [2061, 4822, 857, 428, 26342, 30, 220, 1069, 419, 409, 419, 356, 87, 198, 31221, 25, 19145, 1894]
+    assert tokenized_example['context'][:len(tokenized_input)].tolist() == tokenized_input
+    assert tokenized_example['context'][-1] == tokenizer.eos_token_id
+    assert type(tokenized_example['answer'][0]) == int
+    assert len(tokenized_example['context']) == seqlen
+    assert 'continuation_indices' in tokenized_example
+
+
+def test_tokenize_example_with_no_tokenize_labels(tiny_gpt2_tokenizer, tmp_path):
+    tokenizer = tiny_gpt2_tokenizer
+    seqlen = 2048
+    num_fewshot = 0
+    prompt_string = ''
+    hf_loading_vars = {
+        'split': 'test',
+        'name': 'invoker',
+    }
+    hf_parsing_map = {'context': ['quas', 'wex', 'exort'], 'answer': ['spell']}
+
+    dl = InContextLearningDataset(dataset_uri='hf://mosaicml/test_dataset',
+                                  tokenizer=tokenizer,
+                                  max_seq_len=seqlen,
+                                  pad_tok_id=tokenizer.eos_token_id,
+                                  num_fewshot=num_fewshot,
+                                  fewshot_random_seed=1,
+                                  prompt_string=prompt_string,
+                                  example_delimiter='\n',
+                                  prelimiter='Orbs: ',
+                                  continuation_delimiter='\nSpell: ',
+                                  destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
+                                  hf_loading_vars=hf_loading_vars,
+                                  hf_parsing_map=hf_parsing_map,
+                                  tokenize_labels=False)
+    tokenized_example = dl._tokenize_example('What spell does this invoke? ', 'exort exort wex\nSpell: ',
+                                             {'answer': ' Meatball'})
+    tokenized_input = [2061, 4822, 857, 428, 26342, 30, 220, 1069, 419, 409, 419, 356, 87, 198, 31221, 25]
+    # import IPython; IPython.embed()
+    assert tokenized_example['context'][:len(tokenized_input)].tolist() == tokenized_input
+    assert tokenized_example['context'][-1] == tokenizer.eos_token_id
+    assert len(tokenized_example['context']) == seqlen
+    assert type(tokenized_example['answer']) == str
+
 
 def test_qa_set_cot_no_cot(tmp_path):
     pytest.importorskip('datasets')
@@ -328,6 +415,7 @@ def test_qa_set_cot_no_cot(tmp_path):
     )
     assert not dl.has_cot
 
+
 def test_qa_set_cot_has_cot(tmp_path):
     pytest.importorskip('datasets')
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -350,17 +438,119 @@ def test_qa_set_cot_has_cot(tmp_path):
     )
     assert dl.has_cot
 
-def test_qa_get_max_answer_length():
-    pass
 
-def test_qa_get_answer_from_example():
-    pass
+def test_qa_get_max_answer_length(tiny_gpt2_tokenizer, tmp_path):
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    dataset_uri = f'{local_data}/triviaqa_small.jsonl'
+    tokenizer = tiny_gpt2_tokenizer
+
+    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
+    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
+    dl = InContextLearningQATaskDataset(
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        max_seq_len=1024,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=0,
+        fewshot_random_seed=1234,
+        prompt_string='',
+        example_delimiter='',
+        continuation_delimiter='',
+        cot_delimiter='',
+        destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
+    )
+    # empirical number from the small test dataset
+    assert dl.max_answer_length == 9
+
+
+def test_qa_get_answer_from_example_with_no_cot(tmp_path, tiny_gpt2_tokenizer):
+    pytest.importorskip('datasets')
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    dataset_uri = f'{local_data}/triviaqa_small.jsonl'
+
+    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
+    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
+    dl = InContextLearningQATaskDataset(
+        dataset_uri=dataset_uri,
+        tokenizer=tiny_gpt2_tokenizer,
+        max_seq_len=1024,
+        pad_tok_id=tiny_gpt2_tokenizer.eos_token_id,
+        num_fewshot=0,
+        fewshot_random_seed=1234,
+        prompt_string='',
+        example_delimiter='\n',
+        continuation_delimiter=': ',
+        cot_delimiter=' ### ',
+        destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
+    )
+    answer = dl._get_answer_from_example({
+        'context': 'empty',
+        'answer': 'this is the correct answer',
+        'chain_of_thought': "Let's think step by step. "
+    })
+    assert answer == 'this is the correct answer'
+
+
+def test_qa_get_answer_from_example_with_cot(tmp_path, tiny_gpt2_tokenizer):
+    pytest.importorskip('datasets')
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    dataset_uri = f'{local_data}/triviaqa_small.jsonl'
+
+    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
+    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
+    dl = InContextLearningQATaskDataset(
+        dataset_uri=dataset_uri,
+        tokenizer=tiny_gpt2_tokenizer,
+        max_seq_len=1024,
+        pad_tok_id=tiny_gpt2_tokenizer.eos_token_id,
+        num_fewshot=0,
+        fewshot_random_seed=1234,
+        prompt_string='',
+        example_delimiter='\n',
+        continuation_delimiter=': ',
+        cot_delimiter=' ### ',
+        destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
+    )
+    dl.has_cot = True
+    answer = dl._get_answer_from_example({
+        'context': 'empty',
+        'answer': 'this is the correct answer',
+        'chain_of_thought': "Let's think step by step. "
+    })
+    assert answer == "Let's think step by step.  ### this is the correct answer"
+
+
+def test_qa_tokenize_example(tiny_gpt2_tokenizer, tmp_path):
+    pytest.importorskip('datasets')
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    dataset_uri = f'{local_data}/triviaqa_small.jsonl'
 
-def test_qa_tokenize_example():
-    pass
+    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
+    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
+    dl = InContextLearningQATaskDataset(
+        dataset_uri=dataset_uri,
+        tokenizer=tiny_gpt2_tokenizer,
+        max_seq_len=1024,
+        pad_tok_id=tiny_gpt2_tokenizer.eos_token_id,
+        num_fewshot=0,
+        fewshot_random_seed=1234,
+        prompt_string='',
+        example_delimiter='\n',
+        continuation_delimiter=': ',
+        cot_delimiter=' ### ',
+        destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
+    )
+    dl.has_cot = True
+    tokenized_example = dl._tokenize_example(
+        'starting prompt', 'a context', {
+            'context': 'empty',
+            'answer': 'this is the correct answer',
+            'aliases': ['this is the right answer', 'this is the best answer'],
+            'chain_of_thought': "Let's think step by step. "
+        })
+    assert 'aliases' in tokenized_example
+    assert tokenized_example['aliases'] == ['this is the right answer', 'this is the best answer']
 
-def test_lm_get_answer_from_example():
-    pass
 
 def test_code_adjust_padding(tiny_gpt2_tokenizer, tmp_path):
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -369,26 +559,26 @@ def test_code_adjust_padding(tiny_gpt2_tokenizer, tmp_path):
     seqlen = 2048
     num_fewshot = 0
     prompt_string = ''
-    gen_kwargs = {"temperature": .9, "top_p": .95, "num_beams": 9000}
+    gen_kwargs = {'temperature': .9, 'top_p': .95, 'num_beams': 9000}
 
     dl = InContextLearningCodeEvalDataset(
-            dataset_uri=dataset_uri,
-            tokenizer=tokenizer,
-            max_seq_len=seqlen,
-            pad_tok_id=tokenizer.eos_token_id,
-            num_fewshot=num_fewshot,
-            fewshot_random_seed=1,
-            prompt_string=prompt_string,
-            example_delimiter='\n',
-            prelimiter='Code start:',
-            continuation_delimiter='\nPlease code:',
-            destination_path=str(tmp_path / 'test_human_eval_small.jsonl'),
-            generation_kwargs=gen_kwargs,
-            generations_per_sample=10,
-        )
-    
-    assert all([len(data['prompt']) == 148 for data in dl.dataset])
-    
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        max_seq_len=seqlen,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=num_fewshot,
+        fewshot_random_seed=1,
+        prompt_string=prompt_string,
+        example_delimiter='\n',
+        prelimiter='Code start:',
+        continuation_delimiter='\nPlease code:',
+        destination_path=str(tmp_path / 'test_human_eval_small.jsonl'),
+        generation_kwargs=gen_kwargs,
+        generations_per_sample=10,
+    )
+
+    assert all(len(data['prompt']) == 148 for data in dl.dataset)
+
 
 def test_code_update_gen_kwargs(tiny_gpt2_tokenizer, tmp_path):
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -397,23 +587,23 @@ def test_code_update_gen_kwargs(tiny_gpt2_tokenizer, tmp_path):
     seqlen = 2048
     num_fewshot = 0
     prompt_string = ''
-    gen_kwargs = {"temperature": .9, "top_p": .95, "num_beams": 9000}
+    gen_kwargs = {'temperature': .9, 'top_p': .95, 'num_beams': 9000}
 
     dl = InContextLearningCodeEvalDataset(
-            dataset_uri=dataset_uri,
-            tokenizer=tokenizer,
-            max_seq_len=seqlen,
-            pad_tok_id=tokenizer.eos_token_id,
-            num_fewshot=num_fewshot,
-            fewshot_random_seed=1,
-            prompt_string=prompt_string,
-            example_delimiter='\n',
-            prelimiter='Code start:',
-            continuation_delimiter='\nPlease code:',
-            destination_path=str(tmp_path / 'test_human_eval_small.jsonl'),
-            generation_kwargs=gen_kwargs,
-            generations_per_sample=10,
-        )
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        max_seq_len=seqlen,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=num_fewshot,
+        fewshot_random_seed=1,
+        prompt_string=prompt_string,
+        example_delimiter='\n',
+        prelimiter='Code start:',
+        continuation_delimiter='\nPlease code:',
+        destination_path=str(tmp_path / 'test_human_eval_small.jsonl'),
+        generation_kwargs=gen_kwargs,
+        generations_per_sample=10,
+    )
     assert dl.default_batch['generation_kwargs']['num_beams'] == 9000
     assert dl.default_batch['generation_kwargs']['top_p'] == .95
     assert dl.default_batch['generation_kwargs']['top_k'] == 40

From 20b0ecffee859ba82472f41f92a93596f9f1b39e Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Thu, 7 Dec 2023 01:21:50 +0000
Subject: [PATCH 070/116] rm RAG task

---
 .../in_context_learning_evaluation.py         | 102 ------------------
 1 file changed, 102 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 486b1ded10..5c4524b291 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -515,88 +515,6 @@ def split_batch(self, batch: Any, microbatch_size: int) -> List[Dict[str, Any]]:
         return [{k: v[idx] for k, v in chunked.items()} for idx in range(num_chunks)]
 
 
-# TODO: write tests for this class
-class InContextLearningRAGGenerationTaskDataset(InContextLearningDataset):
-    """A dataset that construct batches for in-context learning RAG generation evaluation
-    Rag generation tasks evaluate a model's ability to answer questions based on passages.
-
-    Args:
-        passage_delimiter (str): Delimiter to place between each passage.
-        passage_query_delimiter (str): Delimiter to place between the last passage and the query.
-    """
-
-    def __init__(self,
-                 passage_delimiter: str = '\nPassage: ',
-                 passage_query_delimiter: str = '\nQuery: ',
-                 *args,
-                 **kwargs):
-        kwargs.pop('passage_delimiter', None)
-        kwargs.pop('passage_query_delimiter', None)
-        self.passage_delimiter = passage_delimiter
-        self.passage_query_delimiter = passage_query_delimiter
-        super().__init__(*args, **kwargs)
-
-    def _construct_context(self, example: dict, preceding_text: str = '', add_answer: bool = False):
-        """
-        Takes a example and constructs a context. Optionally, appends this to preceeding text (such as a
-        prompt or fewshot examples), as well as optionally adds the correct answer (for fewshot examples)
-
-        Args:
-            example (dict): the example from which to construct the context
-            preceding_text (str): any preceding text, needed to if self.example_delimiter is needed at the beginning
-            add_answer (bool): bool for whether or not to add the answer on the end of the context (needed for fewshot examples)
-
-        Returns:
-            str: The constructed context. The default output context is
-                 formatted as follows: f'{self.prelimiter}{example['self.passages_key']}{example[self.context_key]}{self.continuation_delimiter}'
-        """
-        passages = self.passage_delimiter.lstrip('\n ')
-        passages += f'{self.passage_delimiter}'.join(example['passages'])
-        query = example['query']
-        # TODO: add few_shot capabilities
-        context = f'{self.prelimiter}{passages}{self.passage_query_delimiter}{query}'
-        return context
-
-    def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: dict):
-        """
-        Runs text through the tokenizer and handles special cases.
-        Args:
-            prompt_and_fewshot (str): the collection of the prompt and fewshot examples that belongs before the example's context
-            ctx (str): the specific example's derived context
-            example (dict): the example as a dictionary.
-
-        Returns:
-            dict: dictionary with the tokenized data
-        """
-        tokenized_example = super()._tokenize_example(prompt_and_fewshot, ctxt, example)
-        answer = example['answers'][0]
-        tokenized_example['answer'] = self.tokenizer(answer, add_special_tokens=False)['input_ids']
-        return tokenized_example
-
-    def collate_fn(self, data):
-        """
-        The function that the dataloader uses to accumulate data into batches
-        Args:
-            data (list): list of tokenized datapoints (dicts returned by self._tokenize_example)
-
-        Returns:
-            dict: dictionary for a single batch
-        """
-        batch = {'input_ids': [], 'continuation_indices': [], 'mode': 'icl_task', 'labels': [], 'answer_indices': []}
-        for data_pair in data:
-            context_enc = data_pair['context']
-            answer_enc = data_pair['answer']
-
-            inp, answer_span = _make_padded_input(context_enc, answer_enc, self.max_seq_len, self.pad_tok_id)
-            batch['input_ids'].append(inp)
-            batch['answer_indices'].append(answer_span)
-            batch['labels'].append(inp)
-
-        batch = self._convert_tokens_to_tensors(batch)
-        batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
-        return batch
-
-
 class InContextLearningQATaskDataset(InContextLearningDataset):
     """
     A dataset that construct batches for in-context learning question answering evaluation.
@@ -1331,25 +1249,6 @@ def build_icl_dataloader(
             generation_kwargs=generation_kwargs,
         )
         effective_batchsize = batch_size
-    elif icl_task_type == 'rag':
-        dataset = InContextLearningRAGGenerationTaskDataset(
-            dataset_uri=dataset_uri,
-            tokenizer=tokenizer,
-            max_seq_len=max_seq_len,
-            pad_tok_id=pad_tok_id,
-            num_fewshot=num_fewshot,
-            prompt_string=prompt_string,
-            example_delimiter=example_delimiter,
-            continuation_delimiter=continuation_delimiter,
-            passage_delimiter='\nPassage: ',
-            passage_query_delimiter='\nQuery: ',
-            destination_path=destination_path,
-            fewshot_random_seed=fewshot_random_seed,
-            hf_loading_vars=hf_loading_vars,
-            hf_parsing_map=hf_parsing_map,
-            generation_kwargs=generation_kwargs,
-        )
-        effective_batchsize = batch_size
     else:
         raise Exception(f'Unrecognized ICL task type: {icl_task_type}')
 
@@ -1362,7 +1261,6 @@ def build_icl_dataloader(
             InContextLearningMultipleChoiceTaskDataset,
             InContextLearningQATaskDataset,
             InContextLearningCodeEvalDataset,
-            InContextLearningRAGGenerationTaskDataset,
         ),
     ):
         split_batch = dataset.split_batch

From 2285f3f900d54f53e5ad3bfd9165553cf74a2582 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Thu, 7 Dec 2023 18:25:44 +0000
Subject: [PATCH 071/116] more docstring

---
 .../in_context_learning_evaluation.py         | 46 ++++++++-----------
 1 file changed, 20 insertions(+), 26 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 5c4524b291..0c777f16f1 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -461,7 +461,6 @@ def _convert_tokens_to_tensors(self, batch: Dict) -> Dict[str, Any]:
             batch['continuation_indices'] = list(map(torch.tensor, batch['continuation_indices']))
         return batch
 
-    # TODO: Test this?
     def collate_fn(self, data: Dict[str, Any]) -> Dict[str, Any]:
         """
         The function that the dataloader uses to accumulate data into batches.
@@ -989,7 +988,7 @@ class InContextLearningCodeEvalDataset(InContextLearningDataset):
     """
     A dataset that constructs batches for in-context learning code evaluation.
 
-    The default input format is expected to be a jsonl file with the following fields:
+    The input format is expected to be a jsonl file with the following fields:
     - task_id: label of given task
     - prompt: the code snippet that must be completed
     - entry_point: the entry to the function/code snippet to generate
@@ -1011,28 +1010,24 @@ class InContextLearningCodeEvalDataset(InContextLearningDataset):
     - languages:  list of languages
     - pass_at_k: passed value for pass_at_k
     - generation_length: derrived maximum generation length
-    - generation_kwargs: Dictionary of kwargs neeeded for generation. Includes the following:
+    - generation_kwargs: Dictionary of kwargs neeeded for generation. Includes the following, which will be individually overwritten 
+        by keys in generaiton_kwargs if set (see https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig 
+        for more details):
         - pad_token_id: ID for padding token, derived automatically
-        - num_beams: how many beams to search for generations, always set to 1
+        - num_beams: how many beams to search for generations, set to 1
         - num_return_sequences: value passed for 'generations_per_sample', how many generations per prompt
         - do_sample: determines whether model is sampling or greedily decoding. Always set to True
-        - top_p: the cumulative probability of parameter highest probability vocabulary tokens to keep for nucleus sampling. Must be between 0 and 1
-        - top_k: the number of highest probability vocabulary tokens to keep for top-k-filtering. Between 1 and infinity.
         - use_cache: Whether or not to use past key values to speed up sampling. Always set to True
 
     Additional Args:
         generations_per_sample (int) (defaults to 1): The number of independently computed returned sequences for each element in the batch
         pass_at_k (int) (defaults to 1): k for how many chances the model gets to write passing code
-        top_p (int) (defaults to 0.95): top_p sampling parameter for nucleus sampling
-        top_k (int) (defaults to 40): top_k sampling parameter for number of samples to consider
     """
 
     def __init__(
         self,
         generations_per_sample: int,
         pass_at_k: int = 1,
-        top_p: Optional[float] = 0.95,
-        top_k: Optional[int] = 40,
         *args,
         **kwargs,
     ):
@@ -1080,8 +1075,6 @@ def __init__(
                 'num_beams': 1,  # single beam
                 'num_return_sequences': generations_per_sample,
                 'do_sample': True,
-                'top_p': top_p,
-                'top_k': top_k,
                 'use_cache': True
             },
         }
@@ -1393,22 +1386,23 @@ def get_icl_task_dataloader(
         max_seq_len (int): The sequence length expected by the model
         pad_tok_id (int): The special token reserved for padding the ends of batches
         num_fewshot (int): The number of complete fewshot examples to pad each test example with
-        prompt_string (str): Prompt string to put once before all fewshot examples/test examples (e.g. 'translate english to french')
-        example_delimiter (str): Separator that goes between individual examples (e.g. '\n')
-        continuation_delimiter: (str): Separator that goes between context and continuation in each example (e.g. '->')
-        question_prelimiter: (str): Text to be prepended before each context segement in each eval example. (e.g. 'Q:', 'The following is a paragraph containing...')
-        hf_loading_vars (Dict): A dictionary containing keyword arguments to be passed into `load_dataset` if dataset is being pulled from HF.
-        hf_parsing_map (Dict[str:List[str]]): A dictionary containing a from HF columns to ICL dataset keys. The dictionary should be formatted {icl_key:[hf_key1, hf_key1]}.
-            Values in the dict will be concatenated with ' ' seperating them. If not included, will use the columns already present in the HF dataset.
-        destination_path: (str): This is the local file where remote datasets will be saved.
-        fewshot_random_seed (int): Random seed to use for fewshot sampling
-        # TODO: is this right?
-        pass_at_k (int): k for how many chances the model gets to write passing code
-        generations_per_sample (int): how many outputs to generate per prompt
-
-        cot_delimiter (str): Delimiter to place between the chain of thought and continuations.
+        prompt_string (str, default = ''): Prompt string to put once before all fewshot examples/test examples (e.g. 'translate english to french')
+        example_delimiter (str, default = '\n'): Separator that goes between individual examples (e.g. '\n')
+        continuation_delimiter: (str, default = ' '): Separator that goes between context and continuation in each example (e.g. '->')
+        destination_path: (str, default = ''): This is the local file where remote datasets will be saved.
+        question_prelimiter: (str, default = ''): Text to be prepended before each context segement in each eval example. (e.g. 'Q:', 'The following is a paragraph containing...')
+        fewshot_random_seed (int, default = 1234): Random seed to use for fewshot sampling
+
+        pass_at_k (int): k for how many chances the model gets to write passing code.
+        generations_per_sample (int): how many outputs to generate per prompt. Passed in generation_kwargs under "num_return_sequences" and overwritten by generation_kwargs dict.
+        cot_delimiter (str): Delimiter to place between chain of thoughts and continuations.
         has_categories: (bool): If ``True``, we will search the dataset file for a category key, and partition the dataset into a separate dataloader for each category occurring in the data.
 
+        hf_loading_vars (Dict, default = None): A dictionary containing keyword arguments to be passed into `load_dataset` if dataset is being pulled from HF.
+        hf_parsing_map (Dict, default = None): A dictionary containing a from HF columns to ICL dataset keys. The dictionary should be formatted {icl_key:[hf_key1, hf_key1]}.
+            Values in the dict will be concatenated with ' ' seperating them. If not included, will use the columns already present in the HF dataset.
+        generation_kwargs (dict):
+
     Returns:
         DataLoader: A dataloader used for performing in-context learning evaluation on the dataset provided.
     """

From 817030e462b161c47a9020a698a387b26ccc520a Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Thu, 7 Dec 2023 19:13:20 +0000
Subject: [PATCH 072/116] tests passing

---
 tests/datasets/test_in_context_learning_datasets.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py
index 9a5f7de2a3..e4dc02bd85 100644
--- a/tests/datasets/test_in_context_learning_datasets.py
+++ b/tests/datasets/test_in_context_learning_datasets.py
@@ -606,7 +606,6 @@ def test_code_update_gen_kwargs(tiny_gpt2_tokenizer, tmp_path):
     )
     assert dl.default_batch['generation_kwargs']['num_beams'] == 9000
     assert dl.default_batch['generation_kwargs']['top_p'] == .95
-    assert dl.default_batch['generation_kwargs']['top_k'] == 40
     assert dl.default_batch['generation_kwargs']['temperature'] == .9
     assert dl.default_batch['generation_kwargs']['do_sample'] == True
 

From fd0e204a77a632ab6dd74458c744b03e3889ed7c Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Thu, 7 Dec 2023 23:39:17 +0000
Subject: [PATCH 073/116] wip

---
 composer/core/data_spec.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/composer/core/data_spec.py b/composer/core/data_spec.py
index feb38a4687..639aaf5bbf 100644
--- a/composer/core/data_spec.py
+++ b/composer/core/data_spec.py
@@ -60,7 +60,11 @@ def _split_mapping(m, microbatch_size: int):
     for k, v in m.items():
         if isinstance(v, (int, float, str, bool)):
             chunked[k] = [v] * num_chunks
-    return [{k: v[idx] for k, v in chunked.items()} for idx in range(num_chunks)]
+    try:
+        return [{k: v[idx] for k, v in chunked.items()} for idx in range(num_chunks)]
+    except:
+        lens = {k: len(v) for k, v in chunked.items()}
+        print(f"Failed returning batches. Here's the dictionary: {lens}")
 
 
 def _check_list_is_primitives(l):

From d3abb91623321112358eb6b0aa76b75f039b8d45 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Thu, 7 Dec 2023 23:56:45 +0000
Subject: [PATCH 074/116] wip

---
 composer/core/data_spec.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/composer/core/data_spec.py b/composer/core/data_spec.py
index 639aaf5bbf..3da6379c6f 100644
--- a/composer/core/data_spec.py
+++ b/composer/core/data_spec.py
@@ -64,7 +64,7 @@ def _split_mapping(m, microbatch_size: int):
         return [{k: v[idx] for k, v in chunked.items()} for idx in range(num_chunks)]
     except:
         lens = {k: len(v) for k, v in chunked.items()}
-        print(f"Failed returning batches. Here's the dictionary: {lens}")
+        raise Exception(f"Failed returning batches. Here's the dictionary: {lens}")
 
 
 def _check_list_is_primitives(l):

From 3be5fe801c0dfde0d6ef71fadc5c73a99bfd1efe Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Fri, 8 Dec 2023 00:45:05 +0000
Subject: [PATCH 075/116] add dict to data_spec

---
 composer/core/data_spec.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/composer/core/data_spec.py b/composer/core/data_spec.py
index 3da6379c6f..27077b1976 100644
--- a/composer/core/data_spec.py
+++ b/composer/core/data_spec.py
@@ -58,7 +58,7 @@ def _split_mapping(m, microbatch_size: int):
         num_chunks = len(list(chunked.values())[0])
     # Broadcast primitives to all chunks
     for k, v in m.items():
-        if isinstance(v, (int, float, str, bool)):
+        if isinstance(v, (int, float, str, bool, dict)):
             chunked[k] = [v] * num_chunks
     try:
         return [{k: v[idx] for k, v in chunked.items()} for idx in range(num_chunks)]

From 988422e85ae7081f917de6cbc88b1ecdb9bbf5f0 Mon Sep 17 00:00:00 2001
From: Max Marion <mmarion538@gmail.com>
Date: Tue, 12 Dec 2023 08:46:57 -0800
Subject: [PATCH 076/116] Update
 composer/datasets/in_context_learning_evaluation.py

Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com>
---
 composer/datasets/in_context_learning_evaluation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 0c777f16f1..ebd952c4c6 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -330,7 +330,7 @@ def _generate_few_shot_prompt(
 
     def _construct_context(self, example: Dict, preceding_text: str = '', add_answer: bool = False) -> str:
         """
-        Takes an example and constructs a context, ie the input the model reads for this example.
+        Takes an example and constructs a context, i.e. the input the model reads for this example.
         Optionally adds the correct answer (for fewshot examples) and handles example delimiters
 
         Args:

From 3a28e19ab84eb3ffb81497d5f9fb7f3dbbfd1a28 Mon Sep 17 00:00:00 2001
From: Max Marion <mmarion538@gmail.com>
Date: Tue, 12 Dec 2023 08:47:33 -0800
Subject: [PATCH 077/116] Update
 composer/datasets/in_context_learning_evaluation.py

Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com>
---
 composer/datasets/in_context_learning_evaluation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index ebd952c4c6..40a157dee6 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -365,7 +365,7 @@ def _get_answer_from_example(self, example: Dict[str, Any], in_context=False) ->
     def _fix_eos_on_preamble(self, input_ids: List[int]) -> List[int]:
         """
         If the input_ids is empty then input_ids will be a 0-length List,
-        unless the tokenizer adds special tokens to empty strings (e.g. OPT tokenizer)
+        unless the tokenizer adds special tokens to empty strings (e.g. OPT tokenizer).
         If there is an EOS token added, we need to remove it so it is not in the middle of the prompt,
         as the specific eval question's prompt will follow the input_ids
         Args:

From bfbb70a038a2d44c58e29db699ab4424618325a7 Mon Sep 17 00:00:00 2001
From: Max Marion <mmarion538@gmail.com>
Date: Tue, 12 Dec 2023 08:49:14 -0800
Subject: [PATCH 078/116] Apply suggestions from code review

comment improvements

Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com>
---
 composer/datasets/in_context_learning_evaluation.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 40a157dee6..803890b7b4 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -364,10 +364,10 @@ def _get_answer_from_example(self, example: Dict[str, Any], in_context=False) ->
 
     def _fix_eos_on_preamble(self, input_ids: List[int]) -> List[int]:
         """
-        If the input_ids is empty then input_ids will be a 0-length List,
+        If the input_ids is empty then input_ids will be a 0-length List
         unless the tokenizer adds special tokens to empty strings (e.g. OPT tokenizer).
         If there is an EOS token added, we need to remove it so it is not in the middle of the prompt,
-        as the specific eval question's prompt will follow the input_ids
+        as the specific eval question's prompt will follow the input_ids.
         Args:
             input_ids (List): the tokenized input
 
@@ -516,7 +516,7 @@ def split_batch(self, batch: Any, microbatch_size: int) -> List[Dict[str, Any]]:
 
 class InContextLearningQATaskDataset(InContextLearningDataset):
     """
-    A dataset that construct batches for in-context learning question answering evaluation.
+    A dataset that constructs batches for in-context learning question answering evaluation.
     QA tasks evaluate a model's ability to answer questions using a consistent format.
 
     The input format is expected to be a jsonl file with the following fields:
@@ -625,7 +625,7 @@ def _get_max_answer_length(self, dataset) -> int:
 
 class InContextLearningLMTaskDataset(InContextLearningDataset):
     """
-    A dataset that construct batches for in-context learning language modeling evaluation.
+    A dataset that constructs batches for in-context learning language modeling evaluation.
     Language modeling tasks test a model's ability to properly predict tokens based on preceding tokens.
 
     The input format is expected to be a jsonl file with the following fields:

From 1ce6f7a5ce9dc94320191c2807bfd93d9a295e6f Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Tue, 12 Dec 2023 23:43:03 +0000
Subject: [PATCH 079/116] default_batch to base_batch and some docstrings

---
 composer/core/data_spec.py                    |  6 +-
 .../in_context_learning_evaluation.py         | 70 +++++++++++--------
 .../test_in_context_learning_datasets.py      | 12 ++--
 3 files changed, 46 insertions(+), 42 deletions(-)

diff --git a/composer/core/data_spec.py b/composer/core/data_spec.py
index 27077b1976..4a55eb353c 100644
--- a/composer/core/data_spec.py
+++ b/composer/core/data_spec.py
@@ -60,11 +60,7 @@ def _split_mapping(m, microbatch_size: int):
     for k, v in m.items():
         if isinstance(v, (int, float, str, bool, dict)):
             chunked[k] = [v] * num_chunks
-    try:
-        return [{k: v[idx] for k, v in chunked.items()} for idx in range(num_chunks)]
-    except:
-        lens = {k: len(v) for k, v in chunked.items()}
-        raise Exception(f"Failed returning batches. Here's the dictionary: {lens}")
+    return [{k: v[idx] for k, v in chunked.items()} for idx in range(num_chunks)]
 
 
 def _check_list_is_primitives(l):
diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 803890b7b4..5e191d70f5 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -8,7 +8,7 @@
 import json
 import os
 import random
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Union
 
 import torch
 from torch.utils.data import DataLoader, Dataset
@@ -142,9 +142,20 @@ def _get_fewshot_sample_idxs(dataset_size: int, num_fewshot: int, example_idx: i
 
 class InContextLearningDataset(Dataset):
     """
-    A base dataset that constructs batches for in-context learning task evaluations
+    A base dataset that constructs batches for in-context learning task evaluations.
+    The input format is expected to be a jsonl file or a link to a Hugging Face dataset.
 
-    The input format is expected to be a jsonl file with different fields based on the task or a link to a Hugging Face dataset.
+    When creating a new ICL Dataset, the most likely to be reimplemented methods are the following:
+        - _construct_context(): takes a single example dictionary and formulates the context as a string for that eval question.
+        - _get_answer_from_example(): takes a single example dictionary and formulates the correct, ground truth answer as a string.
+        - _tokenize_example(): tokenizes the example and adds any extra content from the original dictionary that needs to be passed downstream.
+        - _read_dataset(): loads the dataset and does basic parsing. If custom parsing must be done, this is a good place to do so (See InContextLearningQATaskDataset._read_dataset())
+
+    Additionally, base_batch and batch_mapping must be defined.
+        - base_batch (Dict): the base that the dataset will use to construct a batch. This should contain static values, like generation_kwargs or mode,
+                             and empty lists for values that will need to be accumulated from each example.
+        - batch_mapping (Dict): A mapping with keys that are keys in the batch and values that are columns in the loaded dataset.
+                                collate_fn will use this mapping to create batches from self.dataset
 
     Args:
         dataset_uri (str): A local path, a remote path beginning with ``s3://`` or another backend, or a HuggingFace dataset uri.
@@ -169,6 +180,7 @@ class InContextLearningDataset(Dataset):
         hf_parsing_map (Dict[str, List[str]]): A dictionary containing a mapping from HF columns to ICL dataset keys. The dictionary should be formatted {icl_key:[hf_key1, hf_key1]}.
             Values in the dict will be concatenated with ' ' seperating them. If not included, will use the columns already present in the HF dataset.
         tokenize_labels (bool): Whether or not the labels should be tokenized. Used in metric calculation and for direct comparison
+        generation_kwargs (Dict): A dictionary containing any
     """
 
     def __init__(
@@ -189,7 +201,7 @@ def __init__(
         strip_dataset: bool = True,
         padding_side: str = 'right',
         padding_size: int = None,
-        default_batch: Dict = None,
+        base_batch: Dict = None,
         batch_mapping: Dict = None,
         hf_loading_vars: Dict = None,
         hf_parsing_map: Dict = None,
@@ -212,7 +224,7 @@ def __init__(
         self.answer_key = answer_key
         self.tokenize_labels = tokenize_labels
         self.batch_mapping = batch_mapping or {}
-        self.default_batch = default_batch or {}
+        self.base_batch = base_batch or {}
         self._update_generation_kwargs(generation_kwargs or {})
 
         hf_loading_vars = hf_loading_vars or {}
@@ -243,17 +255,17 @@ def get_num_samples_in_batch(self, batch: Dict) -> int:
 
     def _update_generation_kwargs(self, generation_kwargs: Dict) -> None:
         """
-        Updates self.default_batch with the passed in generation_kwargs.
-        This must be run after self.default_batch is set (for example, if self.default_batch is set after __init__() is run,
-        likely because default_batch needs a class variable like self.pad_tok_id or self.max_answer_length).
+        Updates self.base_batch with the passed in generation_kwargs.
+        This must be run after self.base_batch is set (for example, if self.base_batch is set after __init__() is run,
+        likely because base_batch needs a class variable like self.pad_tok_id or self.max_answer_length).
 
         Args:
 
         """
-        if 'generation_kwargs' not in self.default_batch:
-            self.default_batch['generation_kwargs'] = {}
+        if 'generation_kwargs' not in self.base_batch:
+            self.base_batch['generation_kwargs'] = {}
         if generation_kwargs:
-            self.default_batch['generation_kwargs'].update(generation_kwargs)
+            self.base_batch['generation_kwargs'].update(generation_kwargs)
 
     def _read_dataset(self,
                       dataset_uri: str,
@@ -360,7 +372,10 @@ def _get_answer_from_example(self, example: Dict[str, Any], in_context=False) ->
         Returns:
             str: the answer in the example
         """
-        return example[self.answer_key]
+        cont = example[self.answer_key]
+        if self.prefix_space and not cont.startswith(' ') and not in_context:
+            cont = f' {cont}'
+        return cont
 
     def _fix_eos_on_preamble(self, input_ids: List[int]) -> List[int]:
         """
@@ -470,7 +485,7 @@ def collate_fn(self, data: Dict[str, Any]) -> Dict[str, Any]:
         Returns:
             Dict: dictionary for a single batch
         """
-        batch = copy.deepcopy(self.default_batch)
+        batch = copy.deepcopy(self.base_batch)
         for data_pair in data:
             for batch_key, data_key in self.batch_mapping.items():
                 batch[batch_key].append(data_pair[data_key])
@@ -536,7 +551,7 @@ def __init__(self, cot_delimiter: str = '', *args, **kwargs):
         self.max_answer_length = 0
         super().__init__(padding_side='left', tokenize_labels=False, *args, **kwargs)
         # NOTE: set these after init call bcus they take class vars
-        self.default_batch = {
+        self.base_batch = {
             'input_ids': [],
             'mode': 'generate',
             'labels': [],
@@ -637,7 +652,7 @@ class InContextLearningLMTaskDataset(InContextLearningDataset):
 
     def __init__(self, *args, **kwargs):
         super().__init__(answer_key='continuation',
-                         default_batch={
+                         base_batch={
                              'input_ids': [],
                              'continuation_indices': [],
                              'mode': 'icl_task',
@@ -651,13 +666,6 @@ def __init__(self, *args, **kwargs):
                          *args,
                          **kwargs)
 
-    def _get_answer_from_example(self, example: Dict[str, Any], in_context=False) -> str:
-        cont = example[self.answer_key]
-        # Should this be in the base class?
-        if self.prefix_space and not cont.startswith(' ') and not in_context:
-            cont = f' {cont}'
-        return cont
-
 
 # TODO: ensure tests
 class InContextLearningMultipleChoiceTaskDataset(InContextLearningDataset):
@@ -687,7 +695,7 @@ class InContextLearningMultipleChoiceTaskDataset(InContextLearningDataset):
 
     def __init__(self, choices_key: str = 'choices', *args, **kwargs):
         self.choices_key = choices_key
-        default_batch = {
+        base_batch = {
             'input_ids': [],
             'continuation_indices': [],
             'mode': 'icl_task',
@@ -697,7 +705,7 @@ def __init__(self, choices_key: str = 'choices', *args, **kwargs):
         }
         # TODO: is there something cleaner here?
         context_key = kwargs.pop('context_key', 'query')
-        super().__init__(context_key=context_key, default_batch=default_batch, padding_side='right', *args, **kwargs)
+        super().__init__(context_key=context_key, base_batch=base_batch, padding_side='right', *args, **kwargs)
         self.num_choices = len(self.dataset[0][self.choices_key])
         self.batch_mapping_per_choice = {'input_ids': 'context', 'labels': 'context'}
         self.batch_map_per_example = {'gold_indices': 'gold'}
@@ -741,7 +749,7 @@ def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -
         tokenized_example[self.context_key] = []
         tokenized_example[self.answer_key] = []
         tokenized_example['continuation_indices'] = []
-        # NOTE: Treating tokenize_labels as True for all MC datasets (required for our accuracy anyway)
+        # NOTE: Treating tokenize_labels as True for all MC datasets (required for our MC accuracy metric)
         for choice in example[self.choices_key]:
             if self.prefix_space:
                 choice = f' {choice}' if not choice.startswith(' ') else choice
@@ -776,7 +784,7 @@ def collate_fn(self, data: Dict[str, Any]) -> Dict[str, Any]:
         Returns:
             Dict: dictionary for a single batch
         """
-        batch = copy.deepcopy(self.default_batch)
+        batch = copy.deepcopy(self.base_batch)
         for data_pair in data:
             choice_start_idx = len(batch['continuation_indices'])
             # TODO: use batch_mappings? Could be fine as is
@@ -865,7 +873,7 @@ class InContextLearningSchemaTaskDataset(InContextLearningMultipleChoiceTaskData
 
     def __init__(self, choices_key='context_options', *args, **kwargs):
         super().__init__(choices_key=choices_key, context_key=choices_key, *args, **kwargs)
-        self.default_batch = {
+        self.base_batch = {
             'input_ids': [],
             'continuation_indices': [],
             'mode': 'icl_task',
@@ -1010,8 +1018,8 @@ class InContextLearningCodeEvalDataset(InContextLearningDataset):
     - languages:  list of languages
     - pass_at_k: passed value for pass_at_k
     - generation_length: derrived maximum generation length
-    - generation_kwargs: Dictionary of kwargs neeeded for generation. Includes the following, which will be individually overwritten 
-        by keys in generaiton_kwargs if set (see https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig 
+    - generation_kwargs: Dictionary of kwargs neeeded for generation. Includes the following, which will be individually overwritten
+        by keys in generaiton_kwargs if set (see https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig
         for more details):
         - pad_token_id: ID for padding token, derived automatically
         - num_beams: how many beams to search for generations, set to 1
@@ -1058,7 +1066,7 @@ def __init__(
             **kwargs,
         )
         self.dataset = self.adjust_padding()
-        self.default_batch = {
+        self.base_batch = {
             'input_ids': [],
             'mode': 'generate',
             'labels': [],
@@ -1294,7 +1302,7 @@ def partition_dataset_by_category(dataset_uri: str, destination_path: str, hf_lo
             conda_package='datasets',
             conda_channel='conda-forge',
         ) from e
-    if 'hf://' in dataset_uri:
+    if dataset_uri.startswith('hf://'):
         dataset_uri = dataset_uri.replace('hf://', '')
         dataset = load_dataset(dataset_uri, **hf_loading_vars)
         if hf_parsing_map:
diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py
index e4dc02bd85..8dfa4fa4d7 100644
--- a/tests/datasets/test_in_context_learning_datasets.py
+++ b/tests/datasets/test_in_context_learning_datasets.py
@@ -196,7 +196,7 @@ def test_update_generation_kwargs(tiny_gpt2_tokenizer, tmp_path):
                                   hf_loading_vars=hf_loading_vars,
                                   hf_parsing_map=hf_parsing_map,
                                   generation_kwargs=gen_kwargs)
-    assert dl.default_batch['generation_kwargs'] == {'test_arg1': 1, 'test_arg2': 2}
+    assert dl.base_batch['generation_kwargs'] == {'test_arg1': 1, 'test_arg2': 2}
 
 
 def test_update_generation_kwargs_no_kwargs(tiny_gpt2_tokenizer, tmp_path):
@@ -223,7 +223,7 @@ def test_update_generation_kwargs_no_kwargs(tiny_gpt2_tokenizer, tmp_path):
                                   destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
                                   hf_loading_vars=hf_loading_vars,
                                   hf_parsing_map=hf_parsing_map)
-    assert not dl.default_batch['generation_kwargs']
+    assert not dl.base_batch['generation_kwargs']
 
 
 def test_construct_context(tiny_gpt2_tokenizer, tmp_path):
@@ -604,10 +604,10 @@ def test_code_update_gen_kwargs(tiny_gpt2_tokenizer, tmp_path):
         generation_kwargs=gen_kwargs,
         generations_per_sample=10,
     )
-    assert dl.default_batch['generation_kwargs']['num_beams'] == 9000
-    assert dl.default_batch['generation_kwargs']['top_p'] == .95
-    assert dl.default_batch['generation_kwargs']['temperature'] == .9
-    assert dl.default_batch['generation_kwargs']['do_sample'] == True
+    assert dl.base_batch['generation_kwargs']['num_beams'] == 9000
+    assert dl.base_batch['generation_kwargs']['top_p'] == .95
+    assert dl.base_batch['generation_kwargs']['temperature'] == .9
+    assert dl.base_batch['generation_kwargs']['do_sample'] == True
 
 
 @pytest.mark.parametrize('dataset_uri', ['mmlu_small.jsonl'])

From 8ea72be962092388cc29ed80b890d5d99f19162f Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Thu, 14 Dec 2023 21:52:19 +0000
Subject: [PATCH 080/116] update comments and fix test. move spacing to default
 get_answer

---
 .../in_context_learning_evaluation.py         | 42 +++++++++++--------
 .../test_in_context_learning_datasets.py      |  2 +-
 2 files changed, 26 insertions(+), 18 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 5e191d70f5..93f1bf5135 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -145,17 +145,20 @@ class InContextLearningDataset(Dataset):
     A base dataset that constructs batches for in-context learning task evaluations.
     The input format is expected to be a jsonl file or a link to a Hugging Face dataset.
 
-    When creating a new ICL Dataset, the most likely to be reimplemented methods are the following:
+    When creating a new ICL Dataset, it is most likely that you will need to reimplemented methods are the following:
         - _construct_context(): takes a single example dictionary and formulates the context as a string for that eval question.
         - _get_answer_from_example(): takes a single example dictionary and formulates the correct, ground truth answer as a string.
         - _tokenize_example(): tokenizes the example and adds any extra content from the original dictionary that needs to be passed downstream.
-        - _read_dataset(): loads the dataset and does basic parsing. If custom parsing must be done, this is a good place to do so (See InContextLearningQATaskDataset._read_dataset())
+        - _read_dataset(): loads the dataset and does basic parsing. If additional parsing must be done, this is a good place to do so (See InContextLearningQATaskDataset._read_dataset())
 
     Additionally, base_batch and batch_mapping must be defined.
         - base_batch (Dict): the base that the dataset will use to construct a batch. This should contain static values, like generation_kwargs or mode,
                              and empty lists for values that will need to be accumulated from each example.
+                             NOTE: Sometimes, you will need to set base_batch directly after the init call, usually in order to use class variables 
+                                   like self.pad_tok_id or self.max_answer_length. If you manually set generation_kwargs at this time, you'll need to call self._update_generation_kwargs() again
+                                   after setting self.base_batch.
         - batch_mapping (Dict): A mapping with keys that are keys in the batch and values that are columns in the loaded dataset.
-                                collate_fn will use this mapping to create batches from self.dataset
+                                collate_fn will use this mapping to create batches from self.dataset.
 
     Args:
         dataset_uri (str): A local path, a remote path beginning with ``s3://`` or another backend, or a HuggingFace dataset uri.
@@ -163,24 +166,28 @@ class InContextLearningDataset(Dataset):
             A local dataset must consist of rows of JSON data points with different fields based on the task.
             The default keys expected are "context" and "answer".
         tokenizer (transformers.PreTrainedTokenizerBase): The tokenizer used to map between strings and token ids
-        max_seq_len (int): The maximum sequence length supported by the model
-        pad_tok_id (int): The special token reserved for padding batches
-        num_fewshot (int): The number of complete fewshot examples to prepend before each test example
-        fewshot_random_seed (int): Random seed to use for fewshot sampling
-        prompt_string (str): Prompt string to put once before all fewshot examples/test examples (e.g. 'translate english to french')
-        example_delimiter (str): Separator that goes between individual (context, answer) pairs (e.g. '\n')
-        continuation_delimiter: (str): Separator that goes between context and answer in each example (e.g. '\nA: ')
-        prelimiter (str): Text to be prepended before each example, including few shot examples
-        context_key (str): The key from the parsed dataset that the class will use as the "context" (i.e. the main content to be included in the prompt)
-        answer_key (str): The key from the parsed dataset that the class will use as the "answer" (i.e. the main content to be predicted by the model)
-        destination_path (str): Temporary path to store downloaded datasets
+        max_seq_len (int): The maximum sequence length supported by the model.
+        pad_tok_id (int): The special token reserved for padding batches.
+        num_fewshot (int): The number of complete fewshot examples to prepend before each test example.
+        fewshot_random_seed (int): Random seed to use for fewshot sampling.
+        prompt_string (str): Prompt string to put once before all fewshot examples/test examples (e.g. 'translate english to french').
+        example_delimiter (str): Separator that goes between individual (context, answer) pairs (e.g. '\n').
+        continuation_delimiter: (str): Separator that goes between context and answer in each example (e.g. '\nA: ').
+        destination_path (str): Temporary path to store downloaded datasets.
+        prelimiter (str): Text to be prepended before each example, including few shot examples (e.g. "Question: ").
+        context_key (str): The key in the loaded dataset that contains the context.
+        answer_key (str): The key in the loaded dataset that contains the answer.
         strip_dataset (bool): Boolean for whether to strip whitespace from data. Trailing whitespace can cause degenerative outputs,
             so unless whitespace should be preserved (for example in code), this should be set to True.
+        padding_side (str): Side of the content and answer on which to apply padding. Can be either 'right' or 'left'.
+        padding_size (int): The final size of the tensor after padding. Defaults to max_sequence_length.
+        base_batch (Dict): The base dictionary upon which a batch is created. See above for more details.
+        base_mapping (Dict): A mapping of batch keys to dataset columns, used to create batches. See above for more details.
         hf_loading_vars (Dict): A dictionary containing keyword arguments to be passed into `load_dataset` if dataset is being pulled from HF.
         hf_parsing_map (Dict[str, List[str]]): A dictionary containing a mapping from HF columns to ICL dataset keys. The dictionary should be formatted {icl_key:[hf_key1, hf_key1]}.
-            Values in the dict will be concatenated with ' ' seperating them. If not included, will use the columns already present in the HF dataset.
-        tokenize_labels (bool): Whether or not the labels should be tokenized. Used in metric calculation and for direct comparison
-        generation_kwargs (Dict): A dictionary containing any
+            Columns will be concatenated with ' ' seperating them. If not included, will load whatever columns are already present in the HF dataset.
+        tokenize_labels (bool): Whether or not the labels should be tokenized. Generally determined by which metric a dataset uses.
+        generation_kwargs (Dict): A dictionary containing extra keyword arguments to be passed along to the model's generate function.
     """
 
     def __init__(
@@ -260,6 +267,7 @@ def _update_generation_kwargs(self, generation_kwargs: Dict) -> None:
         likely because base_batch needs a class variable like self.pad_tok_id or self.max_answer_length).
 
         Args:
+            dict: keyword arguments that be written into base_batch['generation_kwargs']
 
         """
         if 'generation_kwargs' not in self.base_batch:
diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py
index 8dfa4fa4d7..c863999c08 100644
--- a/tests/datasets/test_in_context_learning_datasets.py
+++ b/tests/datasets/test_in_context_learning_datasets.py
@@ -288,7 +288,7 @@ def test_get_answer_from_example(tiny_gpt2_tokenizer, tmp_path):
                                   hf_loading_vars=hf_loading_vars,
                                   hf_parsing_map=hf_parsing_map)
     answer = dl._get_answer_from_example({'context': 'wex exort exort', 'answer': 'alacrity'})
-    assert answer == 'alacrity'
+    assert answer == ' alacrity'
 
 
 def test_fix_eos_on_preamble(tmp_path):

From 85097ce84fda93ce7e206238bfd655bdf56df161 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Sun, 17 Dec 2023 22:16:00 +0000
Subject: [PATCH 081/116] improved docstrings

---
 .../in_context_learning_evaluation.py         | 73 ++++++++++---------
 1 file changed, 39 insertions(+), 34 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 93f1bf5135..60a29d6d9b 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -143,38 +143,42 @@ def _get_fewshot_sample_idxs(dataset_size: int, num_fewshot: int, example_idx: i
 class InContextLearningDataset(Dataset):
     """
     A base dataset that constructs batches for in-context learning task evaluations.
-    The input format is expected to be a jsonl file or a link to a Hugging Face dataset.
+    The dataset format is expected to be a local jsonl file, a cloud link to a jsonl file, or a Hugging Face dataset link.
+    'context' refers to the input a model will recieve before generating an output. For example, the question in question answering tasks,
+        the preceding text in a language modeling task, or the document and question regarding the document in a document understanding task.
+    'example' refers to an loaded dictionary, generally containing a context, an answer, and any other information needed to run the task.
+    'answer' refers to the desired output of the model.
 
-    When creating a new ICL Dataset, it is most likely that you will need to reimplemented methods are the following:
+    When creating a new ICL Dataset, it is likely that you will need to reimplemente the following methods:
         - _construct_context(): takes a single example dictionary and formulates the context as a string for that eval question.
         - _get_answer_from_example(): takes a single example dictionary and formulates the correct, ground truth answer as a string.
         - _tokenize_example(): tokenizes the example and adds any extra content from the original dictionary that needs to be passed downstream.
         - _read_dataset(): loads the dataset and does basic parsing. If additional parsing must be done, this is a good place to do so (See InContextLearningQATaskDataset._read_dataset())
 
     Additionally, base_batch and batch_mapping must be defined.
-        - base_batch (Dict): the base that the dataset will use to construct a batch. This should contain static values, like generation_kwargs or mode,
+        - base_batch (Dict): the base dictionary that the dataset will use to construct a batch. This should contain static values, like generation_kwargs or mode,
                              and empty lists for values that will need to be accumulated from each example.
-                             NOTE: Sometimes, you will need to set base_batch directly after the init call, usually in order to use class variables 
-                                   like self.pad_tok_id or self.max_answer_length. If you manually set generation_kwargs at this time, you'll need to call self._update_generation_kwargs() again
+                             NOTE: Sometimes you will need to set base_batch directly after the init call, e.g. in order to use class variables
+                                   like self.pad_tok_id or self.max_answer_length. If you manually set generation_kwargs this way, you'll need to call self._update_generation_kwargs()
                                    after setting self.base_batch.
         - batch_mapping (Dict): A mapping with keys that are keys in the batch and values that are columns in the loaded dataset.
                                 collate_fn will use this mapping to create batches from self.dataset.
 
     Args:
-        dataset_uri (str): A local path, a remote path beginning with ``s3://`` or another backend, or a HuggingFace dataset uri.
+        dataset_uri (str): A local path, a remote path beginning with ``s3://`` or another backend, or a HuggingFace dataset uri prepended with ``hf://``.
             Alternate backends must be supported by :meth:`composer.utils.maybe_create_object_store_from_uri`.
-            A local dataset must consist of rows of JSON data points with different fields based on the task.
+            A local dataset must consist of rows of JSON data points with task dependant fields.
             The default keys expected are "context" and "answer".
-        tokenizer (transformers.PreTrainedTokenizerBase): The tokenizer used to map between strings and token ids
+        tokenizer (transformers.PreTrainedTokenizerBase): The tokenizer used to map between strings and token ids.
         max_seq_len (int): The maximum sequence length supported by the model.
-        pad_tok_id (int): The special token reserved for padding batches.
-        num_fewshot (int): The number of complete fewshot examples to prepend before each test example.
+        pad_tok_id (int): The special token used for padding batches.
+        num_fewshot (int): The number of complete fewshot examples to prepend before each test example. These are not identical across examples.
         fewshot_random_seed (int): Random seed to use for fewshot sampling.
-        prompt_string (str): Prompt string to put once before all fewshot examples/test examples (e.g. 'translate english to french').
-        example_delimiter (str): Separator that goes between individual (context, answer) pairs (e.g. '\n').
-        continuation_delimiter: (str): Separator that goes between context and answer in each example (e.g. '\nA: ').
+        prompt_string (str): Prompt string to put once before all fewshot examples/test examples (e.g. 'Translate english to french.').
+        example_delimiter (str): Separator inserted before (context, answer) pairs (e.g. '\n') for fewshot sampling and prompting.
+        continuation_delimiter: (str): Separator inserted between context and answer in each example (e.g. '\nA: ').
         destination_path (str): Temporary path to store downloaded datasets.
-        prelimiter (str): Text to be prepended before each example, including few shot examples (e.g. "Question: ").
+        prelimiter (str): Text to be prepended before each context, including few shot examples (e.g. "Question: ").
         context_key (str): The key in the loaded dataset that contains the context.
         answer_key (str): The key in the loaded dataset that contains the answer.
         strip_dataset (bool): Boolean for whether to strip whitespace from data. Trailing whitespace can cause degenerative outputs,
@@ -184,10 +188,10 @@ class InContextLearningDataset(Dataset):
         base_batch (Dict): The base dictionary upon which a batch is created. See above for more details.
         base_mapping (Dict): A mapping of batch keys to dataset columns, used to create batches. See above for more details.
         hf_loading_vars (Dict): A dictionary containing keyword arguments to be passed into `load_dataset` if dataset is being pulled from HF.
-        hf_parsing_map (Dict[str, List[str]]): A dictionary containing a mapping from HF columns to ICL dataset keys. The dictionary should be formatted {icl_key:[hf_key1, hf_key1]}.
-            Columns will be concatenated with ' ' seperating them. If not included, will load whatever columns are already present in the HF dataset.
+        hf_parsing_map (Dict): A dictionary containing a mapping from HF columns to ICL dataset keys. The dictionary should be formatted {icl_key:[hf_key1, hf_key1]}.
+            Column contents will be concatenated with ' ' seperating them. If not included, will load the columns already present in the HF dataset.
         tokenize_labels (bool): Whether or not the labels should be tokenized. Generally determined by which metric a dataset uses.
-        generation_kwargs (Dict): A dictionary containing extra keyword arguments to be passed along to the model's generate function.
+        generation_kwargs (Dict): A dictionary containing keyword arguments to be passed along to the model's generate function.
     """
 
     def __init__(
@@ -373,7 +377,7 @@ def _construct_context(self, example: Dict, preceding_text: str = '', add_answer
 
     def _get_answer_from_example(self, example: Dict[str, Any], in_context=False) -> str:
         """
-        Returns the answer from the example
+        Returns the answer from the example.
         Args:
             example (Dict): the example from which to retrieve the answer
 
@@ -711,7 +715,6 @@ def __init__(self, choices_key: str = 'choices', *args, **kwargs):
             'gold_indices': [],
             'choice_groupings': [],
         }
-        # TODO: is there something cleaner here?
         context_key = kwargs.pop('context_key', 'query')
         super().__init__(context_key=context_key, base_batch=base_batch, padding_side='right', *args, **kwargs)
         self.num_choices = len(self.dataset[0][self.choices_key])
@@ -1395,29 +1398,31 @@ def get_icl_task_dataloader(
 
     Args:
         icl_task_type (str): Name of icl_task type. One of ['multiple_choice', 'schema', 'language_modeling', 'question_answering', 'code_evaluation']
-        dataset_uri (str): Either a local path, a remote path beginning with ``s3://``, or another backend
-            supported by :meth:`composer.utils.maybe_create_object_store_from_uri`, a link to a HuggingFace Dataset
-        tokenizer (transformers.PreTrainedTokenizerBase): The tokenizer used to transform data into batches
+        dataset_uri (str): A local path, a remote path beginning with ``s3://`` or another backend, or a HuggingFace dataset uri prepended with ``hf://``.
+            Alternate backends must be supported by :meth:`composer.utils.maybe_create_object_store_from_uri`.
+            A local dataset must consist of rows of JSON data points with task dependant fields.
+            The default keys expected are "context" and "answer".
+        tokenizer (transformers.PreTrainedTokenizerBase): The tokenizer used to map between strings and token ids.
         batch_size (int): Size of a batch used for eval
-        max_seq_len (int): The sequence length expected by the model
-        pad_tok_id (int): The special token reserved for padding the ends of batches
-        num_fewshot (int): The number of complete fewshot examples to pad each test example with
-        prompt_string (str, default = ''): Prompt string to put once before all fewshot examples/test examples (e.g. 'translate english to french')
-        example_delimiter (str, default = '\n'): Separator that goes between individual examples (e.g. '\n')
-        continuation_delimiter: (str, default = ' '): Separator that goes between context and continuation in each example (e.g. '->')
+        max_seq_len (int): The maximum sequence length supported by the model.
+        pad_tok_id (int): The special token used for padding batches.
+        num_fewshot (int): The number of complete fewshot examples to prepend before each test example. These are not identical across examples.
+        prompt_string (str, default = ''): Prompt string to put once before all fewshot examples/test examples (e.g. 'Translate english to french.').
+        example_delimiter (str, default = '\n'): Separator inserted before (context, answer) pairs (e.g. '\n') for fewshot sampling and prompting.
+        continuation_delimiter: (str, default = ' '): Separator inserted between context and answer in each example (e.g. '\nA: ').
         destination_path: (str, default = ''): This is the local file where remote datasets will be saved.
-        question_prelimiter: (str, default = ''): Text to be prepended before each context segement in each eval example. (e.g. 'Q:', 'The following is a paragraph containing...')
+        question_prelimiter: (str, default = ''): Text to be prepended before each context, including few shot examples (e.g. "Question: ").
         fewshot_random_seed (int, default = 1234): Random seed to use for fewshot sampling
-
         pass_at_k (int): k for how many chances the model gets to write passing code.
         generations_per_sample (int): how many outputs to generate per prompt. Passed in generation_kwargs under "num_return_sequences" and overwritten by generation_kwargs dict.
         cot_delimiter (str): Delimiter to place between chain of thoughts and continuations.
         has_categories: (bool): If ``True``, we will search the dataset file for a category key, and partition the dataset into a separate dataloader for each category occurring in the data.
-
         hf_loading_vars (Dict, default = None): A dictionary containing keyword arguments to be passed into `load_dataset` if dataset is being pulled from HF.
-        hf_parsing_map (Dict, default = None): A dictionary containing a from HF columns to ICL dataset keys. The dictionary should be formatted {icl_key:[hf_key1, hf_key1]}.
-            Values in the dict will be concatenated with ' ' seperating them. If not included, will use the columns already present in the HF dataset.
-        generation_kwargs (dict):
+        hf_parsing_map (Dict, default = None): A dictionary containing a mapping from HF columns to ICL dataset keys. The dictionary should be formatted {icl_key:[hf_key1, hf_key1]}.
+            Column contents will be concatenated with ' ' seperating them. If not included, will load the columns already present in the HF dataset.
+        generation_kwargs (Dict, default = None): A dictionary containing keyword arguments to be passed along to the model's generate function. Overwrites any previously specified generation
+                                                  keyword args in this fucntion (see https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig
+                                                  for more details)
 
     Returns:
         DataLoader: A dataloader used for performing in-context learning evaluation on the dataset provided.

From 4139fbc982bd3ff9acbe41de09db9363d7c5feda Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Mon, 18 Dec 2023 00:13:44 +0000
Subject: [PATCH 082/116] finish schema/mc tests

---
 .../in_context_learning_evaluation.py         |   1 -
 .../test_in_context_learning_datasets.py      | 113 +++++++++++++++++-
 2 files changed, 112 insertions(+), 2 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 60a29d6d9b..1bbe216e60 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -860,7 +860,6 @@ def split_batch(self, batch: Any, microbatch_size: int) -> Dict[str, Any]:
         return [{k: v[idx] for k, v in chunked.items()} for idx in range(num_chunks)]
 
 
-# TODO: ensure tests
 class InContextLearningSchemaTaskDataset(InContextLearningMultipleChoiceTaskDataset):
     """
     A dataset that constructs batches for in-context learning schema evaluation.
diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py
index c863999c08..9d4c1dfdc1 100644
--- a/tests/datasets/test_in_context_learning_datasets.py
+++ b/tests/datasets/test_in_context_learning_datasets.py
@@ -16,7 +16,7 @@
 from composer import Evaluator
 from composer.core import DataSpec
 from composer.datasets.in_context_learning_evaluation import (InContextLearningCodeEvalDataset,
-                                                              InContextLearningDataset, InContextLearningLMTaskDataset,
+                                                              InContextLearningDataset, InContextLearningMultipleChoiceTaskDataset, InContextLearningSchemaTaskDataset,
                                                               InContextLearningQATaskDataset, _get_continuation_span,
                                                               _get_fewshot_sample_idxs, _make_padded_input,
                                                               _tokenizer_needs_prefix_space, _trim_context,
@@ -609,6 +609,117 @@ def test_code_update_gen_kwargs(tiny_gpt2_tokenizer, tmp_path):
     assert dl.base_batch['generation_kwargs']['temperature'] == .9
     assert dl.base_batch['generation_kwargs']['do_sample'] == True
 
+def test_mc_tokenize_example(tiny_gpt2_tokenizer, tmp_path):
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    dataset_uri = f'{local_data}/mmlu_small.jsonl'
+    tokenizer = tiny_gpt2_tokenizer
+    seqlen = 2048
+    num_fewshot = 0
+    prompt_string = ''
+    seqlen = 2048
+    dl = InContextLearningMultipleChoiceTaskDataset(
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        max_seq_len=seqlen,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=num_fewshot,
+        fewshot_random_seed=1,
+        prompt_string=prompt_string,
+        example_delimiter='\n',
+        continuation_delimiter=' ### ',
+        destination_path=str(tmp_path / 'test_human_eval_small.jsonl'),
+    )
+    example = {"context":"Who's the best eval researcher?\n A. Jeremy\n B. Tessa\n C. Max\n D. Other\nAnswer: ","choices":['A', 'B', 'C', 'D'],"gold":2}
+    tokenized_example = dl._tokenize_example(prompt_and_fewshot='Answer the following: ', ctxt=example['context'], example=example)
+    unpadded_queries = [context[context != tokenizer.eos_token_id] for context in tokenized_example['query']]
+    untokenized_inputs = [tokenizer.decode(unpadded_input) for unpadded_input in unpadded_queries]
+    correct_output = [
+        "Answer the following: Who's the best eval researcher?\n A. Jeremy\n B. Tessa\n C. Max\n D. Other\nAnswer: A",
+        "Answer the following: Who's the best eval researcher?\n A. Jeremy\n B. Tessa\n C. Max\n D. Other\nAnswer: B",
+        "Answer the following: Who's the best eval researcher?\n A. Jeremy\n B. Tessa\n C. Max\n D. Other\nAnswer: C",
+        "Answer the following: Who's the best eval researcher?\n A. Jeremy\n B. Tessa\n C. Max\n D. Other\nAnswer: D"
+        ]
+    assert untokenized_inputs == correct_output
+
+def test_schema_construct_context(tiny_gpt2_tokenizer, tmp_path):
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    dataset_uri = f'{local_data}/winograd_small.jsonl'
+    tokenizer = tiny_gpt2_tokenizer
+    seqlen = 2048
+    num_fewshot = 0
+    seqlen = 2048
+    dl = InContextLearningSchemaTaskDataset(
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        max_seq_len=seqlen,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=num_fewshot,
+        fewshot_random_seed=1,
+        prompt_string='',
+        example_delimiter='\n',
+        continuation_delimiter=' ### ',
+        destination_path=str(tmp_path / 'test_human_eval_small.jsonl'),
+    )
+    example = {"context_options":["cont one", "cont two"],"gold":0, "continuation": "this is a continuation"}
+    constructed_context = dl._construct_context(example)
+    assert constructed_context == 'cont one ### this is a continuation'
+    constructed_context = dl._construct_context(example, preceding_text='text')
+    assert constructed_context == '\ncont one ### this is a continuation'
+
+def test_schema_construct_multiple_contexts(tiny_gpt2_tokenizer, tmp_path):
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    dataset_uri = f'{local_data}/winograd_small.jsonl'
+    tokenizer = tiny_gpt2_tokenizer
+    seqlen = 2048
+    num_fewshot = 0
+    prompt_string = ''
+    seqlen = 2048
+    dl = InContextLearningSchemaTaskDataset(
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        max_seq_len=seqlen,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=num_fewshot,
+        fewshot_random_seed=1,
+        prompt_string=prompt_string,
+        example_delimiter='\n',
+        continuation_delimiter=' ### ',
+        destination_path=str(tmp_path / 'test_human_eval_small.jsonl'),
+    )
+    example = {"context_options":["cont one", "cont two"],"gold":0, "continuation": "this is a continuation"}
+    constructed_contexts = dl._construct_multiple_contexts(example)
+    assert constructed_contexts == ["cont one", "cont two"]
+    constructed_contexts = dl._construct_multiple_contexts(example, preceding_text='some text')
+    assert constructed_contexts == ["\ncont one ###", "\ncont two ###"]
+
+def test_schema_tokenize_example(tiny_gpt2_tokenizer, tmp_path):
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    dataset_uri = f'{local_data}/winograd_small.jsonl'
+    tokenizer = tiny_gpt2_tokenizer
+    seqlen = 2048
+    num_fewshot = 0
+    prompt_string = ''
+    seqlen = 2048
+    dl = InContextLearningSchemaTaskDataset(
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        max_seq_len=seqlen,
+        pad_tok_id=tokenizer.eos_token_id,
+        num_fewshot=num_fewshot,
+        fewshot_random_seed=1,
+        prompt_string=prompt_string,
+        example_delimiter='\n',
+        continuation_delimiter=' ### ',
+        destination_path=str(tmp_path / 'test_human_eval_small.jsonl'),
+    )
+    example = {"context_options":["context one", "context two"],"gold":0, "continuation": "this is a continuation"}
+    tokenized_example = dl._tokenize_example(prompt_and_fewshot='prompt ', context_options=example['context_options'], example=example)
+    assert all([tiny_gpt2_tokenizer.decode(cont) == ' this is a continuation' for cont in tokenized_example['answer']])
+    unpadded_inputs = [context[context != tokenizer.eos_token_id] for context in tokenized_example['context_options']]
+    untokenized_inputs = [tokenizer.decode(unpadded_input) for unpadded_input in unpadded_inputs]
+    assert untokenized_inputs == ['prompt context one this is a continuation', 'prompt context two this is a continuation']
+
+
 
 @pytest.mark.parametrize('dataset_uri', ['mmlu_small.jsonl'])
 def test_mc_task_dataloader_subcategories(dataset_uri, tiny_gpt2_tokenizer, tmp_path):

From eb382fa027d64825a4e3b7f12364301fba7de5f3 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Wed, 20 Dec 2023 20:11:13 +0000
Subject: [PATCH 083/116] address pr review comments

---
 .../in_context_learning_evaluation.py         |  27 +-
 .../test_in_context_learning_datasets.py      | 234 ++++++++++--------
 2 files changed, 146 insertions(+), 115 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 1bbe216e60..e40377cbd3 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -112,6 +112,20 @@ def _make_padded_input(context_enc: List,
     return inp
 
 
+def convert_tokens_to_tensors(batch: Dict, tokenize_labels: bool) -> Dict[str, Any]:
+    """
+    HF Datasets converts tensors into lists when we store them, and we don't want to use `type='torch'`
+    because some content in the dataset, like generation args or single ints, should not be converted.
+
+    Here, we convert those lists of tokens back into tensors in order to feed them into the model.
+    """
+    batch['input_ids'] = torch.stack(list(map(torch.tensor, batch['input_ids'])))
+    if tokenize_labels:
+        batch['labels'] = torch.stack(list(map(torch.tensor, batch['labels'])))
+        batch['continuation_indices'] = list(map(torch.tensor, batch['continuation_indices']))
+    return batch
+
+
 def _get_fewshot_sample_idxs(dataset_size: int, num_fewshot: int, example_idx: int, rng: random.Random) -> List[int]:
     """
     Samples indices without replacement. If num_fewshot exceeds the number of unique examples in the dataset,
@@ -479,15 +493,6 @@ def _prep_example(
         tokenized_example = self._tokenize_example(prompt_and_fewshot, ctxt, example)
         return tokenized_example
 
-    # TODO: Maybe make this not a class function. Also, could make our padding operations work on lists
-    def _convert_tokens_to_tensors(self, batch: Dict) -> Dict[str, Any]:
-        # zzzz HF converts ur torch tensors into lists so need to convert them back
-        batch['input_ids'] = torch.stack(list(map(torch.tensor, batch['input_ids'])))
-        if self.tokenize_labels:
-            batch['labels'] = torch.stack(list(map(torch.tensor, batch['labels'])))
-            batch['continuation_indices'] = list(map(torch.tensor, batch['continuation_indices']))
-        return batch
-
     def collate_fn(self, data: Dict[str, Any]) -> Dict[str, Any]:
         """
         The function that the dataloader uses to accumulate data into batches.
@@ -504,7 +509,7 @@ def collate_fn(self, data: Dict[str, Any]) -> Dict[str, Any]:
             if 'continuation_indices' in data_pair:
                 batch['continuation_indices'].append(data_pair['continuation_indices'])
 
-        batch = self._convert_tokens_to_tensors(batch)
+        batch = convert_tokens_to_tensors(batch, self.tokenize_labels)
         batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
         return batch
 
@@ -808,7 +813,7 @@ def collate_fn(self, data: Dict[str, Any]) -> Dict[str, Any]:
             choice_end_idx = len(batch['continuation_indices'])
             batch['choice_groupings'].append((choice_start_idx, choice_end_idx))
 
-        batch = self._convert_tokens_to_tensors(batch)
+        batch = convert_tokens_to_tensors(batch, self.tokenize_labels)
         batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
         return batch
 
diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py
index 9d4c1dfdc1..5d5026a02d 100644
--- a/tests/datasets/test_in_context_learning_datasets.py
+++ b/tests/datasets/test_in_context_learning_datasets.py
@@ -16,11 +16,13 @@
 from composer import Evaluator
 from composer.core import DataSpec
 from composer.datasets.in_context_learning_evaluation import (InContextLearningCodeEvalDataset,
-                                                              InContextLearningDataset, InContextLearningMultipleChoiceTaskDataset, InContextLearningSchemaTaskDataset,
-                                                              InContextLearningQATaskDataset, _get_continuation_span,
-                                                              _get_fewshot_sample_idxs, _make_padded_input,
-                                                              _tokenizer_needs_prefix_space, _trim_context,
-                                                              get_icl_task_dataloader, strip_data)
+                                                              InContextLearningDataset,
+                                                              InContextLearningMultipleChoiceTaskDataset,
+                                                              InContextLearningQATaskDataset,
+                                                              InContextLearningSchemaTaskDataset,
+                                                              _get_continuation_span, _get_fewshot_sample_idxs,
+                                                              _make_padded_input, _tokenizer_needs_prefix_space,
+                                                              _trim_context, get_icl_task_dataloader, strip_data)
 from composer.loggers import InMemoryLogger
 from composer.metrics import (InContextLearningCodeEvalAccuracy, InContextLearningLMAccuracy,
                               InContextLearningMultipleChoiceAccuracy, InContextLearningQAAccuracy)
@@ -609,6 +611,7 @@ def test_code_update_gen_kwargs(tiny_gpt2_tokenizer, tmp_path):
     assert dl.base_batch['generation_kwargs']['temperature'] == .9
     assert dl.base_batch['generation_kwargs']['do_sample'] == True
 
+
 def test_mc_tokenize_example(tiny_gpt2_tokenizer, tmp_path):
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/mmlu_small.jsonl'
@@ -629,8 +632,14 @@ def test_mc_tokenize_example(tiny_gpt2_tokenizer, tmp_path):
         continuation_delimiter=' ### ',
         destination_path=str(tmp_path / 'test_human_eval_small.jsonl'),
     )
-    example = {"context":"Who's the best eval researcher?\n A. Jeremy\n B. Tessa\n C. Max\n D. Other\nAnswer: ","choices":['A', 'B', 'C', 'D'],"gold":2}
-    tokenized_example = dl._tokenize_example(prompt_and_fewshot='Answer the following: ', ctxt=example['context'], example=example)
+    example = {
+        'context': "Who's the best eval researcher?\n A. Jeremy\n B. Tessa\n C. Max\n D. Other\nAnswer: ",
+        'choices': ['A', 'B', 'C', 'D'],
+        'gold': 2
+    }
+    tokenized_example = dl._tokenize_example(prompt_and_fewshot='Answer the following: ',
+                                             ctxt=example['context'],
+                                             example=example)
     unpadded_queries = [context[context != tokenizer.eos_token_id] for context in tokenized_example['query']]
     untokenized_inputs = [tokenizer.decode(unpadded_input) for unpadded_input in unpadded_queries]
     correct_output = [
@@ -638,9 +647,10 @@ def test_mc_tokenize_example(tiny_gpt2_tokenizer, tmp_path):
         "Answer the following: Who's the best eval researcher?\n A. Jeremy\n B. Tessa\n C. Max\n D. Other\nAnswer: B",
         "Answer the following: Who's the best eval researcher?\n A. Jeremy\n B. Tessa\n C. Max\n D. Other\nAnswer: C",
         "Answer the following: Who's the best eval researcher?\n A. Jeremy\n B. Tessa\n C. Max\n D. Other\nAnswer: D"
-        ]
+    ]
     assert untokenized_inputs == correct_output
 
+
 def test_schema_construct_context(tiny_gpt2_tokenizer, tmp_path):
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/winograd_small.jsonl'
@@ -660,12 +670,13 @@ def test_schema_construct_context(tiny_gpt2_tokenizer, tmp_path):
         continuation_delimiter=' ### ',
         destination_path=str(tmp_path / 'test_human_eval_small.jsonl'),
     )
-    example = {"context_options":["cont one", "cont two"],"gold":0, "continuation": "this is a continuation"}
+    example = {'context_options': ['cont one', 'cont two'], 'gold': 0, 'continuation': 'this is a continuation'}
     constructed_context = dl._construct_context(example)
     assert constructed_context == 'cont one ### this is a continuation'
     constructed_context = dl._construct_context(example, preceding_text='text')
     assert constructed_context == '\ncont one ### this is a continuation'
 
+
 def test_schema_construct_multiple_contexts(tiny_gpt2_tokenizer, tmp_path):
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/winograd_small.jsonl'
@@ -686,11 +697,12 @@ def test_schema_construct_multiple_contexts(tiny_gpt2_tokenizer, tmp_path):
         continuation_delimiter=' ### ',
         destination_path=str(tmp_path / 'test_human_eval_small.jsonl'),
     )
-    example = {"context_options":["cont one", "cont two"],"gold":0, "continuation": "this is a continuation"}
+    example = {'context_options': ['cont one', 'cont two'], 'gold': 0, 'continuation': 'this is a continuation'}
     constructed_contexts = dl._construct_multiple_contexts(example)
-    assert constructed_contexts == ["cont one", "cont two"]
+    assert constructed_contexts == ['cont one', 'cont two']
     constructed_contexts = dl._construct_multiple_contexts(example, preceding_text='some text')
-    assert constructed_contexts == ["\ncont one ###", "\ncont two ###"]
+    assert constructed_contexts == ['\ncont one ###', '\ncont two ###']
+
 
 def test_schema_tokenize_example(tiny_gpt2_tokenizer, tmp_path):
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -712,13 +724,16 @@ def test_schema_tokenize_example(tiny_gpt2_tokenizer, tmp_path):
         continuation_delimiter=' ### ',
         destination_path=str(tmp_path / 'test_human_eval_small.jsonl'),
     )
-    example = {"context_options":["context one", "context two"],"gold":0, "continuation": "this is a continuation"}
-    tokenized_example = dl._tokenize_example(prompt_and_fewshot='prompt ', context_options=example['context_options'], example=example)
-    assert all([tiny_gpt2_tokenizer.decode(cont) == ' this is a continuation' for cont in tokenized_example['answer']])
+    example = {'context_options': ['context one', 'context two'], 'gold': 0, 'continuation': 'this is a continuation'}
+    tokenized_example = dl._tokenize_example(prompt_and_fewshot='prompt ',
+                                             context_options=example['context_options'],
+                                             example=example)
+    assert all(tiny_gpt2_tokenizer.decode(cont) == ' this is a continuation' for cont in tokenized_example['answer'])
     unpadded_inputs = [context[context != tokenizer.eos_token_id] for context in tokenized_example['context_options']]
     untokenized_inputs = [tokenizer.decode(unpadded_input) for unpadded_input in unpadded_inputs]
-    assert untokenized_inputs == ['prompt context one this is a continuation', 'prompt context two this is a continuation']
-
+    assert untokenized_inputs == [
+        'prompt context one this is a continuation', 'prompt context two this is a continuation'
+    ]
 
 
 @pytest.mark.parametrize('dataset_uri', ['mmlu_small.jsonl'])
@@ -732,9 +747,9 @@ def test_mc_task_dataloader_subcategories(dataset_uri, tiny_gpt2_tokenizer, tmp_
     batch_size = 8
     seqlen = 2048
     dls = get_icl_task_dataloader('multiple_choice',
-                                  dataset_uri,
-                                  tokenizer,
-                                  batch_size,
+                                  dataset_uri=dataset_uri,
+                                  tokenizer=tokenizer,
+                                  batch_size=batch_size,
                                   max_seq_len=seqlen,
                                   pad_tok_id=tokenizer.eos_token_id,
                                   num_fewshot=2,
@@ -776,9 +791,9 @@ def test_lm_task_dataloader_extra_space(dataset_uri, tiny_gpt2_tokenizer, tmp_pa
     batch_size = 2
     seqlen = 2048
     dl = get_icl_task_dataloader('language_modeling',
-                                 dataset_uri,
-                                 tokenizer,
-                                 batch_size,
+                                 dataset_uri=dataset_uri,
+                                 tokenizer=tokenizer,
+                                 batch_size=batch_size,
                                  max_seq_len=seqlen,
                                  pad_tok_id=tokenizer.eos_token_id,
                                  num_fewshot=10,
@@ -817,9 +832,9 @@ def test_lm_task_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
     batch_size = 2
     seqlen = 2048
     dl = get_icl_task_dataloader('language_modeling',
-                                 dataset_uri,
-                                 tokenizer,
-                                 batch_size,
+                                 dataset_uri=dataset_uri,
+                                 tokenizer=tokenizer,
+                                 batch_size=batch_size,
                                  max_seq_len=seqlen,
                                  pad_tok_id=tokenizer.eos_token_id,
                                  num_fewshot=0,
@@ -855,9 +870,9 @@ def test_schema_task_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
     batch_size = 2
     seqlen = 2048
     dl = get_icl_task_dataloader('schema',
-                                 dataset_uri,
-                                 tokenizer,
-                                 batch_size,
+                                 dataset_uri=dataset_uri,
+                                 tokenizer=tokenizer,
+                                 batch_size=batch_size,
                                  max_seq_len=seqlen,
                                  pad_tok_id=tokenizer.eos_token_id,
                                  num_fewshot=1,
@@ -900,9 +915,9 @@ def test_schema_task_dataloader_sentpiece_tokenizer(dataset_uri, tmp_path):
     batch_size = 2
     seqlen = 2048
     dl = get_icl_task_dataloader('schema',
-                                 dataset_uri,
-                                 tokenizer,
-                                 batch_size,
+                                 dataset_uri=dataset_uri,
+                                 tokenizer=tokenizer,
+                                 batch_size=batch_size,
                                  max_seq_len=seqlen,
                                  pad_tok_id=tokenizer.eos_token_id,
                                  num_fewshot=1,
@@ -947,9 +962,9 @@ def test_lm_task_dataloader_opt_tokenizer(dataset_uri, num_fewshot, tmp_path):
     batch_size = 2
     seqlen = 2048
     dl = get_icl_task_dataloader('language_modeling',
-                                 dataset_uri,
-                                 tokenizer,
-                                 batch_size,
+                                 dataset_uri=dataset_uri,
+                                 tokenizer=tokenizer,
+                                 batch_size=batch_size,
                                  max_seq_len=seqlen,
                                  pad_tok_id=tokenizer.eos_token_id,
                                  num_fewshot=num_fewshot,
@@ -989,9 +1004,9 @@ def test_mc_task_dataloader_opt_tokenizer(dataset_uri, num_fewshot, tmp_path):
     batch_size = 4
     seqlen = 2048
     dl = get_icl_task_dataloader('multiple_choice',
-                                 dataset_uri,
-                                 tokenizer,
-                                 batch_size,
+                                 dataset_uri=dataset_uri,
+                                 tokenizer=tokenizer,
+                                 batch_size=batch_size,
                                  max_seq_len=seqlen,
                                  pad_tok_id=tokenizer.eos_token_id,
                                  num_fewshot=num_fewshot,
@@ -1039,9 +1054,9 @@ def test_mc_split_batch(dataset_uri, num_fewshot, tmp_path):
     batch_size = 4
     seqlen = 2048
     dl = get_icl_task_dataloader('multiple_choice',
-                                 dataset_uri,
-                                 tokenizer,
-                                 batch_size,
+                                 dataset_uri=dataset_uri,
+                                 tokenizer=tokenizer,
+                                 batch_size=batch_size,
                                  max_seq_len=seqlen,
                                  pad_tok_id=tokenizer.eos_token_id,
                                  num_fewshot=num_fewshot,
@@ -1204,9 +1219,9 @@ def test_qa_task_with_cot_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path,
     # empirical number from the small test dataset
     maximum_answer_length = 157
     dl = get_icl_task_dataloader('question_answering',
-                                 dataset_uri,
-                                 tokenizer,
-                                 batch_size,
+                                 dataset_uri=dataset_uri,
+                                 tokenizer=tokenizer,
+                                 batch_size=batch_size,
                                  max_seq_len=seqlen,
                                  pad_tok_id=tokenizer.eos_token_id,
                                  num_fewshot=num_fewshot,
@@ -1265,9 +1280,9 @@ def test_mc_task_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
     batch_size = 2
     seqlen = 2048
     dl = get_icl_task_dataloader('multiple_choice',
-                                 dataset_uri,
-                                 tokenizer,
-                                 batch_size,
+                                 dataset_uri=dataset_uri,
+                                 tokenizer=tokenizer,
+                                 batch_size=batch_size,
                                  max_seq_len=seqlen,
                                  pad_tok_id=tokenizer.eos_token_id,
                                  num_fewshot=1,
@@ -1310,9 +1325,9 @@ def test_code_eval_split_batch(dataset_uri, tmp_path):
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
     dl = get_icl_task_dataloader(
         'code_evaluation',
-        dataset_uri,
-        tokenizer,
-        8,
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        batch_size=8,
         max_seq_len=1024,
         pad_tok_id=tokenizer.eos_token_id,
         num_fewshot=2,
@@ -1380,9 +1395,9 @@ def test_code_eval_sentpiece_dataloader(dataset_uri, tmp_path, num_fewshot, prom
     seqlen = 2048
 
     dl = get_icl_task_dataloader('code_evaluation',
-                                 dataset_uri,
-                                 tokenizer,
-                                 batch_size,
+                                 dataset_uri=dataset_uri,
+                                 tokenizer=tokenizer,
+                                 batch_size=batch_size,
                                  max_seq_len=seqlen,
                                  pad_tok_id=tokenizer.eos_token_id,
                                  num_fewshot=num_fewshot,
@@ -1466,9 +1481,9 @@ def test_code_eval_test_cases(dataset_uri, tmp_path):
     seqlen = 2048
 
     dl = get_icl_task_dataloader('code_evaluation',
-                                 dataset_uri,
-                                 tokenizer,
-                                 batch_size,
+                                 dataset_uri=dataset_uri,
+                                 tokenizer=tokenizer,
+                                 batch_size=batch_size,
                                  max_seq_len=seqlen,
                                  pad_tok_id=tokenizer.eos_token_id,
                                  num_fewshot=0,
@@ -1515,9 +1530,9 @@ def test_code_eval_pass_at_k_validity(dataset_uri, tmp_path):
 
     with pytest.raises(ValueError, match=r'.* pass_at_k .*'):
         get_icl_task_dataloader('code_evaluation',
-                                dataset_uri,
-                                tokenizer,
-                                batch_size,
+                                dataset_uri=dataset_uri,
+                                tokenizer=tokenizer,
+                                batch_size=batch_size,
                                 max_seq_len=seqlen,
                                 pad_tok_id=tokenizer.eos_token_id,
                                 num_fewshot=0,
@@ -1545,9 +1560,9 @@ def test_code_eval_task_dataloader(dataset_uri, tmp_path, num_fewshot, prompt_st
     seqlen = 2048
 
     dl = get_icl_task_dataloader('code_evaluation',
-                                 dataset_uri,
-                                 tokenizer,
-                                 batch_size,
+                                 dataset_uri=dataset_uri,
+                                 tokenizer=tokenizer,
+                                 batch_size=batch_size,
                                  max_seq_len=seqlen,
                                  pad_tok_id=tokenizer.eos_token_id,
                                  num_fewshot=num_fewshot,
@@ -1628,11 +1643,12 @@ def test_lm_task_evaluation(device, dataset_uri, num_fewshot, tiny_gpt2_tokenize
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/{dataset_uri}'
     tokenizer = tiny_gpt2_tokenizer
+    batch_size = 2
     dl = get_icl_task_dataloader(
         'language_modeling',
-        dataset_uri,
-        tokenizer,
-        2,
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        batch_size=batch_size,
         max_seq_len=2048,
         pad_tok_id=tokenizer.eos_token_id,
         num_fewshot=num_fewshot,
@@ -1668,11 +1684,12 @@ def test_schema_task_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer, t
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/{dataset_uri}'
     tokenizer = tiny_gpt2_tokenizer
+    batch_size = 8
     dl = get_icl_task_dataloader(
         'schema',
-        dataset_uri,
-        tokenizer,
-        8,
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        batch_size=batch_size,
         max_seq_len=1024,
         pad_tok_id=tokenizer.eos_token_id,
         num_fewshot=num_fewshot,
@@ -1714,12 +1731,13 @@ def test_mc_task_evaluation_subcategories(device, world_size, dataset_uri, num_f
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/{dataset_uri}'
     tokenizer = tiny_gpt2_tokenizer
+    batch_size = 8
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
     dls = get_icl_task_dataloader('multiple_choice',
-                                  dataset_uri,
-                                  tokenizer,
-                                  8,
+                                  dataset_uri=dataset_uri,
+                                  tokenizer=tokenizer,
+                                  batch_size=batch_size,
                                   max_seq_len=1024,
                                   pad_tok_id=tokenizer.eos_token_id,
                                   num_fewshot=num_fewshot,
@@ -1767,14 +1785,15 @@ def test_mc_task_evaluation(device, world_size, num_fewshot, dataset_uri, tiny_g
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/{dataset_uri}'
     tokenizer = tiny_gpt2_tokenizer
+    batch_size = 8
 
     # seed because the fewshot selection is currently unseeded
     reproducibility.seed_all(1234)
     dl = get_icl_task_dataloader(
         'multiple_choice',
-        dataset_uri,
-        tokenizer,
-        8,
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        batch_size=batch_size,
         max_seq_len=1024,
         pad_tok_id=tokenizer.eos_token_id,
         num_fewshot=num_fewshot,
@@ -1814,14 +1833,15 @@ def test_qa_task_evaluation_opt_tokenizer(device, world_size, num_fewshot, datas
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/{dataset_uri}'
     tokenizer = AutoTokenizer.from_pretrained('facebook/opt-125m')
+    batch_size = 2
 
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
     dl = get_icl_task_dataloader(
         'question_answering',
-        dataset_uri,
-        tokenizer,
-        2,
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        batch_size=batch_size,
         max_seq_len=1024,
         pad_tok_id=tokenizer.eos_token_id,
         num_fewshot=num_fewshot,
@@ -1856,14 +1876,15 @@ def test_qa_task_evaluation_with_cot_opt_tokenizer(device, world_size, num_fewsh
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/{dataset_uri}'
     tokenizer = AutoTokenizer.from_pretrained('facebook/opt-125m')
+    batch_size = 2
 
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
     dl = get_icl_task_dataloader(
         'question_answering',
-        dataset_uri,
-        tokenizer,
-        2,
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        batch_size=batch_size,
         max_seq_len=1024,
         pad_tok_id=tokenizer.eos_token_id,
         num_fewshot=num_fewshot,
@@ -1900,13 +1921,14 @@ def test_qa_task_evaluation(device, world_size, num_fewshot, dataset_uri, tiny_g
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/{dataset_uri}'
     tokenizer = tiny_gpt2_tokenizer
+    batch_size = 2
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
     dl = get_icl_task_dataloader(
         'question_answering',
-        dataset_uri,
-        tokenizer,
-        2,
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        batch_size=batch_size,
         max_seq_len=1024,
         pad_tok_id=tokenizer.eos_token_id,
         num_fewshot=num_fewshot,
@@ -1943,13 +1965,14 @@ def test_qa_task_with_cot_evaluation(device, world_size, num_fewshot, dataset_ur
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/{dataset_uri}'
     tokenizer = tiny_gpt2_tokenizer
+    batch_size = 2
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
     dl = get_icl_task_dataloader(
         'question_answering',
-        dataset_uri,
-        tokenizer,
-        2,
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        batch_size=batch_size,
         max_seq_len=1024,
         pad_tok_id=tokenizer.eos_token_id,
         num_fewshot=num_fewshot,
@@ -2001,14 +2024,15 @@ def test_code_eval_microbatching(monkeypatch, device, world_size, num_fewshot, d
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/{dataset_uri}'
     tokenizer = AutoTokenizer.from_pretrained('facebook/opt-125m')
+    batch_size = 2
 
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
     dl = get_icl_task_dataloader(
         'code_evaluation',
-        dataset_uri,
-        tokenizer,
-        2,
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        batch_size=batch_size,
         max_seq_len=150,
         pad_tok_id=tokenizer.eos_token_id,
         num_fewshot=num_fewshot,
@@ -2052,13 +2076,14 @@ def test_code_eval_sentpiece_evaluation(monkeypatch, device, world_size, num_few
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/{dataset_uri}'
     tokenizer = tiny_t5_tokenizer
+    batch_size = 2
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
     dl = get_icl_task_dataloader(
         'code_evaluation',
-        dataset_uri,
-        tokenizer,
-        2,
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        batch_size=batch_size,
         max_seq_len=175,
         pad_tok_id=tokenizer.eos_token_id,
         num_fewshot=num_fewshot,
@@ -2100,13 +2125,14 @@ def test_code_eval_task_evaluation(monkeypatch, device, world_size, num_fewshot,
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/{dataset_uri}'
     tokenizer = tiny_gpt2_tokenizer
+    batch_size = 2
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
     dl = get_icl_task_dataloader(
         'code_evaluation',
-        dataset_uri,
-        tokenizer,
-        2,
+        dataset_uri=dataset_uri,
+        tokenizer=tokenizer,
+        batch_size=batch_size,
         max_seq_len=150 if num_fewshot == 0 else 450,
         pad_tok_id=tokenizer.eos_token_id,
         num_fewshot=num_fewshot,
@@ -2144,9 +2170,9 @@ def test_lm_spacing_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
     batch_size = 1
     seqlen = 2048
     dl = get_icl_task_dataloader('language_modeling',
-                                 dataset_uri,
-                                 tokenizer,
-                                 batch_size,
+                                 dataset_uri=dataset_uri,
+                                 tokenizer=tokenizer,
+                                 batch_size=batch_size,
                                  max_seq_len=seqlen,
                                  pad_tok_id=tokenizer.eos_token_id,
                                  num_fewshot=1,
@@ -2188,9 +2214,9 @@ def test_hf_dataloading_lm_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path
     batch_size = 2
     seqlen = 2048
     dl = get_icl_task_dataloader('language_modeling',
-                                 dataset_uri,
-                                 tokenizer,
-                                 batch_size,
+                                 dataset_uri=dataset_uri,
+                                 tokenizer=tokenizer,
+                                 batch_size=batch_size,
                                  max_seq_len=seqlen,
                                  pad_tok_id=tokenizer.eos_token_id,
                                  num_fewshot=0,
@@ -2241,9 +2267,9 @@ def test_hf_dataloading_custom_parsing(dataset_uri, tiny_gpt2_tokenizer, tmp_pat
     maximum_answer_length = 4
 
     dl = get_icl_task_dataloader('question_answering',
-                                 dataset_uri,
-                                 tokenizer,
-                                 batch_size,
+                                 dataset_uri=dataset_uri,
+                                 tokenizer=tokenizer,
+                                 batch_size=batch_size,
                                  max_seq_len=seqlen,
                                  pad_tok_id=tokenizer.eos_token_id,
                                  num_fewshot=num_fewshot,

From ea337d5c35561bc0c0371f749693dbcfd31e74eb Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Wed, 20 Dec 2023 22:40:43 +0000
Subject: [PATCH 084/116] lintign

---
 .../in_context_learning_evaluation.py         | 91 +++++++++++++++----
 1 file changed, 71 insertions(+), 20 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index e40377cbd3..bc47012fcc 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -220,6 +220,9 @@ def __init__(
         example_delimiter: str,
         continuation_delimiter: str,
         destination_path: str,
+        static_keys: List = None,
+        list_keys: List = None,
+        tensor_keys: List = None,
         prelimiter: str = '',
         context_key: str = 'context',
         answer_key: str = 'answer',
@@ -233,7 +236,6 @@ def __init__(
         tokenize_labels: bool = True,
         generation_kwargs: Dict = None,
     ):
-
         self.tokenizer = tokenizer
         self.prefix_space = _tokenizer_needs_prefix_space(self.tokenizer)
 
@@ -252,6 +254,10 @@ def __init__(
         self.base_batch = base_batch or {}
         self._update_generation_kwargs(generation_kwargs or {})
 
+        self.static_keys = static_keys
+        self.list_keys = list_keys
+        self.tensor_keys = tensor_keys
+
         hf_loading_vars = hf_loading_vars or {}
         self.dataset = self._read_dataset(dataset_uri, destination_path, hf_loading_vars, hf_parsing_map)
         self.strip_data = strip_dataset
@@ -529,21 +535,22 @@ def split_batch(self, batch: Any, microbatch_size: int) -> List[Dict[str, Any]]:
         # List split lists of strings
         chunked = {}
         for k, v in batch.items():
-            if type(v) in [str, float, int, dict, bool]:
+            if k in self.static_keys:
                 # Defer broadcasting until we know num_chunks
                 pass
-            elif type(v) == list:
+            elif k in self.list_keys:
                 chunked[k] = _split_list(v, microbatch_size)
-            elif type(v) == torch.Tensor:
+            elif k in self.tensor_keys:
                 chunked[k] = _default_split_batch(v, microbatch_size)
             else:
-                raise ValueError(f'Unexpected value type {type(v)} with key {k}')
+                raise ValueError(f'Unexpected key {k} in batch splitting')
         num_chunks = len(chunked['input_ids'])
         for k, v in batch.items():
-            if isinstance(v, (int, float, str, bool, dict)):
+            if k in self.static_keys:
                 chunked[k] = [v] * num_chunks
 
-        return [{k: v[idx] for k, v in chunked.items()} for idx in range(num_chunks)]
+        batched_list = [{k: v[idx] for k, v in chunked.items()} for idx in range(num_chunks)]
+        return batched_list
 
 
 class InContextLearningQATaskDataset(InContextLearningDataset):
@@ -566,7 +573,16 @@ def __init__(self, cot_delimiter: str = '', *args, **kwargs):
         self.cot_delimiter = cot_delimiter
         self.has_cot = False
         self.max_answer_length = 0
-        super().__init__(padding_side='left', tokenize_labels=False, *args, **kwargs)
+        static_keys = ['mode', 'cot_delimiter', 'generation_length', 'generation_kwargs']
+        tensor_keys = ['input_ids', 'attention_mask']
+        list_keys = ['labels']
+        super().__init__(padding_side='left',
+                         tokenize_labels=False,
+                         static_keys=static_keys,
+                         list_keys=list_keys,
+                         tensor_keys=tensor_keys,
+                         *args,
+                         **kwargs)
         # NOTE: set these after init call bcus they take class vars
         self.base_batch = {
             'input_ids': [],
@@ -669,6 +685,8 @@ class InContextLearningLMTaskDataset(InContextLearningDataset):
 
     def __init__(self, *args, **kwargs):
         super().__init__(answer_key='continuation',
+                         static_keys=['mode'],
+                         tensor_keys=['input_ids', 'continuation_indices', 'labels', 'attention_mask'],
                          base_batch={
                              'input_ids': [],
                              'continuation_indices': [],
@@ -684,7 +702,6 @@ def __init__(self, *args, **kwargs):
                          **kwargs)
 
 
-# TODO: ensure tests
 class InContextLearningMultipleChoiceTaskDataset(InContextLearningDataset):
     """
     A dataset that construct batches for in-context learning multiple choice evaluation.
@@ -710,7 +727,14 @@ class InContextLearningMultipleChoiceTaskDataset(InContextLearningDataset):
         choices_key (str): the key under which the choices are stored in the saved dataset. Defaults to 'choices'.
     """
 
-    def __init__(self, choices_key: str = 'choices', *args, **kwargs):
+    def __init__(self,
+                 choices_key: str = 'choices',
+                 static_keys: List = None,
+                 list_of_tensors_keys: List = None,
+                 list_of_tuples_keys: List = None,
+                 list_of_primitives: List = None,
+                 *args,
+                 **kwargs):
         self.choices_key = choices_key
         base_batch = {
             'input_ids': [],
@@ -721,7 +745,18 @@ def __init__(self, choices_key: str = 'choices', *args, **kwargs):
             'choice_groupings': [],
         }
         context_key = kwargs.pop('context_key', 'query')
-        super().__init__(context_key=context_key, base_batch=base_batch, padding_side='right', *args, **kwargs)
+        static_keys = kwargs.pop('static_keys', ['mode', 'generation_kwargs'])
+        tensor_keys = kwargs.pop('tensor_keys', ['input_ids', 'labels', 'attention_mask'])
+        self.list_of_tensors_keys = list_of_tensors_keys or ['continuation_indices']
+        self.list_of_tuples_keys = list_of_tuples_keys or ['choice_groupings']
+        self.list_of_primitives = list_of_primitives or ['gold_indices']
+        super().__init__(context_key=context_key,
+                         base_batch=base_batch,
+                         static_keys=static_keys,
+                         tensor_keys=tensor_keys,
+                         padding_side='right',
+                         *args,
+                         **kwargs)
         self.num_choices = len(self.dataset[0][self.choices_key])
         self.batch_mapping_per_choice = {'input_ids': 'context', 'labels': 'context'}
         self.batch_map_per_example = {'gold_indices': 'gold'}
@@ -838,28 +873,29 @@ def split_batch(self, batch: Any, microbatch_size: int) -> Dict[str, Any]:
         """
         chunked = {}
         for k, v in batch.items():
-            if type(v) in [str, int, dict, bool]:
+            if k in self.static_keys:
                 # Defer broadcasting primitives until we know num_chunks
                 pass
             elif type(v) == list:
-                element_type = type(v[0])
                 # list of tensors - 'continuation_indices'
-                if element_type == torch.Tensor:
+                if k in self.list_of_tensors_keys:
                     chunked[k] = _split_list(v, microbatch_size * self.num_choices)
                 # list of tuples - 'choice_groupings'
-                elif element_type == tuple:
+                elif k in self.list_of_tuples_keys:
                     chunked[k] = _split_list(v, microbatch_size)
                 # list - 'gold_indices'
-                else:
+                elif k in self.list_of_primitives:
                     chunked[k] = _default_split_batch(v, microbatch_size)
-            elif type(v) == torch.Tensor:
+                else:
+                    raise ValueError(f'Unexpected key {k} in list splitting')
+            elif k in self.tensor_keys:
                 chunked[k] = _default_split_batch(v, microbatch_size * self.num_choices)
             else:
-                raise ValueError(f'Unexpected value type {type(v)} with key {k}')
+                raise ValueError(f'Unexpected key {k} in batch splitting')
         num_chunks = len(chunked['input_ids'])
         # Broadcast primitives to all chunks
         for k, v in batch.items():
-            if isinstance(v, (int, float, str, bool)):
+            if k in self.static_keys:
                 chunked[k] = [v] * num_chunks
 
         return [{k: v[idx] for k, v in chunked.items()} for idx in range(num_chunks)]
@@ -887,7 +923,16 @@ class InContextLearningSchemaTaskDataset(InContextLearningMultipleChoiceTaskData
     """
 
     def __init__(self, choices_key='context_options', *args, **kwargs):
-        super().__init__(choices_key=choices_key, context_key=choices_key, *args, **kwargs)
+        static_keys = ['mode']
+        tensor_keys = ['input_ids', 'labels', 'attention_mask']
+        list_of_tensors_keys = ['continuation_indices']
+        super().__init__(choices_key=choices_key,
+                         context_key=choices_key,
+                         static_keys=static_keys,
+                         tensor_keys=tensor_keys,
+                         list_of_tensors_keys=list_of_tensors_keys,
+                         *args,
+                         **kwargs)
         self.base_batch = {
             'input_ids': [],
             'continuation_indices': [],
@@ -1070,10 +1115,16 @@ def __init__(
         }
         # Linting complains if this is not set in init
         self.max_prompt_length = 0
+        static_keys = ['mode', 'pass_at_k', 'generation_length', 'generation_kwargs']
+        list_keys = ['prompts', 'tests', 'entry_points', 'test_inputs', 'test_outputs', 'languages', 'labels']
+        tensor_keys = ['input_ids', 'attention_mask']
         super().__init__(
             context_key='prompt',
             answer_key='canonical_solution',
             strip_dataset=False,
+            static_keys=static_keys,
+            list_keys=list_keys,
+            tensor_keys=tensor_keys,
             tokenize_labels=False,
             padding_side='left',
             batch_mapping=batch_mapping,

From 735e0eea9e4cab6d05e9371f5fc00d0212170401 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Tue, 26 Dec 2023 18:02:22 +0000
Subject: [PATCH 085/116] fixing import, add type

---
 .../in_context_learning_evaluation.py         |  2 +-
 .../test_in_context_learning_datasets.py      | 86 ++++++++++++++++++-
 2 files changed, 85 insertions(+), 3 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index bc47012fcc..418650f8c5 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -395,7 +395,7 @@ def _construct_context(self, example: Dict, preceding_text: str = '', add_answer
             ctxt = f'{ctxt}{self._get_answer_from_example(example, in_context=add_answer)}'
         return ctxt
 
-    def _get_answer_from_example(self, example: Dict[str, Any], in_context=False) -> str:
+    def _get_answer_from_example(self, example: Dict[str, Any], in_context: bool = False) -> str:
         """
         Returns the answer from the example.
         Args:
diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py
index 5d5026a02d..f983fc8bd9 100644
--- a/tests/datasets/test_in_context_learning_datasets.py
+++ b/tests/datasets/test_in_context_learning_datasets.py
@@ -9,9 +9,7 @@
 
 import pytest
 import torch
-import transformers
 from torch.utils.data import DataLoader
-from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from composer import Evaluator
 from composer.core import DataSpec
@@ -47,6 +45,10 @@ def test_tokenizer_needs_prefix_space_when_space_not_needed(tiny_gpt2_tokenizer)
 
 
 def test_tokenizer_needs_prefix_space_when_space_needed():
+    try:
+        from transformers import AutoTokenizer
+    except ImportError:
+        pytest.importorskip('transformers')
     tokenizer = AutoTokenizer.from_pretrained('facebook/opt-125m', use_fast=False)
     assert _tokenizer_needs_prefix_space(tokenizer)
 
@@ -294,6 +296,10 @@ def test_get_answer_from_example(tiny_gpt2_tokenizer, tmp_path):
 
 
 def test_fix_eos_on_preamble(tmp_path):
+    try:
+        from transformers import AutoTokenizer
+    except ImportError:
+        pytest.importorskip('transformers')
     tokenizer = AutoTokenizer.from_pretrained('facebook/opt-125m', use_fast=False)
     seqlen = 2048
     num_fewshot = 0
@@ -399,6 +405,10 @@ def test_qa_set_cot_no_cot(tmp_path):
     pytest.importorskip('datasets')
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/triviaqa_small.jsonl'
+    try:
+        from transformers import AutoTokenizer
+    except ImportError:
+        pytest.importorskip('transformers')
     tokenizer = AutoTokenizer.from_pretrained('facebook/opt-125m')
 
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
@@ -422,6 +432,10 @@ def test_qa_set_cot_has_cot(tmp_path):
     pytest.importorskip('datasets')
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/gsm8k_small.jsonl'
+    try:
+        from transformers import AutoTokenizer
+    except ImportError:
+        pytest.importorskip('transformers')
     tokenizer = AutoTokenizer.from_pretrained('facebook/opt-125m')
 
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
@@ -910,6 +924,10 @@ def test_schema_task_dataloader_sentpiece_tokenizer(dataset_uri, tmp_path):
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
 
+    try:
+        from transformers import AutoTokenizer
+    except ImportError:
+        pytest.importorskip('transformers')
     tokenizer = AutoTokenizer.from_pretrained('huggyllama/llama-7b', use_fast=False)
     dataset_uri = f'{local_data}/{dataset_uri}'
     batch_size = 2
@@ -957,6 +975,10 @@ def test_lm_task_dataloader_opt_tokenizer(dataset_uri, num_fewshot, tmp_path):
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
 
+    try:
+        from transformers import AutoTokenizer
+    except ImportError:
+        pytest.importorskip('transformers')
     tokenizer = AutoTokenizer.from_pretrained('facebook/opt-125m', use_fast=False)
     dataset_uri = f'{local_data}/{dataset_uri}'
     batch_size = 2
@@ -998,6 +1020,10 @@ def test_mc_task_dataloader_opt_tokenizer(dataset_uri, num_fewshot, tmp_path):
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
 
+    try:
+        from transformers import AutoTokenizer
+    except ImportError:
+        pytest.importorskip('transformers')
     tokenizer = AutoTokenizer.from_pretrained('facebook/opt-125m', use_fast=False)
 
     dataset_uri = f'{local_data}/{dataset_uri}'
@@ -1048,6 +1074,10 @@ def test_mc_split_batch(dataset_uri, num_fewshot, tmp_path):
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
 
+    try:
+        from transformers import AutoTokenizer
+    except ImportError:
+        pytest.importorskip('transformers')
     tokenizer = AutoTokenizer.from_pretrained('facebook/opt-125m', use_fast=False)
 
     dataset_uri = f'{local_data}/{dataset_uri}'
@@ -1107,6 +1137,10 @@ def test_qa_split_batch(dataset_uri, tmp_path):
     pytest.importorskip('datasets')
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/{dataset_uri}'
+    try:
+        from transformers import AutoTokenizer
+    except ImportError:
+        pytest.importorskip('transformers')
     tokenizer = AutoTokenizer.from_pretrained('facebook/opt-125m')
 
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
@@ -1319,6 +1353,10 @@ def test_code_eval_split_batch(dataset_uri, tmp_path):
     pytest.importorskip('datasets')
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/{dataset_uri}'
+    try:
+        from transformers import AutoTokenizer
+    except ImportError:
+        pytest.importorskip('transformers')
     tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neox-20b')
 
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
@@ -1389,6 +1427,10 @@ def test_code_eval_sentpiece_dataloader(dataset_uri, tmp_path, num_fewshot, prom
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
 
+    try:
+        from transformers import AutoTokenizer
+    except ImportError:
+        pytest.importorskip('transformers')
     tokenizer = AutoTokenizer.from_pretrained('huggyllama/llama-7b')
     dataset_uri = f'{local_data}/{dataset_uri}'
     batch_size = 9
@@ -1475,6 +1517,10 @@ def test_code_eval_test_cases(dataset_uri, tmp_path):
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
 
+    try:
+        from transformers import AutoTokenizer
+    except ImportError:
+        pytest.importorskip('transformers')
     tokenizer = AutoTokenizer.from_pretrained('huggyllama/llama-7b')
     dataset_uri = f'{local_data}/{dataset_uri}'
     batch_size = 9
@@ -1523,6 +1569,10 @@ def test_code_eval_pass_at_k_validity(dataset_uri, tmp_path):
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
 
+    try:
+        from transformers import AutoTokenizer
+    except ImportError:
+        pytest.importorskip('transformers')
     tokenizer = AutoTokenizer.from_pretrained('huggyllama/llama-7b')
     dataset_uri = f'{local_data}/{dataset_uri}'
     batch_size = 9
@@ -1554,6 +1604,10 @@ def test_code_eval_task_dataloader(dataset_uri, tmp_path, num_fewshot, prompt_st
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
 
+    try:
+        from transformers import AutoTokenizer
+    except ImportError:
+        pytest.importorskip('transformers')
     tokenizer = AutoTokenizer.from_pretrained('mosaicml/mpt-7b')
     dataset_uri = f'{local_data}/{dataset_uri}'
     batch_size = 9
@@ -1660,6 +1714,10 @@ def test_lm_task_evaluation(device, dataset_uri, num_fewshot, tiny_gpt2_tokenize
 
     evaluator = Evaluator(label='lambada', dataloader=dl, metric_names=['InContextLearningLMAccuracy'])
 
+    try:
+        import transformers
+    except ImportError:
+        pytest.importorskip('transformers')
     config = transformers.AutoConfig.from_pretrained('EleutherAI/gpt-neo-125M')
     model = transformers.AutoModelForCausalLM.from_config(config)
     model = HuggingFaceModel(
@@ -1832,6 +1890,10 @@ def test_qa_task_evaluation_opt_tokenizer(device, world_size, num_fewshot, datas
     in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/{dataset_uri}'
+    try:
+        from transformers import AutoTokenizer
+    except ImportError:
+        pytest.importorskip('transformers')
     tokenizer = AutoTokenizer.from_pretrained('facebook/opt-125m')
     batch_size = 2
 
@@ -1852,6 +1914,10 @@ def test_qa_task_evaluation_opt_tokenizer(device, world_size, num_fewshot, datas
     )
 
     evaluator = Evaluator(label='triviaqa', dataloader=dl, metric_names=['InContextLearningQAAccuracy'])
+    try:
+        from transformers import AutoModelForCausalLM
+    except ImportError:
+        pytest.importorskip('transformers')
     model = HuggingFaceModel(
         model=AutoModelForCausalLM.from_pretrained('facebook/opt-125m'),
         tokenizer=tokenizer,
@@ -1875,6 +1941,10 @@ def test_qa_task_evaluation_with_cot_opt_tokenizer(device, world_size, num_fewsh
     in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/{dataset_uri}'
+    try:
+        from transformers import AutoTokenizer
+    except ImportError:
+        pytest.importorskip('transformers')
     tokenizer = AutoTokenizer.from_pretrained('facebook/opt-125m')
     batch_size = 2
 
@@ -1896,6 +1966,10 @@ def test_qa_task_evaluation_with_cot_opt_tokenizer(device, world_size, num_fewsh
     )
 
     evaluator = Evaluator(label='gsm8k', dataloader=dl, metric_names=['InContextLearningQAAccuracy'])
+    try:
+        from transformers import AutoModelForCausalLM
+    except ImportError:
+        pytest.importorskip('transformers')
     model = HuggingFaceModel(
         model=AutoModelForCausalLM.from_pretrained('facebook/opt-125m'),
         tokenizer=tokenizer,
@@ -2023,6 +2097,10 @@ def test_code_eval_microbatching(monkeypatch, device, world_size, num_fewshot, d
     in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/{dataset_uri}'
+    try:
+        from transformers import AutoTokenizer
+    except ImportError:
+        pytest.importorskip('transformers')
     tokenizer = AutoTokenizer.from_pretrained('facebook/opt-125m')
     batch_size = 2
 
@@ -2047,6 +2125,10 @@ def test_code_eval_microbatching(monkeypatch, device, world_size, num_fewshot, d
                           dataloader=dl,
                           metric_names=['InContextLearningCodeEvalAccuracy'],
                           device_eval_microbatch_size=1)
+    try:
+        from transformers import AutoModelForCausalLM
+    except ImportError:
+        pytest.importorskip('transformers')
     model = HuggingFaceModel(
         model=AutoModelForCausalLM.from_pretrained('facebook/opt-125m'),
         tokenizer=tokenizer,

From 350ce5bda30d79544e039f646601bf1cc4afce7a Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Wed, 17 Jan 2024 17:45:42 +0000
Subject: [PATCH 086/116] update comments

---
 .../in_context_learning_evaluation.py         | 86 ++++++++++++++++---
 1 file changed, 72 insertions(+), 14 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 418650f8c5..e7c12a7328 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -33,6 +33,15 @@
 
 
 def strip_data(example: Dict) -> Dict:
+    """
+    Remove white space from the begging and end of string values in a dictionary
+
+    Args:
+        example: dictionary to be stripped
+
+    Returns:
+        dict: the same dictionary with .strip() applied to any value in the dict that is a string
+    """
     return {k: v.strip() if isinstance(v, str) else v for k, v in example.items()}
 
 
@@ -40,11 +49,30 @@ def _tokenizer_needs_prefix_space(tokenizer: transformers.PreTrainedTokenizerBas
     """
     Test for whether a prefix space is needed before the continuation.
     Sentencepiece tokenization should not have a prefix space, but gpt2 style BPE should.
+
+    Args:
+        tokenizer: Tokenizer to test
+
+    Returns:
+        bool: whether or not the tokenizer needs a prefix space
     """
     return len(tokenizer(' a', add_special_tokens=False)['input_ids']) == 1
 
 
 def _trim_context(context_enc: List, continuation_enc: List, max_seq_len: int) -> List:
+    """
+    Trims a list of tokens down to `max_seq_len` if the length of the list plus the continuation
+    is more than `max_seq_len`. It will always trim tokens from the left, i.e. tokens at the beginning
+    of the context will be removed. 
+    
+    Args:
+        context_enc (list): list of tokens in the context
+        continuation_enc (lsit): list of tokens in the continuation
+        max_seq_len (int): maximum length the model can ingest
+
+    Returns:
+        list: the encoded context trimmed from the left
+    """
     if len(continuation_enc) + len(context_enc) > max_seq_len:
         context_max_subseq_len = max_seq_len - len(continuation_enc)
 
@@ -58,6 +86,16 @@ def _trim_context(context_enc: List, continuation_enc: List, max_seq_len: int) -
 
 
 def _get_continuation_span(context_enc: List, continuation_enc: List) -> list:
+    """
+    Gets the list of indices of the continutaion tokens for language modeling or generaiton tasks.
+
+    Args:
+        context_enc (list): list of context tokens
+        continuation_enc (list): list of continuation tokens
+
+    Returns:
+        torch.tensor: a tensor containing indices corresponding to continuation tokens 
+    """
     return torch.tensor(range(len(context_enc), len(context_enc) + len(continuation_enc)))
 
 
@@ -80,7 +118,6 @@ def _make_padded_input(context_enc: List,
     Returns:
         input (torch.tensor): the padded and encoded context
         continuation_span (torch.tensor): the _inclusive_ range of indices corresponding to the continuation
-
     """
 
     inp = torch.tensor(
@@ -118,6 +155,13 @@ def convert_tokens_to_tensors(batch: Dict, tokenize_labels: bool) -> Dict[str, A
     because some content in the dataset, like generation args or single ints, should not be converted.
 
     Here, we convert those lists of tokens back into tensors in order to feed them into the model.
+
+    Args:
+        batch (dict): a dictionary of batched inputs
+        tokenize_labels (bool): whether or not the labels are tokenized (and need to be stacked)
+
+    Returns:
+        dict: the batch with torch tensors in the corresponding keys instead of lists of lists
     """
     batch['input_ids'] = torch.stack(list(map(torch.tensor, batch['input_ids'])))
     if tokenize_labels:
@@ -292,7 +336,6 @@ def _update_generation_kwargs(self, generation_kwargs: Dict) -> None:
 
         Args:
             dict: keyword arguments that be written into base_batch['generation_kwargs']
-
         """
         if 'generation_kwargs' not in self.base_batch:
             self.base_batch['generation_kwargs'] = {}
@@ -306,6 +349,7 @@ def _read_dataset(self,
                       hf_parsing_map: Dict = None) -> transformers.Dataset:
         """
         Reads a dataset and handles parsing it from HuggingFace.
+
         Args:
             dataset_uri (str): A local path, a remote path beginning with ``s3://`` or another backend, or a HuggingFace dataset uri.
                 Alternate backends must be supported by :meth:`composer.utils.maybe_create_object_store_from_uri`.
@@ -353,6 +397,7 @@ def _generate_few_shot_prompt(
         contextes with answers appended.
 
         Returns the formatted prompt_string + concatenated list of formatted few shot examples as a string.
+
         Args:
             num_fewshot (int): number of examples to prepend
             example_idx (int): current example idx
@@ -398,6 +443,7 @@ def _construct_context(self, example: Dict, preceding_text: str = '', add_answer
     def _get_answer_from_example(self, example: Dict[str, Any], in_context: bool = False) -> str:
         """
         Returns the answer from the example.
+
         Args:
             example (Dict): the example from which to retrieve the answer
 
@@ -415,6 +461,7 @@ def _fix_eos_on_preamble(self, input_ids: List[int]) -> List[int]:
         unless the tokenizer adds special tokens to empty strings (e.g. OPT tokenizer).
         If there is an EOS token added, we need to remove it so it is not in the middle of the prompt,
         as the specific eval question's prompt will follow the input_ids.
+
         Args:
             input_ids (List): the tokenized input
 
@@ -428,7 +475,8 @@ def _fix_eos_on_preamble(self, input_ids: List[int]) -> List[int]:
 
     def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -> Dict[str, Any]:
         """
-        Runs text through the tokenizer and handles special cases.
+        Runs text through the tokenizer and handle special cases.
+
         Args:
             prompt_and_fewshot (str): the collection of the prompt and fewshot examples that belongs before the example's context
             ctxt (str): the specific example's derrived context
@@ -502,6 +550,7 @@ def _prep_example(
     def collate_fn(self, data: Dict[str, Any]) -> Dict[str, Any]:
         """
         The function that the dataloader uses to accumulate data into batches.
+
         Args:
             data (List): list of tokenized datapoints (dicts returned by self._tokenize_example)
 
@@ -638,7 +687,7 @@ def _get_answer_from_example(self, example: Dict, in_context=False) -> str:
 
     def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -> Dict[str, Any]:
         """
-        Runs text through the tokenizer and handles special cases.
+        Runs text through the tokenizer and handle special cases.
         Args:
             prompt_and_fewshot (str): the collection of the prompt and fewshot examples that belongs before the example's context
             ctx (str): the specific example's derrived context
@@ -776,7 +825,7 @@ def _get_answer_from_example(self, example: Dict, in_context=False) -> str:
 
     def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -> Dict[str, Any]:
         """
-        Runs text through the tokenizer and handles special cases.
+        Runs text through the tokenizer and handle special cases.
         Args:
             prompt_and_fewshot (str): the collection of the prompt and fewshot examples that belongs before the example's context
             ctx (str): the specific example's derrived context
@@ -953,7 +1002,6 @@ def _construct_context(self, example, preceding_text: str = '', add_answer: bool
 
         Returns:
             str: the single correct context for a given continuation
-
         """
         context_options = example[self.choices_key]
         gold_idx = example['gold']
@@ -1016,7 +1064,8 @@ def _prep_example(
 
     def _tokenize_example(self, prompt_and_fewshot: str, context_options: List[str], example: Dict) -> Dict[str, Any]:
         """
-        Runs text through the tokenizer and handles special cases.
+        Runs text through the tokenizer and handle special cases.
+
         Args:
             prompt_and_fewshot (str): the collection of the prompt and fewshot examples that belongs before the example's context
             ctx (str): the specific example's derrived context
@@ -1156,16 +1205,15 @@ def __init__(
 
     def adjust_padding(self):
         """
-        Adjusts padding to the maximum prompt size rather than max_seq_len.
-        Needs to be done after the dataset has been processed because we can't get the prompt length
-        until after we've tokenized it.
+        Adjusts padding to the maximum prompt length rather than max_seq_len.
+        Needs to be done after the dataset has been processed because we don't know the maximum
+        prompt length until after we've tokenized it.
 
         Returns:
-            dataset:
+            dataset: a HuggingFace Dataset with different padding lengths for example[self.context_key]
         """
         max_prompt_length = 0
         for example in self.dataset:
-            # TODO: Will this elimanate tokens we want to keep?
             unpadded_example = [token for token in example[self.context_key] if token != self.pad_tok_id]
             max_prompt_length = max(
                 max_prompt_length,
@@ -1173,10 +1221,10 @@ def adjust_padding(self):
             )
         self.max_prompt_length = max_prompt_length
 
-        def _trim_padding(example):
+        def _trim_padding(example: Dict):
             # Remove padding tokens applied during tokenization
             unpadded_prompt = [token for token in example[self.context_key] if token != self.pad_tok_id]
-            # Pad only to max_promp_length
+            # Reapply padding only to max_prompt_length
             full_prompt = _trim_context(unpadded_prompt, [], self.max_prompt_length)
             padded_context = _make_padded_input(full_prompt, [], self.max_prompt_length, self.pad_tok_id,
                                                 self.padding_side)
@@ -1224,6 +1272,16 @@ def build_icl_dataloader(
     generations_per_sample: int,
     generation_kwargs: Dict,
 ) -> DataSpec:
+    """
+    Factory method that builds the specific dataset for the specified icl_task_type.
+    See documentation for `get_icl_task_dataloader` for arugment documentation.
+
+    When writing a dataset for a new task, here you will need to:
+        1. add the dataset to the factory and choose an appropriate string
+        2. set the batch size for that task (see InContextLearningMultipleChoiceTaskDataset for why
+            this might be different)
+        3. set the `split_batch` funciton if necessary
+    """
     if icl_task_type == 'multiple_choice':
         dataset = InContextLearningMultipleChoiceTaskDataset(
             dataset_uri=dataset_uri,

From 97b32f20bba3b808d116ed1be85c8190058e027e Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Thu, 18 Jan 2024 19:07:16 +0000
Subject: [PATCH 087/116] update keys

---
 composer/datasets/in_context_learning_evaluation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 22d70bbb66..8c092a7973 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -632,7 +632,7 @@ def __init__(self,
         self.cot_delimiter = cot_delimiter
         self.has_cot = False
         self.max_answer_length = 0
-        static_keys = ['mode', 'cot_delimiter', 'generation_length', 'generation_kwargs']
+        static_keys = ['mode', 'cot_delimiter', 'generation_length', 'generation_kwargs', 'do_normalization', 'stopping_criteria']
         tensor_keys = ['input_ids', 'attention_mask']
         list_keys = ['labels']
         super().__init__(padding_side='left',

From 9b2fcde991ae41d231b184f3afb0cb62466a3956 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Thu, 18 Jan 2024 21:05:09 +0000
Subject: [PATCH 088/116] add typechecks for token ids

---
 composer/datasets/in_context_learning_evaluation.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 8c092a7973..f65cd2544a 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -285,6 +285,8 @@ def __init__(
         self.prefix_space = _tokenizer_needs_prefix_space(self.tokenizer)
 
         self.max_seq_len = max_seq_len
+        if not isinstance(pad_tok_id, int):
+            raise ValueError(f'`InContextLearningDataset` must be an integer. Found {pad_tok_id} instead')
         self.pad_tok_id = pad_tok_id
         self.num_fewshot = num_fewshot
         self.padding_side = padding_side
@@ -629,6 +631,8 @@ def __init__(self,
                  early_stopping_criteria: Optional[List[str]] = None,
                  do_normalization: bool = True,
                  *args, **kwargs):
+        if kwargs['tokenizer'].eos_token_id is None:
+            raise ValueError('`InContextLearningQATaskDataset` tokenizer must have non-null `eos_token_id`')
         self.cot_delimiter = cot_delimiter
         self.has_cot = False
         self.max_answer_length = 0

From ca4f7b805a92e4db6d0e4ddcbbf8e22efa44e188 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Thu, 18 Jan 2024 21:30:16 +0000
Subject: [PATCH 089/116] rm outdated test

---
 tests/datasets/test_in_context_learning_datasets.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py
index a5be08dd11..4b05b61422 100644
--- a/tests/datasets/test_in_context_learning_datasets.py
+++ b/tests/datasets/test_in_context_learning_datasets.py
@@ -220,14 +220,6 @@ def test_stop_sequences_criteria(tiny_gpt2_tokenizer):
     assert eos_criteria(input_ids, None)
 
 
-def test_batch_padding_logic(tiny_gpt2_tokenizer):
-    continuation = tiny_gpt2_tokenizer(' dog' * 2000)['input_ids']
-    context = tiny_gpt2_tokenizer(' cat' * 2000)['input_ids']
-    _, continuation_spans = _make_padded_input(context, continuation, 2048, tiny_gpt2_tokenizer.eos_token_id)
-    # the context (of len 2000) gets clipped to len 48 so that the whole continuation can fit
-    assert continuation_spans[0] == 48 and continuation_spans[-1] == 2047
-
-
 def test_update_generation_kwargs_no_kwargs(tiny_gpt2_tokenizer, tmp_path):
     tokenizer = tiny_gpt2_tokenizer
     seqlen = 2048

From 9628d192d78a1c206e488ca348364c6d6074b03c Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Thu, 18 Jan 2024 22:41:36 +0000
Subject: [PATCH 090/116] fix tests

---
 tests/datasets/test_in_context_learning_datasets.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py
index 4b05b61422..603db1f5cb 100644
--- a/tests/datasets/test_in_context_learning_datasets.py
+++ b/tests/datasets/test_in_context_learning_datasets.py
@@ -493,7 +493,7 @@ def test_qa_get_max_answer_length(tiny_gpt2_tokenizer, tmp_path):
         destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
     )
     # empirical number from the small test dataset
-    assert dl.max_answer_length == 9
+    assert dl.max_answer_length == 7
 
 
 def test_qa_get_answer_from_example_with_no_cot(tmp_path, tiny_gpt2_tokenizer):
@@ -1875,7 +1875,7 @@ def test_qa_task_evaluation_opt_tokenizer(device, world_size, tiny_opt_tokenizer
     tokenizer = tiny_opt_tokenizer
 
     # TODO: check this
-    batch_size = 10
+    batch_size = 4
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
     dl = get_icl_task_dataloader(
@@ -1925,6 +1925,7 @@ def test_qa_task_evaluation_with_cot_opt_tokenizer(device, world_size, tiny_opt_
     dataset_uri = f'{local_data}/{dataset_uri}'
     tokenizer = tiny_opt_tokenizer
 
+    batch_size = 4
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
     dl = get_icl_task_dataloader(
@@ -2082,7 +2083,7 @@ def test_code_eval_microbatching(monkeypatch, device, world_size, tiny_opt_token
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/{dataset_uri}'
     tokenizer = tiny_opt_tokenizer
-    batch_size = 8 
+    batch_size = 4 
 
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)

From e0c011752a76651acca5c2553647b7146d9b756b Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Fri, 19 Jan 2024 00:22:52 +0000
Subject: [PATCH 091/116] add microbatch test

---
 composer/core/data_spec.py                    |  2 +-
 .../in_context_learning_evaluation.py         | 44 +++++------
 .../test_in_context_learning_datasets.py      | 76 +++++++++++++------
 3 files changed, 75 insertions(+), 47 deletions(-)

diff --git a/composer/core/data_spec.py b/composer/core/data_spec.py
index 2255ab991e..e84a42aec6 100644
--- a/composer/core/data_spec.py
+++ b/composer/core/data_spec.py
@@ -58,7 +58,7 @@ def _split_mapping(m, microbatch_size: int):
         num_chunks = len(list(chunked.values())[0])
     # Broadcast primitives to all chunks
     for k, v in m.items():
-        if isinstance(v, (int, float, str, bool, dict)):
+        if isinstance(v, (int, float, str, bool)):
             chunked[k] = [v] * num_chunks
     return [{k: v[idx] for k, v in chunked.items()} for idx in range(num_chunks)]
 
diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index f65cd2544a..12fd99fec3 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -1212,7 +1212,8 @@ def __init__(
             *args,
             **kwargs,
         )
-        self.dataset = self.adjust_padding()
+        self._set_max_prompt_and_answer_lengths()
+        self.dataset = self.dataset.map(self._trim_padding)
         self.base_batch = {
             'input_ids': [],
             'mode': 'generate',
@@ -1224,32 +1225,22 @@ def __init__(
             'test_outputs': [],
             'languages': [],
             'pass_at_k': pass_at_k,
-            # 'generation_length': self.max_seq_len - self.max_prompt_length,
             'generation_length': min(self.max_answer_length, self.max_seq_len - self.max_prompt_length),
             'generation_kwargs': {
                 'pad_token_id': self.pad_tok_id,
                 'num_beams': 1,  # single beam
                 'num_return_sequences': generations_per_sample,
                 'do_sample': True,
-                # # TODO: remove top_p and top_k and suggest using generation kwargs?
-                # 'top_p': self.top_p,
-                # 'top_k': self.top_k,
                 'use_cache': True,
                 'eos_token_id': self.tokenizer.eos_token_id
             }
         }
         self._update_generation_kwargs(kwargs.get('generation_kwargs'))
 
-    def adjust_padding(self):
+    def _set_max_prompt_and_answer_lengths(self):
         """
-        Adjusts padding to the maximum prompt length rather than max_seq_len.
-        Needs to be done after the dataset has been processed because we don't know the maximum
-        prompt length until after we've tokenized it.
-
-        Returns:
-            dataset: a HuggingFace Dataset with different padding lengths for example[self.context_key]
+        Iterates through the dataset and finds the maximum prompt length and sequence lengths
         """
-        # TODO: maybe don't put this here
         max_prompt_length = 0
         max_answer_length = 0
         for example in self.dataset:
@@ -1265,18 +1256,25 @@ def adjust_padding(self):
         self.max_prompt_length = max_prompt_length
         self.max_answer_length = max_answer_length + _MAX_ANSWER_BUFFER_LENGTH
 
-        def _trim_padding(example: Dict):
-            # Remove padding tokens applied during tokenization
-            unpadded_prompt = [token for token in example[self.context_key] if token != self.pad_tok_id]
-            # Reapply padding only to max_prompt_length
-            full_prompt = _trim_context(unpadded_prompt, [], self.max_prompt_length)
-            padded_context = _make_padded_input(full_prompt, [], self.max_prompt_length, self.pad_tok_id,
-                                                self.padding_side)
+    def _trim_padding(self, example: Dict):
+        """
+        Adjusts padding to the maximum prompt length rather than max_seq_len.
+        Needs to be done after the dataset has been processed because we don't know the maximum
+        prompt length until after we've tokenized it.
+
+        Returns:
+            dataset: a HuggingFace Dataset with different padding lengths for example[self.context_key]
+        """
+        # Remove padding tokens applied during tokenization
+        unpadded_prompt = [token for token in example[self.context_key] if token != self.pad_tok_id]
+        # Reapply padding only to max_prompt_length
+        full_prompt = _trim_context(unpadded_prompt, [], self.max_prompt_length)
+        padded_context = _make_padded_input(full_prompt, [], self.max_prompt_length, self.pad_tok_id,
+                                            self.padding_side)
 
-            example[self.context_key] = padded_context
-            return example
+        example[self.context_key] = padded_context
+        return example
 
-        return self.dataset.map(_trim_padding)
 
     def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -> Dict[str, Any]:
         """
diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py
index 603db1f5cb..b39be98712 100644
--- a/tests/datasets/test_in_context_learning_datasets.py
+++ b/tests/datasets/test_in_context_learning_datasets.py
@@ -10,7 +10,6 @@
 import pytest
 import torch
 from torch.utils.data import DataLoader
-from transformers import AutoTokenizer
 
 from composer import Evaluator
 from composer.core import DataSpec
@@ -411,7 +410,6 @@ def test_tokenize_example_with_no_tokenize_labels(tiny_gpt2_tokenizer, tmp_path)
     tokenized_example = dl._tokenize_example('What spell does this invoke? ', 'exort exort wex\nSpell: ',
                                              {'answer': ' Meatball'})
     tokenized_input = [2061, 4822, 857, 428, 26342, 30, 220, 1069, 419, 409, 419, 356, 87, 198, 31221, 25]
-    # import IPython; IPython.embed()
     assert tokenized_example['context'][:len(tokenized_input)].tolist() == tokenized_input
     assert tokenized_example['context'][-1] == tokenizer.eos_token_id
     assert len(tokenized_example['context']) == seqlen
@@ -1628,7 +1626,8 @@ def test_code_eval_task_dataloader(dataset_uri, tmp_path, num_fewshot, prompt_st
                                  continuation_delimiter='',
                                  question_prelimiter='Code start: \n',
                                  destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'),
-                                 generations_per_sample=generations_per_sample)
+                                 generations_per_sample=generations_per_sample,
+                                 generation_kwargs={"temperature": .9, "top_k": 40})
     assert isinstance(dl, DataSpec)
 
     assert isinstance(dl.dataloader, DataLoader)  # pyright
@@ -1670,6 +1669,57 @@ def test_code_eval_task_dataloader(dataset_uri, tmp_path, num_fewshot, prompt_st
         "Code start: \nfrom typing import List\n\n\ndef below_zero(operations: List[int]) -> bool:\n    \"\"\" You're given a list of deposit and withdrawal operations on a bank account that starts with\n    zero balance. Your task is to detect if at any point the balance of account fallls below zero, and\n    at that point function should return True. Otherwise it should return False.\n    >>> below_zero([1, 2, 3])\n    False\n    >>> below_zero([1, 2, -4, 5])\n    True\n    \"\"\"\n"
     )
 
+@pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
+@pytest.mark.parametrize('num_fewshot', [0, 1])
+def test_eval_split_batch(tiny_opt_tokenizer, dataset_uri, num_fewshot, tmp_path):
+    pytest.importorskip('datasets')
+
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+    try:
+        from transformers import AutoTokenizer
+    except ImportError:
+        pytest.importorskip('transformers')
+    tokenizer = AutoTokenizer.from_pretrained('mosaicml/mpt-7b')
+    dataset_uri = f'{local_data}/{dataset_uri}'
+    batch_size = 4
+    seqlen = 512
+
+    dl = get_icl_task_dataloader('code_evaluation',
+                                 dataset_uri=dataset_uri,
+                                 tokenizer=tokenizer,
+                                 batch_size=batch_size,
+                                 max_seq_len=seqlen,
+                                 pad_tok_id=tokenizer.eos_token_id,
+                                 num_fewshot=num_fewshot,
+                                 prompt_string='',
+                                 example_delimiter='\n',
+                                 continuation_delimiter='',
+                                 question_prelimiter='Code start: \n',
+                                 destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'),
+                                 generations_per_sample=1,
+                                 generation_kwargs={"temperature": .9, "top_k": 40})
+    assert isinstance(dl, DataSpec)
+    assert isinstance(dl.dataloader, DataLoader)  # pyright
+    batch = next(dl.dataloader._get_iterator())
+    microbatch_size = 1
+    microbatches = dl.split_batch(batch, microbatch_size)
+    assert len(microbatches) == 4
+    for microbatch in microbatches:
+        assert dl.get_num_samples_in_batch(microbatch) == 1
+        assert 'input_ids' in microbatch
+        # TODO: what should this be?
+        # assert tuple(microbatch['input_ids'].shape) == (microbatch_size, seqlen)
+        assert 'attention_mask' in microbatch
+        # assert tuple(microbatch['attention_mask'].shape) == (microbatch_size, seqlen)
+        assert isinstance(microbatch['generation_kwargs'], dict)
+        assert microbatch['generation_kwargs']['temperature'] == .9
+        assert microbatch['generation_kwargs']['top_k'] == 40
+        assert microbatch['generation_kwargs']['pad_token_id'] == 0
+        assert microbatch['generation_kwargs']['num_beams'] == 1
+        assert microbatch['generation_kwargs']['num_return_sequences'] == 1
+        assert microbatch['generation_kwargs']['do_sample'] == True
+        assert microbatch['generation_kwargs']['use_cache'] == True
+        assert microbatch['generation_kwargs']['eos_token_id'] == 0
 
 @pytest.mark.parametrize('dataset_uri', ['lambada_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [0, 5])
@@ -1865,7 +1915,6 @@ def test_mc_task_evaluation(device, world_size, num_fewshot, dataset_uri, tiny_g
 @pytest.mark.filterwarnings(r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning')
 @device('gpu')
 @world_size(1, 2)
-# @pytest.mark.parametrize('num_fewshot', [0, 5])
 def test_qa_task_evaluation_opt_tokenizer(device, world_size, tiny_opt_tokenizer, tiny_opt_model, num_fewshot,
                                           dataset_uri, tmp_path):
     pytest.importorskip('datasets')
@@ -1874,7 +1923,6 @@ def test_qa_task_evaluation_opt_tokenizer(device, world_size, tiny_opt_tokenizer
     dataset_uri = f'{local_data}/{dataset_uri}'
     tokenizer = tiny_opt_tokenizer
 
-    # TODO: check this
     batch_size = 4
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
@@ -1893,10 +1941,6 @@ def test_qa_task_evaluation_opt_tokenizer(device, world_size, tiny_opt_tokenizer
     )
 
     evaluator = Evaluator(label='triviaqa', dataloader=dl, metric_names=['InContextLearningQAAccuracy'])
-    # try:
-    #     from transformers import AutoModelForCausalLM
-    # except ImportError:
-    #     pytest.importorskip('transformers')
     model = HuggingFaceModel(
         model=tiny_opt_model,
         tokenizer=tokenizer,
@@ -1915,7 +1959,6 @@ def test_qa_task_evaluation_opt_tokenizer(device, world_size, tiny_opt_tokenizer
 @pytest.mark.parametrize('dataset_uri', ['gsm8k_small.jsonl'])
 @device('gpu')
 @world_size(1, 2)
-# @pytest.mark.parametrize('num_fewshot', [5])
 @pytest.mark.filterwarnings(r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning')
 def test_qa_task_evaluation_with_cot_opt_tokenizer(device, world_size, tiny_opt_tokenizer, tiny_opt_model, num_fewshot,
                                                    dataset_uri, tmp_path):
@@ -1944,10 +1987,6 @@ def test_qa_task_evaluation_with_cot_opt_tokenizer(device, world_size, tiny_opt_
     )
 
     evaluator = Evaluator(label='gsm8k', dataloader=dl, metric_names=['InContextLearningQAAccuracy'])
-    # try:
-    #     from transformers import AutoModelForCausalLM
-    # except ImportError:
-    #     pytest.importorskip('transformers')
     model = HuggingFaceModel(
         model=tiny_opt_model,
         tokenizer=tokenizer,
@@ -1966,8 +2005,6 @@ def test_qa_task_evaluation_with_cot_opt_tokenizer(device, world_size, tiny_opt_
 @pytest.mark.parametrize('num_fewshot', [0, 5])
 @device('gpu')
 @world_size(1, 2)
-# @pytest.mark.parametrize('num_fewshot', [0, 5])
-# @pytest.mark.filterwarnings(r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning')
 def test_qa_task_evaluation(device, world_size, num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tiny_gpt2_model,
                             tmp_path):
     pytest.importorskip('datasets')
@@ -2013,7 +2050,6 @@ def test_qa_task_evaluation(device, world_size, num_fewshot, dataset_uri, tiny_g
 @pytest.mark.filterwarnings(r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning')
 @device('gpu')
 @world_size(1, 2)
-# @pytest.mark.parametrize('num_fewshot', [5])
 def test_qa_task_with_cot_evaluation(device, world_size, num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tiny_gpt2_model,
                                      tmp_path):
     pytest.importorskip('datasets')
@@ -2073,8 +2109,6 @@ def test_code_eval_requires_valid_envvar(monkeypatch):
 @device('gpu')
 @world_size(1, 2)
 @pytest.mark.filterwarnings(r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning')
-# def test_code_eval_microbatching(monkeypatch, device, world_size, num_fewshot, dataset_uri, tmp_path,
-#                                  generations_per_sample):
 def test_code_eval_microbatching(monkeypatch, device, world_size, tiny_opt_tokenizer, tiny_opt_model, num_fewshot,
                                  dataset_uri, tmp_path, generations_per_sample):
     pytest.importorskip('datasets')
@@ -2106,10 +2140,6 @@ def test_code_eval_microbatching(monkeypatch, device, world_size, tiny_opt_token
                           dataloader=dl,
                           metric_names=['InContextLearningCodeEvalAccuracy'],
                           device_eval_microbatch_size=1)
-    try:
-        from transformers import AutoModelForCausalLM
-    except ImportError:
-        pytest.importorskip('transformers')
     model = HuggingFaceModel(
         model=tiny_opt_model,
         tokenizer=tokenizer,

From cc0149356227c4dc116d2d326b3f45807ea2b7ab Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Mon, 22 Jan 2024 23:13:44 +0000
Subject: [PATCH 092/116] pyright fixes

---
 .../in_context_learning_evaluation.py         | 167 +++++++++---------
 .../test_in_context_learning_datasets.py      |  41 +++--
 2 files changed, 113 insertions(+), 95 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 12fd99fec3..c7c5d0438d 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -8,7 +8,7 @@
 import json
 import os
 import random
-from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Union, Optional
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Union
 
 import torch
 from torch.utils.data import DataLoader, Dataset
@@ -20,6 +20,7 @@
 
 if TYPE_CHECKING:
     import transformers
+    from datasets import Dataset
 
 # Allow models to have slightly more tokens than were used in the most verbose CoT in the dataset
 _MAX_ANSWER_BUFFER_LENGTH = 10
@@ -86,7 +87,7 @@ def _trim_context(context_enc: List, continuation_enc: List, max_seq_len: int) -
     return context_enc
 
 
-def _get_continuation_span(context_enc: List, continuation_enc: List) -> list:
+def _get_continuation_span(context_enc: List, continuation_enc: List) -> torch.Tensor:
     """
     Gets the list of indices of the continutaion tokens for language modeling or generaiton tasks.
 
@@ -104,7 +105,7 @@ def _make_padded_input(context_enc: List,
                        continuation_enc: List,
                        max_seq_len: int,
                        pad_tok_id: int,
-                       padding_side: str = 'right') -> Tuple[torch.tensor, torch.tensor]:
+                       padding_side: str = 'right') -> torch.Tensor:
     """
     Takes an encoded context and continuation and clips the beginning of the context if they're too long.
     Adds the padding token to the specified side.
@@ -171,7 +172,7 @@ def convert_tokens_to_tensors(batch: Dict, tokenize_labels: bool) -> Dict[str, A
     return batch
 
 
-def _get_fewshot_sample_idxs(dataset_size: int, num_fewshot: int, example_idx: int, rng: random.Random) -> List[int]:
+def _get_fewshot_sample_idxs(dataset_size: int, num_fewshot: int, example_idx: int, rng: random.Random) -> Set[int]:
     """
     Samples indices without replacement. If num_fewshot exceeds the number of unique examples in the dataset,
     then we will have fewer than num_fewshot examples in context.
@@ -204,7 +205,7 @@ class InContextLearningDataset(Dataset):
     A base dataset that constructs batches for in-context learning task evaluations.
     The dataset format is expected to be a local jsonl file, a cloud link to a jsonl file, or a Hugging Face dataset link.
     'context' refers to the input a model will recieve before generating an output. For example, the question in question answering tasks,
-        the preceding text in a language modeling task, or the document and question regarding the document in a document understanding task.
+    the preceding text in a language modeling task, or the document and question regarding the document in a document understanding task.
     'example' refers to an loaded dictionary, generally containing a context, an answer, and any other information needed to run the task.
     'answer' refers to the desired output of the model.
 
@@ -216,10 +217,10 @@ class InContextLearningDataset(Dataset):
 
     Additionally, base_batch and batch_mapping must be defined.
         - base_batch (Dict): the base dictionary that the dataset will use to construct a batch. This should contain static values, like generation_kwargs or mode,
-                             and empty lists for values that will need to be accumulated from each example.
-                             NOTE: Sometimes you will need to set base_batch directly after the init call, e.g. in order to use class variables
-                                   like self.pad_tok_id or self.max_answer_length. If you manually set generation_kwargs this way, you'll need to call self._update_generation_kwargs()
-                                   after setting self.base_batch.
+        and empty lists for values that will need to be accumulated from each example.
+        NOTE: Sometimes you will need to set base_batch directly after the init call, e.g. in order to use class variables
+        like self.pad_tok_id or self.max_answer_length. If you manually set generation_kwargs this way, you'll need to call self._update_generation_kwargs()
+        after setting self.base_batch.
         - batch_mapping (Dict): A mapping with keys that are keys in the batch and values that are columns in the loaded dataset.
                                 collate_fn will use this mapping to create batches from self.dataset.
 
@@ -265,21 +266,21 @@ def __init__(
         example_delimiter: str,
         continuation_delimiter: str,
         destination_path: str,
-        static_keys: List = None,
-        list_keys: List = None,
-        tensor_keys: List = None,
         prelimiter: str = '',
         context_key: str = 'context',
         answer_key: str = 'answer',
         strip_dataset: bool = True,
         padding_side: str = 'right',
-        padding_size: int = None,
-        base_batch: Dict = None,
-        batch_mapping: Dict = None,
-        hf_loading_vars: Dict = None,
-        hf_parsing_map: Dict = None,
-        tokenize_labels: bool = True,
-        generation_kwargs: Dict = None,
+        static_keys: Optional[List] = None,
+        list_keys: Optional[List] = None,
+        tensor_keys: Optional[List] = None,
+        padding_size: Optional[int] = None,
+        base_batch: Optional[Dict] = None,
+        batch_mapping: Optional[Dict] = None,
+        hf_loading_vars: Optional[Dict] = None,
+        hf_parsing_map: Optional[Dict] = None,
+        tokenize_labels: Optional[bool] = True,
+        generation_kwargs: Optional[Dict] = None,
     ):
         self.tokenizer = tokenizer
         self.prefix_space = _tokenizer_needs_prefix_space(self.tokenizer)
@@ -348,8 +349,8 @@ def _update_generation_kwargs(self, generation_kwargs: Dict) -> None:
     def _read_dataset(self,
                       dataset_uri: str,
                       destination_path: str,
-                      hf_loading_vars: Dict = None,
-                      hf_parsing_map: Dict = None) -> transformers.Dataset:
+                      hf_loading_vars: Optional[Dict[str, Any]] = None,
+                      hf_parsing_map: Optional[Dict[str, Any]] = None) -> 'Dataset':
         """
         Reads a dataset and handles parsing it from HuggingFace.
 
@@ -375,6 +376,8 @@ def _read_dataset(self,
             dataset_uri = dataset_uri.replace('hf://', '')
             dataset = load_dataset(dataset_uri, **hf_loading_vars)
             if hf_parsing_map:
+                # assert statement only for type checking
+                assert hf_parsing_map is not None, f'hf_parsing_map to be utilized but recieved object {hf_parsing_map}'
                 dataset_parsing_func = lambda example: {
                     k: ' '.join([str(example[col]) for col in v]) for k, v in hf_parsing_map.items()
                 }
@@ -620,23 +623,27 @@ class InContextLearningQATaskDataset(InContextLearningDataset):
     Additional Args:
         cot_delimiter (str): Delimiter to place between the chain of thought and continuations.
     """
-                # init:
-                #  early_stopping_criteria: Optional[List[str]] = None,
-                #  do_normalization: bool = True):
-        # self.early_stopping_criteria = early_stopping_criteria
-        # self.do_normalization = do_normalization
-
-    def __init__(self, 
-                 cot_delimiter: str = '', 
+
+    # init:
+    #  early_stopping_criteria: Optional[List[str]] = None,
+    #  do_normalization: bool = True):
+    # self.early_stopping_criteria = early_stopping_criteria
+    # self.do_normalization = do_normalization
+
+    def __init__(self,
+                 cot_delimiter: str = '',
                  early_stopping_criteria: Optional[List[str]] = None,
                  do_normalization: bool = True,
-                 *args, **kwargs):
+                 *args,
+                 **kwargs):
         if kwargs['tokenizer'].eos_token_id is None:
             raise ValueError('`InContextLearningQATaskDataset` tokenizer must have non-null `eos_token_id`')
         self.cot_delimiter = cot_delimiter
         self.has_cot = False
         self.max_answer_length = 0
-        static_keys = ['mode', 'cot_delimiter', 'generation_length', 'generation_kwargs', 'do_normalization', 'stopping_criteria']
+        static_keys = [
+            'mode', 'cot_delimiter', 'generation_length', 'generation_kwargs', 'do_normalization', 'stopping_criteria'
+        ]
         tensor_keys = ['input_ids', 'attention_mask']
         list_keys = ['labels']
         super().__init__(padding_side='left',
@@ -1159,8 +1166,8 @@ class InContextLearningCodeEvalDataset(InContextLearningDataset):
     - pass_at_k: passed value for pass_at_k
     - generation_length: derrived maximum generation length
     - generation_kwargs: Dictionary of kwargs neeeded for generation. Includes the following, which will be individually overwritten
-        by keys in generaiton_kwargs if set (see https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig
-        for more details):
+    by keys in generaiton_kwargs if set (see https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig
+    for more details):
         - pad_token_id: ID for padding token, derived automatically
         - num_beams: how many beams to search for generations, set to 1
         - num_return_sequences: value passed for 'generations_per_sample', how many generations per prompt
@@ -1240,6 +1247,9 @@ def __init__(
     def _set_max_prompt_and_answer_lengths(self):
         """
         Iterates through the dataset and finds the maximum prompt length and sequence lengths
+
+        Returns:
+            None
         """
         max_prompt_length = 0
         max_answer_length = 0
@@ -1269,13 +1279,11 @@ def _trim_padding(self, example: Dict):
         unpadded_prompt = [token for token in example[self.context_key] if token != self.pad_tok_id]
         # Reapply padding only to max_prompt_length
         full_prompt = _trim_context(unpadded_prompt, [], self.max_prompt_length)
-        padded_context = _make_padded_input(full_prompt, [], self.max_prompt_length, self.pad_tok_id,
-                                            self.padding_side)
+        padded_context = _make_padded_input(full_prompt, [], self.max_prompt_length, self.pad_tok_id, self.padding_side)
 
         example[self.context_key] = padded_context
         return example
 
-
     def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -> Dict[str, Any]:
         """
         Adds extra code task details to the example dictionary.
@@ -1294,27 +1302,27 @@ def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -
 
 
 def build_icl_dataloader(
-    icl_task_type: str,
-    dataset_uri: str,
-    tokenizer: Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast],
-    batch_size: int,
-    max_seq_len: int,
-    pad_tok_id: int,
-    num_fewshot: int,
-    prompt_string: str,  # e.g. 'translate english to french:'
-    example_delimiter: str,  # e.g. '\n'
-    continuation_delimiter: str,  # e.g. ''
-    hf_loading_vars: Dict,
-    hf_parsing_map: Dict,
-    destination_path: str,
-    prelimiter: str,  # e.g. 'Question: '
-    cot_delimiter: str,
-    fewshot_random_seed: int,
-    pass_at_k: int,
-    generations_per_sample: int,
-    generation_kwargs: Dict,
-    early_stopping_criteria: Optional[List[str]] = None,
-    do_normalization: bool = True) -> DataSpec:
+        icl_task_type: str,
+        dataset_uri: str,
+        tokenizer: Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast],
+        batch_size: int,
+        max_seq_len: int,
+        pad_tok_id: int,
+        num_fewshot: int,
+        prompt_string: str,  # e.g. 'translate english to french:'
+        example_delimiter: str,  # e.g. '\n'
+        continuation_delimiter: str,  # e.g. ''
+        hf_loading_vars: Dict,
+        hf_parsing_map: Dict,
+        destination_path: str,
+        prelimiter: str,  # e.g. 'Question: '
+        cot_delimiter: str,
+        fewshot_random_seed: int,
+        pass_at_k: int,
+        generations_per_sample: int,
+        generation_kwargs: Dict,
+        early_stopping_criteria: Optional[List[str]] = None,
+        do_normalization: bool = True) -> DataSpec:
     """
     Factory method that builds the specific dataset for the specified icl_task_type.
     See documentation for `get_icl_task_dataloader` for arugment documentation.
@@ -1505,29 +1513,28 @@ def partition_dataset_by_category(dataset_uri: str, destination_path: str, hf_lo
 
 
 def get_icl_task_dataloader(
-    icl_task_type: str,
-    dataset_uri: str,
-    tokenizer: transformers.PreTrainedTokenizerBase,
-    batch_size: int,
-    max_seq_len: int,
-    pad_tok_id: int,
-    num_fewshot: int,
-    prompt_string: str = '',  # e.g. 'translate english to french:'
-    example_delimiter: str = '\n',  # e.g. '\n'
-    continuation_delimiter: str = ' ',
-    destination_path: str = '',
-    question_prelimiter: str = '',  # e.g. 'Question: '
-    fewshot_random_seed: int = 1234,
-    pass_at_k: int = 1,
-    generations_per_sample: int = 20,
-    cot_delimiter: str = '',
-    has_categories: bool = False,
-    hf_loading_vars: Dict = None,
-    hf_parsing_map: Dict = None,
-    generation_kwargs: Dict = None,
-    early_stopping_criteria: Optional[List[str]] = None,
-    do_normalization: bool = True
-) -> Union[DataSpec, Dict[str, DataSpec]]:
+        icl_task_type: str,
+        dataset_uri: str,
+        tokenizer: transformers.PreTrainedTokenizerBase,
+        batch_size: int,
+        max_seq_len: int,
+        pad_tok_id: int,
+        num_fewshot: int,
+        prompt_string: str = '',  # e.g. 'translate english to french:'
+        example_delimiter: str = '\n',  # e.g. '\n'
+        continuation_delimiter: str = ' ',
+        destination_path: str = '',
+        question_prelimiter: str = '',  # e.g. 'Question: '
+        fewshot_random_seed: int = 1234,
+        pass_at_k: int = 1,
+        generations_per_sample: int = 20,
+        cot_delimiter: str = '',
+        has_categories: bool = False,
+        hf_loading_vars: Dict = None,
+        hf_parsing_map: Dict = None,
+        generation_kwargs: Dict = None,
+        early_stopping_criteria: Optional[List[str]] = None,
+        do_normalization: bool = True) -> Union[DataSpec, Dict[str, DataSpec]]:
     """
     This constructs a dataloader (or dataloaders if has_categories is True) capable of evaluating LLMs on in-context learning language modeling tasks, for example LAMBADA. An example usage is below:
 
diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py
index b39be98712..272200d0f4 100644
--- a/tests/datasets/test_in_context_learning_datasets.py
+++ b/tests/datasets/test_in_context_learning_datasets.py
@@ -28,7 +28,6 @@
 from composer.models import HuggingFaceModel
 from composer.trainer import Trainer
 from composer.utils import dist, reproducibility
-
 from tests.common import device, world_size
 
 
@@ -51,7 +50,7 @@ def test_tokenizer_needs_prefix_space_when_space_needed():
         from transformers import AutoTokenizer
     except ImportError:
         pytest.importorskip('transformers')
-    tokenizer = AutoTokenizer.from_pretrained('facebook/opt-125m', use_fast=False)
+    tokenizer = AutoTokenizer.from_pretrained('facebook/opt-125m', use_fast=False)  # type: ignore reportUnboundVariable
     assert _tokenizer_needs_prefix_space(tokenizer)
 
 
@@ -203,6 +202,8 @@ def test_update_generation_kwargs(tiny_gpt2_tokenizer, tmp_path):
                                   hf_parsing_map=hf_parsing_map,
                                   generation_kwargs=gen_kwargs)
     assert dl.base_batch['generation_kwargs'] == {'test_arg1': 1, 'test_arg2': 2}
+
+
 def test_stop_sequences_criteria(tiny_gpt2_tokenizer):
     pytest.importorskip('transformers')
     eos_criteria = MultiTokenEOSCriteria('\n\n', tiny_gpt2_tokenizer, 2)
@@ -316,7 +317,7 @@ def test_fix_eos_on_preamble(tmp_path):
         from transformers import AutoTokenizer
     except ImportError:
         pytest.importorskip('transformers')
-    tokenizer = AutoTokenizer.from_pretrained('facebook/opt-125m', use_fast=False)
+    tokenizer = AutoTokenizer.from_pretrained('facebook/opt-125m', use_fast=False)  # type: ignore reportUnboundVariable
     seqlen = 2048
     num_fewshot = 0
     prompt_string = ''
@@ -424,7 +425,7 @@ def test_qa_set_cot_no_cot(tmp_path):
         from transformers import AutoTokenizer
     except ImportError:
         pytest.importorskip('transformers')
-    tokenizer = AutoTokenizer.from_pretrained('facebook/opt-125m')
+    tokenizer = AutoTokenizer.from_pretrained('facebook/opt-125m')  # type: ignore reportUnboundVariable
 
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
@@ -451,7 +452,7 @@ def test_qa_set_cot_has_cot(tmp_path):
         from transformers import AutoTokenizer
     except ImportError:
         pytest.importorskip('transformers')
-    tokenizer = AutoTokenizer.from_pretrained('facebook/opt-125m')
+    tokenizer = AutoTokenizer.from_pretrained('facebook/opt-125m')  # type: ignore reportUnboundVariable
 
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
@@ -943,7 +944,9 @@ def test_schema_task_dataloader_sentpiece_tokenizer(dataset_uri, tmp_path):
         from transformers import AutoTokenizer
     except ImportError:
         pytest.importorskip('transformers')
-    tokenizer = AutoTokenizer.from_pretrained('huggyllama/llama-7b', use_fast=False)
+    tokenizer = AutoTokenizer.from_pretrained(
+        'huggyllama/llama-7b',  # type: ignore reportUnboundVariable
+        use_fast=False)
     dataset_uri = f'{local_data}/{dataset_uri}'
     batch_size = 2
     seqlen = 64
@@ -1378,7 +1381,7 @@ def test_code_eval_split_batch(dataset_uri, tmp_path):
         from transformers import AutoTokenizer
     except ImportError:
         pytest.importorskip('transformers')
-    tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neox-20b')
+    tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neox-20b')  # type: ignore reportUnboundVariable
 
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
@@ -1452,7 +1455,7 @@ def test_code_eval_sentpiece_dataloader(dataset_uri, tmp_path, num_fewshot, prom
         from transformers import AutoTokenizer
     except ImportError:
         pytest.importorskip('transformers')
-    tokenizer = AutoTokenizer.from_pretrained('huggyllama/llama-7b')
+    tokenizer = AutoTokenizer.from_pretrained('huggyllama/llama-7b')  # type: ignore reportUnboundVariable
     dataset_uri = f'{local_data}/{dataset_uri}'
     batch_size = 4
     seqlen = 2048
@@ -1522,7 +1525,7 @@ def test_code_eval_test_cases(dataset_uri, tmp_path):
         from transformers import AutoTokenizer
     except ImportError:
         pytest.importorskip('transformers')
-    tokenizer = AutoTokenizer.from_pretrained('huggyllama/llama-7b')
+    tokenizer = AutoTokenizer.from_pretrained('huggyllama/llama-7b')  # type: ignore reportUnboundVariable
     dataset_uri = f'{local_data}/{dataset_uri}'
     batch_size = 4
     seqlen = 512
@@ -1574,7 +1577,7 @@ def test_code_eval_pass_at_k_validity(dataset_uri, tmp_path):
         from transformers import AutoTokenizer
     except ImportError:
         pytest.importorskip('transformers')
-    tokenizer = AutoTokenizer.from_pretrained('huggyllama/llama-7b')
+    tokenizer = AutoTokenizer.from_pretrained('huggyllama/llama-7b')  # type: ignore reportUnboundVariable
     dataset_uri = f'{local_data}/{dataset_uri}'
     batch_size = 2
     seqlen = 64
@@ -1609,7 +1612,7 @@ def test_code_eval_task_dataloader(dataset_uri, tmp_path, num_fewshot, prompt_st
         from transformers import AutoTokenizer
     except ImportError:
         pytest.importorskip('transformers')
-    tokenizer = AutoTokenizer.from_pretrained('mosaicml/mpt-7b')
+    tokenizer = AutoTokenizer.from_pretrained('mosaicml/mpt-7b')  # type: ignore reportUnboundVariable
     dataset_uri = f'{local_data}/{dataset_uri}'
     batch_size = 4
     seqlen = 2048
@@ -1627,7 +1630,10 @@ def test_code_eval_task_dataloader(dataset_uri, tmp_path, num_fewshot, prompt_st
                                  question_prelimiter='Code start: \n',
                                  destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'),
                                  generations_per_sample=generations_per_sample,
-                                 generation_kwargs={"temperature": .9, "top_k": 40})
+                                 generation_kwargs={
+                                     'temperature': .9,
+                                     'top_k': 40
+                                 })
     assert isinstance(dl, DataSpec)
 
     assert isinstance(dl.dataloader, DataLoader)  # pyright
@@ -1669,6 +1675,7 @@ def test_code_eval_task_dataloader(dataset_uri, tmp_path, num_fewshot, prompt_st
         "Code start: \nfrom typing import List\n\n\ndef below_zero(operations: List[int]) -> bool:\n    \"\"\" You're given a list of deposit and withdrawal operations on a bank account that starts with\n    zero balance. Your task is to detect if at any point the balance of account fallls below zero, and\n    at that point function should return True. Otherwise it should return False.\n    >>> below_zero([1, 2, 3])\n    False\n    >>> below_zero([1, 2, -4, 5])\n    True\n    \"\"\"\n"
     )
 
+
 @pytest.mark.parametrize('dataset_uri', ['human_eval_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [0, 1])
 def test_eval_split_batch(tiny_opt_tokenizer, dataset_uri, num_fewshot, tmp_path):
@@ -1679,7 +1686,7 @@ def test_eval_split_batch(tiny_opt_tokenizer, dataset_uri, num_fewshot, tmp_path
         from transformers import AutoTokenizer
     except ImportError:
         pytest.importorskip('transformers')
-    tokenizer = AutoTokenizer.from_pretrained('mosaicml/mpt-7b')
+    tokenizer = AutoTokenizer.from_pretrained('mosaicml/mpt-7b')  # type: ignore reportUnboundVariable
     dataset_uri = f'{local_data}/{dataset_uri}'
     batch_size = 4
     seqlen = 512
@@ -1697,7 +1704,10 @@ def test_eval_split_batch(tiny_opt_tokenizer, dataset_uri, num_fewshot, tmp_path
                                  question_prelimiter='Code start: \n',
                                  destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'),
                                  generations_per_sample=1,
-                                 generation_kwargs={"temperature": .9, "top_k": 40})
+                                 generation_kwargs={
+                                     'temperature': .9,
+                                     'top_k': 40
+                                 })
     assert isinstance(dl, DataSpec)
     assert isinstance(dl.dataloader, DataLoader)  # pyright
     batch = next(dl.dataloader._get_iterator())
@@ -1721,6 +1731,7 @@ def test_eval_split_batch(tiny_opt_tokenizer, dataset_uri, num_fewshot, tmp_path
         assert microbatch['generation_kwargs']['use_cache'] == True
         assert microbatch['generation_kwargs']['eos_token_id'] == 0
 
+
 @pytest.mark.parametrize('dataset_uri', ['lambada_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [0, 5])
 @device('gpu')
@@ -2117,7 +2128,7 @@ def test_code_eval_microbatching(monkeypatch, device, world_size, tiny_opt_token
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/{dataset_uri}'
     tokenizer = tiny_opt_tokenizer
-    batch_size = 4 
+    batch_size = 4
 
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)

From 174a1d680619a51b6a4c27c32c11dbb462e995e2 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Tue, 23 Jan 2024 01:14:42 +0000
Subject: [PATCH 093/116] linting attempts

---
 composer/datasets/in_context_learning_evaluation.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index c7c5d0438d..318cabf2f1 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -624,12 +624,6 @@ class InContextLearningQATaskDataset(InContextLearningDataset):
         cot_delimiter (str): Delimiter to place between the chain of thought and continuations.
     """
 
-    # init:
-    #  early_stopping_criteria: Optional[List[str]] = None,
-    #  do_normalization: bool = True):
-    # self.early_stopping_criteria = early_stopping_criteria
-    # self.do_normalization = do_normalization
-
     def __init__(self,
                  cot_delimiter: str = '',
                  early_stopping_criteria: Optional[List[str]] = None,
@@ -1492,6 +1486,8 @@ def partition_dataset_by_category(dataset_uri: str, destination_path: str, hf_lo
             if dist.get_local_rank() == 0:
                 get_file(dataset_uri, destination_path, overwrite=True)
         dataset = load_dataset('json', data_files=destination_path, split='train', streaming=False)
+    assert hasattr(dataset,
+                   'features'), f"'features' not found in loaded dataset. Did you parse the HF Dataset correctly?"
     if 'category' not in dataset.features.keys():
         raise Exception(
             f"Attempted to partition dataset by `category` but it doesn't have a `category` key. Got keys: {str(list(dataset.features.keys()))}"

From a894be20b99306dfcf2916526dd9a294b07d0510 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Tue, 23 Jan 2024 23:27:21 +0000
Subject: [PATCH 094/116] linting wip

---
 .../in_context_learning_evaluation.py         | 49 ++++++++++++-------
 1 file changed, 30 insertions(+), 19 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 318cabf2f1..7fe38acac1 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -271,6 +271,7 @@ def __init__(
         answer_key: str = 'answer',
         strip_dataset: bool = True,
         padding_side: str = 'right',
+        tokenize_labels: bool = True,
         static_keys: Optional[List] = None,
         list_keys: Optional[List] = None,
         tensor_keys: Optional[List] = None,
@@ -279,7 +280,6 @@ def __init__(
         batch_mapping: Optional[Dict] = None,
         hf_loading_vars: Optional[Dict] = None,
         hf_parsing_map: Optional[Dict] = None,
-        tokenize_labels: Optional[bool] = True,
         generation_kwargs: Optional[Dict] = None,
     ):
         self.tokenizer = tokenizer
@@ -374,10 +374,12 @@ def _read_dataset(self,
             ) from e
         if 'hf://' in dataset_uri:
             dataset_uri = dataset_uri.replace('hf://', '')
+            if hf_loading_vars is None:
+                hf_loading_vars = {}
+            # TODO: need to ensure split is defined here?
             dataset = load_dataset(dataset_uri, **hf_loading_vars)
-            if hf_parsing_map:
+            if hf_parsing_map is not None:
                 # assert statement only for type checking
-                assert hf_parsing_map is not None, f'hf_parsing_map to be utilized but recieved object {hf_parsing_map}'
                 dataset_parsing_func = lambda example: {
                     k: ' '.join([str(example[col]) for col in v]) for k, v in hf_parsing_map.items()
                 }
@@ -667,7 +669,7 @@ def __init__(self,
             'input_ids': self.context_key,
             'labels': 'aliases',
         }
-        self._update_generation_kwargs(kwargs.get('generation_kwargs'))
+        self._update_generation_kwargs(kwargs.get('generation_kwargs', {}))
 
     def _read_dataset(
         self,
@@ -810,10 +812,10 @@ class InContextLearningMultipleChoiceTaskDataset(InContextLearningDataset):
 
     def __init__(self,
                  choices_key: str = 'choices',
-                 static_keys: List = None,
-                 list_of_tensors_keys: List = None,
-                 list_of_tuples_keys: List = None,
-                 list_of_primitives: List = None,
+                 static_keys: Optional[List] = None,
+                 list_of_tensors_keys: Optional[List] = None,
+                 list_of_tuples_keys: Optional[List] = None,
+                 list_of_primitives: Optional[List] = None,
                  *args,
                  **kwargs):
         self.choices_key = choices_key
@@ -1001,6 +1003,7 @@ class InContextLearningSchemaTaskDataset(InContextLearningMultipleChoiceTaskData
     - labels: Identical to the input, used by the model to calculate loss/metrics
     - gold_indices: List of length |batch_size // N| indicating for each question, which of the answers is correct (via an integer [0, N-1])
     - choice_groupings: Indicates which indices of the batch correspond to which questions
+
     """
 
     def __init__(self, choices_key='context_options', *args, **kwargs):
@@ -1236,7 +1239,7 @@ def __init__(
                 'eos_token_id': self.tokenizer.eos_token_id
             }
         }
-        self._update_generation_kwargs(kwargs.get('generation_kwargs'))
+        self._update_generation_kwargs(kwargs.get('generation_kwargs', {}))
 
     def _set_max_prompt_and_answer_lengths(self):
         """
@@ -1486,12 +1489,12 @@ def partition_dataset_by_category(dataset_uri: str, destination_path: str, hf_lo
             if dist.get_local_rank() == 0:
                 get_file(dataset_uri, destination_path, overwrite=True)
         dataset = load_dataset('json', data_files=destination_path, split='train', streaming=False)
-    assert hasattr(dataset,
-                   'features'), f"'features' not found in loaded dataset. Did you parse the HF Dataset correctly?"
-    if 'category' not in dataset.features.keys():
-        raise Exception(
-            f"Attempted to partition dataset by `category` but it doesn't have a `category` key. Got keys: {str(list(dataset.features.keys()))}"
-        )
+    # assert hasattr(dataset,
+    #                'features'), f"'features' not found in loaded dataset. Did you parse the HF Dataset correctly?"
+    # if 'category' not in dataset.features.keys():
+    #     raise Exception(
+    #         f"Attempted to partition dataset by `category` but it doesn't have a `category` key. Got keys: {str(list(dataset.features.keys()))}"
+    #     )
     categories = sorted(set(dataset['category']))
     output_files = {}
     for cat in categories:
@@ -1511,7 +1514,7 @@ def partition_dataset_by_category(dataset_uri: str, destination_path: str, hf_lo
 def get_icl_task_dataloader(
         icl_task_type: str,
         dataset_uri: str,
-        tokenizer: transformers.PreTrainedTokenizerBase,
+        tokenizer: Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast],
         batch_size: int,
         max_seq_len: int,
         pad_tok_id: int,
@@ -1526,9 +1529,9 @@ def get_icl_task_dataloader(
         generations_per_sample: int = 20,
         cot_delimiter: str = '',
         has_categories: bool = False,
-        hf_loading_vars: Dict = None,
-        hf_parsing_map: Dict = None,
-        generation_kwargs: Dict = None,
+        hf_loading_vars: Optional[Dict] = None,
+        hf_parsing_map: Optional[Dict] = None,
+        generation_kwargs: Optional[Dict] = None,
         early_stopping_criteria: Optional[List[str]] = None,
         do_normalization: bool = True) -> Union[DataSpec, Dict[str, DataSpec]]:
     """
@@ -1592,6 +1595,14 @@ def get_icl_task_dataloader(
     Returns:
         DataLoader: A dataloader used for performing in-context learning evaluation on the dataset provided.
     """
+    if hf_loading_vars is None:
+        hf_loading_vars = {}
+    if hf_parsing_map is None:
+        hf_parsing_map = {}
+    if generation_kwargs is None:
+        generation_kwargs = {}
+    if early_stopping_criteria is None:
+        early_stopping_criteria = []
 
     if has_categories:
         result_dls = {}

From a87a7d7cd52f801f970349b5fd4ec0dde1702dfd Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Wed, 24 Jan 2024 06:37:25 +0000
Subject: [PATCH 095/116] fix linting

---
 .../in_context_learning_evaluation.py         | 162 ++++++++++++------
 composer/datasets/utils.py                    |   6 +-
 .../test_in_context_learning_datasets.py      |  88 +++-------
 3 files changed, 136 insertions(+), 120 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 7fe38acac1..4a0b4f7271 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -20,7 +20,7 @@
 
 if TYPE_CHECKING:
     import transformers
-    from datasets import Dataset
+    from datasets import Dataset as HFDataset
 
 # Allow models to have slightly more tokens than were used in the most verbose CoT in the dataset
 _MAX_ANSWER_BUFFER_LENGTH = 10
@@ -58,7 +58,9 @@ def _tokenizer_needs_prefix_space(tokenizer: transformers.PreTrainedTokenizerBas
     Returns:
         bool: whether or not the tokenizer needs a prefix space
     """
-    return len(tokenizer(' a', add_special_tokens=False)['input_ids']) == 1
+    test_tokens = tokenizer(' a', add_special_tokens=False)['input_ids']
+    assert isinstance(test_tokens, list)
+    return len(test_tokens) == 1
 
 
 def _trim_context(context_enc: List, continuation_enc: List, max_seq_len: int) -> List:
@@ -282,6 +284,16 @@ def __init__(
         hf_parsing_map: Optional[Dict] = None,
         generation_kwargs: Optional[Dict] = None,
     ):
+        try:
+            import datasets
+            del datasets
+        except ImportError as e:
+            raise MissingConditionalImportError(
+                extra_deps_group='nlp',
+                conda_package='datasets',
+                conda_channel='conda-forge',
+            ) from e
+
         self.tokenizer = tokenizer
         self.prefix_space = _tokenizer_needs_prefix_space(self.tokenizer)
 
@@ -307,13 +319,13 @@ def __init__(
         self.tensor_keys = tensor_keys
 
         hf_loading_vars = hf_loading_vars or {}
-        self.dataset = self._read_dataset(dataset_uri, destination_path, hf_loading_vars, hf_parsing_map)
+        self.dataset: HFDataset = self._read_dataset(dataset_uri, destination_path, hf_loading_vars, hf_parsing_map)
         self.strip_data = strip_dataset
         if self.strip_data:
             self.dataset = self.dataset.map(strip_data)
 
         fewshot_rng = random.Random(fewshot_random_seed)
-        self.dataset = self.dataset.map(
+        self.dataset: HFDataset = self.dataset.map(
             self._prep_example,
             with_indices=True,
             fn_kwargs={
@@ -350,7 +362,7 @@ def _read_dataset(self,
                       dataset_uri: str,
                       destination_path: str,
                       hf_loading_vars: Optional[Dict[str, Any]] = None,
-                      hf_parsing_map: Optional[Dict[str, Any]] = None) -> 'Dataset':
+                      hf_parsing_map: Optional[Dict[str, Any]] = None) -> 'HFDataset':
         """
         Reads a dataset and handles parsing it from HuggingFace.
 
@@ -364,31 +376,26 @@ def _read_dataset(self,
         Returns:
             dataset: a loaded HF dataset
         """
-        try:
-            from datasets import load_dataset  # pyright: ignore [reportGeneralTypeIssues]
-        except ImportError as e:
-            raise MissingConditionalImportError(
-                extra_deps_group='nlp',
-                conda_package='datasets',
-                conda_channel='conda-forge',
-            ) from e
+        from datasets import Dataset as HFDataset
+        from datasets import load_dataset
         if 'hf://' in dataset_uri:
             dataset_uri = dataset_uri.replace('hf://', '')
             if hf_loading_vars is None:
                 hf_loading_vars = {}
-            # TODO: need to ensure split is defined here?
             dataset = load_dataset(dataset_uri, **hf_loading_vars)
-            if hf_parsing_map is not None:
-                # assert statement only for type checking
+            if hf_parsing_map:
                 dataset_parsing_func = lambda example: {
-                    k: ' '.join([str(example[col]) for col in v]) for k, v in hf_parsing_map.items()
+                    k: ' '.join([str(example[col]) for col in v])
+                    for k, v in hf_parsing_map.items()  # pyright: ignore[reportOptionalMemberAccess]
                 }
+                assert isinstance(dataset, HFDataset)
                 dataset = dataset.map(dataset_parsing_func, remove_columns=dataset.column_names)
         else:
             with dist.local_rank_zero_download_and_wait(destination_path):
                 if dist.get_local_rank() == 0:
                     get_file(dataset_uri, destination_path, overwrite=True)
             dataset = load_dataset('json', data_files=destination_path, split='train', streaming=False)
+        assert isinstance(dataset, HFDataset)
         return dataset
 
     def _generate_few_shot_prompt(
@@ -418,9 +425,18 @@ def _generate_few_shot_prompt(
         few_shot_text = preamble
 
         if num_fewshot > 0:
-            fewshot_idxs = _get_fewshot_sample_idxs(len(self.dataset), num_fewshot, example_idx, fewshot_rng)
+            fewshot_idxs = _get_fewshot_sample_idxs(
+                len(self.dataset),
+                num_fewshot,
+                example_idx,
+                fewshot_rng,
+            )
             for fewshot_idx in fewshot_idxs:
-                ctxt = self._construct_context(self.dataset[fewshot_idx], few_shot_text, add_answer=True)
+                ctxt = self._construct_context(
+                    self.dataset[fewshot_idx],
+                    few_shot_text,
+                    add_answer=True,
+                )
                 few_shot_text += ctxt
 
         return few_shot_text
@@ -495,20 +511,26 @@ def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -
         """
         tokenized_example = {}
         # Always add special tokens to preamble
-        preamble = self.tokenizer(prompt_and_fewshot)
-        preamble = self._fix_eos_on_preamble(preamble['input_ids'])
+        preamble = self.tokenizer(prompt_and_fewshot)['input_ids']
+        assert isinstance(preamble, list)
+        preamble = self._fix_eos_on_preamble(preamble)
         if self.strip_data:
             # rstrip context because a prompt ending in a space results in degenerate output
             ctxt = ctxt.rstrip()
         # Never add special tokens to context
         tokenized_context = self.tokenizer(ctxt, add_special_tokens=False)['input_ids']
+        assert isinstance(preamble, list)
+        assert isinstance(tokenized_context, list)
+
         tokenized_context = preamble + tokenized_context
 
         if self.tokenize_labels:
             # Never add special tokens to answer
             tokenized_answer = self.tokenizer(self._get_answer_from_example(example),
                                               add_special_tokens=False)['input_ids']
+            assert isinstance(tokenized_answer, list)
             trimmed_context = _trim_context(tokenized_context, tokenized_answer, self.padding_size)
+            assert isinstance(trimmed_context, list)
             continuation_indices = _get_continuation_span(trimmed_context, tokenized_answer)
             padded_context = _make_padded_input(trimmed_context, tokenized_answer, self.padding_size, self.pad_tok_id,
                                                 self.padding_side)
@@ -517,7 +539,13 @@ def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -
             tokenized_example[self.answer_key] = tokenized_answer
             tokenized_example['continuation_indices'] = continuation_indices
         else:
-            trimmed_context = _trim_context(tokenized_context, [], self.padding_size)
+            assert isinstance(tokenized_context, list)
+            trimmed_context = _trim_context(
+                tokenized_context,
+                [],
+                self.padding_size,
+            )
+            assert isinstance(trimmed_context, list)
             padded_context = _make_padded_input(trimmed_context, [], self.padding_size, self.pad_tok_id,
                                                 self.padding_side)
 
@@ -533,7 +561,7 @@ def _prep_example(
         num_fewshot: int,
         prompt_string: str,
         fewshot_rng: random.Random,
-    ) -> List[Dict[str, Any]]:
+    ) -> Dict[str, Any]:
         """
         Prepares a single example from a HF Dataset into tokenized format with prompt and fewshot examples.
 
@@ -555,7 +583,7 @@ def _prep_example(
         tokenized_example = self._tokenize_example(prompt_and_fewshot, ctxt, example)
         return tokenized_example
 
-    def collate_fn(self, data: Dict[str, Any]) -> Dict[str, Any]:
+    def collate_fn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
         """
         The function that the dataloader uses to accumulate data into batches.
 
@@ -675,9 +703,9 @@ def _read_dataset(
         self,
         dataset_uri: str,
         destination_path: str,
-        hf_loading_vars: Dict = None,
-        hf_parsing_map: Dict = None,
-    ):
+        hf_loading_vars: Dict,
+        hf_parsing_map: Dict,
+    ) -> 'HFDataset':
         dataset = super()._read_dataset(dataset_uri, destination_path, hf_loading_vars, hf_parsing_map)
         self.has_cot = 'chain_of_thought' in dataset.features
         dataset = dataset.map(
@@ -736,11 +764,13 @@ def _get_max_answer_length(self, dataset) -> int:
                     response = (f'{example["chain_of_thought"]}{self.cot_delimiter}{answer}')
                 else:
                     response = answer
-                max_answer_length = max(max_answer_length, len(self.tokenizer(response)['input_ids']))
+                tokenized_repsonse = self.tokenizer(response)['input_ids']
+                assert isinstance(tokenized_repsonse, list)
+                max_answer_length = max(max_answer_length, len(tokenized_repsonse))
         max_answer_length = max_answer_length + (_MAX_ANSWER_BUFFER_LENGTH if len(self.cot_delimiter) > 0 else 0)
         return max_answer_length
 
-    def collate_fn(self, data: Dict[str, Any]) -> Dict[str, Any]:
+    def collate_fn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
         batch = super().collate_fn(data)
         batch_size = batch['input_ids'].shape[0]
         stopping_criteria = None
@@ -871,13 +901,15 @@ def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -
         # NOTE: some of this is repeated from super class but for loop makes things considerably different
         tokenized_example = {}
         # Always add special tokens to preamble
-        preamble = self.tokenizer(prompt_and_fewshot)
-        preamble = self._fix_eos_on_preamble(preamble['input_ids'])
+        preamble = self.tokenizer(prompt_and_fewshot)['input_ids']
+        assert isinstance(preamble, list)
+        preamble = self._fix_eos_on_preamble(preamble)
         if self.strip_data:
             # rstrip context because a prompt ending in a space results in degenerate output
             ctxt = ctxt.rstrip()
         # Never add special tokens to context
         tokenized_context = self.tokenizer(ctxt, add_special_tokens=False)['input_ids']
+        assert isinstance(tokenized_context, list)
         tokenized_context = preamble + tokenized_context
 
         tokenized_example[self.context_key] = []
@@ -890,10 +922,18 @@ def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -
 
             # Never add special tokens to answer
             tokenized_answer = self.tokenizer(choice, add_special_tokens=False)['input_ids']
+            assert isinstance(tokenized_context, list)
+            assert isinstance(tokenized_answer, list)
             trimmed_context = _trim_context(tokenized_context, tokenized_answer, self.padding_size)
+            assert isinstance(trimmed_context, list)
             continuation_indices = _get_continuation_span(trimmed_context, tokenized_answer)
-            padded_context = _make_padded_input(trimmed_context, tokenized_answer, self.padding_size, self.pad_tok_id,
-                                                self.padding_side)
+            padded_context = _make_padded_input(
+                trimmed_context,
+                tokenized_answer,
+                self.padding_size,
+                self.pad_tok_id,
+                self.padding_side,
+            )
 
             tokenized_example[self.context_key].append(padded_context)
             tokenized_example[self.answer_key].append(tokenized_answer)
@@ -902,7 +942,7 @@ def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -
         tokenized_example['gold'] = example['gold']
         return tokenized_example
 
-    def collate_fn(self, data: Dict[str, Any]) -> Dict[str, Any]:
+    def collate_fn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
         """
         The function that the dataloader uses to accumulate data into batches.
         We run each distinct query + answer choice through the model separately and determine which
@@ -938,7 +978,7 @@ def collate_fn(self, data: Dict[str, Any]) -> Dict[str, Any]:
     def get_num_samples_in_batch(self, batch) -> int:
         return batch['input_ids'].shape[0] // self.num_choices
 
-    def split_batch(self, batch: Any, microbatch_size: int) -> Dict[str, Any]:
+    def split_batch(self, batch: Any, microbatch_size: int) -> List[Dict[str, Any]]:
         """
         Split batch while ensuring all continuations are in the same microbatch.
 
@@ -1047,7 +1087,7 @@ def _construct_context(self, example, preceding_text: str = '', add_answer: bool
         context = f'{context}{self.continuation_delimiter}{continuation}'
         return context
 
-    def _construct_multiple_contexts(self, example: Dict, preceding_text: str = '') -> str:
+    def _construct_multiple_contexts(self, example: Dict, preceding_text: str = '') -> List[str]:
         """
         Takes a example and constructs all contexts. Optionally, appends this to preceeding text (such as a
         prompt or fewshot examples).
@@ -1075,7 +1115,7 @@ def _prep_example(
         num_fewshot: int,
         prompt_string: str,
         fewshot_rng: random.Random,
-    ) -> List[Dict[str, Any]]:
+    ) -> Dict[str, Any]:
         """
         Prepares a single example from a HF Dataset into tokenized format with prompt and fewshot examples.
 
@@ -1110,10 +1150,13 @@ def _tokenize_example(self, prompt_and_fewshot: str, context_options: List[str],
             Dict: dictionary with the tokenized data
         """
         tokenized_example = {}
-        preamble = self.tokenizer(prompt_and_fewshot)
-        preamble = self._fix_eos_on_preamble(preamble['input_ids'])
+        preamble = self.tokenizer(prompt_and_fewshot)['input_ids']
+        assert isinstance(preamble, list)
+        preamble = self._fix_eos_on_preamble(preamble)
         encoded_contexts = [
-            preamble + self.tokenizer(c, add_special_tokens=False)['input_ids'] for c in context_options
+            preamble +  # pyright: ignore[reportGeneralTypeIssues]
+            self.tokenizer(c, add_special_tokens=False)['input_ids']  # pyright: ignore[reportGeneralTypeIssues]
+            for c in context_options
         ]
         continuation = example['continuation']
         if self.prefix_space:
@@ -1124,7 +1167,10 @@ def _tokenize_example(self, prompt_and_fewshot: str, context_options: List[str],
         tokenized_example['continuation_indices'] = []
         tokenized_example[self.answer_key] = []
         for context in encoded_contexts:
+            assert isinstance(context, list)
+            assert isinstance(tokenized_continuation, list)
             trimmed_context = _trim_context(context, tokenized_continuation, self.padding_size)
+            assert isinstance(trimmed_context, list)
             continuation_indices = _get_continuation_span(trimmed_context, tokenized_continuation)
             padded_context = _make_padded_input(trimmed_context, tokenized_continuation, self.padding_size,
                                                 self.pad_tok_id, self.padding_side)
@@ -1251,15 +1297,15 @@ def _set_max_prompt_and_answer_lengths(self):
         max_prompt_length = 0
         max_answer_length = 0
         for example in self.dataset:
+            assert isinstance(example, Dict)
             unpadded_example = [token for token in example[self.context_key] if token != self.pad_tok_id]
-            max_prompt_length = max(
-                max_prompt_length,
-                len(unpadded_example),
-            )
+            max_prompt_length = max(max_prompt_length, len(unpadded_example))
 
-            len_tokenized_answer = len(
-                self.tokenizer(example['canonical_solution'], add_special_tokens=False)['input_ids'])
+            tokenized_answer = self.tokenizer(example['canonical_solution'], add_special_tokens=False)['input_ids']
+            assert isinstance(tokenized_answer, list)
+            len_tokenized_answer = len(tokenized_answer)
             max_answer_length = max(max_answer_length, len_tokenized_answer)
+
         self.max_prompt_length = max_prompt_length
         self.max_answer_length = max_answer_length + _MAX_ANSWER_BUFFER_LENGTH
 
@@ -1301,7 +1347,7 @@ def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -
 def build_icl_dataloader(
         icl_task_type: str,
         dataset_uri: str,
-        tokenizer: Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast],
+        tokenizer: transformers.PreTrainedTokenizerBase,
         batch_size: int,
         max_seq_len: int,
         pad_tok_id: int,
@@ -1469,7 +1515,8 @@ def partition_dataset_by_category(dataset_uri: str, destination_path: str, hf_lo
         Dict[str, str]: Mapping of category names to partitioned dataset local files names.
     """
     try:
-        from datasets import load_dataset  # pyright: ignore [reportGeneralTypeIssues]
+        from datasets import Dataset as HFDataset
+        from datasets import IterableDataset, load_dataset
     except ImportError as e:
         raise MissingConditionalImportError(
             extra_deps_group='nlp',
@@ -1479,23 +1526,26 @@ def partition_dataset_by_category(dataset_uri: str, destination_path: str, hf_lo
     if dataset_uri.startswith('hf://'):
         dataset_uri = dataset_uri.replace('hf://', '')
         dataset = load_dataset(dataset_uri, **hf_loading_vars)
+        assert isinstance(dataset, HFDataset) or isinstance(dataset, IterableDataset)
         if hf_parsing_map:
             dataset_parsing_func = lambda example: {
                 k: ' '.join([str(example[col]) for col in v]) for k, v in hf_parsing_map.items()
             }
+            assert hasattr(dataset, 'column_names')
             dataset = dataset.map(dataset_parsing_func, remove_columns=dataset.column_names)
     else:
         with dist.local_rank_zero_download_and_wait(destination_path):
             if dist.get_local_rank() == 0:
                 get_file(dataset_uri, destination_path, overwrite=True)
         dataset = load_dataset('json', data_files=destination_path, split='train', streaming=False)
-    # assert hasattr(dataset,
-    #                'features'), f"'features' not found in loaded dataset. Did you parse the HF Dataset correctly?"
-    # if 'category' not in dataset.features.keys():
-    #     raise Exception(
-    #         f"Attempted to partition dataset by `category` but it doesn't have a `category` key. Got keys: {str(list(dataset.features.keys()))}"
-    #     )
-    categories = sorted(set(dataset['category']))
+    assert isinstance(dataset, HFDataset) or isinstance(dataset, IterableDataset)
+    assert hasattr(dataset, 'features')
+    assert dataset.features is not None
+    if 'category' not in dataset.features.keys():
+        raise Exception(f"""Attempted to partition dataset by `category` \
+            but it doesn't have a `category` key. \
+            Got keys: {str(list(dataset.features.keys()))}""")
+    categories = sorted(set(dataset['category']))  # pyright: ignore[reportGeneralTypeIssues]
     output_files = {}
     for cat in categories:
         path = destination_path.split('/')
@@ -1590,7 +1640,7 @@ def get_icl_task_dataloader(
                                                   keyword args in this fucntion (see https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig
                                                   for more details)
 
-                                                  # TODO: add early stopping doucmentation
+                                                  TODO: add early stopping doucmentation
 
     Returns:
         DataLoader: A dataloader used for performing in-context learning evaluation on the dataset provided.
diff --git a/composer/datasets/utils.py b/composer/datasets/utils.py
index 431a860900..b627ef8596 100644
--- a/composer/datasets/utils.py
+++ b/composer/datasets/utils.py
@@ -179,7 +179,7 @@ class MultiTokenEOSCriteria(transformers.StoppingCriteria):
         def __init__(
             self,
             stop_sequence: str,
-            tokenizer: transformers.PreTrainedTokenizer,
+            tokenizer: transformers.PreTrainedTokenizerBase,
             batch_size: int,
         ) -> None:
             self.done_tracker = [False] * batch_size
@@ -196,7 +196,7 @@ def __init__(
             self.stop_sequence_id_len = len(self.stop_sequence_ids) + 2
             self.tokenizer = tokenizer
 
-        def __call__(self, input_ids, scores: Optional[torch.FloatTensor] = None, **kwargs) -> bool:
+        def __call__(self, input_ids: torch.Tensor, scores: Optional[torch.FloatTensor] = None, **kwargs) -> bool:
             # For efficiency, we compare the last n tokens where n is the number of tokens in the stop_sequence
             lookback_ids_batch = input_ids[:, :][:, -self.stop_sequence_id_len:]
 
@@ -213,7 +213,7 @@ def __call__(self, input_ids, scores: Optional[torch.FloatTensor] = None, **kwar
             return False not in self.done_tracker
 
     def stop_sequences_criteria(
-        tokenizer: transformers.PreTrainedTokenizer,
+        tokenizer: transformers.PreTrainedTokenizerBase,
         stop_sequences: List[str],
         batch_size: int,
     ) -> transformers.StoppingCriteriaList:
diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py
index 272200d0f4..69e1c12ba2 100644
--- a/tests/datasets/test_in_context_learning_datasets.py
+++ b/tests/datasets/test_in_context_learning_datasets.py
@@ -46,11 +46,9 @@ def test_tokenizer_needs_prefix_space_when_space_not_needed(tiny_gpt2_tokenizer)
 
 
 def test_tokenizer_needs_prefix_space_when_space_needed():
-    try:
-        from transformers import AutoTokenizer
-    except ImportError:
-        pytest.importorskip('transformers')
-    tokenizer = AutoTokenizer.from_pretrained('facebook/opt-125m', use_fast=False)  # type: ignore reportUnboundVariable
+    transformers = pytest.importorskip('transformers')
+    tokenizer = transformers.AutoTokenizer.from_pretrained('facebook/opt-125m',
+                                                           use_fast=False)  # type: ignore reportUnboundVariable
     assert _tokenizer_needs_prefix_space(tokenizer)
 
 
@@ -313,11 +311,9 @@ def test_get_answer_from_example(tiny_gpt2_tokenizer, tmp_path):
 
 
 def test_fix_eos_on_preamble(tmp_path):
-    try:
-        from transformers import AutoTokenizer
-    except ImportError:
-        pytest.importorskip('transformers')
-    tokenizer = AutoTokenizer.from_pretrained('facebook/opt-125m', use_fast=False)  # type: ignore reportUnboundVariable
+    transformers = pytest.importorskip('transformers')
+    tokenizer = transformers.AutoTokenizer.from_pretrained('facebook/opt-125m',
+                                                           use_fast=False)  # type: ignore reportUnboundVariable
     seqlen = 2048
     num_fewshot = 0
     prompt_string = ''
@@ -421,11 +417,8 @@ def test_qa_set_cot_no_cot(tmp_path):
     pytest.importorskip('datasets')
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/triviaqa_small.jsonl'
-    try:
-        from transformers import AutoTokenizer
-    except ImportError:
-        pytest.importorskip('transformers')
-    tokenizer = AutoTokenizer.from_pretrained('facebook/opt-125m')  # type: ignore reportUnboundVariable
+    transformers = pytest.importorskip('transformers')
+    tokenizer = transformers.AutoTokenizer.from_pretrained('facebook/opt-125m')  # type: ignore reportUnboundVariable
 
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
@@ -448,11 +441,8 @@ def test_qa_set_cot_has_cot(tmp_path):
     pytest.importorskip('datasets')
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/gsm8k_small.jsonl'
-    try:
-        from transformers import AutoTokenizer
-    except ImportError:
-        pytest.importorskip('transformers')
-    tokenizer = AutoTokenizer.from_pretrained('facebook/opt-125m')  # type: ignore reportUnboundVariable
+    transformers = pytest.importorskip('transformers')
+    tokenizer = transformers.AutoTokenizer.from_pretrained('facebook/opt-125m')  # type: ignore reportUnboundVariable
 
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
@@ -609,7 +599,7 @@ def test_code_adjust_padding(tiny_gpt2_tokenizer, tmp_path):
         generations_per_sample=10,
     )
 
-    assert all(len(data['prompt']) == 148 for data in dl.dataset)
+    assert all(len(data['prompt']) == 148 for data in dl.dataset)  # pyright: ignore [reportGeneralTypeIssues]
 
 
 def test_code_update_gen_kwargs(tiny_gpt2_tokenizer, tmp_path):
@@ -937,14 +927,10 @@ def test_schema_task_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
 @pytest.mark.parametrize('dataset_uri', ['winograd_small.jsonl'])
 def test_schema_task_dataloader_sentpiece_tokenizer(dataset_uri, tmp_path):
     pytest.importorskip('datasets')
+    transformers = pytest.importorskip('transformers')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-
-    try:
-        from transformers import AutoTokenizer
-    except ImportError:
-        pytest.importorskip('transformers')
-    tokenizer = AutoTokenizer.from_pretrained(
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
         'huggyllama/llama-7b',  # type: ignore reportUnboundVariable
         use_fast=False)
     dataset_uri = f'{local_data}/{dataset_uri}'
@@ -1377,11 +1363,9 @@ def test_code_eval_split_batch(dataset_uri, tmp_path):
     pytest.importorskip('datasets')
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
     dataset_uri = f'{local_data}/{dataset_uri}'
-    try:
-        from transformers import AutoTokenizer
-    except ImportError:
-        pytest.importorskip('transformers')
-    tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neox-20b')  # type: ignore reportUnboundVariable
+    transformers = pytest.importorskip('transformers')
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        'EleutherAI/gpt-neox-20b')  # type: ignore reportUnboundVariable
 
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
@@ -1451,11 +1435,8 @@ def test_code_eval_sentpiece_dataloader(dataset_uri, tmp_path, num_fewshot, prom
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
 
-    try:
-        from transformers import AutoTokenizer
-    except ImportError:
-        pytest.importorskip('transformers')
-    tokenizer = AutoTokenizer.from_pretrained('huggyllama/llama-7b')  # type: ignore reportUnboundVariable
+    transformers = pytest.importorskip('transformers')
+    tokenizer = transformers.AutoTokenizer.from_pretrained('huggyllama/llama-7b')  # type: ignore reportUnboundVariable
     dataset_uri = f'{local_data}/{dataset_uri}'
     batch_size = 4
     seqlen = 2048
@@ -1521,11 +1502,8 @@ def test_code_eval_test_cases(dataset_uri, tmp_path):
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
 
-    try:
-        from transformers import AutoTokenizer
-    except ImportError:
-        pytest.importorskip('transformers')
-    tokenizer = AutoTokenizer.from_pretrained('huggyllama/llama-7b')  # type: ignore reportUnboundVariable
+    transformers = pytest.importorskip('transformers')
+    tokenizer = transformers.AutoTokenizer.from_pretrained('huggyllama/llama-7b')  # type: ignore reportUnboundVariable
     dataset_uri = f'{local_data}/{dataset_uri}'
     batch_size = 4
     seqlen = 512
@@ -1573,11 +1551,8 @@ def test_code_eval_pass_at_k_validity(dataset_uri, tmp_path):
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
 
-    try:
-        from transformers import AutoTokenizer
-    except ImportError:
-        pytest.importorskip('transformers')
-    tokenizer = AutoTokenizer.from_pretrained('huggyllama/llama-7b')  # type: ignore reportUnboundVariable
+    transformers = pytest.importorskip('transformers')
+    tokenizer = transformers.AutoTokenizer.from_pretrained('huggyllama/llama-7b')  # type: ignore reportUnboundVariable
     dataset_uri = f'{local_data}/{dataset_uri}'
     batch_size = 2
     seqlen = 64
@@ -1608,11 +1583,8 @@ def test_code_eval_task_dataloader(dataset_uri, tmp_path, num_fewshot, prompt_st
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
 
-    try:
-        from transformers import AutoTokenizer
-    except ImportError:
-        pytest.importorskip('transformers')
-    tokenizer = AutoTokenizer.from_pretrained('mosaicml/mpt-7b')  # type: ignore reportUnboundVariable
+    transformers = pytest.importorskip('transformers')
+    tokenizer = transformers.AutoTokenizer.from_pretrained('mosaicml/mpt-7b')  # type: ignore reportUnboundVariable
     dataset_uri = f'{local_data}/{dataset_uri}'
     batch_size = 4
     seqlen = 2048
@@ -1682,11 +1654,8 @@ def test_eval_split_batch(tiny_opt_tokenizer, dataset_uri, num_fewshot, tmp_path
     pytest.importorskip('datasets')
 
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
-    try:
-        from transformers import AutoTokenizer
-    except ImportError:
-        pytest.importorskip('transformers')
-    tokenizer = AutoTokenizer.from_pretrained('mosaicml/mpt-7b')  # type: ignore reportUnboundVariable
+    transformers = pytest.importorskip('transformers')
+    tokenizer = transformers.AutoTokenizer.from_pretrained('mosaicml/mpt-7b')  # type: ignore reportUnboundVariable
     dataset_uri = f'{local_data}/{dataset_uri}'
     batch_size = 4
     seqlen = 512
@@ -1758,10 +1727,7 @@ def test_lm_task_evaluation(device, dataset_uri, num_fewshot, tiny_gpt2_tokenize
 
     evaluator = Evaluator(label='lambada', dataloader=dl, metric_names=['InContextLearningLMAccuracy'])
 
-    try:
-        import transformers
-    except ImportError:
-        pytest.importorskip('transformers')
+    transformers = pytest.importorskip('transformers')
     config = transformers.AutoConfig.from_pretrained('EleutherAI/gpt-neo-125M')
     model = transformers.AutoModelForCausalLM.from_config(config)
     model = HuggingFaceModel(

From eae8a1cdd05b77756d63e183018672c89de7f629 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Wed, 24 Jan 2024 07:04:14 +0000
Subject: [PATCH 096/116] add early stopping and do_normalization documentation

---
 composer/datasets/in_context_learning_evaluation.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 4a0b4f7271..9ffe9c8b0d 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -1639,8 +1639,9 @@ def get_icl_task_dataloader(
         generation_kwargs (Dict, default = None): A dictionary containing keyword arguments to be passed along to the model's generate function. Overwrites any previously specified generation
                                                   keyword args in this fucntion (see https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig
                                                   for more details)
-
-                                                  TODO: add early stopping doucmentation
+        early_stopping (List, default = None): A list of strings that, when found in a model's output, will be treated as a stopping criteria at metric computation time.
+            Used in QA tasks with CoT
+        do_normalization (bool, default = True): Whether or not to normalize the outputs and labels in InContextLearningQAAccuracy. Only used in QA tasks.
 
     Returns:
         DataLoader: A dataloader used for performing in-context learning evaluation on the dataset provided.

From 5be0cc9d31c0719885b1be8475720986d4e99ed5 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Wed, 24 Jan 2024 23:45:30 +0000
Subject: [PATCH 097/116] fix linting

---
 composer/datasets/in_context_learning_evaluation.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 9ffe9c8b0d..d6d7937785 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -20,7 +20,7 @@
 
 if TYPE_CHECKING:
     import transformers
-    from datasets import Dataset as HFDataset
+    from datasets import Dataset as HFDataset  # pyright: ignore[reportGeneralTypeIssues]
 
 # Allow models to have slightly more tokens than were used in the most verbose CoT in the dataset
 _MAX_ANSWER_BUFFER_LENGTH = 10
@@ -376,8 +376,8 @@ def _read_dataset(self,
         Returns:
             dataset: a loaded HF dataset
         """
-        from datasets import Dataset as HFDataset
-        from datasets import load_dataset
+        from datasets import Dataset as HFDataset  # pyright: ignore[reportGeneralTypeIssues]
+        from datasets import load_dataset  # pyright: ignore[reportGeneralTypeIssues]
         if 'hf://' in dataset_uri:
             dataset_uri = dataset_uri.replace('hf://', '')
             if hf_loading_vars is None:
@@ -1515,8 +1515,8 @@ def partition_dataset_by_category(dataset_uri: str, destination_path: str, hf_lo
         Dict[str, str]: Mapping of category names to partitioned dataset local files names.
     """
     try:
-        from datasets import Dataset as HFDataset
-        from datasets import IterableDataset, load_dataset
+        from datasets import Dataset as HFDataset  # pyright: ignore[reportGeneralTypeIssues]
+        from datasets import IterableDataset, load_dataset  # pyright: ignore[reportGeneralTypeIssues]
     except ImportError as e:
         raise MissingConditionalImportError(
             extra_deps_group='nlp',

From 1fd12fc401d4cd13494955558c36f5385d436c66 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Thu, 25 Jan 2024 00:03:24 +0000
Subject: [PATCH 098/116] fix linting

---
 composer/datasets/in_context_learning_evaluation.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index d6d7937785..cb80456ab3 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -1154,8 +1154,8 @@ def _tokenize_example(self, prompt_and_fewshot: str, context_options: List[str],
         assert isinstance(preamble, list)
         preamble = self._fix_eos_on_preamble(preamble)
         encoded_contexts = [
-            preamble +  # pyright: ignore[reportGeneralTypeIssues]
-            self.tokenizer(c, add_special_tokens=False)['input_ids']  # pyright: ignore[reportGeneralTypeIssues]
+            preamble +  # pyright: ignore[reportOperatorIssue, reportGeneralTypeIssues]
+            self.tokenizer(c, add_special_tokens=False)['input_ids']  # pyright: ignore[reportOperatorIssue, ]
             for c in context_options
         ]
         continuation = example['continuation']
@@ -1359,7 +1359,7 @@ def build_icl_dataloader(
         hf_parsing_map: Dict,
         destination_path: str,
         prelimiter: str,  # e.g. 'Question: '
-        cot_delimiter: str,
+        cot_delimiter: str,  # e.g. ' ### '
         fewshot_random_seed: int,
         pass_at_k: int,
         generations_per_sample: int,
@@ -1545,7 +1545,7 @@ def partition_dataset_by_category(dataset_uri: str, destination_path: str, hf_lo
         raise Exception(f"""Attempted to partition dataset by `category` \
             but it doesn't have a `category` key. \
             Got keys: {str(list(dataset.features.keys()))}""")
-    categories = sorted(set(dataset['category']))  # pyright: ignore[reportGeneralTypeIssues]
+    categories = sorted(set(dataset['category']))  # pyright: ignore[reportIndexIssue, reportGeneralTypeIssues]
     output_files = {}
     for cat in categories:
         path = destination_path.split('/')
@@ -1553,7 +1553,9 @@ def partition_dataset_by_category(dataset_uri: str, destination_path: str, hf_lo
         tmp_path_to_broadcast = str(os.path.abspath(cat_dest))
         gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
         if dist.get_local_rank() == 0:
-            subset = [l for l in dataset if l['category'] == cat]  # pyright: ignore[reportGeneralTypeIssues]
+            subset = [
+                l for l in dataset if l['category'] == cat  # pyright: ignore[reportGeneralTypeIssues]
+            ]  # pyright: ignore[reportArgumentType, reportCallIssue]
             with open(gathered_paths[0], 'w', encoding='utf8') as f:
                 for l in subset:
                     f.write(json.dumps(l, ensure_ascii=False) + '\n')

From 5f12dc5e291d728f1ecf04312ba88a8b2d570a5d Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Thu, 25 Jan 2024 01:12:03 +0000
Subject: [PATCH 099/116] fix final dist test issue

---
 .../test_in_context_learning_datasets.py      | 20 ++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py
index 69e1c12ba2..e47809bc59 100644
--- a/tests/datasets/test_in_context_learning_datasets.py
+++ b/tests/datasets/test_in_context_learning_datasets.py
@@ -1789,9 +1789,9 @@ def test_schema_task_evaluation(num_fewshot, dataset_uri, tiny_gpt2_tokenizer, t
 
 @pytest.mark.parametrize('dataset_uri', ['mmlu_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [0, 5])
-@pytest.mark.filterwarnings(r'ignore:Cannot split .* of length.*:UserWarning')
 @device('gpu')
 @world_size(1, 2)
+@pytest.mark.filterwarnings(r'ignore:Cannot split .* of length.*:UserWarning')
 def test_mc_task_evaluation_subcategories(device, world_size, dataset_uri, num_fewshot, tiny_gpt2_model,
                                           tiny_gpt2_tokenizer, tmp_path):
     pytest.importorskip('datasets')
@@ -1800,13 +1800,15 @@ def test_mc_task_evaluation_subcategories(device, world_size, dataset_uri, num_f
     dataset_uri = f'{local_data}/{dataset_uri}'
     tokenizer = tiny_gpt2_tokenizer
     batch_size = 8
+    max_seq_len = 64
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
+    reproducibility.seed_all(1234)
     dls = get_icl_task_dataloader('multiple_choice',
                                   dataset_uri=dataset_uri,
                                   tokenizer=tokenizer,
                                   batch_size=batch_size,
-                                  max_seq_len=1024,
+                                  max_seq_len=max_seq_len,
                                   pad_tok_id=tokenizer.eos_token_id,
                                   num_fewshot=num_fewshot,
                                   prompt_string='',
@@ -1840,6 +1842,7 @@ def test_mc_task_evaluation_subcategories(device, world_size, dataset_uri, num_f
 
 @pytest.mark.parametrize('dataset_uri', ['piqa_small.jsonl', 'hellaswag_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [0, 5])
+@pytest.mark.filterwarnings(r'ignore:Cannot split .* of length.*:UserWarning')
 @device('gpu')
 @world_size(1, 2)
 def test_mc_task_evaluation(device, world_size, num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tmp_path,
@@ -1850,6 +1853,8 @@ def test_mc_task_evaluation(device, world_size, num_fewshot, dataset_uri, tiny_g
     dataset_uri = f'{local_data}/{dataset_uri}'
     tokenizer = tiny_gpt2_tokenizer
     batch_size = 8
+    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
+    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
 
     # seed because the fewshot selection is currently unseeded
     reproducibility.seed_all(1234)
@@ -1858,13 +1863,13 @@ def test_mc_task_evaluation(device, world_size, num_fewshot, dataset_uri, tiny_g
         dataset_uri=dataset_uri,
         tokenizer=tokenizer,
         batch_size=batch_size,
-        max_seq_len=1024,
+        max_seq_len=64,
         pad_tok_id=tokenizer.eos_token_id,
         num_fewshot=num_fewshot,
         prompt_string='',
         example_delimiter='\n',
         continuation_delimiter=': ',
-        destination_path=str(tmp_path / 'icl.jsonl'),
+        destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
     )
 
     evaluator = Evaluator(label='mc', dataloader=dl, metric_names=['InContextLearningMultipleChoiceAccuracy'])
@@ -1884,12 +1889,15 @@ def test_mc_task_evaluation(device, world_size, num_fewshot, dataset_uri, tiny_g
     with open(dataset_uri) as f:
         for _ in f:
             num_samples += 1
-    assert trainer.state.eval_metrics['mc']['InContextLearningMultipleChoiceAccuracy'].total == num_samples
+    total = trainer.state.eval_metrics['mc']['InContextLearningMultipleChoiceAccuracy'].total
+    dist.all_reduce(total)  # type: ignore
+    assert total.item() == num_samples  # type: ignore
 
 
 @pytest.mark.parametrize('num_fewshot', [0, 5])
 @pytest.mark.parametrize('dataset_uri', ['triviaqa_small.jsonl'])
 @pytest.mark.filterwarnings(r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning')
+@pytest.mark.filterwarnings(r'ignore:Cannot split .* of length.*:UserWarning')
 @device('gpu')
 @world_size(1, 2)
 def test_qa_task_evaluation_opt_tokenizer(device, world_size, tiny_opt_tokenizer, tiny_opt_model, num_fewshot,
@@ -1937,6 +1945,7 @@ def test_qa_task_evaluation_opt_tokenizer(device, world_size, tiny_opt_tokenizer
 @device('gpu')
 @world_size(1, 2)
 @pytest.mark.filterwarnings(r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning')
+@pytest.mark.filterwarnings(r'ignore:Cannot split .* of length.*:UserWarning')
 def test_qa_task_evaluation_with_cot_opt_tokenizer(device, world_size, tiny_opt_tokenizer, tiny_opt_model, num_fewshot,
                                                    dataset_uri, tmp_path):
     pytest.importorskip('datasets')
@@ -1982,6 +1991,7 @@ def test_qa_task_evaluation_with_cot_opt_tokenizer(device, world_size, tiny_opt_
 @pytest.mark.parametrize('num_fewshot', [0, 5])
 @device('gpu')
 @world_size(1, 2)
+@pytest.mark.filterwarnings(r'ignore:.*The dataloader_len \(2\) is greater than the length.*:UserWarning')
 def test_qa_task_evaluation(device, world_size, num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tiny_gpt2_model,
                             tmp_path):
     pytest.importorskip('datasets')

From f531dfce2a94c5d3ab1fed95903d504f7605a238 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Thu, 25 Jan 2024 01:19:19 +0000
Subject: [PATCH 100/116] fix isort

---
 composer/datasets/__init__.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/composer/datasets/__init__.py b/composer/datasets/__init__.py
index 895b469583..a456ec3239 100644
--- a/composer/datasets/__init__.py
+++ b/composer/datasets/__init__.py
@@ -11,11 +11,15 @@
                                      build_streaming_cifar10_dataloader, build_synthetic_cifar10_dataloader)
 from composer.datasets.imagenet import (build_ffcv_imagenet_dataloader, build_imagenet_dataloader,
                                         build_streaming_imagenet1k_dataloader, build_synthetic_imagenet_dataloader)
+from composer.datasets.in_context_learning_evaluation import (InContextLearningCodeEvalDataset,
+                                                              InContextLearningDataset, InContextLearningLMTaskDataset,
+                                                              InContextLearningMultipleChoiceTaskDataset,
+                                                              InContextLearningQATaskDataset,
+                                                              InContextLearningSchemaTaskDataset)
 from composer.datasets.lm_dataset import build_lm_dataloader
 from composer.datasets.mnist import build_mnist_dataloader, build_synthetic_mnist_dataloader
 from composer.datasets.synthetic import (SyntheticBatchPairDataset, SyntheticDataLabelType, SyntheticDataType,
                                          SyntheticPILDataset)
-from composer.datasets.in_context_learning_evaluation import InContextLearningDataset, InContextLearningQATaskDataset, InContextLearningLMTaskDataset, InContextLearningCodeEvalDataset, InContextLearningMultipleChoiceTaskDataset, InContextLearningSchemaTaskDataset
 
 __all__ = [
     'ADE20k',
@@ -25,11 +29,11 @@
     'SyntheticDataLabelType',
     'SyntheticDataType',
     'SyntheticPILDataset',
-    'InContextLearningDataset', 
-    'InContextLearningQATaskDataset', 
-    'InContextLearningLMTaskDataset', 
-    'InContextLearningCodeEvalDataset', 
-    'InContextLearningMultipleChoiceTaskDataset', 
+    'InContextLearningDataset',
+    'InContextLearningQATaskDataset',
+    'InContextLearningLMTaskDataset',
+    'InContextLearningCodeEvalDataset',
+    'InContextLearningMultipleChoiceTaskDataset',
     'InContextLearningSchemaTaskDataset',
     'build_ade20k_dataloader',
     'build_streaming_ade20k_dataloader',

From 3e71cb3ac82766c04f7201c0b4f1a360aae2aa2c Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Thu, 25 Jan 2024 01:43:01 +0000
Subject: [PATCH 101/116] fix linting

---
 .../test_in_context_learning_datasets.py      | 27 ++++++++++++-------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py
index e47809bc59..170eb88453 100644
--- a/tests/datasets/test_in_context_learning_datasets.py
+++ b/tests/datasets/test_in_context_learning_datasets.py
@@ -13,14 +13,23 @@
 
 from composer import Evaluator
 from composer.core import DataSpec
-from composer.datasets.in_context_learning_evaluation import (InContextLearningCodeEvalDataset,
-                                                              InContextLearningDataset,
-                                                              InContextLearningMultipleChoiceTaskDataset,
-                                                              InContextLearningQATaskDataset,
-                                                              InContextLearningSchemaTaskDataset,
-                                                              _get_continuation_span, _get_fewshot_sample_idxs,
-                                                              _make_padded_input, _tokenizer_needs_prefix_space,
-                                                              _trim_context, get_icl_task_dataloader, strip_data)
+
+# isort: off
+from composer.datasets.in_context_learning_evaluation import (
+    InContextLearningCodeEvalDataset,
+    InContextLearningDataset,
+    InContextLearningMultipleChoiceTaskDataset,
+    InContextLearningQATaskDataset,
+    InContextLearningSchemaTaskDataset,
+    _get_continuation_span,
+    _get_fewshot_sample_idxs,
+    _make_padded_input,
+    _tokenizer_needs_prefix_space,
+    _trim_context,
+    get_icl_task_dataloader,
+    strip_data,
+)
+# isort: on
 from composer.datasets.utils import MultiTokenEOSCriteria
 from composer.loggers import InMemoryLogger
 from composer.metrics import (InContextLearningCodeEvalAccuracy, InContextLearningLMAccuracy,
@@ -1132,7 +1141,7 @@ def test_qa_split_batch(tiny_opt_tokenizer, dataset_uri, tmp_path):
     tokenizer = tiny_opt_tokenizer
 
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
-    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
+    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)  # for dist
     dl = get_icl_task_dataloader(
         icl_task_type='question_answering',
         dataset_uri=dataset_uri,

From e487934a62e9ee9db3ffd3946b1bb645cc967728 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Thu, 25 Jan 2024 03:51:30 +0000
Subject: [PATCH 102/116] fix docstrings

---
 .../in_context_learning_evaluation.py         | 115 ++++++++++--------
 1 file changed, 67 insertions(+), 48 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index cb80456ab3..4a0f11471f 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -212,19 +212,19 @@ class InContextLearningDataset(Dataset):
     'answer' refers to the desired output of the model.
 
     When creating a new ICL Dataset, it is likely that you will need to reimplemente the following methods:
-        - _construct_context(): takes a single example dictionary and formulates the context as a string for that eval question.
-        - _get_answer_from_example(): takes a single example dictionary and formulates the correct, ground truth answer as a string.
-        - _tokenize_example(): tokenizes the example and adds any extra content from the original dictionary that needs to be passed downstream.
-        - _read_dataset(): loads the dataset and does basic parsing. If additional parsing must be done, this is a good place to do so (See InContextLearningQATaskDataset._read_dataset())
+    - _construct_context(): takes a single example dictionary and formulates the context as a string for that eval question.
+    - _get_answer_from_example(): takes a single example dictionary and formulates the correct, ground truth answer as a string.
+    - _tokenize_example(): tokenizes the example and adds any extra content from the original dictionary that needs to be passed downstream.
+    - _read_dataset(): loads the dataset and does basic parsing. If additional parsing must be done, this is a good place to do so (See InContextLearningQATaskDataset._read_dataset())
 
     Additionally, base_batch and batch_mapping must be defined.
-        - base_batch (Dict): the base dictionary that the dataset will use to construct a batch. This should contain static values, like generation_kwargs or mode,
-        and empty lists for values that will need to be accumulated from each example.
-        NOTE: Sometimes you will need to set base_batch directly after the init call, e.g. in order to use class variables
-        like self.pad_tok_id or self.max_answer_length. If you manually set generation_kwargs this way, you'll need to call self._update_generation_kwargs()
-        after setting self.base_batch.
-        - batch_mapping (Dict): A mapping with keys that are keys in the batch and values that are columns in the loaded dataset.
-                                collate_fn will use this mapping to create batches from self.dataset.
+    - base_batch (Dict): the base dictionary that the dataset will use to construct a batch. This should contain static values, like generation_kwargs or mode,
+    and empty lists for values that will need to be accumulated from each example.
+    NOTE: Sometimes you will need to set base_batch directly after the init call, e.g. in order to use class variables
+    like self.pad_tok_id or self.max_answer_length. If you manually set generation_kwargs this way, you'll need to call self._update_generation_kwargs()
+    after setting self.base_batch.
+    - batch_mapping (Dict): A mapping with keys that are keys in the batch and values that are columns in the loaded dataset.
+                            collate_fn will use this mapping to create batches from self.dataset.
 
     Args:
         dataset_uri (str): A local path, a remote path beginning with ``s3://`` or another backend, or a HuggingFace dataset uri prepended with ``hf://``.
@@ -254,6 +254,7 @@ class InContextLearningDataset(Dataset):
             Column contents will be concatenated with ' ' seperating them. If not included, will load the columns already present in the HF dataset.
         tokenize_labels (bool): Whether or not the labels should be tokenized. Generally determined by which metric a dataset uses.
         generation_kwargs (Dict): A dictionary containing keyword arguments to be passed along to the model's generate function.
+
     """
 
     def __init__(
@@ -828,12 +829,12 @@ class InContextLearningMultipleChoiceTaskDataset(InContextLearningDataset):
     - gold: index of the correct choice under 'choices'
     - choices: a list of strings, each being one of the potential choices
 
-    Each batch then consists of batch_size // N distinct questions and has the following the structure.
-    - input_ids: Input tensor batch x seqlen x # tokens
-    - continuation_indices: List of |batch| consisting of tensors indicating which indices in the sequence correspond to the question answer (aka continuation)
+    Each batch then consists of ``|batch_size // N|`` distinct questions and has the following the structure.
+    - input_ids: Input tensor ``|batch x seqlen x # tokens|``
+    - continuation_indices: List of ``|batch|`` consisting of tensors indicating which indices in the sequence correspond to the question answer (aka continuation)
     - mode: Indicates to the model that this is an ICL task and may rely on a custom code path to properly update metrics
     - labels: Identical to the input, used by the model to calculate loss/metrics
-    - gold_indices: List of length |batch_size // N| indicating for each question, which of the answers is correct (via an integer [0, N-1])
+    - gold_indices: List of length ``|batch_size // N|`` indicating for each question, which of the answers is correct (via an integer [0, N-1])
     - choice_groupings: Indicates which indices of the batch correspond to which questions
 
     Additional Args:
@@ -1025,8 +1026,7 @@ def split_batch(self, batch: Any, microbatch_size: int) -> List[Dict[str, Any]]:
 
 
 class InContextLearningSchemaTaskDataset(InContextLearningMultipleChoiceTaskDataset):
-    """
-    A dataset that constructs batches for in-context learning schema evaluation.
+    """A dataset that constructs batches for in-context learning schema evaluation.
     A schema task involves sentences with a fill-in-the-blank where the user needs to choose the correct word
     to fill in from a set of N options. We use the partial evaluation technique from https://arxiv.org/abs/1806.02847
     to determine the model's choice of fill-in word.
@@ -1036,12 +1036,12 @@ class InContextLearningSchemaTaskDataset(InContextLearningMultipleChoiceTaskData
     - gold: index of the correct context from 'context_options'
     - continuation: the finishing continuation
 
-    Each batch then consists of batch_size // N distinct tasks and has the following the structure
-    - input_ids: Input tensor batch x seqlen x # tokens
-    - continuation_indices: List of |batch| consisting of tensors indicating which indices in the sequence correspond to the question answer (aka continuation)
+    Each batch then consists of ``batch_size // N`` distinct tasks and has the following the structure
+    - input_ids: Input tensor ``batch x seqlen x # of tokens``
+    - continuation_indices: List of ``batch`` consisting of tensors indicating which indices in the sequence correspond to the question answer (aka continuation)
     - mode: Indicates to the model that this is an ICL task and may rely on a custom code path to properly update metrics
     - labels: Identical to the input, used by the model to calculate loss/metrics
-    - gold_indices: List of length |batch_size // N| indicating for each question, which of the answers is correct (via an integer [0, N-1])
+    - gold_indices: List of length ``batch_size // N`` indicating for each question, which of the answers is correct (via an integer [0, N-1])
     - choice_groupings: Indicates which indices of the batch correspond to which questions
 
     """
@@ -1187,6 +1187,7 @@ class InContextLearningCodeEvalDataset(InContextLearningDataset):
     A dataset that constructs batches for in-context learning code evaluation.
 
     The input format is expected to be a jsonl file with the following fields:
+
     - task_id: label of given task
     - prompt: the code snippet that must be completed
     - entry_point: the entry to the function/code snippet to generate
@@ -1197,6 +1198,7 @@ class InContextLearningCodeEvalDataset(InContextLearningDataset):
     - language: the language of the code snippet
 
     Each batch then consists of the following the structure
+
     - input_ids: Input tensor batch x seqlen x num tokens
     - mode: Indicates to the model that this is an ICL task and may rely on a custom code path to properly update metrics
     - mode: always set to 'generate'
@@ -1211,6 +1213,7 @@ class InContextLearningCodeEvalDataset(InContextLearningDataset):
     - generation_kwargs: Dictionary of kwargs neeeded for generation. Includes the following, which will be individually overwritten
     by keys in generaiton_kwargs if set (see https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig
     for more details):
+
         - pad_token_id: ID for padding token, derived automatically
         - num_beams: how many beams to search for generations, set to 1
         - num_return_sequences: value passed for 'generations_per_sample', how many generations per prompt
@@ -1586,33 +1589,49 @@ def get_icl_task_dataloader(
         generation_kwargs: Optional[Dict] = None,
         early_stopping_criteria: Optional[List[str]] = None,
         do_normalization: bool = True) -> Union[DataSpec, Dict[str, DataSpec]]:
-    """
-    This constructs a dataloader (or dataloaders if has_categories is True) capable of evaluating LLMs on in-context learning language modeling tasks, for example LAMBADA. An example usage is below:
-
-    >>> dl = get_icl_task_dataloader(
-       ... 'language_modeling',
-       ... dataset_uri,
-       ... tokenizer,
-       ... batch_size=2,
-       ... max_seq_len=2048,
-       ... pad_tok_id=tokenizer.pad_token_id,
-       ... num_fewshot=10,
-       ... prompt_string='translate english to french',
-       ... example_delimiter='\n',
-       ... continuation_delimiter=''
-       )
-    >>> eval_evaluator = Evaluator(
-       ...     label="lambada",
-       ...     dataloader=dl,
-       ...     metric_names=['InContextLearningLMAccuracy']
-       ... )
-    >>> trainer = Trainer(
-       ...     model=model,
-       ...     train_dataloader=train_dataloader,
-       ...     eval_dataloader=eval_evaluator,
-       ...     optimizers=optimizer,
-       ...     max_duration="1ep",
-       ... )
+    """This constructs a dataloader (or dataloaders if has_categories is True) capable of evaluating LLMs on in-context learning language modeling tasks, for example LAMBADA. An example usage is below:
+
+        .. testsetup::
+
+            import transformers
+            from composer.models import HuggingFaceModel
+            from composer.trainer import Trainer
+            dataset_uri = "/tmp/dataset_uri.jsonl"
+            dataset = RandomTextClassificationDataset(size=16, use_keys=True)
+            train_dataloader = torch.utils.data.DataLoader(dataset, batch_size=8)
+            hf_model, tokenizer = HuggingFaceModel.hf_from_composer_checkpoint('composer-hf-checkpoint.pt')
+            # At this point, hf_model is randomly initialized
+            composer_model = HuggingFaceModel(hf_model, hf_tokenizer)
+
+        Example:
+
+        .. testcode::
+
+
+            dl = get_icl_task_dataloader(
+                'language_modeling',
+                dataset_uri,
+                tokenizer,
+                batch_size=2,
+                max_seq_len=2048,
+                pad_tok_id=tokenizer.pad_token_id,
+                num_fewshot=10,
+                prompt_string='translate english to french',
+                example_delimiter='\n',
+                continuation_delimiter=''
+                )
+            eval_evaluator = Evaluator(
+                    label="lambada",
+                    dataloader=dl,
+                    metric_names=['InContextLearningLMAccuracy']
+                )
+            trainer = Trainer(
+                    model=model,
+                    train_dataloader=train_dataloader,
+                    eval_dataloader=eval_evaluator,
+                    optimizers=optimizer,
+                    max_duration="1ep",
+                )
 
     Args:
         icl_task_type (str): Name of icl_task type. One of ['multiple_choice', 'schema', 'language_modeling', 'question_answering', 'code_evaluation']

From c5ca3f8aa6a80128faf3fbd316d68a391ceec965 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Thu, 25 Jan 2024 17:48:20 +0000
Subject: [PATCH 103/116] fix docstrings

---
 .../in_context_learning_evaluation.py         | 26 ++++++++++---------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 4a0f11471f..1c7175e2d3 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -212,19 +212,21 @@ class InContextLearningDataset(Dataset):
     'answer' refers to the desired output of the model.
 
     When creating a new ICL Dataset, it is likely that you will need to reimplemente the following methods:
+
     - _construct_context(): takes a single example dictionary and formulates the context as a string for that eval question.
     - _get_answer_from_example(): takes a single example dictionary and formulates the correct, ground truth answer as a string.
     - _tokenize_example(): tokenizes the example and adds any extra content from the original dictionary that needs to be passed downstream.
     - _read_dataset(): loads the dataset and does basic parsing. If additional parsing must be done, this is a good place to do so (See InContextLearningQATaskDataset._read_dataset())
 
     Additionally, base_batch and batch_mapping must be defined.
+
     - base_batch (Dict): the base dictionary that the dataset will use to construct a batch. This should contain static values, like generation_kwargs or mode,
-    and empty lists for values that will need to be accumulated from each example.
-    NOTE: Sometimes you will need to set base_batch directly after the init call, e.g. in order to use class variables
-    like self.pad_tok_id or self.max_answer_length. If you manually set generation_kwargs this way, you'll need to call self._update_generation_kwargs()
-    after setting self.base_batch.
+      and empty lists for values that will need to be accumulated from each example.
+      NOTE: Sometimes you will need to set base_batch directly after the init call, e.g. in order to use class variables
+      like self.pad_tok_id or self.max_answer_length. If you manually set generation_kwargs this way, you'll need to call self._update_generation_kwargs()
+      after setting self.base_batch.
     - batch_mapping (Dict): A mapping with keys that are keys in the batch and values that are columns in the loaded dataset.
-                            collate_fn will use this mapping to create batches from self.dataset.
+      collate_fn will use this mapping to create batches from self.dataset.
 
     Args:
         dataset_uri (str): A local path, a remote path beginning with ``s3://`` or another backend, or a HuggingFace dataset uri prepended with ``hf://``.
@@ -237,8 +239,8 @@ class InContextLearningDataset(Dataset):
         num_fewshot (int): The number of complete fewshot examples to prepend before each test example. These are not identical across examples.
         fewshot_random_seed (int): Random seed to use for fewshot sampling.
         prompt_string (str): Prompt string to put once before all fewshot examples/test examples (e.g. 'Translate english to french.').
-        example_delimiter (str): Separator inserted before (context, answer) pairs (e.g. '\n') for fewshot sampling and prompting.
-        continuation_delimiter: (str): Separator inserted between context and answer in each example (e.g. '\nA: ').
+        example_delimiter (str): Separator inserted before (context, answer) pairs (e.g. '\\n') for fewshot sampling and prompting.
+        continuation_delimiter: (str): Separator inserted between context and answer in each example (e.g. '\\nA: ').
         destination_path (str): Temporary path to store downloaded datasets.
         prelimiter (str): Text to be prepended before each context, including few shot examples (e.g. "Question: ").
         context_key (str): The key in the loaded dataset that contains the context.
@@ -1211,8 +1213,8 @@ class InContextLearningCodeEvalDataset(InContextLearningDataset):
     - pass_at_k: passed value for pass_at_k
     - generation_length: derrived maximum generation length
     - generation_kwargs: Dictionary of kwargs neeeded for generation. Includes the following, which will be individually overwritten
-    by keys in generaiton_kwargs if set (see https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig
-    for more details):
+      by keys in generaiton_kwargs if set (see https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig
+      for more details):
 
         - pad_token_id: ID for padding token, derived automatically
         - num_beams: how many beams to search for generations, set to 1
@@ -1617,7 +1619,7 @@ def get_icl_task_dataloader(
                 pad_tok_id=tokenizer.pad_token_id,
                 num_fewshot=10,
                 prompt_string='translate english to french',
-                example_delimiter='\n',
+                example_delimiter='\\n',
                 continuation_delimiter=''
                 )
             eval_evaluator = Evaluator(
@@ -1645,8 +1647,8 @@ def get_icl_task_dataloader(
         pad_tok_id (int): The special token used for padding batches.
         num_fewshot (int): The number of complete fewshot examples to prepend before each test example. These are not identical across examples.
         prompt_string (str, default = ''): Prompt string to put once before all fewshot examples/test examples (e.g. 'Translate english to french.').
-        example_delimiter (str, default = '\n'): Separator inserted before (context, answer) pairs (e.g. '\n') for fewshot sampling and prompting.
-        continuation_delimiter: (str, default = ' '): Separator inserted between context and answer in each example (e.g. '\nA: ').
+        example_delimiter (str, default = '\\n'): Separator inserted before (context, answer) pairs (e.g. '\\n') for fewshot sampling and prompting.
+        continuation_delimiter: (str, default = ' '): Separator inserted between context and answer in each example (e.g. '\\nA: ').
         destination_path: (str, default = ''): This is the local file where remote datasets will be saved.
         question_prelimiter: (str, default = ''): Text to be prepended before each context, including few shot examples (e.g. "Question: ").
         fewshot_random_seed (int, default = 1234): Random seed to use for fewshot sampling

From 712a33dec745a14262dfab34ec48348d374aa641 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Thu, 25 Jan 2024 18:21:09 +0000
Subject: [PATCH 104/116] add warning filters

---
 tests/datasets/test_in_context_learning_datasets.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py
index 170eb88453..e1af7dbf37 100644
--- a/tests/datasets/test_in_context_learning_datasets.py
+++ b/tests/datasets/test_in_context_learning_datasets.py
@@ -2295,6 +2295,8 @@ def test_lm_spacing_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
     'name': 'juggernaut',
 }])
 @pytest.mark.parametrize('hf_parsing_map', [None, {'context': ['context'], 'continuation': ['continuation']}])
+@pytest.mark.filterwarnings(
+    r'ignore:The repository for mosaicml/test_dataset contains custom code which must*:FutureWarning')
 def test_hf_dataloading_lm_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path, num_fewshot, prompt_string,
                                       hf_loading_vars, hf_parsing_map):
     pytest.importorskip('datasets')
@@ -2344,6 +2346,8 @@ def test_hf_dataloading_lm_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path
     'name': 'invoker',
 }])
 @pytest.mark.parametrize('hf_parsing_map', [{'context': ['quas', 'wex', 'exort'], 'answer': ['spell']}])
+@pytest.mark.filterwarnings(
+    r'ignore:The repository for mosaicml/test_dataset contains custom code which must*:FutureWarning')
 def test_hf_dataloading_custom_parsing(dataset_uri, tiny_gpt2_tokenizer, tmp_path, num_fewshot, prompt_string,
                                        hf_loading_vars, hf_parsing_map):
     pytest.importorskip('datasets')

From b305a4b7e808c3f338e5588a6d60803407328731 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Thu, 25 Jan 2024 18:35:20 +0000
Subject: [PATCH 105/116] fix warnings

---
 .../datasets/test_in_context_learning_datasets.py  | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py
index e1af7dbf37..065c90cc37 100644
--- a/tests/datasets/test_in_context_learning_datasets.py
+++ b/tests/datasets/test_in_context_learning_datasets.py
@@ -182,6 +182,8 @@ def test_fewshot_sample_idxs_randomness():
     assert rng_1_sample_2 != rng_3_sample_2
 
 
+@pytest.mark.filterwarnings(
+    r'ignore:The repository for mosaicml/test_dataset contains custom code which must*:FutureWarning')
 def test_update_generation_kwargs(tiny_gpt2_tokenizer, tmp_path):
     tokenizer = tiny_gpt2_tokenizer
     seqlen = 2048
@@ -227,6 +229,8 @@ def test_stop_sequences_criteria(tiny_gpt2_tokenizer):
     assert eos_criteria(input_ids, None)
 
 
+@pytest.mark.filterwarnings(
+    r'ignore:The repository for mosaicml/test_dataset contains custom code which must*:FutureWarning')
 def test_update_generation_kwargs_no_kwargs(tiny_gpt2_tokenizer, tmp_path):
     tokenizer = tiny_gpt2_tokenizer
     seqlen = 2048
@@ -254,6 +258,8 @@ def test_update_generation_kwargs_no_kwargs(tiny_gpt2_tokenizer, tmp_path):
     assert not dl.base_batch['generation_kwargs']
 
 
+@pytest.mark.filterwarnings(
+    r'ignore:The repository for mosaicml/test_dataset contains custom code which must*:FutureWarning')
 def test_construct_context(tiny_gpt2_tokenizer, tmp_path):
     tokenizer = tiny_gpt2_tokenizer
     seqlen = 2048
@@ -291,6 +297,8 @@ def test_construct_context(tiny_gpt2_tokenizer, tmp_path):
     assert constructed_context == '\nOrbs: quas quas exort\nSpell: ice wall'
 
 
+@pytest.mark.filterwarnings(
+    r'ignore:The repository for mosaicml/test_dataset contains custom code which must*:FutureWarning')
 def test_get_answer_from_example(tiny_gpt2_tokenizer, tmp_path):
     tokenizer = tiny_gpt2_tokenizer
     seqlen = 2048
@@ -319,6 +327,8 @@ def test_get_answer_from_example(tiny_gpt2_tokenizer, tmp_path):
     assert answer == ' alacrity'
 
 
+@pytest.mark.filterwarnings(
+    r'ignore:The repository for mosaicml/test_dataset contains custom code which must*:FutureWarning')
 def test_fix_eos_on_preamble(tmp_path):
     transformers = pytest.importorskip('transformers')
     tokenizer = transformers.AutoTokenizer.from_pretrained('facebook/opt-125m',
@@ -353,6 +363,8 @@ def test_fix_eos_on_preamble(tmp_path):
     assert fixed_preamble[-1] != tokenizer.eos_token_id
 
 
+@pytest.mark.filterwarnings(
+    r'ignore:The repository for mosaicml/test_dataset contains custom code which must*:FutureWarning')
 def test_tokenize_example_with_tokenize_labels(tiny_gpt2_tokenizer, tmp_path):
     tokenizer = tiny_gpt2_tokenizer
     seqlen = 2048
@@ -388,6 +400,8 @@ def test_tokenize_example_with_tokenize_labels(tiny_gpt2_tokenizer, tmp_path):
     assert 'continuation_indices' in tokenized_example
 
 
+@pytest.mark.filterwarnings(
+    r'ignore:The repository for mosaicml/test_dataset contains custom code which must*:FutureWarning')
 def test_tokenize_example_with_no_tokenize_labels(tiny_gpt2_tokenizer, tmp_path):
     tokenizer = tiny_gpt2_tokenizer
     seqlen = 2048

From 9ed99fdf456000b966941f6c56ec2e9faaa16a49 Mon Sep 17 00:00:00 2001
From: Max Marion <mmarion538@gmail.com>
Date: Thu, 25 Jan 2024 12:18:59 -0800
Subject: [PATCH 106/116] Update
 composer/datasets/in_context_learning_evaluation.py

fix spelling

Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com>
---
 composer/datasets/in_context_learning_evaluation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 1c7175e2d3..db8d8bc2ee 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -211,7 +211,7 @@ class InContextLearningDataset(Dataset):
     'example' refers to an loaded dictionary, generally containing a context, an answer, and any other information needed to run the task.
     'answer' refers to the desired output of the model.
 
-    When creating a new ICL Dataset, it is likely that you will need to reimplemente the following methods:
+    When creating a new ICL Dataset, it is likely that you will need to reimplement the following methods:
 
     - _construct_context(): takes a single example dictionary and formulates the context as a string for that eval question.
     - _get_answer_from_example(): takes a single example dictionary and formulates the correct, ground truth answer as a string.

From 37d5f9b958b29f962a1d2009257475e8e7db7265 Mon Sep 17 00:00:00 2001
From: Max Marion <mmarion538@gmail.com>
Date: Thu, 25 Jan 2024 12:19:14 -0800
Subject: [PATCH 107/116] Update
 composer/datasets/in_context_learning_evaluation.py

fix spelling

Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com>
---
 composer/datasets/in_context_learning_evaluation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index db8d8bc2ee..761ca0ab00 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -739,7 +739,7 @@ def _get_answer_from_example(self, example: Dict, in_context=False) -> str:
 
     def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -> Dict[str, Any]:
         """
-        Runs text through the tokenizer and handle special cases.
+        Run text through the tokenizer and handle special cases.
         Args:
             prompt_and_fewshot (str): the collection of the prompt and fewshot examples that belongs before the example's context
             ctx (str): the specific example's derrived context

From 5ff9a30314c91a0063807ce3cc10ef34e93f5222 Mon Sep 17 00:00:00 2001
From: Max Marion <mmarion538@gmail.com>
Date: Thu, 25 Jan 2024 12:19:24 -0800
Subject: [PATCH 108/116] Update
 composer/datasets/in_context_learning_evaluation.py

fix spelling

Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com>
---
 composer/datasets/in_context_learning_evaluation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 761ca0ab00..03962a5145 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -680,7 +680,7 @@ def __init__(self,
                          tensor_keys=tensor_keys,
                          *args,
                          **kwargs)
-        # NOTE: set these after init call bcus they take class vars
+        # NOTE: set these after init call because they take class vars
         self.early_stopping_criteria = early_stopping_criteria
         self.base_batch = {
             'input_ids': [],

From c44c76395957bfa26b0c70bdd2f8fc0cf48449d9 Mon Sep 17 00:00:00 2001
From: Max Marion <mmarion538@gmail.com>
Date: Thu, 25 Jan 2024 12:20:22 -0800
Subject: [PATCH 109/116] Update
 composer/datasets/in_context_learning_evaluation.py

fix spelling

Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com>
---
 composer/datasets/in_context_learning_evaluation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 03962a5145..b6c221b4e1 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -91,7 +91,7 @@ def _trim_context(context_enc: List, continuation_enc: List, max_seq_len: int) -
 
 def _get_continuation_span(context_enc: List, continuation_enc: List) -> torch.Tensor:
     """
-    Gets the list of indices of the continutaion tokens for language modeling or generaiton tasks.
+    Gets the list of indices of the continuation tokens for language modeling or generation tasks.
 
     Args:
         context_enc (list): list of context tokens

From 7e070847ee0e05fcbd051c53de0b5f94de216c32 Mon Sep 17 00:00:00 2001
From: Max Marion <mmarion538@gmail.com>
Date: Thu, 25 Jan 2024 12:20:32 -0800
Subject: [PATCH 110/116] Update
 composer/datasets/in_context_learning_evaluation.py

fix spelling

Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com>
---
 composer/datasets/in_context_learning_evaluation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index b6c221b4e1..5302346782 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -208,7 +208,7 @@ class InContextLearningDataset(Dataset):
     The dataset format is expected to be a local jsonl file, a cloud link to a jsonl file, or a Hugging Face dataset link.
     'context' refers to the input a model will recieve before generating an output. For example, the question in question answering tasks,
     the preceding text in a language modeling task, or the document and question regarding the document in a document understanding task.
-    'example' refers to an loaded dictionary, generally containing a context, an answer, and any other information needed to run the task.
+    'example' refers to a loaded dictionary, generally containing a context, an answer, and any other information needed to run the task.
     'answer' refers to the desired output of the model.
 
     When creating a new ICL Dataset, it is likely that you will need to reimplement the following methods:

From b8cae18db11319e2d081a0d21bb4d05e707e0a41 Mon Sep 17 00:00:00 2001
From: Max Marion <mmarion538@gmail.com>
Date: Thu, 25 Jan 2024 12:20:49 -0800
Subject: [PATCH 111/116] Update
 composer/datasets/in_context_learning_evaluation.py

fix spelling

Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com>
---
 composer/datasets/in_context_learning_evaluation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 5302346782..ea839802be 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -231,7 +231,7 @@ class InContextLearningDataset(Dataset):
     Args:
         dataset_uri (str): A local path, a remote path beginning with ``s3://`` or another backend, or a HuggingFace dataset uri prepended with ``hf://``.
             Alternate backends must be supported by :meth:`composer.utils.maybe_create_object_store_from_uri`.
-            A local dataset must consist of rows of JSON data points with task dependant fields.
+            A local dataset must consist of rows of JSON data points with task dependent fields.
             The default keys expected are "context" and "answer".
         tokenizer (transformers.PreTrainedTokenizerBase): The tokenizer used to map between strings and token ids.
         max_seq_len (int): The maximum sequence length supported by the model.

From a840abbe6f3ccc3dd43e9bbb6e26abf36988d91e Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Thu, 25 Jan 2024 21:45:11 +0000
Subject: [PATCH 112/116] add capitalization

---
 .../in_context_learning_evaluation.py         | 240 +++++++++---------
 1 file changed, 120 insertions(+), 120 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index ea839802be..5a4d8777b6 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -39,10 +39,10 @@ def strip_data(example: Dict) -> Dict:
     Remove white space from the begging and end of string values in a dictionary
 
     Args:
-        example: dictionary to be stripped
+        example: Dictionary to be stripped
 
     Returns:
-        dict: the same dictionary with .strip() applied to any value in the dict that is a string
+        dict: The same dictionary with .strip() applied to any value in the dict that is a string
     """
     return {k: v.strip() if isinstance(v, str) else v for k, v in example.items()}
 
@@ -56,7 +56,7 @@ def _tokenizer_needs_prefix_space(tokenizer: transformers.PreTrainedTokenizerBas
         tokenizer: Tokenizer to test
 
     Returns:
-        bool: whether or not the tokenizer needs a prefix space
+        bool: Whether or not the tokenizer needs a prefix space
     """
     test_tokens = tokenizer(' a', add_special_tokens=False)['input_ids']
     assert isinstance(test_tokens, list)
@@ -70,12 +70,12 @@ def _trim_context(context_enc: List, continuation_enc: List, max_seq_len: int) -
     of the context will be removed.
 
     Args:
-        context_enc (list): list of tokens in the context
-        continuation_enc (lsit): list of tokens in the continuation
-        max_seq_len (int): maximum length the model can ingest
+        context_enc (list): List of tokens in the context
+        continuation_enc (lsit): List of tokens in the continuation
+        max_seq_len (int): Maximum length the model can ingest
 
     Returns:
-        list: the encoded context trimmed from the left
+        list: The encoded context trimmed from the left
     """
     if len(continuation_enc) + len(context_enc) > max_seq_len:
         context_max_subseq_len = max_seq_len - len(continuation_enc)
@@ -94,11 +94,11 @@ def _get_continuation_span(context_enc: List, continuation_enc: List) -> torch.T
     Gets the list of indices of the continuation tokens for language modeling or generation tasks.
 
     Args:
-        context_enc (list): list of context tokens
-        continuation_enc (list): list of continuation tokens
+        context_enc (list): List of context tokens
+        continuation_enc (list): List of continuation tokens
 
     Returns:
-        torch.tensor: a tensor containing indices corresponding to continuation tokens
+        torch.tensor: A tensor containing indices corresponding to continuation tokens
     """
     return torch.tensor(range(len(context_enc), len(context_enc) + len(continuation_enc)))
 
@@ -113,15 +113,15 @@ def _make_padded_input(context_enc: List,
     Adds the padding token to the specified side.
 
     Args:
-        context_enc (List): the encoded input to the model
-        continuation_enc (List): the encoded desired output for the example
-        max_seq_list (int): maximum length sequences can be
-        pad_tok_id (int): the token id we pad with
-        padding_side (str): which side to pad the context on. Can be 'right' or 'left
+        context_enc (List): The encoded input to the model
+        continuation_enc (List): The encoded desired output for the example
+        max_seq_list (int): Maximum length sequences can be
+        pad_tok_id (int): The token id we pad with
+        padding_side (str): Which side to pad the context on. Can be 'right' or 'left
 
     Returns:
-        input (torch.tensor): the padded and encoded context
-        continuation_span (torch.tensor): the _inclusive_ range of indices corresponding to the continuation
+        input (torch.tensor): The padded and encoded context
+        continuation_span (torch.tensor): The _inclusive_ range of indices corresponding to the continuation
     """
 
     inp = torch.tensor(
@@ -161,11 +161,11 @@ def convert_tokens_to_tensors(batch: Dict, tokenize_labels: bool) -> Dict[str, A
     Here, we convert those lists of tokens back into tensors in order to feed them into the model.
 
     Args:
-        batch (dict): a dictionary of batched inputs
-        tokenize_labels (bool): whether or not the labels are tokenized (and need to be stacked)
+        batch (dict): A dictionary of batched inputs
+        tokenize_labels (bool): Whether or not the labels are tokenized (and need to be stacked)
 
     Returns:
-        dict: the batch with torch tensors in the corresponding keys instead of lists of lists
+        dict: The batch with torch tensors in the corresponding keys instead of lists of lists
     """
     batch['input_ids'] = torch.stack(list(map(torch.tensor, batch['input_ids'])))
     if tokenize_labels:
@@ -179,13 +179,13 @@ def _get_fewshot_sample_idxs(dataset_size: int, num_fewshot: int, example_idx: i
     Samples indices without replacement. If num_fewshot exceeds the number of unique examples in the dataset,
     then we will have fewer than num_fewshot examples in context.
     Args:
-        dataset_size (int): length of the dataset
-        num_fewshot (int): number of examples to prepend
-        example_idx (int): current example's index (excluded from fewshot choices)
-        rng (random.Random): rng for repeatable sample selection
+        dataset_size (int): Length of the dataset
+        num_fewshot (int): Number of examples to prepend
+        example_idx (int): Current example's index (excluded from fewshot choices)
+        rng (random.Random): RNG for repeatable sample selection
 
     Returns:
-        list: indices of the examples chosen for fewshot selection
+        list: Indices of the examples chosen for fewshot selection
     """
     num_fewshot = min(dataset_size - 1, num_fewshot)
     fewshot_idxs = set(rng.sample(range(0, dataset_size), num_fewshot))
@@ -213,14 +213,14 @@ class InContextLearningDataset(Dataset):
 
     When creating a new ICL Dataset, it is likely that you will need to reimplement the following methods:
 
-    - _construct_context(): takes a single example dictionary and formulates the context as a string for that eval question.
-    - _get_answer_from_example(): takes a single example dictionary and formulates the correct, ground truth answer as a string.
-    - _tokenize_example(): tokenizes the example and adds any extra content from the original dictionary that needs to be passed downstream.
-    - _read_dataset(): loads the dataset and does basic parsing. If additional parsing must be done, this is a good place to do so (See InContextLearningQATaskDataset._read_dataset())
+    - _construct_context(): Takes a single example dictionary and formulates the context as a string for that eval question.
+    - _get_answer_from_example(): Takes a single example dictionary and formulates the correct, ground truth answer as a string.
+    - _tokenize_example(): Tokenizes the example and adds any extra content from the original dictionary that needs to be passed downstream.
+    - _read_dataset(): Loads the dataset and does basic parsing. If additional parsing must be done, this is a good place to do so (See InContextLearningQATaskDataset._read_dataset())
 
     Additionally, base_batch and batch_mapping must be defined.
 
-    - base_batch (Dict): the base dictionary that the dataset will use to construct a batch. This should contain static values, like generation_kwargs or mode,
+    - base_batch (Dict): The base dictionary that the dataset will use to construct a batch. This should contain static values, like generation_kwargs or mode,
       and empty lists for values that will need to be accumulated from each example.
       NOTE: Sometimes you will need to set base_batch directly after the init call, e.g. in order to use class variables
       like self.pad_tok_id or self.max_answer_length. If you manually set generation_kwargs this way, you'll need to call self._update_generation_kwargs()
@@ -354,7 +354,7 @@ def _update_generation_kwargs(self, generation_kwargs: Dict) -> None:
         likely because base_batch needs a class variable like self.pad_tok_id or self.max_answer_length).
 
         Args:
-            dict: keyword arguments that be written into base_batch['generation_kwargs']
+            dict: Keyword arguments that be written into base_batch['generation_kwargs']
         """
         if 'generation_kwargs' not in self.base_batch:
             self.base_batch['generation_kwargs'] = {}
@@ -377,7 +377,7 @@ def _read_dataset(self,
             hf_parsing_map (Dict): Dictionary in the form of {icl_key: [hf_col1, hf_col2]} that will map one or more hf columns, in order, to ICL dataset columns
 
         Returns:
-            dataset: a loaded HF dataset
+            dataset: A loaded HF dataset
         """
         from datasets import Dataset as HFDataset  # pyright: ignore[reportGeneralTypeIssues]
         from datasets import load_dataset  # pyright: ignore[reportGeneralTypeIssues]
@@ -417,13 +417,13 @@ def _generate_few_shot_prompt(
         Returns the formatted prompt_string + concatenated list of formatted few shot examples as a string.
 
         Args:
-            num_fewshot (int): number of examples to prepend
-            example_idx (int): current example idx
-            preamble (str): text to occur at the beginning of the task. Generally instructions or a prompt.
-            fewshot_rng (random.Random): seeded sampler to chose samples with
+            num_fewshot (int): Number of examples to prepend
+            example_idx (int): Current example idx
+            preamble (str): Text to occur at the beginning of the task. Generally instructions or a prompt.
+            fewshot_rng (random.Random): Seeded sampler to chose samples with
 
         Returns:
-            str: the original preamble with num_fewshot examples appended
+            str: The original preamble with num_fewshot examples appended
         """
         few_shot_text = preamble
 
@@ -450,9 +450,9 @@ def _construct_context(self, example: Dict, preceding_text: str = '', add_answer
         Optionally adds the correct answer (for fewshot examples) and handles example delimiters
 
         Args:
-            example (Dict): the example from which to construct the context
-            preceding_text (str): any preceding text, used as a check for prepending self.example_delimiter
-            add_answer (bool): bool for whether or not to add the answer on the end of the context (e.g. for fewshot examples)
+            example (Dict): The example from which to construct the context
+            preceding_text (str): Any preceding text, used as a check for prepending self.example_delimiter
+            add_answer (bool): Bool for whether or not to add the answer on the end of the context (e.g. for fewshot examples)
 
         Returns:
             str: The constructed context. The default output context is
@@ -472,10 +472,10 @@ def _get_answer_from_example(self, example: Dict[str, Any], in_context: bool = F
         Returns the answer from the example.
 
         Args:
-            example (Dict): the example from which to retrieve the answer
+            example (Dict): The example from which to retrieve the answer
 
         Returns:
-            str: the answer in the example
+            str: The answer in the example
         """
         cont = example[self.answer_key]
         if self.prefix_space and not cont.startswith(' ') and not in_context:
@@ -490,10 +490,10 @@ def _fix_eos_on_preamble(self, input_ids: List[int]) -> List[int]:
         as the specific eval question's prompt will follow the input_ids.
 
         Args:
-            input_ids (List): the tokenized input
+            input_ids (List): The tokenized input
 
         Returns:
-            input_ids: the tokenized input conditionally edited
+            input_ids: The tokenized input conditionally edited
         """
         if (self.tokenizer.eos_token_id is not None and len(input_ids) > 1 and
                 input_ids[-1] == self.tokenizer.eos_token_id):
@@ -505,12 +505,12 @@ def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -
         Runs text through the tokenizer and handle special cases.
 
         Args:
-            prompt_and_fewshot (str): the collection of the prompt and fewshot examples that belongs before the example's context
-            ctxt (str): the specific example's derrived context
-            example (Dict): the example as a dictionary. Used for additional processing in inherited classes.
+            prompt_and_fewshot (str): The collection of the prompt and fewshot examples that belongs before the example's context
+            ctxt (str): The specific example's derrived context
+            example (Dict): The example as a dictionary. Used for additional processing in inherited classes.
 
         Returns:
-            Dict: dictionary with the tokenized data
+            Dict: Dictionary with the tokenized data
         """
         tokenized_example = {}
         # Always add special tokens to preamble
@@ -573,13 +573,13 @@ def _prep_example(
 
         Args:
             example (Dict): A Dictionary from the hf dataset
-            example_idx (int): the index of example
+            example_idx (int): The index of example
             num_fewshot (int): Number of examples context/continuation pairs to prepend to the test pair
             prompt_string (str): The prompt to prepend to all inputs
             fewshot_rng (random.Random): Random number generator to use for fewshot sampling
 
         Returns:
-            Dict: contains a dictionary with the tokenized data
+            Dict: Contains a dictionary with the tokenized data
         """
         prompt_and_fewshot = self._generate_few_shot_prompt(num_fewshot, example_idx, prompt_string, fewshot_rng)
         ctxt = self._construct_context(example, prompt_and_fewshot, add_answer=False)
@@ -591,10 +591,10 @@ def collate_fn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
         The function that the dataloader uses to accumulate data into batches.
 
         Args:
-            data (List): list of tokenized datapoints (dicts returned by self._tokenize_example)
+            data (List): List of tokenized datapoints (dicts returned by self._tokenize_example)
 
         Returns:
-            Dict: dictionary for a single batch
+            Dict: Dictionary for a single batch
         """
         batch = copy.deepcopy(self.base_batch)
         for data_pair in data:
@@ -612,11 +612,11 @@ def split_batch(self, batch: Any, microbatch_size: int) -> List[Dict[str, Any]]:
         Handling for certain specialty columns that must be split into batches in different formats.
 
         Args:
-            batch (Dict): batch of data
-            microbatch_size (int): size of microbatches
+            batch (Dict): Batch of data
+            microbatch_size (int): Size of microbatches
 
         Returns:
-            List: list of chunked batches
+            List: List of chunked batches
         """
         # Don't split kwargs that don't change
         # Normally split torch tensors
@@ -647,9 +647,9 @@ class InContextLearningQATaskDataset(InContextLearningDataset):
     QA tasks evaluate a model's ability to answer questions using a consistent format.
 
     The input format is expected to be a jsonl file with the following fields:
-    - context: the question
-    - answer: the preferred answer to the question
-    - aliases: a list of aliases for the answer
+    - context: The question
+    - answer: The preferred answer to the question
+    - aliases: A list of aliases for the answer
 
     See InContextLearningDataset for more details.
 
@@ -727,10 +727,10 @@ def _get_answer_from_example(self, example: Dict, in_context=False) -> str:
         """
         Returns the answer from the example. Applies chain of thought if self.has_cot is marked as true.
         Args:
-            example (Dict): the example from which to retrieve the answer
+            example (Dict): The example from which to retrieve the answer
 
         Returns:
-            str: the answer in from the example with chain of thought and delimiter if needed
+            str: The answer in from the example with chain of thought and delimiter if needed
         """
         if self.has_cot:
             return f'{example["chain_of_thought"]}{self.cot_delimiter}{example[self.answer_key]}'
@@ -741,12 +741,12 @@ def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -
         """
         Run text through the tokenizer and handle special cases.
         Args:
-            prompt_and_fewshot (str): the collection of the prompt and fewshot examples that belongs before the example's context
-            ctx (str): the specific example's derrived context
-            example (Dict): the example as a dictionary.
+            prompt_and_fewshot (str): The collection of the prompt and fewshot examples that belongs before the example's context
+            ctx (str): The specific example's derrived context
+            example (Dict): The example as a dictionary.
 
         Returns:
-            Dict: dictionary with the tokenized data
+            Dict: Dictionary with the tokenized data
         """
         tokenized_example = super()._tokenize_example(prompt_and_fewshot, ctxt, example)
         tokenized_example['aliases'] = list(example.get('aliases', []))
@@ -757,7 +757,7 @@ def _get_max_answer_length(self, dataset) -> int:
         Loops over the dataset and finds the longest answer length.
 
         Returns:
-            int: the maximum answer length with an additional buffer of {_MAX_ANSWER_BUFFER_LENGTH} if chain of thought is present
+            int: The maximum answer length with an additional buffer of {_MAX_ANSWER_BUFFER_LENGTH} if chain of thought is present
         """
         max_answer_length = 0
         for example in dataset:
@@ -793,8 +793,8 @@ class InContextLearningLMTaskDataset(InContextLearningDataset):
     Language modeling tasks test a model's ability to properly predict tokens based on preceding tokens.
 
     The input format is expected to be a jsonl file with the following fields:
-    - context: preceding text
-    - continuation: the expected continuation
+    - context: Preceding text
+    - continuation: The expected continuation
 
     See InContextLearningDataset for more details.
     """
@@ -827,9 +827,9 @@ class InContextLearningMultipleChoiceTaskDataset(InContextLearningDataset):
     inputs per question can stored in the same batch.
 
     The default input format is a jsonl file with the following fields:
-    - query: the preceding text, question, or document relevant to the choices
-    - gold: index of the correct choice under 'choices'
-    - choices: a list of strings, each being one of the potential choices
+    - query: The preceding text, question, or document relevant to the choices
+    - gold: Index of the correct choice under 'choices'
+    - choices: A list of strings, each being one of the potential choices
 
     Each batch then consists of ``|batch_size // N|`` distinct questions and has the following the structure.
     - input_ids: Input tensor ``|batch x seqlen x # tokens|``
@@ -840,7 +840,7 @@ class InContextLearningMultipleChoiceTaskDataset(InContextLearningDataset):
     - choice_groupings: Indicates which indices of the batch correspond to which questions
 
     Additional Args:
-        choices_key (str): the key under which the choices are stored in the saved dataset. Defaults to 'choices'.
+        choices_key (str): The key under which the choices are stored in the saved dataset. Defaults to 'choices'.
     """
 
     def __init__(self,
@@ -881,10 +881,10 @@ def _get_answer_from_example(self, example: Dict, in_context=False) -> str:
         """
         Returns the correct answer from the example's choices.
         Args:
-            example (Dict): the example from which to retrieve the answer
+            example (Dict): The example from which to retrieve the answer
 
         Returns:
-            str: the full string of the correct answer based on the 'gold' key
+            str: The full string of the correct answer based on the 'gold' key
         """
         choices = example[self.choices_key]
         gold_idx = example['gold']
@@ -894,12 +894,12 @@ def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -
         """
         Runs text through the tokenizer and handle special cases.
         Args:
-            prompt_and_fewshot (str): the collection of the prompt and fewshot examples that belongs before the example's context
-            ctx (str): the specific example's derrived context
-            example (Dict): the example as a dictionary.
+            prompt_and_fewshot (str): The collection of the prompt and fewshot examples that belongs before the example's context
+            ctx (str): The specific example's derrived context
+            example (Dict): The example as a dictionary.
 
         Returns:
-            Dict: dictionary with the tokenized data
+            Dict: Dictionary with the tokenized data
         """
         # NOTE: some of this is repeated from super class but for loop makes things considerably different
         tokenized_example = {}
@@ -956,10 +956,10 @@ def collate_fn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
         which contiguous sequences of elements in the batch correspond to which question
         gold_indices indicates which of the [0, N-1] choices is the correct one for each question.
         Args:
-            data (List): list of tokenized datapoints (dicts returned by self._tokenize_example)
+            data (List): List of tokenized datapoints (dicts returned by self._tokenize_example)
 
         Returns:
-            Dict: dictionary for a single batch
+            Dict: Dictionary for a single batch
         """
         batch = copy.deepcopy(self.base_batch)
         for data_pair in data:
@@ -991,11 +991,11 @@ def split_batch(self, batch: Any, microbatch_size: int) -> List[Dict[str, Any]]:
         microbatch_size are tracked in logical example, we split logical attributes by
         microbatch_size and real attributes by microbatch_size * num_choices.
         Args:
-            batch (Dict): batch of data
-            microbatch_size (int): size of microbatches
+            batch (Dict): Batch of data
+            microbatch_size (int): Size of microbatches
 
         Returns:
-            list: list of chunked batches
+            list: List of chunked batches
         """
         chunked = {}
         for k, v in batch.items():
@@ -1034,9 +1034,9 @@ class InContextLearningSchemaTaskDataset(InContextLearningMultipleChoiceTaskData
     to determine the model's choice of fill-in word.
 
     The default input format is a jsonl file with the following fields:
-    - context_options: list of strings corresponding to possible preceding context options for the continuation
-    - gold: index of the correct context from 'context_options'
-    - continuation: the finishing continuation
+    - context_options: List of strings corresponding to possible preceding context options for the continuation
+    - gold: Index of the correct context from 'context_options'
+    - continuation: The finishing continuation
 
     Each batch then consists of ``batch_size // N`` distinct tasks and has the following the structure
     - input_ids: Input tensor ``batch x seqlen x # of tokens``
@@ -1073,12 +1073,12 @@ def _construct_context(self, example, preceding_text: str = '', add_answer: bool
         Takes a example and constructs a context with the correct context for the example's continuation.
 
         Args:
-            example (Dict): the example from which to construct the context
-            preceding_text (str): any preceding text, needed to if self.example_delimiter is needed at the beginning
-            add_answer (bool): this will always be true when calling this function for SchemaTaskDataset
+            example (Dict): The example from which to construct the context
+            preceding_text (str): Any preceding text, needed to if self.example_delimiter is needed at the beginning
+            add_answer (bool): This will always be true when calling this function for SchemaTaskDataset
 
         Returns:
-            str: the single correct context for a given continuation
+            str: The single correct context for a given continuation
         """
         context_options = example[self.choices_key]
         gold_idx = example['gold']
@@ -1095,11 +1095,11 @@ def _construct_multiple_contexts(self, example: Dict, preceding_text: str = '')
         prompt or fewshot examples).
 
         Args:
-            example (Dict): the example from which to construct the context
-            preceding_text (str): any preceding text, needed to if self.example_delimiter is needed at the beginning
+            example (Dict): The example from which to construct the context
+            preceding_text (str): Any preceding text, needed to if self.example_delimiter is needed at the beginning
 
         Returns:
-            list: all context options for the selected example with formatting
+            list: All context options for the selected example with formatting
         """
         context_options = example[self.choices_key]
         if len(preceding_text) > 0:
@@ -1126,13 +1126,13 @@ def _prep_example(
 
         Args:
             example (Dict): A dictionary from the hf dataset
-            example_idx (int): the index of example
+            example_idx (int): The index of example
             num_fewshot (int): Number of examples context/continuation pairs to prepend to the test pair
             prompt_string (str): The prompt to prepend to all inputs
             fewshot_rng (random.Random): Random number generator to use for fewshot sampling
 
         Returns:
-            Dict: contains a dictionary with the tokenized data
+            Dict: Contains a dictionary with the tokenized data
         """
         prompt_and_fewshot = self._generate_few_shot_prompt(num_fewshot, example_idx, prompt_string, fewshot_rng)
         ctxt = self._construct_multiple_contexts(example, prompt_and_fewshot)
@@ -1144,12 +1144,12 @@ def _tokenize_example(self, prompt_and_fewshot: str, context_options: List[str],
         Runs text through the tokenizer and handle special cases.
 
         Args:
-            prompt_and_fewshot (str): the collection of the prompt and fewshot examples that belongs before the example's context
-            ctx (str): the specific example's derrived context
-            example (Dict): the example as a dictionary.
+            prompt_and_fewshot (str): The collection of the prompt and fewshot examples that belongs before the example's context
+            ctx (str): The specific example's derrived context
+            example (Dict): The example as a dictionary.
 
         Returns:
-            Dict: dictionary with the tokenized data
+            Dict: Dictionary with the tokenized data
         """
         tokenized_example = {}
         preamble = self.tokenizer(prompt_and_fewshot)['input_ids']
@@ -1190,36 +1190,36 @@ class InContextLearningCodeEvalDataset(InContextLearningDataset):
 
     The input format is expected to be a jsonl file with the following fields:
 
-    - task_id: label of given task
-    - prompt: the code snippet that must be completed
-    - entry_point: the entry to the function/code snippet to generate
-    - canonical_solution: working solution
-    - test: the checker code that will run to completion if the code generation is valid and otherwise throw assertion
-    - test_inputs: list of test inputs
-    - test_outputs: list of test outputs
-    - language: the language of the code snippet
+    - task_id: Label of given task
+    - prompt: The code snippet that must be completed
+    - entry_point: The entry to the function/code snippet to generate
+    - canonical_solution: Working solution
+    - test: The checker code that will run to completion if the code generation is valid and otherwise throw assertion
+    - test_inputs: List of test inputs
+    - test_outputs: List of test outputs
+    - language: The language of the code snippet
 
     Each batch then consists of the following the structure
 
     - input_ids: Input tensor batch x seqlen x num tokens
     - mode: Indicates to the model that this is an ICL task and may rely on a custom code path to properly update metrics
-    - mode: always set to 'generate'
-    - labels: exact solution for the coding problem
-    - prompts: prompt for the task
-    - entry_points: list of entry points
-    - test_inputs: list of test inputs
-    - test_outputs: list of test outputs
-    - languages:  list of languages
-    - pass_at_k: passed value for pass_at_k
-    - generation_length: derrived maximum generation length
+    - mode: Always set to 'generate'
+    - labels: Exact solution for the coding problem
+    - prompts: Prompt for the task
+    - entry_points: List of entry points
+    - test_inputs: List of test inputs
+    - test_outputs: List of test outputs
+    - languages:  List of languages
+    - pass_at_k: Passed value for pass_at_k
+    - generation_length: Derrived maximum generation length
     - generation_kwargs: Dictionary of kwargs neeeded for generation. Includes the following, which will be individually overwritten
       by keys in generaiton_kwargs if set (see https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig
       for more details):
 
         - pad_token_id: ID for padding token, derived automatically
-        - num_beams: how many beams to search for generations, set to 1
-        - num_return_sequences: value passed for 'generations_per_sample', how many generations per prompt
-        - do_sample: determines whether model is sampling or greedily decoding. Always set to True
+        - num_beams: How many beams to search for generations, set to 1
+        - num_return_sequences: Value passed for 'generations_per_sample', how many generations per prompt
+        - do_sample: Determines whether model is sampling or greedily decoding. Always set to True
         - use_cache: Whether or not to use past key values to speed up sampling. Always set to True
 
     Additional Args:
@@ -1321,7 +1321,7 @@ def _trim_padding(self, example: Dict):
         prompt length until after we've tokenized it.
 
         Returns:
-            dataset: a HuggingFace Dataset with different padding lengths for example[self.context_key]
+            dataset: A HuggingFace Dataset with different padding lengths for example[self.context_key]
         """
         # Remove padding tokens applied during tokenization
         unpadded_prompt = [token for token in example[self.context_key] if token != self.pad_tok_id]
@@ -1653,7 +1653,7 @@ def get_icl_task_dataloader(
         question_prelimiter: (str, default = ''): Text to be prepended before each context, including few shot examples (e.g. "Question: ").
         fewshot_random_seed (int, default = 1234): Random seed to use for fewshot sampling
         pass_at_k (int): k for how many chances the model gets to write passing code.
-        generations_per_sample (int): how many outputs to generate per prompt. Passed in generation_kwargs under "num_return_sequences" and overwritten by generation_kwargs dict.
+        generations_per_sample (int): How many outputs to generate per prompt. Passed in generation_kwargs under "num_return_sequences" and overwritten by generation_kwargs dict.
         cot_delimiter (str): Delimiter to place between chain of thoughts and continuations.
         has_categories: (bool): If ``True``, we will search the dataset file for a category key, and partition the dataset into a separate dataloader for each category occurring in the data.
         hf_loading_vars (Dict, default = None): A dictionary containing keyword arguments to be passed into `load_dataset` if dataset is being pulled from HF.

From c65aab852de8307c339aa743742b9c98f4ce4de0 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Thu, 25 Jan 2024 21:50:17 +0000
Subject: [PATCH 113/116] revert default changes

---
 composer/datasets/in_context_learning_evaluation.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 5a4d8777b6..7dab605ba9 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -1576,14 +1576,14 @@ def get_icl_task_dataloader(
         max_seq_len: int,
         pad_tok_id: int,
         num_fewshot: int,
-        prompt_string: str = '',  # e.g. 'translate english to french:'
-        example_delimiter: str = '\n',  # e.g. '\n'
-        continuation_delimiter: str = ' ',
+        prompt_string: str,  # e.g. 'translate english to french:'
+        example_delimiter: str,  # e.g. '\n'
+        continuation_delimiter: str = '',
         destination_path: str = '',
         question_prelimiter: str = '',  # e.g. 'Question: '
         fewshot_random_seed: int = 1234,
         pass_at_k: int = 1,
-        generations_per_sample: int = 20,
+        generations_per_sample: int = 1,
         cot_delimiter: str = '',
         has_categories: bool = False,
         hf_loading_vars: Optional[Dict] = None,

From 3d5c700f1ae9ea4ae830408e98ca842febed1b58 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Thu, 25 Jan 2024 22:19:01 +0000
Subject: [PATCH 114/116] change update_generate_kwargs to public

---
 .../in_context_learning_evaluation.py         | 72 +++++++++----------
 .../test_in_context_learning_datasets.py      | 42 +++++------
 2 files changed, 57 insertions(+), 57 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 7dab605ba9..f53aa62288 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -213,17 +213,17 @@ class InContextLearningDataset(Dataset):
 
     When creating a new ICL Dataset, it is likely that you will need to reimplement the following methods:
 
-    - _construct_context(): Takes a single example dictionary and formulates the context as a string for that eval question.
-    - _get_answer_from_example(): Takes a single example dictionary and formulates the correct, ground truth answer as a string.
-    - _tokenize_example(): Tokenizes the example and adds any extra content from the original dictionary that needs to be passed downstream.
-    - _read_dataset(): Loads the dataset and does basic parsing. If additional parsing must be done, this is a good place to do so (See InContextLearningQATaskDataset._read_dataset())
+    - construct_context(): Takes a single example dictionary and formulates the context as a string for that eval question.
+    - get_answer_from_example(): Takes a single example dictionary and formulates the correct, ground truth answer as a string.
+    - tokenize_example(): Tokenizes the example and adds any extra content from the original dictionary that needs to be passed downstream.
+    - read_dataset(): Loads the dataset and does basic parsing. If additional parsing must be done, this is a good place to do so (See InContextLearningQATaskDataset.read_dataset())
 
     Additionally, base_batch and batch_mapping must be defined.
 
     - base_batch (Dict): The base dictionary that the dataset will use to construct a batch. This should contain static values, like generation_kwargs or mode,
       and empty lists for values that will need to be accumulated from each example.
       NOTE: Sometimes you will need to set base_batch directly after the init call, e.g. in order to use class variables
-      like self.pad_tok_id or self.max_answer_length. If you manually set generation_kwargs this way, you'll need to call self._update_generation_kwargs()
+      like self.pad_tok_id or self.max_answer_length. If you manually set generation_kwargs this way, you'll need to call self.update_generation_kwargs()
       after setting self.base_batch.
     - batch_mapping (Dict): A mapping with keys that are keys in the batch and values that are columns in the loaded dataset.
       collate_fn will use this mapping to create batches from self.dataset.
@@ -315,14 +315,14 @@ def __init__(
         self.tokenize_labels = tokenize_labels
         self.batch_mapping = batch_mapping or {}
         self.base_batch = base_batch or {}
-        self._update_generation_kwargs(generation_kwargs or {})
+        self.update_generation_kwargs(generation_kwargs or {})
 
         self.static_keys = static_keys
         self.list_keys = list_keys
         self.tensor_keys = tensor_keys
 
         hf_loading_vars = hf_loading_vars or {}
-        self.dataset: HFDataset = self._read_dataset(dataset_uri, destination_path, hf_loading_vars, hf_parsing_map)
+        self.dataset: HFDataset = self.read_dataset(dataset_uri, destination_path, hf_loading_vars, hf_parsing_map)
         self.strip_data = strip_dataset
         if self.strip_data:
             self.dataset = self.dataset.map(strip_data)
@@ -347,7 +347,7 @@ def __len__(self) -> int:
     def get_num_samples_in_batch(self, batch: Dict) -> int:
         return batch['input_ids'].shape[0]
 
-    def _update_generation_kwargs(self, generation_kwargs: Dict) -> None:
+    def update_generation_kwargs(self, generation_kwargs: Dict) -> None:
         """
         Updates self.base_batch with the passed in generation_kwargs.
         This must be run after self.base_batch is set (for example, if self.base_batch is set after __init__() is run,
@@ -361,11 +361,11 @@ def _update_generation_kwargs(self, generation_kwargs: Dict) -> None:
         if generation_kwargs:
             self.base_batch['generation_kwargs'].update(generation_kwargs)
 
-    def _read_dataset(self,
-                      dataset_uri: str,
-                      destination_path: str,
-                      hf_loading_vars: Optional[Dict[str, Any]] = None,
-                      hf_parsing_map: Optional[Dict[str, Any]] = None) -> 'HFDataset':
+    def read_dataset(self,
+                     dataset_uri: str,
+                     destination_path: str,
+                     hf_loading_vars: Optional[Dict[str, Any]] = None,
+                     hf_parsing_map: Optional[Dict[str, Any]] = None) -> 'HFDataset':
         """
         Reads a dataset and handles parsing it from HuggingFace.
 
@@ -435,7 +435,7 @@ def _generate_few_shot_prompt(
                 fewshot_rng,
             )
             for fewshot_idx in fewshot_idxs:
-                ctxt = self._construct_context(
+                ctxt = self.construct_context(
                     self.dataset[fewshot_idx],
                     few_shot_text,
                     add_answer=True,
@@ -444,7 +444,7 @@ def _generate_few_shot_prompt(
 
         return few_shot_text
 
-    def _construct_context(self, example: Dict, preceding_text: str = '', add_answer: bool = False) -> str:
+    def construct_context(self, example: Dict, preceding_text: str = '', add_answer: bool = False) -> str:
         """
         Takes an example and constructs a context, i.e. the input the model reads for this example.
         Optionally adds the correct answer (for fewshot examples) and handles example delimiters
@@ -464,10 +464,10 @@ def _construct_context(self, example: Dict, preceding_text: str = '', add_answer
             ctxt = f'{self.example_delimiter}{ctxt}'
         ctxt = f'{ctxt}{self.continuation_delimiter}'
         if add_answer:
-            ctxt = f'{ctxt}{self._get_answer_from_example(example, in_context=add_answer)}'
+            ctxt = f'{ctxt}{self.get_answer_from_example(example, in_context=add_answer)}'
         return ctxt
 
-    def _get_answer_from_example(self, example: Dict[str, Any], in_context: bool = False) -> str:
+    def get_answer_from_example(self, example: Dict[str, Any], in_context: bool = False) -> str:
         """
         Returns the answer from the example.
 
@@ -500,7 +500,7 @@ def _fix_eos_on_preamble(self, input_ids: List[int]) -> List[int]:
             input_ids = input_ids[:-1]
         return input_ids
 
-    def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -> Dict[str, Any]:
+    def tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -> Dict[str, Any]:
         """
         Runs text through the tokenizer and handle special cases.
 
@@ -529,7 +529,7 @@ def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -
 
         if self.tokenize_labels:
             # Never add special tokens to answer
-            tokenized_answer = self.tokenizer(self._get_answer_from_example(example),
+            tokenized_answer = self.tokenizer(self.get_answer_from_example(example),
                                               add_special_tokens=False)['input_ids']
             assert isinstance(tokenized_answer, list)
             trimmed_context = _trim_context(tokenized_context, tokenized_answer, self.padding_size)
@@ -553,7 +553,7 @@ def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -
                                                 self.padding_side)
 
             tokenized_example[self.context_key] = padded_context
-            tokenized_example[self.answer_key] = self._get_answer_from_example(example)
+            tokenized_example[self.answer_key] = self.get_answer_from_example(example)
 
         return tokenized_example
 
@@ -582,8 +582,8 @@ def _prep_example(
             Dict: Contains a dictionary with the tokenized data
         """
         prompt_and_fewshot = self._generate_few_shot_prompt(num_fewshot, example_idx, prompt_string, fewshot_rng)
-        ctxt = self._construct_context(example, prompt_and_fewshot, add_answer=False)
-        tokenized_example = self._tokenize_example(prompt_and_fewshot, ctxt, example)
+        ctxt = self.construct_context(example, prompt_and_fewshot, add_answer=False)
+        tokenized_example = self.tokenize_example(prompt_and_fewshot, ctxt, example)
         return tokenized_example
 
     def collate_fn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
@@ -700,16 +700,16 @@ def __init__(self,
             'input_ids': self.context_key,
             'labels': 'aliases',
         }
-        self._update_generation_kwargs(kwargs.get('generation_kwargs', {}))
+        self.update_generation_kwargs(kwargs.get('generation_kwargs', {}))
 
-    def _read_dataset(
+    def read_dataset(
         self,
         dataset_uri: str,
         destination_path: str,
         hf_loading_vars: Dict,
         hf_parsing_map: Dict,
     ) -> 'HFDataset':
-        dataset = super()._read_dataset(dataset_uri, destination_path, hf_loading_vars, hf_parsing_map)
+        dataset = super().read_dataset(dataset_uri, destination_path, hf_loading_vars, hf_parsing_map)
         self.has_cot = 'chain_of_thought' in dataset.features
         dataset = dataset.map(
             lambda examples: {
@@ -723,7 +723,7 @@ def _read_dataset(
         self.padding_size = self.max_seq_len - self.max_answer_length
         return dataset
 
-    def _get_answer_from_example(self, example: Dict, in_context=False) -> str:
+    def get_answer_from_example(self, example: Dict, in_context=False) -> str:
         """
         Returns the answer from the example. Applies chain of thought if self.has_cot is marked as true.
         Args:
@@ -737,7 +737,7 @@ def _get_answer_from_example(self, example: Dict, in_context=False) -> str:
         else:
             return example[self.answer_key]
 
-    def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -> Dict[str, Any]:
+    def tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -> Dict[str, Any]:
         """
         Run text through the tokenizer and handle special cases.
         Args:
@@ -748,7 +748,7 @@ def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -
         Returns:
             Dict: Dictionary with the tokenized data
         """
-        tokenized_example = super()._tokenize_example(prompt_and_fewshot, ctxt, example)
+        tokenized_example = super().tokenize_example(prompt_and_fewshot, ctxt, example)
         tokenized_example['aliases'] = list(example.get('aliases', []))
         return tokenized_example
 
@@ -877,7 +877,7 @@ def __init__(self,
         self.batch_mapping_per_choice = {'input_ids': 'context', 'labels': 'context'}
         self.batch_map_per_example = {'gold_indices': 'gold'}
 
-    def _get_answer_from_example(self, example: Dict, in_context=False) -> str:
+    def get_answer_from_example(self, example: Dict, in_context=False) -> str:
         """
         Returns the correct answer from the example's choices.
         Args:
@@ -890,7 +890,7 @@ def _get_answer_from_example(self, example: Dict, in_context=False) -> str:
         gold_idx = example['gold']
         return choices[gold_idx]
 
-    def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -> Dict[str, Any]:
+    def tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -> Dict[str, Any]:
         """
         Runs text through the tokenizer and handle special cases.
         Args:
@@ -1068,7 +1068,7 @@ def __init__(self, choices_key='context_options', *args, **kwargs):
             'choice_groupings': [],
         }
 
-    def _construct_context(self, example, preceding_text: str = '', add_answer: bool = False) -> str:
+    def construct_context(self, example, preceding_text: str = '', add_answer: bool = False) -> str:
         """
         Takes a example and constructs a context with the correct context for the example's continuation.
 
@@ -1136,10 +1136,10 @@ def _prep_example(
         """
         prompt_and_fewshot = self._generate_few_shot_prompt(num_fewshot, example_idx, prompt_string, fewshot_rng)
         ctxt = self._construct_multiple_contexts(example, prompt_and_fewshot)
-        tokenized_example = self._tokenize_example(prompt_and_fewshot, ctxt, example)
+        tokenized_example = self.tokenize_example(prompt_and_fewshot, ctxt, example)
         return tokenized_example
 
-    def _tokenize_example(self, prompt_and_fewshot: str, context_options: List[str], example: Dict) -> Dict[str, Any]:
+    def tokenize_example(self, prompt_and_fewshot: str, context_options: List[str], example: Dict) -> Dict[str, Any]:
         """
         Runs text through the tokenizer and handle special cases.
 
@@ -1290,7 +1290,7 @@ def __init__(
                 'eos_token_id': self.tokenizer.eos_token_id
             }
         }
-        self._update_generation_kwargs(kwargs.get('generation_kwargs', {}))
+        self.update_generation_kwargs(kwargs.get('generation_kwargs', {}))
 
     def _set_max_prompt_and_answer_lengths(self):
         """
@@ -1332,12 +1332,12 @@ def _trim_padding(self, example: Dict):
         example[self.context_key] = padded_context
         return example
 
-    def _tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -> Dict[str, Any]:
+    def tokenize_example(self, prompt_and_fewshot: str, ctxt: str, example: Dict) -> Dict[str, Any]:
         """
         Adds extra code task details to the example dictionary.
         See InContextLearningDataset for more details
         """
-        tokenized_example = super()._tokenize_example(prompt_and_fewshot, ctxt, example)
+        tokenized_example = super().tokenize_example(prompt_and_fewshot, ctxt, example)
         tokenized_example['prompt_text'] = example['prompt']
         tokenized_example['task_id'] = example['task_id']
         tokenized_example['canonical_solution'] = example['canonical_solution']
diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py
index 065c90cc37..9a98e2b174 100644
--- a/tests/datasets/test_in_context_learning_datasets.py
+++ b/tests/datasets/test_in_context_learning_datasets.py
@@ -284,16 +284,16 @@ def test_construct_context(tiny_gpt2_tokenizer, tmp_path):
                                   destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
                                   hf_loading_vars=hf_loading_vars,
                                   hf_parsing_map=hf_parsing_map)
-    constructed_context = dl._construct_context({'context': 'quas quas exort', 'answer': 'ice wall'})
+    constructed_context = dl.construct_context({'context': 'quas quas exort', 'answer': 'ice wall'})
     assert constructed_context == 'Orbs: quas quas exort\nSpell: '
-    constructed_context = dl._construct_context({'context': 'quas quas exort', 'answer': 'ice wall'}, add_answer=True)
+    constructed_context = dl.construct_context({'context': 'quas quas exort', 'answer': 'ice wall'}, add_answer=True)
     assert constructed_context == 'Orbs: quas quas exort\nSpell: ice wall'
-    constructed_context = dl._construct_context({
+    constructed_context = dl.construct_context({
         'context': 'quas quas exort',
         'answer': 'ice wall'
     },
-                                                preceding_text='The harsh White Waste beckons!',
-                                                add_answer=True)
+                                               preceding_text='The harsh White Waste beckons!',
+                                               add_answer=True)
     assert constructed_context == '\nOrbs: quas quas exort\nSpell: ice wall'
 
 
@@ -323,7 +323,7 @@ def test_get_answer_from_example(tiny_gpt2_tokenizer, tmp_path):
                                   destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
                                   hf_loading_vars=hf_loading_vars,
                                   hf_parsing_map=hf_parsing_map)
-    answer = dl._get_answer_from_example({'context': 'wex exort exort', 'answer': 'alacrity'})
+    answer = dl.get_answer_from_example({'context': 'wex exort exort', 'answer': 'alacrity'})
     assert answer == ' alacrity'
 
 
@@ -390,8 +390,8 @@ def test_tokenize_example_with_tokenize_labels(tiny_gpt2_tokenizer, tmp_path):
                                   hf_loading_vars=hf_loading_vars,
                                   hf_parsing_map=hf_parsing_map,
                                   tokenize_labels=True)
-    tokenized_example = dl._tokenize_example('What spell does this invoke? ', 'exort exort wex\nSpell: ',
-                                             {'answer': ' Meatball'})
+    tokenized_example = dl.tokenize_example('What spell does this invoke? ', 'exort exort wex\nSpell: ',
+                                            {'answer': ' Meatball'})
     tokenized_input = [2061, 4822, 857, 428, 26342, 30, 220, 1069, 419, 409, 419, 356, 87, 198, 31221, 25, 19145, 1894]
     assert tokenized_example['context'][:len(tokenized_input)].tolist() == tokenized_input
     assert tokenized_example['context'][-1] == tokenizer.eos_token_id
@@ -427,8 +427,8 @@ def test_tokenize_example_with_no_tokenize_labels(tiny_gpt2_tokenizer, tmp_path)
                                   hf_loading_vars=hf_loading_vars,
                                   hf_parsing_map=hf_parsing_map,
                                   tokenize_labels=False)
-    tokenized_example = dl._tokenize_example('What spell does this invoke? ', 'exort exort wex\nSpell: ',
-                                             {'answer': ' Meatball'})
+    tokenized_example = dl.tokenize_example('What spell does this invoke? ', 'exort exort wex\nSpell: ',
+                                            {'answer': ' Meatball'})
     tokenized_input = [2061, 4822, 857, 428, 26342, 30, 220, 1069, 419, 409, 419, 356, 87, 198, 31221, 25]
     assert tokenized_example['context'][:len(tokenized_input)].tolist() == tokenized_input
     assert tokenized_example['context'][-1] == tokenizer.eos_token_id
@@ -528,7 +528,7 @@ def test_qa_get_answer_from_example_with_no_cot(tmp_path, tiny_gpt2_tokenizer):
         cot_delimiter=' ### ',
         destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
     )
-    answer = dl._get_answer_from_example({
+    answer = dl.get_answer_from_example({
         'context': 'empty',
         'answer': 'this is the correct answer',
         'chain_of_thought': "Let's think step by step. "
@@ -557,7 +557,7 @@ def test_qa_get_answer_from_example_with_cot(tmp_path, tiny_gpt2_tokenizer):
         destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
     )
     dl.has_cot = True
-    answer = dl._get_answer_from_example({
+    answer = dl.get_answer_from_example({
         'context': 'empty',
         'answer': 'this is the correct answer',
         'chain_of_thought': "Let's think step by step. "
@@ -586,7 +586,7 @@ def test_qa_tokenize_example(tiny_gpt2_tokenizer, tmp_path):
         destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
     )
     dl.has_cot = True
-    tokenized_example = dl._tokenize_example(
+    tokenized_example = dl.tokenize_example(
         'starting prompt', 'a context', {
             'context': 'empty',
             'answer': 'this is the correct answer',
@@ -680,9 +680,9 @@ def test_mc_tokenize_example(tiny_gpt2_tokenizer, tmp_path):
         'choices': ['A', 'B', 'C', 'D'],
         'gold': 2
     }
-    tokenized_example = dl._tokenize_example(prompt_and_fewshot='Answer the following: ',
-                                             ctxt=example['context'],
-                                             example=example)
+    tokenized_example = dl.tokenize_example(prompt_and_fewshot='Answer the following: ',
+                                            ctxt=example['context'],
+                                            example=example)
     unpadded_queries = [context[context != tokenizer.eos_token_id] for context in tokenized_example['query']]
     untokenized_inputs = [tokenizer.decode(unpadded_input) for unpadded_input in unpadded_queries]
     correct_output = [
@@ -714,9 +714,9 @@ def test_schema_construct_context(tiny_gpt2_tokenizer, tmp_path):
         destination_path=str(tmp_path / 'test_human_eval_small.jsonl'),
     )
     example = {'context_options': ['cont one', 'cont two'], 'gold': 0, 'continuation': 'this is a continuation'}
-    constructed_context = dl._construct_context(example)
+    constructed_context = dl.construct_context(example)
     assert constructed_context == 'cont one ### this is a continuation'
-    constructed_context = dl._construct_context(example, preceding_text='text')
+    constructed_context = dl.construct_context(example, preceding_text='text')
     assert constructed_context == '\ncont one ### this is a continuation'
 
 
@@ -768,9 +768,9 @@ def test_schema_tokenize_example(tiny_gpt2_tokenizer, tmp_path):
         destination_path=str(tmp_path / 'test_human_eval_small.jsonl'),
     )
     example = {'context_options': ['context one', 'context two'], 'gold': 0, 'continuation': 'this is a continuation'}
-    tokenized_example = dl._tokenize_example(prompt_and_fewshot='prompt ',
-                                             context_options=example['context_options'],
-                                             example=example)
+    tokenized_example = dl.tokenize_example(prompt_and_fewshot='prompt ',
+                                            context_options=example['context_options'],
+                                            example=example)
     assert all(tiny_gpt2_tokenizer.decode(cont) == ' this is a continuation' for cont in tokenized_example['answer'])
     unpadded_inputs = [context[context != tokenizer.eos_token_id] for context in tokenized_example['context_options']]
     untokenized_inputs = [tokenizer.decode(unpadded_input) for unpadded_input in unpadded_inputs]

From 5e11203512c2210f0aed3c57fb6326562a0cee77 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Thu, 25 Jan 2024 22:31:13 +0000
Subject: [PATCH 115/116] fix type

---
 composer/datasets/in_context_learning_evaluation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index f53aa62288..4e22de70ba 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -302,7 +302,7 @@ def __init__(
 
         self.max_seq_len = max_seq_len
         if not isinstance(pad_tok_id, int):
-            raise ValueError(f'`InContextLearningDataset` must be an integer. Found {pad_tok_id} instead')
+            raise ValueError(f'`InContextLearningDataset` must be an integer. Found {type(pad_tok_id)} instead')
         self.pad_tok_id = pad_tok_id
         self.num_fewshot = num_fewshot
         self.padding_side = padding_side

From d48a3a99fa9126603885c9f8b90abc78bf8d0a93 Mon Sep 17 00:00:00 2001
From: Max Marion <max.marion@databricks.com>
Date: Thu, 25 Jan 2024 23:22:50 +0000
Subject: [PATCH 116/116] move pad_tok_id error

---
 composer/datasets/in_context_learning_evaluation.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 4e22de70ba..4e0e30f1ff 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -130,6 +130,10 @@ def _make_padded_input(context_enc: List,
     )
     (inp_len,) = inp.shape
 
+    # Sometimes tokenizers that have neither a pad_tok_id or eos_tok_id will pass None in as the padding
+    # token and cause errors
+    if not isinstance(pad_tok_id, int):
+        raise ValueError(f'`pad_tok_id` must be an integer. Found {type(pad_tok_id)} instead')
     # pad length from seq to padding_length
     if padding_side == 'right':
         inp = torch.cat(
@@ -301,8 +305,6 @@ def __init__(
         self.prefix_space = _tokenizer_needs_prefix_space(self.tokenizer)
 
         self.max_seq_len = max_seq_len
-        if not isinstance(pad_tok_id, int):
-            raise ValueError(f'`InContextLearningDataset` must be an integer. Found {type(pad_tok_id)} instead')
         self.pad_tok_id = pad_tok_id
         self.num_fewshot = num_fewshot
         self.padding_side = padding_side