add response_field in data_processing (#68)

Signed-off-by: Yu Chin Fabian Lim <[email protected]>
foundation-model-stack · Aug 23, 2024 · f4710e7 · f4710e7
1 parent 48426a1
commit f4710e7
Show file tree

Hide file tree

Showing 2 changed files with 29 additions and 1 deletion.
diff --git a/scripts/benchmarks/benchmark.py b/scripts/benchmarks/benchmark.py
@@ -166,6 +166,7 @@ def __init__(
         dataset_text_field: str = "output",
         chat_template: str = None,
         response_template: str = None,
+        response_field: str = None,
         additional_dataset_kwargs: Dict = {},
     ) -> None:
 
@@ -180,6 +181,7 @@ def __init__(
             "tokenize": tokenize,
             "input_field": input_field,
             "dataset_text_field": dataset_text_field,
+            "response_field": response_field,
             "chat_template": chat_template,
         }
         self.training_paths = {}  # cache to store the training paths

diff --git a/scripts/benchmarks/data_processing.py b/scripts/benchmarks/data_processing.py
@@ -1,5 +1,6 @@
 # Standard
 from typing import Callable, Dict, List
+import warnings
 
 # Third Party
 from transformers import PreTrainedTokenizer
@@ -16,6 +17,7 @@ def build_data_formatting_func(
     dataset_text_field: str = "output",
     features: List = None,
     response_template: str = None,
+    response_field: str = None,
     chat_template: str = None,
 ):
     if tokenizer is None or chat_template is None:
@@ -36,6 +38,7 @@ def build_data_formatting_func(
         dataset_text_field,
         features,
         response_template,
+        response_field,
     )
 
 
@@ -47,19 +50,42 @@ def _build_data_formatting_func(
     dataset_text_field: str = "output",
     features: List = None,
     response_template: str = None,
+    response_field: str = None,
+    ignore_index: int = -100,
 ):
 
     tokenizer.chat_template = chat_template
 
     loss_masking = None
     if tokenize and response_template is not None:
         loss_masking = instruction_mask_loss(tokenizer, response_template)
+    elif tokenize and response_template is None:
+        assert response_field is not None, \
+            "response_field must be specified if tokenize=True and response_template=None."
 
     def _format(example):
         formatted_and_maybe_tokenized = tokenizer.apply_chat_template(
             [example], tokenize=tokenize
         )
         key = "input_ids" if tokenize else dataset_text_field
+
+        if tokenize and response_template is None and response_field:
+            # in this case we need to use the response field to tokenize
+            warnings.warn(
+                "chat_template passed in with tokenize=True and "
+                "response_template was None. To ensure loss masking is "
+                f"correct, please do not put reponse_field '{response_field}' "
+                "in the chat template."
+            )
+            # NOTE: in this case not handling attention mask
+            response = tokenizer(example[response_field])['input_ids']
+            return {
+                key: formatted_and_maybe_tokenized + response,
+                'labels': [ ignore_index ] * len(formatted_and_maybe_tokenized) + response
+            }
+
+            loss_masking = instruction_mask_loss(tokenizer, response_template)
+
         if not loss_masking:
             return {key: formatted_and_maybe_tokenized}
         return loss_masking(formatted_and_maybe_tokenized)
@@ -193,4 +219,4 @@ def collate_example(example):
         # flatten the additional dim
         return {k: v.view(-1) for k, v in collated_example.items()}
 
-    return collate_example
+    return collate_example