Skip to content

Commit

Permalink
add response_field in data_processing (#68)
Browse files Browse the repository at this point in the history
Signed-off-by: Yu Chin Fabian Lim <[email protected]>
  • Loading branch information
fabianlim authored Aug 23, 2024
1 parent 48426a1 commit f4710e7
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 1 deletion.
2 changes: 2 additions & 0 deletions scripts/benchmarks/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@ def __init__(
dataset_text_field: str = "output",
chat_template: str = None,
response_template: str = None,
response_field: str = None,
additional_dataset_kwargs: Dict = {},
) -> None:

Expand All @@ -180,6 +181,7 @@ def __init__(
"tokenize": tokenize,
"input_field": input_field,
"dataset_text_field": dataset_text_field,
"response_field": response_field,
"chat_template": chat_template,
}
self.training_paths = {} # cache to store the training paths
Expand Down
28 changes: 27 additions & 1 deletion scripts/benchmarks/data_processing.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Standard
from typing import Callable, Dict, List
import warnings

# Third Party
from transformers import PreTrainedTokenizer
Expand All @@ -16,6 +17,7 @@ def build_data_formatting_func(
dataset_text_field: str = "output",
features: List = None,
response_template: str = None,
response_field: str = None,
chat_template: str = None,
):
if tokenizer is None or chat_template is None:
Expand All @@ -36,6 +38,7 @@ def build_data_formatting_func(
dataset_text_field,
features,
response_template,
response_field,
)


Expand All @@ -47,19 +50,42 @@ def _build_data_formatting_func(
dataset_text_field: str = "output",
features: List = None,
response_template: str = None,
response_field: str = None,
ignore_index: int = -100,
):

tokenizer.chat_template = chat_template

loss_masking = None
if tokenize and response_template is not None:
loss_masking = instruction_mask_loss(tokenizer, response_template)
elif tokenize and response_template is None:
assert response_field is not None, \
"response_field must be specified if tokenize=True and response_template=None."

def _format(example):
formatted_and_maybe_tokenized = tokenizer.apply_chat_template(
[example], tokenize=tokenize
)
key = "input_ids" if tokenize else dataset_text_field

if tokenize and response_template is None and response_field:
# in this case we need to use the response field to tokenize
warnings.warn(
"chat_template passed in with tokenize=True and "
"response_template was None. To ensure loss masking is "
f"correct, please do not put reponse_field '{response_field}' "
"in the chat template."
)
# NOTE: in this case not handling attention mask
response = tokenizer(example[response_field])['input_ids']
return {
key: formatted_and_maybe_tokenized + response,
'labels': [ ignore_index ] * len(formatted_and_maybe_tokenized) + response
}

loss_masking = instruction_mask_loss(tokenizer, response_template)

if not loss_masking:
return {key: formatted_and_maybe_tokenized}
return loss_masking(formatted_and_maybe_tokenized)
Expand Down Expand Up @@ -193,4 +219,4 @@ def collate_example(example):
# flatten the additional dim
return {k: v.view(-1) for k, v in collated_example.items()}

return collate_example
return collate_example

0 comments on commit f4710e7

Please sign in to comment.