From 754fe7fc8a1bd4858fe023ceabfb21f7672aa42d Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Wed, 27 Dec 2023 11:04:05 -0800 Subject: [PATCH 1/6] default to using tokenizer eos and bos --- scripts/data_prep/convert_text_to_mds.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/scripts/data_prep/convert_text_to_mds.py b/scripts/data_prep/convert_text_to_mds.py index dc7c514d75..dd36c60658 100644 --- a/scripts/data_prep/convert_text_to_mds.py +++ b/scripts/data_prep/convert_text_to_mds.py @@ -8,7 +8,7 @@ from argparse import ArgumentParser, Namespace from concurrent.futures import ProcessPoolExecutor from glob import glob -from typing import Iterable, List, Tuple, cast +from typing import Iterable, List, Optional, Tuple, cast import psutil from composer.utils import (ObjectStore, maybe_create_object_store_from_uri, @@ -109,11 +109,6 @@ def parse_args() -> Namespace: parser.error( 'When setting --concat_tokens, you must specify a --tokenizer') - # now that we have validated them, change BOS/EOS to strings - if parsed.bos_text is None: - parsed.bos_text = '' - if parsed.eos_text is None: - parsed.eos_text = '' return parsed @@ -328,13 +323,13 @@ def convert_text_to_mds( output_folder: str, input_folder: str, concat_tokens: int, - eos_text: str, - bos_text: str, no_wrap: bool, compression: str, processes: int, args_str: str, reprocess: bool, + bos_text: Optional[str] = None, + eos_text: Optional[str] = None, ): """Convert a folder of text files to MDS format. @@ -343,14 +338,20 @@ def convert_text_to_mds( output_folder (str): Folder to write MDS shards to input_folder (str): Folder of text files to process concat_tokens (int): Concantenate up to this many tokens - eos_text (str): Textend to append to each example to separate concatenated samples - bos_text (str): Text to prepend to each example to separate concatenated samples no_wrap: (bool): Whether to let text examples wrap across multiple training examples compression (str): The compression algorithm to use for MDS writing processes (int): The number of processes to use. args_str (str): String representation of the arguments reprocess (bool): Whether to always reprocess the given folder of text files + bos_text (Optional[str]): Text to prepend to each example to separate concatenated samples + If None, default to using the tokenizer's specified bos_text. + eos_text (Optional[str]): Text end to append to each example to separate concatenated samples + If None, default to using the tokenizer's specified eos_text. """ + tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) + bos_text = tokenizer.bos_token if bos_text is None else bos_text + eos_text = tokenizer.eos_token if eos_text is None else eos_text + is_remote_output = is_remote_path(output_folder) object_names = get_object_names(input_folder) From 50588ac00d43ee87798c41959b379502a86263da Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Wed, 27 Dec 2023 16:44:25 -0800 Subject: [PATCH 2/6] pyright fixes --- scripts/data_prep/convert_text_to_mds.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/data_prep/convert_text_to_mds.py b/scripts/data_prep/convert_text_to_mds.py index dd36c60658..d3679c309d 100644 --- a/scripts/data_prep/convert_text_to_mds.py +++ b/scripts/data_prep/convert_text_to_mds.py @@ -351,6 +351,7 @@ def convert_text_to_mds( tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) bos_text = tokenizer.bos_token if bos_text is None else bos_text eos_text = tokenizer.eos_token if eos_text is None else eos_text + assert bos_text is not None and eos_text is not None # for pyright is_remote_output = is_remote_path(output_folder) From 0fb487fc4697467aea47faae0a301bf26ad88b4a Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Thu, 28 Dec 2023 15:50:46 -0800 Subject: [PATCH 3/6] Take into account tokenizers that automatically add bos token --- scripts/data_prep/convert_text_to_mds.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/scripts/data_prep/convert_text_to_mds.py b/scripts/data_prep/convert_text_to_mds.py index d3679c309d..e954590e6b 100644 --- a/scripts/data_prep/convert_text_to_mds.py +++ b/scripts/data_prep/convert_text_to_mds.py @@ -221,7 +221,8 @@ def download_and_convert( downloading_iter = DownloadingIterable(object_names=file_names, output_folder=tmp_dir, object_store=object_store) - tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) + tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, + add_bos_token=False) tokenizer.model_max_length = 5000000000 # Hack to prevent warnings from HuggingFace # Use the ConcatTokensDataset from LLM-foundry to concatenate sequences of tokens up @@ -344,14 +345,25 @@ def convert_text_to_mds( args_str (str): String representation of the arguments reprocess (bool): Whether to always reprocess the given folder of text files bos_text (Optional[str]): Text to prepend to each example to separate concatenated samples - If None, default to using the tokenizer's specified bos_text. + If None, use the tokenizer's bos_token if tokenizer.add_bos_token is True, otherwise use an empty string. eos_text (Optional[str]): Text end to append to each example to separate concatenated samples - If None, default to using the tokenizer's specified eos_text. + If None, use the tokenizer's eos_token. """ tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) - bos_text = tokenizer.bos_token if bos_text is None else bos_text - eos_text = tokenizer.eos_token if eos_text is None else eos_text - assert bos_text is not None and eos_text is not None # for pyright + + if bos_text is None: + if hasattr(tokenizer, 'add_bos_token'): + if tokenizer.add_bos_token: + tokenizer_bos = tokenizer.bos_token + bos_text = tokenizer_bos if tokenizer_bos is not None else '' + else: + bos_text = '' + + if eos_text is None: + tokenizer_eos = tokenizer.eos_token + eos_text = tokenizer_eos if tokenizer_eos is not None else '' + + assert bos_text is not None and eos_text is not None # for pyright is_remote_output = is_remote_path(output_folder) From b121d746adf547a8f065440c5b1a5d2f7002cd7f Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Thu, 28 Dec 2023 15:53:11 -0800 Subject: [PATCH 4/6] set add_eos_token to false --- scripts/data_prep/convert_text_to_mds.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/data_prep/convert_text_to_mds.py b/scripts/data_prep/convert_text_to_mds.py index e954590e6b..bacffa5b03 100644 --- a/scripts/data_prep/convert_text_to_mds.py +++ b/scripts/data_prep/convert_text_to_mds.py @@ -222,7 +222,8 @@ def download_and_convert( output_folder=tmp_dir, object_store=object_store) tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, - add_bos_token=False) + add_bos_token=False, + add_eos_token=False) tokenizer.model_max_length = 5000000000 # Hack to prevent warnings from HuggingFace # Use the ConcatTokensDataset from LLM-foundry to concatenate sequences of tokens up @@ -363,7 +364,7 @@ def convert_text_to_mds( tokenizer_eos = tokenizer.eos_token eos_text = tokenizer_eos if tokenizer_eos is not None else '' - assert bos_text is not None and eos_text is not None # for pyright + assert bos_text is not None and eos_text is not None # for pyright is_remote_output = is_remote_path(output_folder) From 4496f989ac4e117494bcd5c26ee58b2fdd97bf61 Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Thu, 28 Dec 2023 16:06:14 -0800 Subject: [PATCH 5/6] small logic fix --- scripts/data_prep/convert_text_to_mds.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/scripts/data_prep/convert_text_to_mds.py b/scripts/data_prep/convert_text_to_mds.py index bacffa5b03..8710315f9d 100644 --- a/scripts/data_prep/convert_text_to_mds.py +++ b/scripts/data_prep/convert_text_to_mds.py @@ -353,10 +353,9 @@ def convert_text_to_mds( tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) if bos_text is None: - if hasattr(tokenizer, 'add_bos_token'): - if tokenizer.add_bos_token: - tokenizer_bos = tokenizer.bos_token - bos_text = tokenizer_bos if tokenizer_bos is not None else '' + if hasattr(tokenizer, 'add_bos_token') and tokenizer.add_bos_token: + tokenizer_bos = tokenizer.bos_token + bos_text = tokenizer_bos if tokenizer_bos is not None else '' else: bos_text = '' From 655536c9e0914c3dd0199824e1734b2ef885e76d Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Thu, 28 Dec 2023 16:08:42 -0800 Subject: [PATCH 6/6] fix comment --- scripts/data_prep/convert_text_to_mds.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/data_prep/convert_text_to_mds.py b/scripts/data_prep/convert_text_to_mds.py index 8710315f9d..2b3c8cbfbf 100644 --- a/scripts/data_prep/convert_text_to_mds.py +++ b/scripts/data_prep/convert_text_to_mds.py @@ -346,7 +346,7 @@ def convert_text_to_mds( args_str (str): String representation of the arguments reprocess (bool): Whether to always reprocess the given folder of text files bos_text (Optional[str]): Text to prepend to each example to separate concatenated samples - If None, use the tokenizer's bos_token if tokenizer.add_bos_token is True, otherwise use an empty string. + If None and tokenizer.add_bos_token is True, use the tokenizer's bos_token, otherwise use an empty string. eos_text (Optional[str]): Text end to append to each example to separate concatenated samples If None, use the tokenizer's eos_token. """