From 754fe7fc8a1bd4858fe023ceabfb21f7672aa42d Mon Sep 17 00:00:00 2001
From: Irene Dea <deaairene@gmail.com>
Date: Wed, 27 Dec 2023 11:04:05 -0800
Subject: [PATCH 1/6] default to using tokenizer eos and bos

---
 scripts/data_prep/convert_text_to_mds.py | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/scripts/data_prep/convert_text_to_mds.py b/scripts/data_prep/convert_text_to_mds.py
index dc7c514d75..dd36c60658 100644
--- a/scripts/data_prep/convert_text_to_mds.py
+++ b/scripts/data_prep/convert_text_to_mds.py
@@ -8,7 +8,7 @@
 from argparse import ArgumentParser, Namespace
 from concurrent.futures import ProcessPoolExecutor
 from glob import glob
-from typing import Iterable, List, Tuple, cast
+from typing import Iterable, List, Optional, Tuple, cast
 
 import psutil
 from composer.utils import (ObjectStore, maybe_create_object_store_from_uri,
@@ -109,11 +109,6 @@ def parse_args() -> Namespace:
         parser.error(
             'When setting --concat_tokens, you must specify a --tokenizer')
 
-    # now that we have validated them, change BOS/EOS to strings
-    if parsed.bos_text is None:
-        parsed.bos_text = ''
-    if parsed.eos_text is None:
-        parsed.eos_text = ''
     return parsed
 
 
@@ -328,13 +323,13 @@ def convert_text_to_mds(
     output_folder: str,
     input_folder: str,
     concat_tokens: int,
-    eos_text: str,
-    bos_text: str,
     no_wrap: bool,
     compression: str,
     processes: int,
     args_str: str,
     reprocess: bool,
+    bos_text: Optional[str] = None,
+    eos_text: Optional[str] = None,
 ):
     """Convert a folder of text files to MDS format.
 
@@ -343,14 +338,20 @@ def convert_text_to_mds(
         output_folder (str): Folder to write MDS shards to
         input_folder (str): Folder of text files to process
         concat_tokens (int): Concantenate up to this many tokens
-        eos_text (str): Textend to append to each example to separate concatenated samples
-        bos_text (str): Text to prepend to each example to separate concatenated samples
         no_wrap: (bool): Whether to let text examples wrap across multiple training examples
         compression (str): The compression algorithm to use for MDS writing
         processes (int): The number of processes to use.
         args_str (str): String representation of the arguments
         reprocess (bool): Whether to always reprocess the given folder of text files
+        bos_text (Optional[str]): Text to prepend to each example to separate concatenated samples
+            If None, default to using the tokenizer's specified bos_text.
+        eos_text (Optional[str]): Text end to append to each example to separate concatenated samples
+            If None, default to using the tokenizer's specified eos_text.
     """
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
+    bos_text = tokenizer.bos_token if bos_text is None else bos_text
+    eos_text = tokenizer.eos_token if eos_text is None else eos_text
+
     is_remote_output = is_remote_path(output_folder)
 
     object_names = get_object_names(input_folder)

From 50588ac00d43ee87798c41959b379502a86263da Mon Sep 17 00:00:00 2001
From: Irene Dea <deaairene@gmail.com>
Date: Wed, 27 Dec 2023 16:44:25 -0800
Subject: [PATCH 2/6] pyright fixes

---
 scripts/data_prep/convert_text_to_mds.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/data_prep/convert_text_to_mds.py b/scripts/data_prep/convert_text_to_mds.py
index dd36c60658..d3679c309d 100644
--- a/scripts/data_prep/convert_text_to_mds.py
+++ b/scripts/data_prep/convert_text_to_mds.py
@@ -351,6 +351,7 @@ def convert_text_to_mds(
     tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
     bos_text = tokenizer.bos_token if bos_text is None else bos_text
     eos_text = tokenizer.eos_token if eos_text is None else eos_text
+    assert bos_text is not None and eos_text is not None  # for pyright
 
     is_remote_output = is_remote_path(output_folder)
 

From 0fb487fc4697467aea47faae0a301bf26ad88b4a Mon Sep 17 00:00:00 2001
From: Irene Dea <deaairene@gmail.com>
Date: Thu, 28 Dec 2023 15:50:46 -0800
Subject: [PATCH 3/6] Take into account tokenizers that automatically add bos
 token

---
 scripts/data_prep/convert_text_to_mds.py | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/scripts/data_prep/convert_text_to_mds.py b/scripts/data_prep/convert_text_to_mds.py
index d3679c309d..e954590e6b 100644
--- a/scripts/data_prep/convert_text_to_mds.py
+++ b/scripts/data_prep/convert_text_to_mds.py
@@ -221,7 +221,8 @@ def download_and_convert(
         downloading_iter = DownloadingIterable(object_names=file_names,
                                                output_folder=tmp_dir,
                                                object_store=object_store)
-        tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
+        tokenizer = AutoTokenizer.from_pretrained(tokenizer_name,
+                                                  add_bos_token=False)
         tokenizer.model_max_length = 5000000000  # Hack to prevent warnings from HuggingFace
 
         # Use the ConcatTokensDataset from LLM-foundry to concatenate sequences of tokens up
@@ -344,14 +345,25 @@ def convert_text_to_mds(
         args_str (str): String representation of the arguments
         reprocess (bool): Whether to always reprocess the given folder of text files
         bos_text (Optional[str]): Text to prepend to each example to separate concatenated samples
-            If None, default to using the tokenizer's specified bos_text.
+            If None, use the tokenizer's bos_token if tokenizer.add_bos_token is True, otherwise use an empty string.
         eos_text (Optional[str]): Text end to append to each example to separate concatenated samples
-            If None, default to using the tokenizer's specified eos_text.
+            If None, use the tokenizer's eos_token.
     """
     tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
-    bos_text = tokenizer.bos_token if bos_text is None else bos_text
-    eos_text = tokenizer.eos_token if eos_text is None else eos_text
-    assert bos_text is not None and eos_text is not None  # for pyright
+
+    if bos_text is None:
+        if hasattr(tokenizer, 'add_bos_token'):
+            if tokenizer.add_bos_token:
+                tokenizer_bos = tokenizer.bos_token
+                bos_text = tokenizer_bos if tokenizer_bos is not None else ''
+        else:
+            bos_text = ''
+
+    if eos_text is None:
+        tokenizer_eos = tokenizer.eos_token
+        eos_text = tokenizer_eos if tokenizer_eos is not None else ''
+
+    assert bos_text is not None and eos_text is not None # for pyright
 
     is_remote_output = is_remote_path(output_folder)
 

From b121d746adf547a8f065440c5b1a5d2f7002cd7f Mon Sep 17 00:00:00 2001
From: Irene Dea <deaairene@gmail.com>
Date: Thu, 28 Dec 2023 15:53:11 -0800
Subject: [PATCH 4/6] set add_eos_token to false

---
 scripts/data_prep/convert_text_to_mds.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/scripts/data_prep/convert_text_to_mds.py b/scripts/data_prep/convert_text_to_mds.py
index e954590e6b..bacffa5b03 100644
--- a/scripts/data_prep/convert_text_to_mds.py
+++ b/scripts/data_prep/convert_text_to_mds.py
@@ -222,7 +222,8 @@ def download_and_convert(
                                                output_folder=tmp_dir,
                                                object_store=object_store)
         tokenizer = AutoTokenizer.from_pretrained(tokenizer_name,
-                                                  add_bos_token=False)
+                                                  add_bos_token=False,
+                                                  add_eos_token=False)
         tokenizer.model_max_length = 5000000000  # Hack to prevent warnings from HuggingFace
 
         # Use the ConcatTokensDataset from LLM-foundry to concatenate sequences of tokens up
@@ -363,7 +364,7 @@ def convert_text_to_mds(
         tokenizer_eos = tokenizer.eos_token
         eos_text = tokenizer_eos if tokenizer_eos is not None else ''
 
-    assert bos_text is not None and eos_text is not None # for pyright
+    assert bos_text is not None and eos_text is not None  # for pyright
 
     is_remote_output = is_remote_path(output_folder)
 

From 4496f989ac4e117494bcd5c26ee58b2fdd97bf61 Mon Sep 17 00:00:00 2001
From: Irene Dea <deaairene@gmail.com>
Date: Thu, 28 Dec 2023 16:06:14 -0800
Subject: [PATCH 5/6] small logic fix

---
 scripts/data_prep/convert_text_to_mds.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/scripts/data_prep/convert_text_to_mds.py b/scripts/data_prep/convert_text_to_mds.py
index bacffa5b03..8710315f9d 100644
--- a/scripts/data_prep/convert_text_to_mds.py
+++ b/scripts/data_prep/convert_text_to_mds.py
@@ -353,10 +353,9 @@ def convert_text_to_mds(
     tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
 
     if bos_text is None:
-        if hasattr(tokenizer, 'add_bos_token'):
-            if tokenizer.add_bos_token:
-                tokenizer_bos = tokenizer.bos_token
-                bos_text = tokenizer_bos if tokenizer_bos is not None else ''
+        if hasattr(tokenizer, 'add_bos_token') and tokenizer.add_bos_token:
+            tokenizer_bos = tokenizer.bos_token
+            bos_text = tokenizer_bos if tokenizer_bos is not None else ''
         else:
             bos_text = ''
 

From 655536c9e0914c3dd0199824e1734b2ef885e76d Mon Sep 17 00:00:00 2001
From: Irene Dea <deaairene@gmail.com>
Date: Thu, 28 Dec 2023 16:08:42 -0800
Subject: [PATCH 6/6] fix comment

---
 scripts/data_prep/convert_text_to_mds.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/data_prep/convert_text_to_mds.py b/scripts/data_prep/convert_text_to_mds.py
index 8710315f9d..2b3c8cbfbf 100644
--- a/scripts/data_prep/convert_text_to_mds.py
+++ b/scripts/data_prep/convert_text_to_mds.py
@@ -346,7 +346,7 @@ def convert_text_to_mds(
         args_str (str): String representation of the arguments
         reprocess (bool): Whether to always reprocess the given folder of text files
         bos_text (Optional[str]): Text to prepend to each example to separate concatenated samples
-            If None, use the tokenizer's bos_token if tokenizer.add_bos_token is True, otherwise use an empty string.
+            If None and tokenizer.add_bos_token is True, use the tokenizer's bos_token, otherwise use an empty string.
         eos_text (Optional[str]): Text end to append to each example to separate concatenated samples
             If None, use the tokenizer's eos_token.
     """