From 4651be7de1aebc9fcfaefb1034a5c71920c9e708 Mon Sep 17 00:00:00 2001 From: xiaohanzhan-db Date: Mon, 8 Jan 2024 05:49:17 +0000 Subject: [PATCH] Add questions --- scripts/data_prep/validate_and_tokenize_data.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/data_prep/validate_and_tokenize_data.py b/scripts/data_prep/validate_and_tokenize_data.py index 30ba2e4456..dfa47b946b 100644 --- a/scripts/data_prep/validate_and_tokenize_data.py +++ b/scripts/data_prep/validate_and_tokenize_data.py @@ -57,6 +57,8 @@ # MAGIC future: Literal[False] = False, # MAGIC } # MAGIC - What null checkings do we want to have? +# MAGIC - How to map the model to its expected eos_text / bos_text format? [Ref](https://databricks.slack.com/archives/C05K29T9NBF/p1703644153357929?thread_ts=1703643155.904289&cid=C05K29T9NBF) +# MAGIC - How to automate tokenization for CPT? it is always really standard: sequence -> concat(tok(BOS), tok(sequence), tok(EOS)), and then concatenate sequences. [Ref](https://databricks.slack.com/archives/C05K29T9NBF/p1703698056000399?thread_ts=1703643155.904289&cid=C05K29T9NBF) # MAGIC ``` # COMMAND ----------