From a6d54b779a4738b83151f92794f9fe957dbb66d3 Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Sat, 25 May 2024 01:32:58 +0000 Subject: [PATCH] Finish up tokens --- scripts/data_prep/convert_text_to_mds.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scripts/data_prep/convert_text_to_mds.py b/scripts/data_prep/convert_text_to_mds.py index 6c94798682..0696137b48 100644 --- a/scripts/data_prep/convert_text_to_mds.py +++ b/scripts/data_prep/convert_text_to_mds.py @@ -94,6 +94,12 @@ def __iter__(self) -> Iterable[Dict[str, bytes]]: # Add the EOS token to the buffer to separate files. buffer += self.eos_tokens + # Finish up the last of the tokens. + while len(buffer) >= self.max_length: + concat_sample = buffer[:self.max_length] + buffer = buffer[self.max_length:] if self.should_wrap else [] + yield {'tokens': np.asarray(concat_sample).tobytes()} + def parse_args() -> Namespace: """Parse commandline arguments."""