Skip to content

Commit

Permalink
Finish up tokens
Browse files Browse the repository at this point in the history
  • Loading branch information
irenedea committed May 25, 2024
1 parent 4ada4b5 commit a6d54b7
Showing 1 changed file with 6 additions and 0 deletions.
6 changes: 6 additions & 0 deletions scripts/data_prep/convert_text_to_mds.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,12 @@ def __iter__(self) -> Iterable[Dict[str, bytes]]:
# Add the EOS token to the buffer to separate files.
buffer += self.eos_tokens

# Finish up the last of the tokens.
while len(buffer) >= self.max_length:
concat_sample = buffer[:self.max_length]
buffer = buffer[self.max_length:] if self.should_wrap else []
yield {'tokens': np.asarray(concat_sample).tobytes()}


def parse_args() -> Namespace:
"""Parse commandline arguments."""
Expand Down

0 comments on commit a6d54b7

Please sign in to comment.