Skip to content

Commit

Permalink
Merge pull request #455 from ngc92/fineweb
Browse files Browse the repository at this point in the history
update fine-web preprocessing: faster, and much less RAM consumption
  • Loading branch information
karpathy authored May 24, 2024
2 parents dee4e42 + b66eb66 commit b0f065f
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 28 deletions.
13 changes: 8 additions & 5 deletions dev/data/data_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,14 @@ def write_datafile(filename, toks):
header[0] = 20240520 # magic
header[1] = 1 # version
header[2] = len(toks) # number of tokens after the 256*4 bytes of header (each 2 bytes as uint16)
# validate that no token exceeds a uint16
maxtok = 2**16
assert all(0 <= t < maxtok for t in toks), "token dictionary too large for uint16"
# construct the tokens numpy array
toks_np = np.array(toks, dtype=np.uint16)
# construct the tokens numpy array, if not already
if not isinstance(toks, np.ndarray) or not toks.dtype == np.uint16:
# validate that no token exceeds a uint16
maxtok = 2**16
assert all(0 <= t < maxtok for t in toks), "token dictionary too large for uint16"
toks_np = np.array(toks, dtype=np.uint16)
else:
toks_np = toks
# write to file
print(f"writing {len(toks):,} tokens to {filename}")
with open(filename, "wb") as f:
Expand Down
56 changes: 33 additions & 23 deletions dev/data/fineweb.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,39 +52,49 @@

# helper functions
def tokenize(doc):
return enc.encode_ordinary(doc["text"])
# validate tokens in individual threads
tokens = np.array([eot] + enc.encode_ordinary(doc["text"]))
assert (0 <= tokens).all() and (tokens < 2**16).all(), "token dictionary too large for uint16"
return tokens.astype(np.uint16)

# don't hog the entire system
nprocs = max(1, os.cpu_count() - 2)

# main loop write files
with mp.Pool() as pool:
with mp.Pool(nprocs) as pool:
shard_index = 0
all_tokens = []
# preallocate buffer to hold current shard
all_tokens_np = np.empty((args.shard_size,), dtype=np.uint16)
token_count = 0
progress_bar = None
for tokens in pool.imap(tokenize, fw):

# record the tokens and make sure to separate documents
all_tokens.append(eot)
all_tokens.extend(tokens)

# update progress bar
if progress_bar is None:
progress_bar = tqdm(total=args.shard_size, unit="tokens", desc=f"Shard {shard_index}")
progress_bar.update(len(tokens))
for tokens in pool.imap(tokenize, fw, chunksize=16):
# enough space to add this document fully?
if token_count+len(tokens) < args.shard_size:
all_tokens_np[token_count:token_count+len(tokens)] = tokens
token_count += len(tokens)

# if we reach shard_size tokens, write shard to disk
if len(all_tokens) >= args.shard_size:
# update progress bar
if progress_bar is None:
progress_bar = tqdm(total=args.shard_size, unit="tokens", desc=f"Shard {shard_index}")
progress_bar.update(len(tokens))
else:
split = "val" if shard_index == 0 else "train"
filename = os.path.join(DATA_CACHE_DIR, f"fineweb_{split}_{shard_index:06d}.bin")
write_tokens = all_tokens[:args.shard_size]
rest_tokens = all_tokens[args.shard_size:]
write_datafile(filename, write_tokens)

# split the last document
remainder = args.shard_size - token_count
progress_bar.update(remainder)
all_tokens_np[token_count:token_count+remainder] = tokens[:remainder]
write_datafile(filename, all_tokens_np)
shard_index += 1
progress_bar = None
# note: create a copy so Python can free the all_tokens memory above
# the list rest_tokens is expected to be very small
all_tokens = [t for t in rest_tokens]

# populate the next shard with the leftovers of the current doc
all_tokens_np[0:len(tokens)-remainder] = tokens[remainder:]
token_count = len(tokens)-remainder

# write any remaining tokens as the last shard
if len(all_tokens) > 0:
if token_count != 0:
split = "val" if shard_index == 0 else "train"
filename = os.path.join(DATA_CACHE_DIR, f"fineweb_{split}_{shard_index:06d}.bin")
write_datafile(filename, all_tokens)
write_datafile(filename, all_tokens_np[:token_count])

0 comments on commit b0f065f

Please sign in to comment.