From 52c40f2b948a09ced616737781c57f6aef84d149 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Mon, 30 Oct 2023 22:51:16 -0700 Subject: [PATCH] pr nit --- llmfoundry/data/finetuning/tasks.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py index ed9191a776..42a9c0da02 100644 --- a/llmfoundry/data/finetuning/tasks.py +++ b/llmfoundry/data/finetuning/tasks.py @@ -340,17 +340,18 @@ def dataset_mapper(example: Dict): return _tokenize_formatted_example(example, tokenizer) detected_cpu_count = os.cpu_count() or 1 + num_cpus_to_use = max(1, detected_cpu_count - 4) columns_to_remove = list(dataset[0].keys()) tokenized_dataset = dataset.map( dataset_mapper, batched=False, remove_columns=columns_to_remove, - num_proc=max(1, detected_cpu_count - 4), + num_proc=num_cpus_to_use), ) prompt_length_filtered_dataset = tokenized_dataset.filter( lambda example: len(example['input_ids']) < max_seq_len, - num_proc=max(1, detected_cpu_count - 4), + num_proc=num_cpus_to_use, ) examples_removed = len(tokenized_dataset) - len(