Skip to content

Commit

Permalink
pr nit
Browse files Browse the repository at this point in the history
  • Loading branch information
dakinggg committed Oct 31, 2023
1 parent 95ff757 commit 52c40f2
Showing 1 changed file with 3 additions and 2 deletions.
5 changes: 3 additions & 2 deletions llmfoundry/data/finetuning/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,17 +340,18 @@ def dataset_mapper(example: Dict):
return _tokenize_formatted_example(example, tokenizer)

detected_cpu_count = os.cpu_count() or 1
num_cpus_to_use = max(1, detected_cpu_count - 4)

columns_to_remove = list(dataset[0].keys())
tokenized_dataset = dataset.map(
dataset_mapper,
batched=False,
remove_columns=columns_to_remove,
num_proc=max(1, detected_cpu_count - 4),
num_proc=num_cpus_to_use),
)
prompt_length_filtered_dataset = tokenized_dataset.filter(
lambda example: len(example['input_ids']) < max_seq_len,
num_proc=max(1, detected_cpu_count - 4),
num_proc=num_cpus_to_use,
)

examples_removed = len(tokenized_dataset) - len(
Expand Down

0 comments on commit 52c40f2

Please sign in to comment.