Skip to content

Commit

Permalink
Updated config file with GPT2 tokenized datasets in RCP
Browse files Browse the repository at this point in the history
  • Loading branch information
TJ-Solergibert committed Jul 16, 2024
1 parent 25ad39b commit d91f9e1
Showing 1 changed file with 7 additions and 7 deletions.
14 changes: 7 additions & 7 deletions examples/config_multilingual_nanoset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ checkpoints:
data_stages:
- data:
dataset:
dataset_folder: datasets/c4-es/tokenized
dataset_folder: /mloscratch/homes/solergib/nanotrove/nanotron/datasets/c4-es/tokenized
dataset_tokens:
- 15
num_loading_workers: 1
Expand All @@ -17,8 +17,8 @@ data_stages:
- data:
dataset:
dataset_folder:
- datasets/SlimPajama-6B/tokenized
- datasets/c4-es/tokenized
- /mloscratch/homes/solergib/nanotrove/nanotron/datasets/SlimPajama-6B/tokenized
- /mloscratch/homes/solergib/nanotrove/nanotron/datasets/c4-es/tokenized
dataset_tokens:
- 16
- 15
Expand All @@ -29,8 +29,8 @@ data_stages:
- data:
dataset:
dataset_folder:
datasets/SlimPajama-6B/tokenized: 0.8
datasets/c4-es/tokenized: 0.2
/mloscratch/homes/solergib/nanotrove/nanotron/datasets/SlimPajama-6B/tokenized: 0.8
/mloscratch/homes/solergib/nanotrove/nanotron/datasets/c4-es/tokenized: 0.2
dataset_tokens:
- 16
- 15
Expand Down Expand Up @@ -65,7 +65,7 @@ model:
initializer_range: 0.02
intermediate_size: 11008
is_llama_config: true
max_position_embeddings: 4096
max_position_embeddings: 1024
num_hidden_layers: 32
num_attention_heads: 32
num_key_value_heads: 8
Expand Down Expand Up @@ -108,7 +108,7 @@ parallelism:
profiler: null
tokenizer:
tokenizer_max_length: null
tokenizer_name_or_path: meta-llama/Meta-Llama-3-8B
tokenizer_name_or_path: gpt2
tokenizer_revision: null
tokens:
batch_accumulation_per_replica: 1
Expand Down

0 comments on commit d91f9e1

Please sign in to comment.