forked from mosaicml/llm-foundry
-
Notifications
You must be signed in to change notification settings - Fork 0
/
mcli-pretokenize-oci-upload.yaml
36 lines (31 loc) · 1.1 KB
/
mcli-pretokenize-oci-upload.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
name: c4-2k-pre-tokenized
image: mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04
compute:
gpus: 8 # Number of GPUs to use
## These configurations are optional
# cluster: TODO # Name of the cluster to use for this run
# gpu_type: a100_80gb # Type of GPU to use. We use a100_80gb in our experiments
integrations:
- integration_type: pip_packages
# Install necessary oci-cli
packages:
- oci-cli==3.23.2
- integration_type: git_repo
git_repo: mosaicml/llm-foundry
git_branch: v0.4.0
# git_commit: # OR use your commit hash
pip_install: '.'
ssh_clone: false # Should be true if using a private repo
command: |
cd llm-foundry/scripts/data_prep/
# Run the dataset conversion
python convert_dataset_hf.py \
--dataset c4 --data_subset en \
--out_root ./my-copy-c4 \
--splits val_small val train_small train \
--concat_tokens 2048 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>'
# Upload the dataset to OCI
oci os object bulk-upload \
-bn YOUR_BUCKET_HERE --region BUCKET_REGION \
--object-prefix c4/base/pretok-neox-2k/ \
--src-dir my-copy-c4/