diff --git a/scripts/train/benchmarking/README.md b/scripts/train/benchmarking/README.md index 7164e93bd8..1bbf399e88 100644 --- a/scripts/train/benchmarking/README.md +++ b/scripts/train/benchmarking/README.md @@ -69,176 +69,218 @@ Our microbatching engine enables microbatch sizes that do not divde Global Batch [comment]: # (TODO: Update tables with torch 2.0 after next Composer release) +## H100 80GB BF16 +| Model | SeqLen (T) | # GPUs | GPU | MFU | HFU | Model TFLOP | MicroBatchSize | GradAccum | GlobalBatchSize | Throughput (S/s) | Throughput (T/s) | Throughput (T/s/GPU) | GlobalBatchSize (T) | Precision | MP Mode | Sharding Strategy | Activation Checkpointing | Activation CPUOffload | NumParams | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | +| 70b | 2048 | 64 | h100_80gb | 42.57 | 56.76 | 421 | 8 | 4 | 2048 | 32 | 66523 | 1039 | 4194304 | amp_bf16 | DEFAULT | FULL_SHARD | True | False | 64862437376 | +| 70b | 2048 | 32 | h100_80gb | 36.15 | 48.2 | 357 | 2 | 16 | 1024 | 13 | 28242 | 882 | 2097152 | amp_bf16 | DEFAULT | FULL_SHARD | True | False | 64862437376 | +| 30b | 8192 | 8 | h100_80gb | 29.92 | 39.9 | 296 | 1 | 21 | 168 | 1 | 11072 | 1384 | 1376256 | amp_bf16 | DEFAULT | FULL_SHARD | True | False | 30019254272 | +| 30b | 4096 | 8 | h100_80gb | 35.86 | 47.81 | 354 | 1 | 21 | 168 | 3 | 14419 | 1802 | 688128 | amp_bf16 | DEFAULT | FULL_SHARD | True | False | 29989894144 | +| 30b | 2048 | 32 | h100_80gb | 43.92 | 58.57 | 434 | 14 | 3 | 1344 | 36 | 73860 | 2308 | 2752512 | amp_bf16 | DEFAULT | FULL_SHARD | True | False | 29975214080 | +| 30b | 2048 | 16 | h100_80gb | 43.07 | 57.42 | 426 | 10 | 3 | 480 | 17 | 36209 | 2263 | 983040 | amp_bf16 | DEFAULT | FULL_SHARD | True | False | 29975214080 | +| 30b | 2048 | 8 | h100_80gb | 38.11 | 50.82 | 377 | 3 | 21 | 504 | 7 | 16022 | 2002 | 1032192 | amp_bf16 | DEFAULT | FULL_SHARD | True | False | 29975214080 | +| 30b | 1024 | 8 | h100_80gb | 38.76 | 51.68 | 383 | 6 | 21 | 1008 | 16 | 16672 | 2084 | 1032192 | amp_bf16 | DEFAULT | FULL_SHARD | True | False | 29967874048 | +| 13b | 32768 | 8 | h100_80gb | 31.68 | 42.24 | 313 | 1 | 3 | 24 | 0 | 15812 | 1976 | 786432 | amp_bf16 | DEFAULT | FULL_SHARD | True | False | 13011240960 | +| 13b | 16384 | 8 | h100_80gb | 35.55 | 47.4 | 351 | 3 | 3 | 72 | 1 | 23881 | 2985 | 1179648 | amp_bf16 | DEFAULT | FULL_SHARD | True | False | 12927354880 | +| 13b | 4096 | 8 | h100_80gb | 41.6 | 55.47 | 411 | 10 | 3 | 240 | 9 | 37740 | 4717 | 983040 | amp_bf16 | DEFAULT | FULL_SHARD | True | False | 12864440320 | +| 13b | 2048 | 64 | h100_80gb | 39.86 | 39.86 | 394 | 2 | 1 | 128 | 150 | 307209 | 4800 | 262144 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 12853954560 | +| 13b | 2048 | 32 | h100_80gb | 39.95 | 39.95 | 395 | 2 | 1 | 64 | 75 | 153960 | 4811 | 131072 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 12853954560 | +| 13b | 2048 | 16 | h100_80gb | 39.58 | 39.58 | 391 | 2 | 1 | 32 | 37 | 76280 | 4767 | 65536 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 12853954560 | +| 13b | 2048 | 8 | h100_80gb | 39.79 | 39.79 | 393 | 2 | 1 | 16 | 18 | 38336 | 4792 | 32768 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 12853954560 | +| 13b | 1024 | 8 | h100_80gb | 44.27 | 59.03 | 438 | 40 | 3 | 960 | 42 | 44019 | 5502 | 983040 | amp_bf16 | DEFAULT | FULL_SHARD | True | False | 12848711680 | +| 7b | 65536 | 8 | h100_80gb | 28.59 | 38.13 | 282 | 1 | 2 | 16 | 0 | 15654 | 1956 | 1048576 | amp_bf16 | DEFAULT | FULL_SHARD | True | False | 6918905856 | +| 7b | 32768 | 8 | h100_80gb | 30.94 | 41.25 | 306 | 2 | 2 | 32 | 0 | 26550 | 3318 | 1048576 | amp_bf16 | DEFAULT | FULL_SHARD | True | False | 6784688128 | +| 7b | 8192 | 8 | h100_80gb | 37.14 | 49.52 | 367 | 8 | 2 | 128 | 6 | 55481 | 6935 | 1048576 | amp_bf16 | DEFAULT | FULL_SHARD | True | False | 6684024832 | +| 7b | 4096 | 8 | h100_80gb | 40.42 | 53.9 | 399 | 16 | 2 | 256 | 16 | 68893 | 8611 | 1048576 | amp_bf16 | DEFAULT | FULL_SHARD | True | False | 6667247616 | +| 7b | 2048 | 8 | h100_80gb | 46.44 | 46.44 | 459 | 6 | 1 | 48 | 41 | 85144 | 10643 | 98304 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 6658859008 | +| 7b | 1024 | 8 | h100_80gb | 42.83 | 57.11 | 423 | 64 | 2 | 1024 | 79 | 81628 | 10203 | 1048576 | amp_bf16 | DEFAULT | FULL_SHARD | True | False | 6654664704 | +| 3b | 65536 | 8 | h100_80gb | 26.81 | 35.74 | 265 | 1 | 2 | 16 | 0 | 26099 | 3262 | 1048576 | amp_bf16 | DEFAULT | FULL_SHARD | True | False | 2814366720 | +| 3b | 32768 | 8 | h100_80gb | 28.84 | 38.46 | 285 | 3 | 6 | 144 | 1 | 46984 | 5873 | 4718592 | amp_bf16 | DEFAULT | FULL_SHARD | True | False | 2730480640 | +| 3b | 16384 | 8 | h100_80gb | 36.34 | 36.34 | 359 | 1 | 6 | 48 | 5 | 89223 | 11152 | 786432 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 2688537600 | +| 3b | 8192 | 8 | h100_80gb | 40.31 | 40.31 | 398 | 3 | 6 | 144 | 16 | 132626 | 16578 | 1179648 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 2667566080 | +| 3b | 4096 | 8 | h100_80gb | 42.31 | 42.31 | 418 | 5 | 6 | 240 | 40 | 167712 | 20964 | 983040 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 2657080320 | +| 3b | 2048 | 64 | h100_80gb | 40.8 | 40.8 | 403 | 6 | 3 | 1152 | 703 | 1441663 | 22525 | 2359296 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 2651837440 | +| 3b | 2048 | 32 | h100_80gb | 41.7 | 41.7 | 412 | 6 | 3 | 576 | 359 | 736701 | 23021 | 1179648 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 2651837440 | +| 3b | 2048 | 16 | h100_80gb | 43.73 | 43.73 | 432 | 10 | 3 | 480 | 188 | 386285 | 24142 | 983040 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 2651837440 | +| 3b | 1024 | 8 | h100_80gb | 46.2 | 46.2 | 457 | 20 | 6 | 960 | 211 | 216369 | 27046 | 983040 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 2649216000 | +| 3b | 512 | 8 | h100_80gb | 46.32 | 46.32 | 458 | 40 | 6 | 1920 | 436 | 223721 | 27965 | 983040 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 2647905280 | +| 1b | 65536 | 8 | h100_80gb | 26.34 | 35.12 | 260 | 1 | 2 | 16 | 0 | 44050 | 5506 | 1048576 | amp_bf16 | DEFAULT | FULL_SHARD | True | False | 1445974016 | +| 1b | 32768 | 8 | h100_80gb | 33.54 | 33.54 | 331 | 1 | 4 | 32 | 2 | 96203 | 12025 | 1048576 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 1378865152 | +| 1b | 16384 | 8 | h100_80gb | 35.22 | 35.22 | 348 | 2 | 4 | 64 | 9 | 157194 | 19649 | 1048576 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 1345310720 | +| 1b | 8192 | 8 | h100_80gb | 37.73 | 37.73 | 373 | 3 | 4 | 96 | 28 | 233256 | 29157 | 786432 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 1328533504 | +| 1b | 4096 | 8 | h100_80gb | 40.26 | 40.26 | 398 | 7 | 4 | 224 | 75 | 308282 | 38535 | 917504 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 1320144896 | +| 1b | 2048 | 64 | h100_80gb | 40.85 | 40.85 | 404 | 20 | 1 | 1280 | 1387 | 2841754 | 44402 | 2621440 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 1315950592 | +| 1b | 2048 | 32 | h100_80gb | 41.52 | 41.52 | 410 | 20 | 1 | 640 | 705 | 1444183 | 45130 | 1310720 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 1315950592 | +| 1b | 2048 | 16 | h100_80gb | 42.36 | 42.36 | 419 | 20 | 1 | 320 | 359 | 736596 | 46037 | 655360 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 1315950592 | +| 1b | 2048 | 8 | h100_80gb | 41.82 | 41.82 | 413 | 14 | 1 | 112 | 177 | 363645 | 45455 | 229376 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 1315950592 | +| 1b | 1024 | 8 | h100_80gb | 41.95 | 41.95 | 415 | 18 | 4 | 576 | 382 | 391287 | 48910 | 589824 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 1313853440 | +| 1b | 512 | 8 | h100_80gb | 43.21 | 43.21 | 427 | 56 | 4 | 1792 | 816 | 418201 | 52275 | 917504 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 1312804864 | +| 760m | 32768 | 8 | h100_80gb | 31.84 | 31.84 | 315 | 1 | 2 | 16 | 3 | 130333 | 16291 | 524288 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 807656448 | +| 760m | 16384 | 8 | h100_80gb | 33.57 | 33.57 | 332 | 3 | 2 | 48 | 13 | 222521 | 27815 | 786432 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 782490624 | +| 760m | 8192 | 8 | h100_80gb | 34.84 | 34.84 | 344 | 6 | 2 | 96 | 40 | 334602 | 41825 | 786432 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 769907712 | +| 760m | 4096 | 8 | h100_80gb | 35.83 | 35.83 | 354 | 12 | 2 | 192 | 108 | 443674 | 55459 | 786432 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 763616256 | +| 760m | 2048 | 32 | h100_80gb | 37.57 | 37.57 | 371 | 24 | 1 | 768 | 1062 | 2175091 | 67971 | 1572864 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 760470528 | +| 760m | 2048 | 16 | h100_80gb | 37.89 | 37.89 | 374 | 24 | 1 | 384 | 535 | 1096819 | 68551 | 786432 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 760470528 | +| 760m | 2048 | 8 | h100_80gb | 34.9 | 34.9 | 345 | 24 | 2 | 384 | 246 | 505177 | 63147 | 786432 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 760470528 | +| 760m | 1024 | 8 | h100_80gb | 39.76 | 39.76 | 393 | 48 | 2 | 768 | 613 | 628648 | 78581 | 786432 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 758897664 | +| 760m | 512 | 8 | h100_80gb | 40.42 | 40.42 | 399 | 96 | 2 | 1536 | 1308 | 669998 | 83749 | 786432 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 758111232 | + +## H100 80GB FP8 +| Model | SeqLen (T) | # GPUs | GPU | MFU | HFU | Model TFLOP | MicroBatchSize | GradAccum | GlobalBatchSize | Throughput (S/s) | Throughput (T/s) | Throughput (T/s/GPU) | GlobalBatchSize (T) | Precision | MP Mode | Sharding Strategy | Activation Checkpointing | Activation CPUOffload | NumParams | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | +| 3b | 32768 | 8 | h100_80gb | 14.38 | 19.18 | 284 | 3 | 6 | 144 | 1 | 46853 | 5856 | 4718592 | amp_fp8 | DEFAULT | FULL_SHARD | True | False | 2730480640 | +| 3b | 8192 | 8 | h100_80gb | 23.28 | 23.28 | 460 | 3 | 6 | 144 | 18 | 153174 | 19146 | 1179648 | amp_fp8 | DEFAULT | FULL_SHARD | False | False | 2667566080 | +| 3b | 2048 | 8 | h100_80gb | 27.7 | 27.7 | 548 | 10 | 6 | 480 | 119 | 244692 | 30586 | 983040 | amp_fp8 | DEFAULT | FULL_SHARD | False | False | 2651837440 | +| 3b | 512 | 8 | h100_80gb | 30.25 | 30.25 | 598 | 40 | 6 | 1920 | 570 | 292217 | 36527 | 983040 | amp_fp8 | DEFAULT | FULL_SHARD | False | False | 2647905280 | +| 1b | 32768 | 8 | h100_80gb | 17.55 | 17.55 | 347 | 1 | 4 | 32 | 3 | 100643 | 12580 | 1048576 | amp_fp8 | DEFAULT | FULL_SHARD | False | False | 1378865152 | +| 1b | 8192 | 8 | h100_80gb | 20.71 | 20.71 | 409 | 2 | 4 | 64 | 31 | 256087 | 32010 | 524288 | amp_fp8 | DEFAULT | FULL_SHARD | False | False | 1328533504 | +| 1b | 512 | 8 | h100_80gb | 29.06 | 29.06 | 575 | 56 | 4 | 1792 | 1098 | 562523 | 70315 | 917504 | amp_fp8 | DEFAULT | FULL_SHARD | False | False | 1312804864 | + ## A100 80GB with 1600 Gbps node-node interconnect (RoCE) -| Model | SeqLen (T) | # GPUs | GPU | MFU | HFU | MicroBatchSize | GradAccum | GlobalBatchSize | Throughput (S/s) | Throughput (T/s) | Throughput (T/s/GPU) | GlobalBatchSize (T) | Precision | MP Mode | Sharding Strategy | Activation Checkpointing | Activation CPUOffload | NumParams | +| Model | SeqLen (T) | # GPUs | GPU | MFU | HFU | Model TFLOP | MicroBatchSize | GradAccum | GlobalBatchSize | Throughput (S/s) | Throughput (T/s) | Throughput (T/s/GPU) | GlobalBatchSize (T) | Precision | MP Mode | Sharding Strategy | Activation Checkpointing | Activation CPUOffload | NumParams | | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | -| 70b | 2048 | 64 | a100_80gb | 53.33 | 71.1 | 8 | 4 | 2048 | 12 | 26274 | 410 | 4194304 | bf16 | PURE | FULL_SHARD | True | False | 64862437376 | -| 70b | 2048 | 32 | a100_80gb | 48.56 | 64.75 | 2 | 16 | 1024 | 5 | 11962 | 373 | 2097152 | bf16 | PURE | FULL_SHARD | True | False | 64862437376 | -| 30b | 8192 | 8 | a100_80gb | 42.66 | 56.89 | 1 | 21 | 168 | 0 | 4977 | 622 | 1376256 | bf16 | PURE | FULL_SHARD | True | False | 30019254272 | -| 30b | 4096 | 8 | a100_80gb | 49.12 | 65.49 | 1 | 21 | 168 | 1 | 6227 | 778 | 688128 | bf16 | PURE | FULL_SHARD | True | False | 29989894144 | -| 30b | 2048 | 64 | a100_80gb | 52.93 | 70.57 | 16 | 3 | 3072 | 27 | 56126 | 876 | 6291456 | bf16 | PURE | FULL_SHARD | True | False | 29975214080 | -| 30b | 2048 | 32 | a100_80gb | 53.48 | 71.3 | 14 | 3 | 1344 | 13 | 28353 | 886 | 2752512 | bf16 | PURE | FULL_SHARD | True | False | 29975214080 | -| 30b | 2048 | 16 | a100_80gb | 53.4 | 71.2 | 10 | 3 | 480 | 6 | 14157 | 884 | 983040 | bf16 | PURE | FULL_SHARD | True | False | 29975214080 | -| 30b | 2048 | 8 | a100_80gb | 47.57 | 63.43 | 3 | 21 | 504 | 3 | 6305 | 788 | 1032192 | bf16 | PURE | FULL_SHARD | True | False | 29975214080 | -| 30b | 1024 | 8 | a100_80gb | 51.69 | 68.92 | 6 | 21 | 1008 | 6 | 7010 | 876 | 1032192 | bf16 | PURE | FULL_SHARD | True | False | 29967874048 | -| 30b | 512 | 8 | a100_80gb | 49.23 | 65.63 | 12 | 21 | 2016 | 13 | 6754 | 844 | 1032192 | bf16 | PURE | FULL_SHARD | True | False | 29964204032 | -| 13b | 32768 | 8 | a100_80gb | 49.53 | 66.04 | 1 | 3 | 24 | 0 | 7795 | 974 | 786432 | bf16 | PURE | FULL_SHARD | True | False | 13011240960 | -| 13b | 16384 | 8 | a100_80gb | 51.71 | 68.94 | 3 | 3 | 72 | 0 | 10953 | 1369 | 1179648 | bf16 | PURE | FULL_SHARD | True | False | 12927354880 | -| 13b | 8192 | 8 | a100_80gb | 52.83 | 70.44 | 5 | 3 | 120 | 1 | 13531 | 1691 | 983040 | bf16 | PURE | FULL_SHARD | True | False | 12885411840 | -| 13b | 4096 | 8 | a100_80gb | 53.62 | 71.5 | 10 | 3 | 240 | 3 | 15339 | 1917 | 983040 | bf16 | PURE | FULL_SHARD | True | False | 12864440320 | -| 13b | 2048 | 64 | a100_80gb | 52.51 | 70.01 | 32 | 1 | 2048 | 62 | 127624 | 1994 | 4194304 | bf16 | PURE | FULL_SHARD | True | False | 12853954560 | -| 13b | 2048 | 32 | a100_80gb | 52.86 | 70.48 | 32 | 1 | 1024 | 31 | 64241 | 2007 | 2097152 | bf16 | PURE | FULL_SHARD | True | False | 12853954560 | -| 13b | 2048 | 16 | a100_80gb | 53.14 | 70.86 | 24 | 1 | 384 | 15 | 32291 | 2018 | 786432 | bf16 | PURE | FULL_SHARD | True | False | 12853954560 | -| 13b | 2048 | 8 | a100_80gb | 54.38 | 72.51 | 20 | 3 | 480 | 8 | 16522 | 2065 | 983040 | bf16 | PURE | FULL_SHARD | True | False | 12853954560 | -| 13b | 1024 | 8 | a100_80gb | 55.23 | 73.63 | 40 | 3 | 960 | 16 | 17315 | 2164 | 983040 | bf16 | PURE | FULL_SHARD | True | False | 12848711680 | -| 13b | 512 | 8 | a100_80gb | 54.99 | 73.32 | 80 | 3 | 1920 | 34 | 17521 | 2190 | 983040 | bf16 | PURE | FULL_SHARD | True | False | 12846090240 | -| 7b | 65536 | 8 | a100_80gb | 42.61 | 56.82 | 1 | 2 | 16 | 0 | 7355 | 919 | 1048576 | bf16 | PURE | FULL_SHARD | True | False | 6918905856 | -| 7b | 32768 | 8 | a100_80gb | 48.18 | 64.24 | 2 | 2 | 32 | 0 | 13035 | 1629 | 1048576 | bf16 | PURE | FULL_SHARD | True | False | 6784688128 | -| 7b | 16384 | 8 | a100_80gb | 49.5 | 66.0 | 4 | 2 | 64 | 1 | 18698 | 2337 | 1048576 | bf16 | PURE | FULL_SHARD | True | False | 6717579264 | -| 7b | 8192 | 8 | a100_80gb | 50.71 | 67.62 | 8 | 2 | 128 | 2 | 23887 | 2985 | 1048576 | bf16 | PURE | FULL_SHARD | True | False | 6684024832 | -| 7b | 4096 | 8 | a100_80gb | 52.05 | 69.4 | 16 | 2 | 256 | 6 | 27973 | 3496 | 1048576 | bf16 | PURE | FULL_SHARD | True | False | 6667247616 | -| 7b | 2048 | 64 | a100_80gb | 50.8 | 67.73 | 32 | 1 | 2048 | 114 | 234932 | 3670 | 4194304 | bf16 | PURE | FULL_SHARD | True | False | 6658859008 | -| 7b | 2048 | 32 | a100_80gb | 51.16 | 68.22 | 32 | 1 | 1024 | 57 | 118310 | 3697 | 2097152 | bf16 | PURE | FULL_SHARD | True | False | 6658859008 | -| 7b | 2048 | 16 | a100_80gb | 51.59 | 68.79 | 32 | 1 | 512 | 29 | 59653 | 3728 | 1048576 | bf16 | PURE | FULL_SHARD | True | False | 6658859008 | -| 7b | 2048 | 8 | a100_80gb | 52.92 | 70.56 | 32 | 2 | 512 | 14 | 30596 | 3824 | 1048576 | bf16 | PURE | FULL_SHARD | True | False | 6658859008 | -| 7b | 1024 | 8 | a100_80gb | 53.66 | 71.55 | 64 | 2 | 1024 | 31 | 32243 | 4030 | 1048576 | bf16 | PURE | FULL_SHARD | True | False | 6654664704 | -| 7b | 512 | 8 | a100_80gb | 53.5 | 71.34 | 128 | 2 | 2048 | 64 | 32794 | 4099 | 1048576 | bf16 | PURE | FULL_SHARD | True | False | 6652567552 | -| 3b | 65536 | 8 | a100_80gb | 46.17 | 61.57 | 1 | 2 | 16 | 0 | 14174 | 1771 | 1048576 | bf16 | PURE | FULL_SHARD | True | False | 2814366720 | -| 3b | 32768 | 8 | a100_80gb | 46.73 | 62.31 | 3 | 6 | 144 | 0 | 24003 | 3000 | 4718592 | bf16 | PURE | FULL_SHARD | True | False | 2730480640 | -| 3b | 16384 | 8 | a100_80gb | 57.29 | 57.29 | 1 | 6 | 48 | 2 | 44356 | 5544 | 786432 | bf16 | PURE | FULL_SHARD | False | False | 2688537600 | -| 3b | 8192 | 8 | a100_80gb | 58.68 | 58.68 | 3 | 6 | 144 | 7 | 60883 | 7610 | 1179648 | bf16 | PURE | FULL_SHARD | False | False | 2667566080 | -| 3b | 4096 | 8 | a100_80gb | 59.51 | 59.51 | 5 | 6 | 240 | 18 | 74388 | 9298 | 983040 | bf16 | PURE | FULL_SHARD | False | False | 2657080320 | -| 3b | 2048 | 64 | a100_80gb | 58.36 | 58.36 | 12 | 3 | 2304 | 317 | 650175 | 10158 | 4718592 | bf16 | PURE | FULL_SHARD | False | False | 2651837440 | -| 3b | 2048 | 32 | a100_80gb | 59.22 | 59.22 | 12 | 3 | 1152 | 161 | 329856 | 10308 | 2359296 | bf16 | PURE | FULL_SHARD | False | False | 2651837440 | -| 3b | 2048 | 16 | a100_80gb | 59.08 | 59.08 | 10 | 3 | 480 | 80 | 164543 | 10283 | 983040 | bf16 | PURE | FULL_SHARD | False | False | 2651837440 | -| 3b | 2048 | 8 | a100_80gb | 59.77 | 59.77 | 10 | 6 | 480 | 40 | 83230 | 10403 | 983040 | bf16 | PURE | FULL_SHARD | False | False | 2651837440 | -| 3b | 1024 | 8 | a100_80gb | 61.56 | 61.56 | 20 | 6 | 960 | 88 | 90906 | 11363 | 983040 | bf16 | PURE | FULL_SHARD | False | False | 2649216000 | -| 3b | 512 | 8 | a100_80gb | 62.09 | 62.09 | 40 | 6 | 1920 | 184 | 94553 | 11819 | 983040 | bf16 | PURE | FULL_SHARD | False | False | 2647905280 | -| 1b | 65536 | 8 | a100_80gb | 45.29 | 60.39 | 1 | 2 | 16 | 0 | 23885 | 2985 | 1048576 | bf16 | PURE | FULL_SHARD | True | False | 1445974016 | -| 1b | 32768 | 8 | a100_80gb | 56.02 | 56.02 | 1 | 4 | 32 | 1 | 50657 | 6332 | 1048576 | bf16 | PURE | FULL_SHARD | False | False | 1378865152 | -| 1b | 16384 | 8 | a100_80gb | 55.84 | 55.84 | 2 | 4 | 64 | 4 | 78591 | 9823 | 1048576 | bf16 | PURE | FULL_SHARD | False | False | 1345310720 | -| 1b | 8192 | 8 | a100_80gb | 56.38 | 56.38 | 3 | 4 | 96 | 13 | 109915 | 13739 | 786432 | bf16 | PURE | FULL_SHARD | False | False | 1328533504 | -| 1b | 4096 | 8 | a100_80gb | 58.3 | 58.3 | 7 | 4 | 224 | 34 | 140767 | 17595 | 917504 | bf16 | PURE | FULL_SHARD | False | False | 1320144896 | -| 1b | 2048 | 64 | a100_80gb | 56.67 | 56.67 | 20 | 1 | 1280 | 606 | 1243103 | 19423 | 2621440 | bf16 | PURE | FULL_SHARD | False | False | 1315950592 | -| 1b | 2048 | 32 | a100_80gb | 56.74 | 56.74 | 20 | 1 | 640 | 303 | 622285 | 19446 | 1310720 | bf16 | PURE | FULL_SHARD | False | False | 1315950592 | -| 1b | 2048 | 16 | a100_80gb | 57.47 | 57.47 | 20 | 1 | 320 | 153 | 315117 | 19694 | 655360 | bf16 | PURE | FULL_SHARD | False | False | 1315950592 | -| 1b | 2048 | 8 | a100_80gb | 59.16 | 59.16 | 14 | 4 | 448 | 79 | 162214 | 20276 | 917504 | bf16 | PURE | FULL_SHARD | False | False | 1315950592 | -| 1b | 1024 | 8 | a100_80gb | 58.98 | 58.98 | 18 | 4 | 576 | 169 | 173458 | 21682 | 589824 | bf16 | PURE | FULL_SHARD | False | False | 1313853440 | -| 1b | 512 | 8 | a100_80gb | 60.38 | 60.38 | 56 | 4 | 1792 | 359 | 184268 | 23033 | 917504 | bf16 | PURE | FULL_SHARD | False | False | 1312804864 | -| 760m | 65536 | 8 | a100_80gb | 45.48 | 60.64 | 1 | 2 | 16 | 0 | 33252 | 4156 | 1048576 | bf16 | PURE | FULL_SHARD | True | False | 857988096 | -| 760m | 32768 | 8 | a100_80gb | 54.48 | 54.48 | 1 | 2 | 16 | 2 | 70305 | 8788 | 524288 | bf16 | PURE | FULL_SHARD | False | False | 807656448 | -| 760m | 16384 | 8 | a100_80gb | 55.21 | 55.21 | 3 | 2 | 48 | 7 | 115383 | 14422 | 786432 | bf16 | PURE | FULL_SHARD | False | False | 782490624 | -| 760m | 8192 | 8 | a100_80gb | 55.13 | 55.13 | 6 | 2 | 96 | 20 | 166928 | 20866 | 786432 | bf16 | PURE | FULL_SHARD | False | False | 769907712 | -| 760m | 4096 | 8 | a100_80gb | 55.2 | 55.2 | 12 | 2 | 192 | 52 | 215501 | 26937 | 786432 | bf16 | PURE | FULL_SHARD | False | False | 763616256 | -| 760m | 2048 | 64 | a100_80gb | 51.82 | 51.82 | 24 | 1 | 1536 | 923 | 1892166 | 29565 | 3145728 | bf16 | PURE | FULL_SHARD | False | False | 760470528 | -| 760m | 2048 | 32 | a100_80gb | 53.27 | 53.27 | 24 | 1 | 768 | 474 | 972497 | 30390 | 1572864 | bf16 | PURE | FULL_SHARD | False | False | 760470528 | -| 760m | 2048 | 16 | a100_80gb | 53.56 | 53.56 | 24 | 1 | 384 | 238 | 488871 | 30554 | 786432 | bf16 | PURE | FULL_SHARD | False | False | 760470528 | -| 760m | 2048 | 8 | a100_80gb | 55.67 | 55.67 | 24 | 2 | 384 | 124 | 254104 | 31763 | 786432 | bf16 | PURE | FULL_SHARD | False | False | 760470528 | -| 760m | 1024 | 8 | a100_80gb | 55.98 | 55.98 | 48 | 2 | 768 | 272 | 279108 | 34888 | 786432 | bf16 | PURE | FULL_SHARD | False | False | 758897664 | -| 760m | 512 | 8 | a100_80gb | 56.2 | 56.2 | 96 | 2 | 1536 | 573 | 293755 | 36719 | 786432 | bf16 | PURE | FULL_SHARD | False | False | 758111232 | -| 350m | 65536 | 8 | a100_80gb | 52.39 | 52.39 | 1 | 2 | 16 | 0 | 59835 | 7479 | 1048576 | bf16 | PURE | FULL_SHARD | False | False | 420997120 | -| 350m | 32768 | 8 | a100_80gb | 47.45 | 47.45 | 2 | 2 | 32 | 3 | 98793 | 12349 | 1048576 | bf16 | PURE | FULL_SHARD | False | False | 387442688 | -| 350m | 16384 | 8 | a100_80gb | 53.01 | 53.01 | 4 | 2 | 64 | 11 | 187535 | 23441 | 1048576 | bf16 | PURE | FULL_SHARD | False | False | 370665472 | -| 350m | 8192 | 8 | a100_80gb | 53.21 | 53.21 | 8 | 2 | 128 | 35 | 289398 | 36174 | 1048576 | bf16 | PURE | FULL_SHARD | False | False | 362276864 | -| 350m | 4096 | 8 | a100_80gb | 52.46 | 52.46 | 16 | 2 | 256 | 95 | 390131 | 48766 | 1048576 | bf16 | PURE | FULL_SHARD | False | False | 358082560 | -| 350m | 2048 | 64 | a100_80gb | 47.76 | 47.76 | 32 | 1 | 2048 | 1699 | 3480601 | 54384 | 4194304 | bf16 | PURE | FULL_SHARD | False | False | 355985408 | -| 350m | 2048 | 32 | a100_80gb | 48.58 | 48.58 | 32 | 1 | 1024 | 864 | 1770287 | 55321 | 2097152 | bf16 | PURE | FULL_SHARD | False | False | 355985408 | -| 350m | 2048 | 16 | a100_80gb | 50.53 | 50.53 | 32 | 1 | 512 | 449 | 920605 | 57537 | 1048576 | bf16 | PURE | FULL_SHARD | False | False | 355985408 | -| 350m | 2048 | 8 | a100_80gb | 51.73 | 51.73 | 32 | 2 | 512 | 230 | 471290 | 58911 | 1048576 | bf16 | PURE | FULL_SHARD | False | False | 355985408 | -| 350m | 1024 | 8 | a100_80gb | 51.28 | 51.28 | 64 | 2 | 1024 | 514 | 526393 | 65799 | 1048576 | bf16 | PURE | FULL_SHARD | False | False | 354936832 | -| 350m | 512 | 8 | a100_80gb | 51.18 | 51.18 | 128 | 2 | 2048 | 1095 | 560858 | 70107 | 1048576 | bf16 | PURE | FULL_SHARD | False | False | 354412544 | -| 125m | 65536 | 8 | a100_80gb | 54.31 | 54.31 | 1 | 2 | 16 | 2 | 163472 | 20434 | 1048576 | bf16 | PURE | FULL_SHARD | False | False | 174070272 | -| 125m | 32768 | 8 | a100_80gb | 53.15 | 53.15 | 2 | 2 | 32 | 8 | 293685 | 36710 | 1048576 | bf16 | PURE | FULL_SHARD | False | False | 148904448 | -| 125m | 16384 | 8 | a100_80gb | 51.58 | 51.58 | 4 | 2 | 64 | 29 | 489578 | 61197 | 1048576 | bf16 | PURE | FULL_SHARD | False | False | 136321536 | -| 125m | 8192 | 8 | a100_80gb | 49.18 | 49.18 | 8 | 2 | 128 | 88 | 727986 | 90998 | 1048576 | bf16 | PURE | FULL_SHARD | False | False | 130030080 | -| 125m | 4096 | 8 | a100_80gb | 46.62 | 46.62 | 16 | 2 | 256 | 233 | 958343 | 119792 | 1048576 | bf16 | PURE | FULL_SHARD | False | False | 126884352 | -| 125m | 2048 | 64 | a100_80gb | 40.77 | 40.77 | 32 | 1 | 2048 | 4063 | 8321727 | 130026 | 4194304 | bf16 | PURE | FULL_SHARD | False | False | 125311488 | -| 125m | 2048 | 32 | a100_80gb | 41.22 | 41.22 | 32 | 1 | 1024 | 2053 | 4206041 | 131438 | 2097152 | bf16 | PURE | FULL_SHARD | False | False | 125311488 | -| 125m | 2048 | 16 | a100_80gb | 41.92 | 41.92 | 32 | 1 | 512 | 1044 | 2139036 | 133689 | 1048576 | bf16 | PURE | FULL_SHARD | False | False | 125311488 | -| 125m | 2048 | 8 | a100_80gb | 44.04 | 44.04 | 32 | 2 | 512 | 548 | 1123506 | 140438 | 1048576 | bf16 | PURE | FULL_SHARD | False | False | 125311488 | -| 125m | 1024 | 8 | a100_80gb | 43.25 | 43.25 | 64 | 2 | 1024 | 1225 | 1254561 | 156820 | 1048576 | bf16 | PURE | FULL_SHARD | False | False | 124525056 | -| 125m | 512 | 8 | a100_80gb | 42.54 | 42.54 | 128 | 2 | 2048 | 2587 | 1325030 | 165628 | 1048576 | bf16 | PURE | FULL_SHARD | False | False | 124131840 | +| 70b | 2048 | 64 | a100_80gb | 53.33 | 71.1 | 166 | 8 | 4 | 2048 | 12 | 26274 | 410 | 4194304 | bf16 | PURE | FULL_SHARD | True | False | 64862437376 | +| 70b | 2048 | 32 | a100_80gb | 48.56 | 64.75 | 151 | 2 | 16 | 1024 | 5 | 11962 | 373 | 2097152 | bf16 | PURE | FULL_SHARD | True | False | 64862437376 | +| 30b | 8192 | 8 | a100_80gb | 39.38 | 52.5 | 122 | 1 | 21 | 168 | 0 | 4594 | 574 | 1376256 | amp_bf16 | DEFAULT | FULL_SHARD | True | False | 30019254272 | +| 30b | 4096 | 8 | a100_80gb | 51.37 | 68.49 | 160 | 1 | 21 | 168 | 1 | 6513 | 814 | 688128 | amp_bf16 | DEFAULT | FULL_SHARD | True | False | 29989894144 | +| 30b | 2048 | 8 | a100_80gb | 55.3 | 73.74 | 172 | 3 | 21 | 504 | 3 | 7330 | 916 | 1032192 | amp_bf16 | DEFAULT | FULL_SHARD | True | False | 29975214080 | +| 30b | 1024 | 8 | a100_80gb | 55.82 | 74.43 | 174 | 6 | 21 | 1008 | 7 | 7571 | 946 | 1032192 | amp_bf16 | DEFAULT | FULL_SHARD | True | False | 29967874048 | +| 30b | 512 | 8 | a100_80gb | 56.4 | 75.2 | 175 | 12 | 21 | 2016 | 15 | 7739 | 967 | 1032192 | amp_bf16 | DEFAULT | FULL_SHARD | True | False | 29964204032 | +| 13b | 32768 | 8 | a100_80gb | 51.69 | 68.92 | 161 | 1 | 3 | 24 | 0 | 8134 | 1016 | 786432 | amp_bf16 | DEFAULT | FULL_SHARD | True | False | 13011240960 | +| 13b | 16384 | 8 | a100_80gb | 54.07 | 72.1 | 168 | 3 | 3 | 72 | 0 | 11454 | 1431 | 1179648 | amp_bf16 | DEFAULT | FULL_SHARD | True | False | 12927354880 | +| 13b | 8192 | 8 | a100_80gb | 56.07 | 74.76 | 174 | 5 | 3 | 120 | 1 | 14362 | 1795 | 983040 | amp_bf16 | DEFAULT | FULL_SHARD | True | False | 12885411840 | +| 13b | 4096 | 8 | a100_80gb | 57.62 | 76.82 | 179 | 10 | 3 | 240 | 4 | 16482 | 2060 | 983040 | amp_bf16 | DEFAULT | FULL_SHARD | True | False | 12864440320 | +| 13b | 2048 | 8 | a100_80gb | 59.57 | 59.57 | 185 | 2 | 3 | 48 | 8 | 18097 | 2262 | 98304 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 12853954560 | +| 13b | 1024 | 8 | a100_80gb | 59.48 | 79.3 | 185 | 40 | 3 | 960 | 18 | 18647 | 2330 | 983040 | amp_bf16 | DEFAULT | FULL_SHARD | True | False | 12848711680 | +| 7b | 65536 | 8 | a100_80gb | 46.97 | 62.63 | 146 | 1 | 2 | 16 | 0 | 8108 | 1013 | 1048576 | amp_bf16 | DEFAULT | FULL_SHARD | True | False | 6918905856 | +| 7b | 32768 | 8 | a100_80gb | 49.46 | 65.94 | 154 | 2 | 2 | 32 | 0 | 13382 | 1672 | 1048576 | amp_bf16 | DEFAULT | FULL_SHARD | True | False | 6784688128 | +| 7b | 16384 | 8 | a100_80gb | 51.96 | 69.28 | 162 | 4 | 2 | 64 | 1 | 19629 | 2453 | 1048576 | amp_bf16 | DEFAULT | FULL_SHARD | True | False | 6717579264 | +| 7b | 8192 | 8 | a100_80gb | 54.47 | 72.62 | 169 | 8 | 2 | 128 | 3 | 25655 | 3206 | 1048576 | amp_bf16 | DEFAULT | FULL_SHARD | True | False | 6684024832 | +| 7b | 4096 | 8 | a100_80gb | 54.84 | 73.12 | 171 | 16 | 2 | 256 | 7 | 29472 | 3684 | 1048576 | amp_bf16 | DEFAULT | FULL_SHARD | True | False | 6667247616 | +| 7b | 2048 | 8 | a100_80gb | 64.23 | 64.23 | 200 | 6 | 2 | 96 | 18 | 37130 | 4641 | 196608 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 6658859008 | +| 7b | 1024 | 8 | a100_80gb | 58.01 | 77.35 | 180 | 64 | 2 | 1024 | 34 | 34857 | 4357 | 1048576 | amp_bf16 | DEFAULT | FULL_SHARD | True | False | 6654664704 | +| 3b | 65536 | 8 | a100_80gb | 46.05 | 61.41 | 143 | 1 | 2 | 16 | 0 | 14137 | 1767 | 1048576 | amp_bf16 | DEFAULT | FULL_SHARD | True | False | 2814366720 | +| 3b | 32768 | 8 | a100_80gb | 47.18 | 62.91 | 147 | 3 | 6 | 144 | 0 | 24235 | 3029 | 4718592 | amp_bf16 | DEFAULT | FULL_SHARD | True | False | 2730480640 | +| 3b | 16384 | 8 | a100_80gb | 57.13 | 57.13 | 178 | 1 | 6 | 48 | 2 | 44233 | 5529 | 786432 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 2688537600 | +| 3b | 8192 | 8 | a100_80gb | 59.34 | 59.34 | 185 | 3 | 6 | 144 | 7 | 61567 | 7695 | 1179648 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 2667566080 | +| 3b | 4096 | 8 | a100_80gb | 60.53 | 60.53 | 188 | 5 | 6 | 240 | 18 | 75658 | 9457 | 983040 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 2657080320 | +| 3b | 2048 | 8 | a100_80gb | 62.11 | 62.11 | 193 | 10 | 2 | 160 | 42 | 86491 | 10811 | 327680 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 2651837440 | +| 3b | 1024 | 8 | a100_80gb | 62.73 | 62.73 | 195 | 20 | 6 | 960 | 90 | 92643 | 11580 | 983040 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 2649216000 | +| 3b | 512 | 8 | a100_80gb | 63.71 | 63.71 | 198 | 40 | 6 | 1920 | 189 | 97019 | 12127 | 983040 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 2647905280 | +| 1b | 65536 | 8 | a100_80gb | 46.18 | 61.57 | 144 | 1 | 2 | 16 | 0 | 24353 | 3044 | 1048576 | amp_bf16 | DEFAULT | FULL_SHARD | True | False | 1445974016 | +| 1b | 32768 | 8 | a100_80gb | 55.52 | 55.52 | 173 | 1 | 4 | 32 | 1 | 50207 | 6275 | 1048576 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 1378865152 | +| 1b | 16384 | 8 | a100_80gb | 56.6 | 56.6 | 176 | 2 | 4 | 64 | 4 | 79650 | 9956 | 1048576 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 1345310720 | +| 1b | 8192 | 8 | a100_80gb | 56.69 | 56.69 | 176 | 3 | 4 | 96 | 13 | 110516 | 13814 | 786432 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 1328533504 | +| 1b | 4096 | 8 | a100_80gb | 59.0 | 59.0 | 184 | 7 | 4 | 224 | 34 | 142457 | 17807 | 917504 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 1320144896 | +| 1b | 2048 | 8 | a100_80gb | 59.86 | 59.86 | 186 | 14 | 4 | 448 | 80 | 164109 | 20513 | 917504 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 1315950592 | +| 1b | 1024 | 8 | a100_80gb | 60.15 | 60.15 | 187 | 18 | 4 | 576 | 172 | 176898 | 22112 | 589824 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 1313853440 | +| 1b | 512 | 8 | a100_80gb | 60.68 | 60.68 | 189 | 56 | 4 | 1792 | 361 | 185186 | 23148 | 917504 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 1312804864 | +| 760m | 65536 | 8 | a100_80gb | 45.34 | 60.45 | 141 | 1 | 2 | 16 | 0 | 33150 | 4143 | 1048576 | amp_bf16 | DEFAULT | FULL_SHARD | True | False | 857988096 | +| 760m | 32768 | 8 | a100_80gb | 54.57 | 54.57 | 170 | 1 | 2 | 16 | 2 | 70417 | 8802 | 524288 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 807656448 | +| 760m | 16384 | 8 | a100_80gb | 54.64 | 54.64 | 170 | 3 | 2 | 48 | 6 | 114198 | 14274 | 786432 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 782490624 | +| 760m | 8192 | 8 | a100_80gb | 55.31 | 55.31 | 172 | 6 | 2 | 96 | 20 | 167471 | 20933 | 786432 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 769907712 | +| 760m | 4096 | 8 | a100_80gb | 56.05 | 56.05 | 174 | 12 | 2 | 192 | 53 | 218808 | 27351 | 786432 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 763616256 | +| 760m | 2048 | 8 | a100_80gb | 56.85 | 56.85 | 177 | 24 | 2 | 384 | 126 | 259472 | 32434 | 786432 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 760470528 | +| 760m | 1024 | 8 | a100_80gb | 47.76 | 47.76 | 149 | 48 | 2 | 768 | 232 | 238122 | 29765 | 786432 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 758897664 | +| 760m | 512 | 8 | a100_80gb | 45.07 | 45.07 | 140 | 96 | 2 | 1536 | 460 | 235571 | 29446 | 786432 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 758111232 | +| 350m | 65536 | 8 | a100_80gb | 52.7 | 52.7 | 164 | 1 | 2 | 16 | 0 | 60195 | 7524 | 1048576 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 420997120 | +| 350m | 32768 | 8 | a100_80gb | 52.46 | 52.46 | 163 | 2 | 2 | 32 | 3 | 109222 | 13652 | 1048576 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 387442688 | +| 350m | 16384 | 8 | a100_80gb | 53.28 | 53.28 | 166 | 4 | 2 | 64 | 11 | 188478 | 23559 | 1048576 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 370665472 | +| 350m | 8192 | 8 | a100_80gb | 53.8 | 53.8 | 167 | 8 | 2 | 128 | 35 | 292559 | 36569 | 1048576 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 362276864 | +| 350m | 4096 | 8 | a100_80gb | 53.31 | 53.31 | 166 | 16 | 2 | 256 | 96 | 396442 | 49555 | 1048576 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 358082560 | +| 350m | 2048 | 8 | a100_80gb | 51.62 | 51.62 | 161 | 32 | 2 | 512 | 229 | 470263 | 58782 | 1048576 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 355985408 | +| 350m | 1024 | 8 | a100_80gb | 50.51 | 50.51 | 157 | 64 | 2 | 1024 | 506 | 518504 | 64813 | 1048576 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 354936832 | +| 350m | 512 | 8 | a100_80gb | 50.61 | 50.61 | 157 | 128 | 2 | 2048 | 1083 | 554643 | 69330 | 1048576 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 354412544 | +| 125m | 65536 | 8 | a100_80gb | 54.13 | 54.13 | 168 | 1 | 2 | 16 | 2 | 162946 | 20368 | 1048576 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 174070272 | +| 125m | 32768 | 8 | a100_80gb | 52.71 | 52.71 | 164 | 2 | 2 | 32 | 8 | 291256 | 36407 | 1048576 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 148904448 | +| 125m | 16384 | 8 | a100_80gb | 50.61 | 50.61 | 157 | 4 | 2 | 64 | 29 | 480322 | 60040 | 1048576 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 136321536 | +| 125m | 8192 | 8 | a100_80gb | 48.85 | 48.85 | 152 | 8 | 2 | 128 | 88 | 723142 | 90392 | 1048576 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 130030080 | +| 125m | 4096 | 8 | a100_80gb | 46.08 | 46.08 | 143 | 16 | 2 | 256 | 231 | 947172 | 118396 | 1048576 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 126884352 | +| 125m | 2048 | 8 | a100_80gb | 44.79 | 44.79 | 139 | 40 | 2 | 640 | 557 | 1142641 | 142830 | 1310720 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 125311488 | +| 125m | 2048 | 8 | a100_80gb | 44.45 | 44.45 | 138 | 32 | 2 | 512 | 553 | 1133901 | 141737 | 1048576 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 125311488 | +| 125m | 1024 | 8 | a100_80gb | 43.15 | 43.15 | 134 | 64 | 2 | 1024 | 1222 | 1251751 | 156468 | 1048576 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 124525056 | +| 125m | 512 | 8 | a100_80gb | 42.56 | 42.56 | 132 | 128 | 2 | 2048 | 2588 | 1325455 | 165681 | 1048576 | amp_bf16 | DEFAULT | FULL_SHARD | False | False | 124131840 | ## A100 40GB with 1600 Gbps node-node interconnect (RoCE) -| Model | SeqLen (T) | # GPUs | GPU | MFU | HFU | MicroBatchSize | GradAccum | GlobalBatchSize | Throughput (S/s) | Throughput (T/s) | Throughput (T/s/GPU) | GlobalBatchSize (T) | Precision | MP Mode | Sharding Strategy | Activation Checkpointing | Activation CPUOffload | NumParams | +| Model | SeqLen (T) | # GPUs | GPU | MFU | HFU | Model TFLOP| MicroBatchSize | GradAccum | GlobalBatchSize | Throughput (S/s) | Throughput (T/s) | Throughput (T/s/GPU) | GlobalBatchSize (T) | Precision | MP Mode | Sharding Strategy | Activation Checkpointing | Activation CPUOffload | NumParams | | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | -| 70b | 2048 | 128 | a100_40gb | 48.91 | 65.21 | 4 | 1 | 512 | 23 | 48194 | 376 | 1048576 | bf16 | PURE | FULL_SHARD | True | False | 64862437376 | -| 70b | 2048 | 64 | a100_40gb | 35.87 | 47.82 | 2 | 1 | 128 | 8 | 17672 | 276 | 262144 | bf16 | PURE | FULL_SHARD | True | False | 64862437376 | -| 30b | 2048 | 128 | a100_40gb | 52.25 | 69.66 | 6 | 1 | 768 | 54 | 110803 | 865 | 1572864 | bf16 | PURE | FULL_SHARD | True | False | 29975214080 | -| 30b | 2048 | 32 | a100_40gb | 51.74 | 68.98 | 4 | 1 | 128 | 13 | 27431 | 857 | 262144 | bf16 | PURE | FULL_SHARD | True | False | 29975214080 | -| 13b | 8192 | 8 | a100_40gb | 43.95 | 58.6 | 1 | 16 | 128 | 1 | 11258 | 1407 | 1048576 | bf16 | PURE | FULL_SHARD | True | False | 12885411840 | -| 13b | 4096 | 8 | a100_40gb | 44.85 | 59.8 | 2 | 16 | 256 | 3 | 12830 | 1603 | 1048576 | bf16 | PURE | FULL_SHARD | True | False | 12864440320 | -| 13b | 2048 | 128 | a100_40gb | 51.93 | 69.24 | 16 | 1 | 2048 | 123 | 252444 | 1972 | 4194304 | bf16 | PURE | FULL_SHARD | True | False | 12853954560 | -| 13b | 2048 | 64 | a100_40gb | 52.04 | 69.39 | 16 | 1 | 1024 | 61 | 126479 | 1976 | 2097152 | bf16 | PURE | FULL_SHARD | True | False | 12853954560 | -| 13b | 2048 | 32 | a100_40gb | 52.62 | 70.16 | 14 | 1 | 448 | 31 | 63946 | 1998 | 917504 | bf16 | PURE | FULL_SHARD | True | False | 12853954560 | -| 13b | 2048 | 16 | a100_40gb | 52.5 | 70.0 | 10 | 1 | 160 | 15 | 31900 | 1993 | 327680 | bf16 | PURE | FULL_SHARD | True | False | 12853954560 | -| 13b | 2048 | 8 | a100_40gb | 43.94 | 58.58 | 4 | 16 | 512 | 6 | 13347 | 1668 | 1048576 | bf16 | PURE | FULL_SHARD | True | False | 12853954560 | -| 13b | 1024 | 8 | a100_40gb | 44.07 | 58.76 | 8 | 16 | 1024 | 13 | 13817 | 1727 | 1048576 | bf16 | PURE | FULL_SHARD | True | False | 12848711680 | -| 13b | 512 | 8 | a100_40gb | 44.28 | 59.04 | 16 | 16 | 2048 | 27 | 14108 | 1763 | 1048576 | bf16 | PURE | FULL_SHARD | True | False | 12846090240 | -| 7b | 16384 | 8 | a100_40gb | 47.65 | 63.53 | 1 | 4 | 32 | 1 | 17998 | 2249 | 524288 | bf16 | PURE | FULL_SHARD | True | False | 6717579264 | -| 7b | 8192 | 8 | a100_40gb | 49.04 | 65.38 | 3 | 4 | 96 | 2 | 23098 | 2887 | 786432 | bf16 | PURE | FULL_SHARD | True | False | 6684024832 | -| 7b | 4096 | 8 | a100_40gb | 50.11 | 66.82 | 6 | 4 | 192 | 6 | 26930 | 3366 | 786432 | bf16 | PURE | FULL_SHARD | True | False | 6667247616 | -| 7b | 2048 | 128 | a100_40gb | 50.14 | 66.85 | 18 | 1 | 2304 | 226 | 463749 | 3623 | 4718592 | bf16 | PURE | FULL_SHARD | True | False | 6658859008 | -| 7b | 2048 | 64 | a100_40gb | 50.73 | 67.64 | 18 | 1 | 1152 | 114 | 234614 | 3665 | 2359296 | bf16 | PURE | FULL_SHARD | True | False | 6658859008 | -| 7b | 2048 | 32 | a100_40gb | 51.55 | 68.73 | 18 | 1 | 576 | 58 | 119202 | 3725 | 1179648 | bf16 | PURE | FULL_SHARD | True | False | 6658859008 | -| 7b | 2048 | 16 | a100_40gb | 50.44 | 67.26 | 16 | 1 | 256 | 28 | 58322 | 3645 | 524288 | bf16 | PURE | FULL_SHARD | True | False | 6658859008 | -| 7b | 2048 | 8 | a100_40gb | 50.92 | 67.89 | 12 | 4 | 384 | 14 | 29436 | 3679 | 786432 | bf16 | PURE | FULL_SHARD | True | False | 6658859008 | -| 7b | 1024 | 8 | a100_40gb | 51.31 | 68.42 | 24 | 4 | 768 | 30 | 30833 | 3854 | 786432 | bf16 | PURE | FULL_SHARD | True | False | 6654664704 | -| 7b | 512 | 8 | a100_40gb | 50.85 | 67.8 | 48 | 4 | 1536 | 60 | 31167 | 3895 | 786432 | bf16 | PURE | FULL_SHARD | True | False | 6652567552 | -| 3b | 32768 | 8 | a100_40gb | 46.03 | 61.37 | 1 | 4 | 32 | 0 | 23640 | 2955 | 1048576 | bf16 | PURE | FULL_SHARD | True | False | 2730480640 | -| 3b | 16384 | 8 | a100_40gb | 46.14 | 61.52 | 2 | 8 | 128 | 2 | 35726 | 4465 | 2097152 | bf16 | PURE | FULL_SHARD | True | False | 2688537600 | -| 3b | 8192 | 8 | a100_40gb | 55.13 | 55.13 | 1 | 8 | 64 | 6 | 57193 | 7149 | 524288 | bf16 | PURE | FULL_SHARD | False | False | 2667566080 | -| 3b | 4096 | 8 | a100_40gb | 56.18 | 56.18 | 2 | 8 | 128 | 17 | 70223 | 8777 | 524288 | bf16 | PURE | FULL_SHARD | False | False | 2657080320 | -| 3b | 2048 | 128 | a100_40gb | 54.8 | 54.8 | 6 | 1 | 768 | 596 | 1220885 | 9538 | 1572864 | bf16 | PURE | FULL_SHARD | False | False | 2651837440 | -| 3b | 2048 | 64 | a100_40gb | 55.94 | 55.94 | 6 | 1 | 384 | 304 | 623167 | 9736 | 786432 | bf16 | PURE | FULL_SHARD | False | False | 2651837440 | -| 3b | 2048 | 32 | a100_40gb | 56.96 | 56.96 | 6 | 1 | 192 | 154 | 317261 | 9914 | 393216 | bf16 | PURE | FULL_SHARD | False | False | 2651837440 | -| 3b | 2048 | 16 | a100_40gb | 56.02 | 56.02 | 5 | 1 | 80 | 76 | 156013 | 9750 | 163840 | bf16 | PURE | FULL_SHARD | False | False | 2651837440 | -| 3b | 2048 | 8 | a100_40gb | 57.82 | 57.82 | 5 | 8 | 320 | 39 | 80520 | 10065 | 655360 | bf16 | PURE | FULL_SHARD | False | False | 2651837440 | -| 3b | 1024 | 8 | a100_40gb | 58.14 | 58.14 | 10 | 8 | 640 | 83 | 85854 | 10731 | 655360 | bf16 | PURE | FULL_SHARD | False | False | 2649216000 | -| 3b | 512 | 8 | a100_40gb | 59.49 | 59.49 | 20 | 8 | 1280 | 176 | 90596 | 11324 | 655360 | bf16 | PURE | FULL_SHARD | False | False | 2647905280 | -| 1b | 32768 | 8 | a100_40gb | 45.07 | 60.1 | 1 | 4 | 32 | 1 | 40762 | 5095 | 1048576 | bf16 | PURE | FULL_SHARD | True | False | 1378865152 | -| 1b | 16384 | 8 | a100_40gb | 55.23 | 55.23 | 1 | 8 | 64 | 4 | 77723 | 9715 | 1048576 | bf16 | PURE | FULL_SHARD | False | False | 1345310720 | -| 1b | 8192 | 8 | a100_40gb | 55.29 | 55.29 | 2 | 8 | 128 | 13 | 107799 | 13474 | 1048576 | bf16 | PURE | FULL_SHARD | False | False | 1328533504 | -| 1b | 4096 | 8 | a100_40gb | 55.85 | 55.85 | 4 | 8 | 256 | 32 | 134851 | 16856 | 1048576 | bf16 | PURE | FULL_SHARD | False | False | 1320144896 | -| 1b | 2048 | 128 | a100_40gb | 54.41 | 54.41 | 10 | 1 | 1280 | 1165 | 2386897 | 18647 | 2621440 | bf16 | PURE | FULL_SHARD | False | False | 1315950592 | -| 1b | 2048 | 64 | a100_40gb | 55.44 | 55.44 | 10 | 1 | 640 | 593 | 1216104 | 19001 | 1310720 | bf16 | PURE | FULL_SHARD | False | False | 1315950592 | -| 1b | 2048 | 32 | a100_40gb | 45.39 | 45.39 | 10 | 1 | 320 | 243 | 497782 | 15555 | 655360 | bf16 | PURE | FULL_SHARD | False | False | 1315950592 | -| 1b | 2048 | 16 | a100_40gb | 55.69 | 55.69 | 8 | 1 | 128 | 149 | 305372 | 19085 | 262144 | bf16 | PURE | FULL_SHARD | False | False | 1315950592 | -| 1b | 2048 | 8 | a100_40gb | 56.23 | 56.23 | 8 | 8 | 512 | 75 | 154171 | 19271 | 1048576 | bf16 | PURE | FULL_SHARD | False | False | 1315950592 | -| 1b | 1024 | 8 | a100_40gb | 57.02 | 57.02 | 16 | 8 | 1024 | 163 | 167677 | 20959 | 1048576 | bf16 | PURE | FULL_SHARD | False | False | 1313853440 | -| 1b | 512 | 8 | a100_40gb | 57.1 | 57.1 | 32 | 8 | 2048 | 340 | 174256 | 21782 | 1048576 | bf16 | PURE | FULL_SHARD | False | False | 1312804864 | -| 760m | 32768 | 8 | a100_40gb | 44.53 | 59.37 | 1 | 4 | 32 | 1 | 57464 | 7183 | 1048576 | bf16 | PURE | FULL_SHARD | True | False | 807656448 | -| 760m | 16384 | 8 | a100_40gb | 53.26 | 53.26 | 1 | 4 | 32 | 6 | 111316 | 13914 | 524288 | bf16 | PURE | FULL_SHARD | False | False | 782490624 | -| 760m | 8192 | 8 | a100_40gb | 53.12 | 53.12 | 3 | 4 | 96 | 19 | 160853 | 20106 | 786432 | bf16 | PURE | FULL_SHARD | False | False | 769907712 | -| 760m | 4096 | 8 | a100_40gb | 53.0 | 53.0 | 6 | 4 | 192 | 50 | 206909 | 25863 | 786432 | bf16 | PURE | FULL_SHARD | False | False | 763616256 | -| 760m | 2048 | 128 | a100_40gb | 50.73 | 50.73 | 12 | 1 | 1536 | 1808 | 3704382 | 28940 | 3145728 | bf16 | PURE | FULL_SHARD | False | False | 760470528 | -| 760m | 2048 | 64 | a100_40gb | 51.44 | 51.44 | 12 | 1 | 768 | 917 | 1878030 | 29344 | 1572864 | bf16 | PURE | FULL_SHARD | False | False | 760470528 | -| 760m | 2048 | 32 | a100_40gb | 51.97 | 51.97 | 12 | 1 | 384 | 463 | 948745 | 29648 | 786432 | bf16 | PURE | FULL_SHARD | False | False | 760470528 | -| 760m | 2048 | 16 | a100_40gb | 51.9 | 51.9 | 12 | 1 | 192 | 231 | 473723 | 29607 | 393216 | bf16 | PURE | FULL_SHARD | False | False | 760470528 | -| 760m | 2048 | 8 | a100_40gb | 52.89 | 52.89 | 12 | 4 | 384 | 117 | 241389 | 30173 | 786432 | bf16 | PURE | FULL_SHARD | False | False | 760470528 | -| 760m | 1024 | 8 | a100_40gb | 53.63 | 53.63 | 24 | 4 | 768 | 261 | 267376 | 33422 | 786432 | bf16 | PURE | FULL_SHARD | False | False | 758897664 | -| 760m | 512 | 8 | a100_40gb | 53.47 | 53.47 | 48 | 4 | 1536 | 545 | 279504 | 34938 | 786432 | bf16 | PURE | FULL_SHARD | False | False | 758111232 | -| 350m | 32768 | 8 | a100_40gb | 51.55 | 51.55 | 1 | 4 | 32 | 3 | 107329 | 13416 | 1048576 | bf16 | PURE | FULL_SHARD | False | False | 387442688 | -| 350m | 16384 | 8 | a100_40gb | 51.78 | 51.78 | 2 | 4 | 64 | 11 | 183175 | 22896 | 1048576 | bf16 | PURE | FULL_SHARD | False | False | 370665472 | -| 350m | 8192 | 8 | a100_40gb | 51.39 | 51.39 | 4 | 4 | 128 | 34 | 279466 | 34933 | 1048576 | bf16 | PURE | FULL_SHARD | False | False | 362276864 | -| 350m | 4096 | 8 | a100_40gb | 50.38 | 50.38 | 8 | 4 | 256 | 91 | 374670 | 46833 | 1048576 | bf16 | PURE | FULL_SHARD | False | False | 358082560 | -| 350m | 2048 | 128 | a100_40gb | 45.61 | 45.61 | 18 | 1 | 2304 | 3245 | 6647647 | 51934 | 4718592 | bf16 | PURE | FULL_SHARD | False | False | 355985408 | -| 350m | 2048 | 64 | a100_40gb | 46.27 | 46.27 | 18 | 1 | 1152 | 1646 | 3372118 | 52689 | 2359296 | bf16 | PURE | FULL_SHARD | False | False | 355985408 | -| 350m | 2048 | 32 | a100_40gb | 47.26 | 47.26 | 18 | 1 | 576 | 840 | 1721978 | 53811 | 1179648 | bf16 | PURE | FULL_SHARD | False | False | 355985408 | -| 350m | 2048 | 16 | a100_40gb | 48.66 | 48.66 | 18 | 1 | 288 | 432 | 886622 | 55413 | 589824 | bf16 | PURE | FULL_SHARD | False | False | 355985408 | -| 350m | 2048 | 8 | a100_40gb | 49.17 | 49.17 | 16 | 4 | 512 | 218 | 447963 | 55995 | 1048576 | bf16 | PURE | FULL_SHARD | False | False | 355985408 | -| 350m | 1024 | 8 | a100_40gb | 48.73 | 48.73 | 32 | 4 | 1024 | 488 | 500184 | 62523 | 1048576 | bf16 | PURE | FULL_SHARD | False | False | 354936832 | -| 350m | 512 | 8 | a100_40gb | 48.39 | 48.39 | 64 | 4 | 2048 | 1035 | 530277 | 66284 | 1048576 | bf16 | PURE | FULL_SHARD | False | False | 354412544 | -| 125m | 32768 | 8 | a100_40gb | 47.27 | 47.27 | 1 | 4 | 32 | 7 | 261208 | 32651 | 1048576 | bf16 | PURE | FULL_SHARD | False | False | 148904448 | -| 125m | 16384 | 8 | a100_40gb | 46.77 | 46.77 | 2 | 3 | 48 | 27 | 443876 | 55484 | 786432 | bf16 | PURE | FULL_SHARD | False | False | 136321536 | -| 125m | 8192 | 8 | a100_40gb | 46.94 | 46.94 | 5 | 3 | 120 | 84 | 694868 | 86858 | 983040 | bf16 | PURE | FULL_SHARD | False | False | 130030080 | -| 125m | 4096 | 8 | a100_40gb | 44.82 | 44.82 | 13 | 3 | 312 | 224 | 921297 | 115162 | 1277952 | bf16 | PURE | FULL_SHARD | False | False | 126884352 | -| 125m | 2048 | 128 | a100_40gb | 38.86 | 38.86 | 26 | 1 | 3328 | 7746 | 15863837 | 123936 | 6815744 | bf16 | PURE | FULL_SHARD | False | False | 125311488 | -| 125m | 2048 | 64 | a100_40gb | 39.27 | 39.27 | 26 | 1 | 1664 | 3913 | 8015010 | 125234 | 3407872 | bf16 | PURE | FULL_SHARD | False | False | 125311488 | -| 125m | 2048 | 32 | a100_40gb | 39.86 | 39.86 | 26 | 1 | 832 | 1986 | 4067922 | 127122 | 1703936 | bf16 | PURE | FULL_SHARD | False | False | 125311488 | -| 125m | 2048 | 16 | a100_40gb | 40.93 | 40.93 | 26 | 1 | 416 | 1019 | 2088560 | 130535 | 851968 | bf16 | PURE | FULL_SHARD | False | False | 125311488 | -| 125m | 2048 | 8 | a100_40gb | 42.75 | 42.75 | 26 | 3 | 624 | 532 | 1090678 | 136334 | 1277952 | bf16 | PURE | FULL_SHARD | False | False | 125311488 | -| 125m | 1024 | 8 | a100_40gb | 40.89 | 40.89 | 52 | 3 | 1248 | 1158 | 1186314 | 148289 | 1277952 | bf16 | PURE | FULL_SHARD | False | False | 124525056 | -| 125m | 512 | 8 | a100_40gb | 40.26 | 40.26 | 104 | 3 | 2496 | 2448 | 1253886 | 156735 | 1277952 | bf16 | PURE | FULL_SHARD | False | False | 124131840 | +| 70b | 2048 | 128 | a100_40gb | 48.91 | 65.21 | 152 | 4 | 1 | 512 | 23 | 48194 | 376 | 1048576 | bf16 | PURE | FULL_SHARD | True | False | 64862437376 | +| 70b | 2048 | 64 | a100_40gb | 35.87 | 47.82 | 111 | 2 | 1 | 128 | 8 | 17672 | 276 | 262144 | bf16 | PURE | FULL_SHARD | True | False | 64862437376 | +| 30b | 2048 | 128 | a100_40gb | 52.25 | 69.66 | 163 | 6 | 1 | 768 | 54 | 110803 | 865 | 1572864 | bf16 | PURE | FULL_SHARD | True | False | 29975214080 | +| 30b | 2048 | 32 | a100_40gb | 51.74 | 68.98 | 161 | 4 | 1 | 128 | 13 | 27431 | 857 | 262144 | bf16 | PURE | FULL_SHARD | True | False | 29975214080 | +| 13b | 8192 | 8 | a100_40gb | 43.95 | 58.6 | 137 | 1 | 16 | 128 | 1 | 11258 | 1407 | 1048576 | bf16 | PURE | FULL_SHARD | True | False | 12885411840 | +| 13b | 4096 | 8 | a100_40gb | 44.85 | 59.8 | 139 | 2 | 16 | 256 | 3 | 12830 | 1603 | 1048576 | bf16 | PURE | FULL_SHARD | True | False | 12864440320 | +| 13b | 2048 | 128 | a100_40gb | 51.93 | 69.24 | 162 | 16 | 1 | 2048 | 123 | 252444 | 1972 | 4194304 | bf16 | PURE | FULL_SHARD | True | False | 12853954560 | +| 13b | 2048 | 64 | a100_40gb | 52.04 | 69.39 | 162 | 16 | 1 | 1024 | 61 | 126479 | 1976 | 2097152 | bf16 | PURE | FULL_SHARD | True | False | 12853954560 | +| 13b | 2048 | 32 | a100_40gb | 52.62 | 70.16 | 164 | 14 | 1 | 448 | 31 | 63946 | 1998 | 917504 | bf16 | PURE | FULL_SHARD | True | False | 12853954560 | +| 13b | 2048 | 16 | a100_40gb | 52.5 | 70.0 | 163 | 10 | 1 | 160 | 15 | 31900 | 1993 | 327680 | bf16 | PURE | FULL_SHARD | True | False | 12853954560 | +| 13b | 2048 | 8 | a100_40gb | 43.94 | 58.58 | 137 | 4 | 16 | 512 | 6 | 13347 | 1668 | 1048576 | bf16 | PURE | FULL_SHARD | True | False | 12853954560 | +| 13b | 1024 | 8 | a100_40gb | 44.07 | 58.76 | 137 | 8 | 16 | 1024 | 13 | 13817 | 1727 | 1048576 | bf16 | PURE | FULL_SHARD | True | False | 12848711680 | +| 13b | 512 | 8 | a100_40gb | 44.28 | 59.04 | 138 | 16 | 16 | 2048 | 27 | 14108 | 1763 | 1048576 | bf16 | PURE | FULL_SHARD | True | False | 12846090240 | +| 7b | 16384 | 8 | a100_40gb | 47.65 | 63.53 | 148 | 1 | 4 | 32 | 1 | 17998 | 2249 | 524288 | bf16 | PURE | FULL_SHARD | True | False | 6717579264 | +| 7b | 8192 | 8 | a100_40gb | 49.04 | 65.38 | 153 | 3 | 4 | 96 | 2 | 23098 | 2887 | 786432 | bf16 | PURE | FULL_SHARD | True | False | 6684024832 | +| 7b | 4096 | 8 | a100_40gb | 50.11 | 66.82 | 156 | 6 | 4 | 192 | 6 | 26930 | 3366 | 786432 | bf16 | PURE | FULL_SHARD | True | False | 6667247616 | +| 7b | 2048 | 128 | a100_40gb | 50.14 | 66.85 | 156 | 18 | 1 | 2304 | 226 | 463749 | 3623 | 4718592 | bf16 | PURE | FULL_SHARD | True | False | 6658859008 | +| 7b | 2048 | 64 | a100_40gb | 50.73 | 67.64 | 158 | 18 | 1 | 1152 | 114 | 234614 | 3665 | 2359296 | bf16 | PURE | FULL_SHARD | True | False | 6658859008 | +| 7b | 2048 | 32 | a100_40gb | 51.55 | 68.73 | 160 | 18 | 1 | 576 | 58 | 119202 | 3725 | 1179648 | bf16 | PURE | FULL_SHARD | True | False | 6658859008 | +| 7b | 2048 | 16 | a100_40gb | 50.44 | 67.26 | 157 | 16 | 1 | 256 | 28 | 58322 | 3645 | 524288 | bf16 | PURE | FULL_SHARD | True | False | 6658859008 | +| 7b | 2048 | 8 | a100_40gb | 50.92 | 67.89 | 158 | 12 | 4 | 384 | 14 | 29436 | 3679 | 786432 | bf16 | PURE | FULL_SHARD | True | False | 6658859008 | +| 7b | 1024 | 8 | a100_40gb | 51.31 | 68.42 | 160 | 24 | 4 | 768 | 30 | 30833 | 3854 | 786432 | bf16 | PURE | FULL_SHARD | True | False | 6654664704 | +| 7b | 512 | 8 | a100_40gb | 50.85 | 67.8 | 158 | 48 | 4 | 1536 | 60 | 31167 | 3895 | 786432 | bf16 | PURE | FULL_SHARD | True | False | 6652567552 | +| 3b | 32768 | 8 | a100_40gb | 46.03 | 61.37 | 143 | 1 | 4 | 32 | 0 | 23640 | 2955 | 1048576 | bf16 | PURE | FULL_SHARD | True | False | 2730480640 | +| 3b | 16384 | 8 | a100_40gb | 46.14 | 61.52 | 143 | 2 | 8 | 128 | 2 | 35726 | 4465 | 2097152 | bf16 | PURE | FULL_SHARD | True | False | 2688537600 | +| 3b | 8192 | 8 | a100_40gb | 55.13 | 55.13 | 172 | 1 | 8 | 64 | 6 | 57193 | 7149 | 524288 | bf16 | PURE | FULL_SHARD | False | False | 2667566080 | +| 3b | 4096 | 8 | a100_40gb | 56.18 | 56.18 | 175 | 2 | 8 | 128 | 17 | 70223 | 8777 | 524288 | bf16 | PURE | FULL_SHARD | False | False | 2657080320 | +| 3b | 2048 | 128 | a100_40gb | 54.8 | 54.8 | 170 | 6 | 1 | 768 | 596 | 1220885 | 9538 | 1572864 | bf16 | PURE | FULL_SHARD | False | False | 2651837440 | +| 3b | 2048 | 64 | a100_40gb | 55.94 | 55.94 | 174 | 6 | 1 | 384 | 304 | 623167 | 9736 | 786432 | bf16 | PURE | FULL_SHARD | False | False | 2651837440 | +| 3b | 2048 | 32 | a100_40gb | 56.96 | 56.96 | 177 | 6 | 1 | 192 | 154 | 317261 | 9914 | 393216 | bf16 | PURE | FULL_SHARD | False | False | 2651837440 | +| 3b | 2048 | 16 | a100_40gb | 56.02 | 56.02 | 174 | 5 | 1 | 80 | 76 | 156013 | 9750 | 163840 | bf16 | PURE | FULL_SHARD | False | False | 2651837440 | +| 3b | 2048 | 8 | a100_40gb | 57.82 | 57.82 | 180 | 5 | 8 | 320 | 39 | 80520 | 10065 | 655360 | bf16 | PURE | FULL_SHARD | False | False | 2651837440 | +| 3b | 1024 | 8 | a100_40gb | 58.14 | 58.14 | 181 | 10 | 8 | 640 | 83 | 85854 | 10731 | 655360 | bf16 | PURE | FULL_SHARD | False | False | 2649216000 | +| 3b | 512 | 8 | a100_40gb | 59.49 | 59.49 | 185 | 20 | 8 | 1280 | 176 | 90596 | 11324 | 655360 | bf16 | PURE | FULL_SHARD | False | False | 2647905280 | +| 1b | 32768 | 8 | a100_40gb | 45.07 | 60.1 | 140 | 1 | 4 | 32 | 1 | 40762 | 5095 | 1048576 | bf16 | PURE | FULL_SHARD | True | False | 1378865152 | +| 1b | 16384 | 8 | a100_40gb | 55.23 | 55.23 | 172 | 1 | 8 | 64 | 4 | 77723 | 9715 | 1048576 | bf16 | PURE | FULL_SHARD | False | False | 1345310720 | +| 1b | 8192 | 8 | a100_40gb | 55.29 | 55.29 | 172 | 2 | 8 | 128 | 13 | 107799 | 13474 | 1048576 | bf16 | PURE | FULL_SHARD | False | False | 1328533504 | +| 1b | 4096 | 8 | a100_40gb | 55.85 | 55.85 | 174 | 4 | 8 | 256 | 32 | 134851 | 16856 | 1048576 | bf16 | PURE | FULL_SHARD | False | False | 1320144896 | +| 1b | 2048 | 128 | a100_40gb | 54.41 | 54.41 | 169 | 10 | 1 | 1280 | 1165 | 2386897 | 18647 | 2621440 | bf16 | PURE | FULL_SHARD | False | False | 1315950592 | +| 1b | 2048 | 64 | a100_40gb | 55.44 | 55.44 | 172 | 10 | 1 | 640 | 593 | 1216104 | 19001 | 1310720 | bf16 | PURE | FULL_SHARD | False | False | 1315950592 | +| 1b | 2048 | 32 | a100_40gb | 45.39 | 45.39 | 141 | 10 | 1 | 320 | 243 | 497782 | 15555 | 655360 | bf16 | PURE | FULL_SHARD | False | False | 1315950592 | +| 1b | 2048 | 16 | a100_40gb | 55.69 | 55.69 | 173 | 8 | 1 | 128 | 149 | 305372 | 19085 | 262144 | bf16 | PURE | FULL_SHARD | False | False | 1315950592 | +| 1b | 2048 | 8 | a100_40gb | 56.23 | 56.23 | 175 | 8 | 8 | 512 | 75 | 154171 | 19271 | 1048576 | bf16 | PURE | FULL_SHARD | False | False | 1315950592 | +| 1b | 1024 | 8 | a100_40gb | 57.02 | 57.02 | 177 | 16 | 8 | 1024 | 163 | 167677 | 20959 | 1048576 | bf16 | PURE | FULL_SHARD | False | False | 1313853440 | +| 1b | 512 | 8 | a100_40gb | 57.1 | 57.1 | 178 | 32 | 8 | 2048 | 340 | 174256 | 21782 | 1048576 | bf16 | PURE | FULL_SHARD | False | False | 1312804864 | +| 760m | 32768 | 8 | a100_40gb | 44.53 | 59.37 | 138 | 1 | 4 | 32 | 1 | 57464 | 7183 | 1048576 | bf16 | PURE | FULL_SHARD | True | False | 807656448 | +| 760m | 16384 | 8 | a100_40gb | 53.26 | 53.26 | 166 | 1 | 4 | 32 | 6 | 111316 | 13914 | 524288 | bf16 | PURE | FULL_SHARD | False | False | 782490624 | +| 760m | 8192 | 8 | a100_40gb | 53.12 | 53.12 | 165 | 3 | 4 | 96 | 19 | 160853 | 20106 | 786432 | bf16 | PURE | FULL_SHARD | False | False | 769907712 | +| 760m | 4096 | 8 | a100_40gb | 53.0 | 53.0 | 165 | 6 | 4 | 192 | 50 | 206909 | 25863 | 786432 | bf16 | PURE | FULL_SHARD | False | False | 763616256 | +| 760m | 2048 | 128 | a100_40gb | 50.73 | 50.73 | 158 | 12 | 1 | 1536 | 1808 | 3704382 | 28940 | 3145728 | bf16 | PURE | FULL_SHARD | False | False | 760470528 | +| 760m | 2048 | 64 | a100_40gb | 51.44 | 51.44 | 160 | 12 | 1 | 768 | 917 | 1878030 | 29344 | 1572864 | bf16 | PURE | FULL_SHARD | False | False | 760470528 | +| 760m | 2048 | 32 | a100_40gb | 51.97 | 51.97 | 162 | 12 | 1 | 384 | 463 | 948745 | 29648 | 786432 | bf16 | PURE | FULL_SHARD | False | False | 760470528 | +| 760m | 2048 | 16 | a100_40gb | 51.9 | 51.9 | 161 | 12 | 1 | 192 | 231 | 473723 | 29607 | 393216 | bf16 | PURE | FULL_SHARD | False | False | 760470528 | +| 760m | 2048 | 8 | a100_40gb | 52.89 | 52.89 | 165 | 12 | 4 | 384 | 117 | 241389 | 30173 | 786432 | bf16 | PURE | FULL_SHARD | False | False | 760470528 | +| 760m | 1024 | 8 | a100_40gb | 53.63 | 53.63 | 167 | 24 | 4 | 768 | 261 | 267376 | 33422 | 786432 | bf16 | PURE | FULL_SHARD | False | False | 758897664 | +| 760m | 512 | 8 | a100_40gb | 53.47 | 53.47 | 166 | 48 | 4 | 1536 | 545 | 279504 | 34938 | 786432 | bf16 | PURE | FULL_SHARD | False | False | 758111232 | +| 350m | 32768 | 8 | a100_40gb | 51.55 | 51.55 | 160 | 1 | 4 | 32 | 3 | 107329 | 13416 | 1048576 | bf16 | PURE | FULL_SHARD | False | False | 387442688 | +| 350m | 16384 | 8 | a100_40gb | 51.78 | 51.78 | 161 | 2 | 4 | 64 | 11 | 183175 | 22896 | 1048576 | bf16 | PURE | FULL_SHARD | False | False | 370665472 | +| 350m | 8192 | 8 | a100_40gb | 51.39 | 51.39 | 160 | 4 | 4 | 128 | 34 | 279466 | 34933 | 1048576 | bf16 | PURE | FULL_SHARD | False | False | 362276864 | +| 350m | 4096 | 8 | a100_40gb | 50.38 | 50.38 | 157 | 8 | 4 | 256 | 91 | 374670 | 46833 | 1048576 | bf16 | PURE | FULL_SHARD | False | False | 358082560 | +| 350m | 2048 | 128 | a100_40gb | 45.61 | 45.61 | 142 | 18 | 1 | 2304 | 3245 | 6647647 | 51934 | 4718592 | bf16 | PURE | FULL_SHARD | False | False | 355985408 | +| 350m | 2048 | 64 | a100_40gb | 46.27 | 46.27 | 144 | 18 | 1 | 1152 | 1646 | 3372118 | 52689 | 2359296 | bf16 | PURE | FULL_SHARD | False | False | 355985408 | +| 350m | 2048 | 32 | a100_40gb | 47.26 | 47.26 | 147 | 18 | 1 | 576 | 840 | 1721978 | 53811 | 1179648 | bf16 | PURE | FULL_SHARD | False | False | 355985408 | +| 350m | 2048 | 16 | a100_40gb | 48.66 | 48.66 | 151 | 18 | 1 | 288 | 432 | 886622 | 55413 | 589824 | bf16 | PURE | FULL_SHARD | False | False | 355985408 | +| 350m | 2048 | 8 | a100_40gb | 49.17 | 49.17 | 153 | 16 | 4 | 512 | 218 | 447963 | 55995 | 1048576 | bf16 | PURE | FULL_SHARD | False | False | 355985408 | +| 350m | 1024 | 8 | a100_40gb | 48.73 | 48.73 | 152 | 32 | 4 | 1024 | 488 | 500184 | 62523 | 1048576 | bf16 | PURE | FULL_SHARD | False | False | 354936832 | +| 350m | 512 | 8 | a100_40gb | 48.39 | 48.39 | 150 | 64 | 4 | 2048 | 1035 | 530277 | 66284 | 1048576 | bf16 | PURE | FULL_SHARD | False | False | 354412544 | +| 125m | 32768 | 8 | a100_40gb | 47.27 | 47.27 | 147 | 1 | 4 | 32 | 7 | 261208 | 32651 | 1048576 | bf16 | PURE | FULL_SHARD | False | False | 148904448 | +| 125m | 16384 | 8 | a100_40gb | 46.77 | 46.77 | 145 | 2 | 3 | 48 | 27 | 443876 | 55484 | 786432 | bf16 | PURE | FULL_SHARD | False | False | 136321536 | +| 125m | 8192 | 8 | a100_40gb | 46.94 | 46.94 | 146 | 5 | 3 | 120 | 84 | 694868 | 86858 | 983040 | bf16 | PURE | FULL_SHARD | False | False | 130030080 | +| 125m | 4096 | 8 | a100_40gb | 44.82 | 44.82 | 139 | 13 | 3 | 312 | 224 | 921297 | 115162 | 1277952 | bf16 | PURE | FULL_SHARD | False | False | 126884352 | +| 125m | 2048 | 128 | a100_40gb | 38.86 | 38.86 | 121 | 26 | 1 | 3328 | 7746 | 15863837 | 123936 | 6815744 | bf16 | PURE | FULL_SHARD | False | False | 125311488 | +| 125m | 2048 | 64 | a100_40gb | 39.27 | 39.27 | 122 | 26 | 1 | 1664 | 3913 | 8015010 | 125234 | 3407872 | bf16 | PURE | FULL_SHARD | False | False | 125311488 | +| 125m | 2048 | 32 | a100_40gb | 39.86 | 39.86 | 124 | 26 | 1 | 832 | 1986 | 4067922 | 127122 | 1703936 | bf16 | PURE | FULL_SHARD | False | False | 125311488 | +| 125m | 2048 | 16 | a100_40gb | 40.93 | 40.93 | 127 | 26 | 1 | 416 | 1019 | 2088560 | 130535 | 851968 | bf16 | PURE | FULL_SHARD | False | False | 125311488 | +| 125m | 2048 | 8 | a100_40gb | 42.75 | 42.75 | 133 | 26 | 3 | 624 | 532 | 1090678 | 136334 | 1277952 | bf16 | PURE | FULL_SHARD | False | False | 125311488 | +| 125m | 1024 | 8 | a100_40gb | 40.89 | 40.89 | 127 | 52 | 3 | 1248 | 1158 | 1186314 | 148289 | 1277952 | bf16 | PURE | FULL_SHARD | False | False | 124525056 | +| 125m | 512 | 8 | a100_40gb | 40.26 | 40.26 | 125 | 104 | 3 | 2496 | 2448 | 1253886 | 156735 | 1277952 | bf16 | PURE | FULL_SHARD | False | False | 124131840 | diff --git a/scripts/train/benchmarking/collect_results.py b/scripts/train/benchmarking/collect_results.py index 050390b743..d3691e951c 100644 --- a/scripts/train/benchmarking/collect_results.py +++ b/scripts/train/benchmarking/collect_results.py @@ -6,9 +6,10 @@ import math from typing import Any, Dict, List, Union -from mcli import sdk as msdk +from composer.callbacks.speed_monitor import \ + GPU_AVAILABLE_FLOPS as GPU_FLOP_DICT -GPU_AVAILABLE_FLOPS = 312_000_000_000_000 +from mcli import sdk as msdk def str_to_bool(value: Union[bool, str]): @@ -46,13 +47,19 @@ def parse_args(): def get_runs(args: argparse.Namespace): - runs = [r for r in msdk.get_runs() if args.project in r.name] + runs = [ + r for r in msdk.get_runs(include_details=True) + if args.project in r.name.split('-')[0] and + r.status == msdk.RunStatus('COMPLETED') + ] for filter in args.filters: runs = [r for r in runs if filter in r.name] def sort_key(r: msdk.Run): model_name = r.name.split('-')[2] - num_gpu = r.config.gpu_num + num_gpu = r.gpus + gpu_type = r.gpu_type + model_precision = r.submitted_config.parameters['precision'] if model_name[-1] == 'm': model_name_size = 1e6 elif model_name[-1] == 'b': @@ -61,9 +68,12 @@ def sort_key(r: msdk.Run): print(model_name) raise ValueError model_size = int(model_name[:-1]) - return (model_name_size, model_size, r.config.parameters['max_seq_len'], - num_gpu, r.config.parameters['global_train_batch_size']) + return (gpu_type, model_precision, model_name_size, model_size, + r.submitted_config.parameters['max_seq_len'], num_gpu, + r.submitted_config.parameters['global_train_batch_size']) + unique_runs = {sort_key(i): i for i in runs} + runs = [unique_runs[r] for r in unique_runs] runs.sort(reverse=True, key=sort_key) return runs @@ -83,17 +93,7 @@ def filter_runs(runs: List[msdk.Run]): pop_runs = [] for run in runs: - if run.status in [ - msdk.RunStatus('FAILED_PULL'), - msdk.RunStatus('PENDING'), - msdk.RunStatus('QUEUED'), - msdk.RunStatus('RUNNING'), - msdk.RunStatus('SCHEDULED'), - msdk.RunStatus('STARTING'), - msdk.RunStatus('STOPPED'), - msdk.RunStatus('STOPPING'), - msdk.RunStatus('TERMINATING'), - ]: + if run.status != msdk.RunStatus('COMPLETED'): print(f'run {run.name} has run status {run.status}') pop_runs.append(run) for run in pop_runs: @@ -106,13 +106,22 @@ def parse_run(run: msdk.Run) -> Dict[str, Any]: n_params = micro_batchsize = throughput = -1 model_name = run.name.split('-')[2] - gpu_num = run.config.gpu_num - gpu_type = run.config.gpu_type - - fsdp_config = run.config.parameters['fsdp_config'] - - seq_len = run.config.parameters['max_seq_len'] - global_train_batch_size = run.config.parameters['global_train_batch_size'] + gpus = run.gpus + gpu_type = run.gpu_type + + if 'h100' in gpu_type: + gpu_type = 'h100-sxm' + if 'a100' in gpu_type: + gpu_type = 'a100' + GPU_AVAILABLE_FLOPS = GPU_FLOP_DICT[gpu_type][ + run.submitted_config.parameters['precision']] + + gpu_type = run.gpu_type + fsdp_config = run.submitted_config.parameters['fsdp_config'] + + seq_len = run.submitted_config.parameters['max_seq_len'] + global_train_batch_size = run.submitted_config.parameters[ + 'global_train_batch_size'] activation_checkpointing = fsdp_config['activation_checkpointing'] logs = msdk.get_run_logs(run) @@ -138,8 +147,8 @@ def parse_run(run: msdk.Run) -> Dict[str, Any]: throughput = float(line.split(' ')[-1]) break - d_model = run.config.parameters['model']['d_model'] - n_layers = run.config.parameters['model']['n_layers'] + d_model = run.submitted_config.parameters['model']['d_model'] + n_layers = run.submitted_config.parameters['model']['n_layers'] # mfu is approximated using thoughtput and param count # the number of paramters is approximately the number of multiply-accumulates (MAC) in the network @@ -153,31 +162,36 @@ def parse_run(run: msdk.Run) -> Dict[str, Any]: attn_flops_per_seq = n_layers * 2 * 2 * (d_model * (seq_len**2)) # there are 2 ops in bwd pass and 1 in fwd pass so we mult by 3 mfu_w_attn = (3 * flops_per_seq + 3 * attn_flops_per_seq) * throughput / ( - gpu_num * GPU_AVAILABLE_FLOPS) + gpus * GPU_AVAILABLE_FLOPS) if activation_checkpointing: hfu_w_attn = (4 * flops_per_seq + 4 * attn_flops_per_seq - ) * throughput / (gpu_num * GPU_AVAILABLE_FLOPS) + ) * throughput / (gpus * GPU_AVAILABLE_FLOPS) else: hfu_w_attn = mfu_w_attn + model_tflop = int( + (3 * flops_per_seq + 3 * attn_flops_per_seq) * throughput / gpus / 1e12) + return { 'Model': model_name, 'SeqLen (T)': seq_len, '# GPUs': - gpu_num, + gpus, 'GPU': gpu_type, 'MFU': round(mfu_w_attn * 100, 2), 'HFU': round(hfu_w_attn * 100, 2), + 'Model TFLOP': + model_tflop, 'MicroBatchSize': micro_batchsize, 'GradAccum': - math.ceil(global_train_batch_size / gpu_num / micro_batchsize), + math.ceil(global_train_batch_size / gpus / micro_batchsize), 'GlobalBatchSize': global_train_batch_size, 'Throughput (S/s)': @@ -185,11 +199,11 @@ def parse_run(run: msdk.Run) -> Dict[str, Any]: 'Throughput (T/s)': int(throughput * seq_len), 'Throughput (T/s/GPU)': - int(throughput * seq_len / gpu_num), + int(throughput * seq_len / gpus), 'GlobalBatchSize (T)': global_train_batch_size * seq_len, 'Precision': - run.config.parameters['precision'], + run.submitted_config.parameters['precision'], 'MP Mode': fsdp_config['mixed_precision'], 'Sharding Strategy': diff --git a/scripts/train/benchmarking/submit_benchmarks.py b/scripts/train/benchmarking/submit_benchmarks.py index f7db0613ef..6530e79b0b 100644 --- a/scripts/train/benchmarking/submit_benchmarks.py +++ b/scripts/train/benchmarking/submit_benchmarks.py @@ -62,7 +62,7 @@ def parse_args(): type=str, default=['bf16'], nargs='+', - choices=['bf16', 'fp16']) + choices=['bf16', 'fp16', 'fp8']) parser.add_argument('--fsdp_config_mixed_precision', type=str, default='PURE') @@ -71,6 +71,31 @@ def parse_args(): nargs='?', const=True, default=None) + parser.add_argument('--fsdp_config_shard_strategy', + type=str, + nargs='?', + const=True, + default=None) + parser.add_argument('--fsdp_config_limit_all_gathers', + type=str_to_bool, + nargs='?', + const=True, + default=None) + parser.add_argument('--fsdp_config_forward_prefetch', + type=str_to_bool, + nargs='?', + const=True, + default=None) + parser.add_argument('--fsdp_config_backward_prefetch', + type=str, + nargs='?', + const=True, + default=None) + parser.add_argument('--activation_cpu_offload', + type=str_to_bool, + nargs='?', + const=True, + default=None) parser.add_argument( '-s', '--seq_len_exp', @@ -121,7 +146,7 @@ def parse_args(): parser.add_argument('-c', '--clusters', type=str, - default=['r7z2'], + default=['r1z1'], nargs='+', choices=CLUSTER_INFO.keys()) known_args = parser.parse_known_args()[0] @@ -136,7 +161,7 @@ def parse_args(): parser.add_argument('-g', '--gpu_nums', type=int, - default=[16], + default=[8], nargs='+', choices=_gpu_nums) @@ -158,14 +183,13 @@ def parse_args(): const=True, default=True) - parser.add_argument('--priority', type=str, default='low') + parser.add_argument('--priority', type=str, default='lowest') parser.add_argument('--RUN', type=str_to_bool, nargs='?', const=True, default=False) - return parser.parse_args() @@ -236,19 +260,26 @@ def get_valid_gpu_lim(cluster: str, gpu_type: str): raise ValueError -def mod_parameters(parameters: Dict[str, Any], - max_seq_len: int, - global_train_batch_size: int, - precision: str, - fsdp_config_mixed_precision: str = 'DEFAULT', - fsdp_config_activation_checkpointing: Optional[bool] = None, - run_name: str = '', - data_remote: Optional[str] = None, - max_duration: str = '30ba', - eval_interval: int = 0, - microbatch_size: Optional[Union[int, str]] = None, - wandb: bool = True, - pad_vocab_multiple: Optional[int] = None): +def mod_parameters( + parameters: Dict[str, Any], + max_seq_len: int, + global_train_batch_size: int, + precision: str, + fsdp_config_mixed_precision: str = 'DEFAULT', + fsdp_config_activation_checkpointing: Optional[bool] = None, + fsdp_config_shard_strategy: Optional[str] = None, + fsdp_config_forward_prefetch: Optional[bool] = None, + fsdp_config_backward_prefetch: Optional[str] = None, + fsdp_config_limit_all_gathers: Optional[bool] = None, + activation_cpu_offload: Optional[bool] = None, + run_name: str = '', + data_remote: Optional[str] = None, + max_duration: str = '30ba', + eval_interval: int = 0, + microbatch_size: Optional[Union[int, str]] = None, + wandb: bool = True, + pad_vocab_multiple: Optional[int] = None, +): if run_name: parameters['run_name'] = run_name if data_remote is not None: @@ -271,9 +302,9 @@ def mod_parameters(parameters: Dict[str, Any], parameters['max_seq_len'] = max_seq_len parameters['model']['max_seq_len'] = max_seq_len - parameters['model']['attn_impl'] = args.attn_impl + parameters['model']['attn_config']['attn_impl'] = args.attn_impl - parameters['model']['low_precision_layernorm'] = True + parameters['model']['norm_type'] = 'low_precision_layernorm' # Pad vocab size to multiple of N for A100 perf if pad_vocab_multiple: @@ -305,9 +336,21 @@ def mod_parameters(parameters: Dict[str, Any], if fsdp_config_activation_checkpointing is not None: parameters['fsdp_config'][ 'activation_checkpointing'] = fsdp_config_activation_checkpointing - - parameters['fsdp_config']['activation_checkpointing_reentrant'] = False - parameters['fsdp_config']['limit_all_gathers'] = True + if fsdp_config_shard_strategy is not None: + parameters['fsdp_config'][ + 'sharding_strategy'] = fsdp_config_shard_strategy + if fsdp_config_limit_all_gathers is not None: + parameters['fsdp_config'][ + 'limit_all_gathers'] = fsdp_config_limit_all_gathers + if fsdp_config_forward_prefetch is not None: + parameters['fsdp_config'][ + 'forward_prefetch'] = fsdp_config_forward_prefetch + if fsdp_config_backward_prefetch is not None: + parameters['fsdp_config'][ + 'backward_prefetch'] = fsdp_config_backward_prefetch + if activation_cpu_offload is not None: + parameters['fsdp_config'][ + 'activation_cpu_offload'] = activation_cpu_offload if wandb: # add wandb @@ -332,7 +375,7 @@ def get_integrations(project: str, } git_integration.update({ 'integration_type': 'git_repo', - 'git_repo': 'mosaicml/examples', + 'git_repo': 'mosaicml/llm-foundry', 'pip_install': '-e .[gpu]' }) @@ -351,30 +394,42 @@ def get_integrations(project: str, def run_config(config: Tuple[str, int, int, str, str, int, str], args: argparse.Namespace): model_yaml, max_seq_len, global_train_batch_size, cluster, gpu_type, gpu_num, precision = config - - integrations = get_integrations( - args.project, - git_branch=args.git_branch, - git_commit=args.git_commit, - wandb=args.wandb) # point to git repo and potentially wandb - - # Define our command - if args.data_remote is not None: - command = """ - cd examples/scripts - - composer train/train.py /mnt/config/parameters.yaml + integrations = [ + { + 'integration_type': 'git_repo', + 'git_repo': 'mosaicml/llm-foundry', + 'git_branch': 'v0.3.0', + 'pip_install': '-e .[gpu]', + }, + { + 'integration_type': 'wandb', + 'entity': 'mosaic-ml', + 'project': args.project + }, + ] + + command = '' + if gpu_type == 'h100_80gb' and 'fp8' in precision: # Required for flash-attn and FP8 training + command += f""" + pip install flash-attn==1.0.7 --no-build-isolation + pip install git+https://github.com/NVIDIA/TransformerEngine.git@v0.10 + pip uninstall install pydantic --yes + pip install pydantic==1.9.0 """ + + if args.data_remote is None: + command += f""" + cd llm-foundry/scripts + python data_prep/convert_dataset_hf.py --dataset c4 --data_subset en --out_root ./my-copy-c4 --splits train_small val_small --concat_tokens {max_seq_len} --tokenizer gpt2 --eos_text '<|endoftext|>' + composer train/train.py /mnt/config/parameters.yaml + """ else: command = f""" - cd examples/scripts - - python data_prep/convert_dataset_hf.py --dataset c4 --data_subset en --out_root ./my-copy-c4 --splits train_small val_small --concat_tokens {max_seq_len} --tokenizer gpt2 --eos_text '<|endoftext|>' - - composer train/train.py /mnt/config/parameters.yaml - """ + cd llm-foundry/scripts + composer train/train.py /mnt/config/parameters.yaml + """ - path = os.path.join('../yamls/mpt', model_yaml) + path = os.path.join('../yamls/pretrain', 'mpt-' + model_yaml) parameters = get_parameters(path) model_name = '-'.join(model_yaml.split('.')[-2].split('/')[-2:]).replace( @@ -391,23 +446,28 @@ def run_config(config: Tuple[str, int, int, str, str, int, str], _name = name name = name[:name_len_lim] print(f'Shortening {_name} to {name} ({name_len_lim} chars)') - microbatch_size = args.microbatch_size or 'auto' assert isinstance(microbatch_size, (int, str)) parameters = mod_parameters( parameters, max_seq_len, global_train_batch_size, - precision, - fsdp_config_mixed_precision=args.fsdp_config_mixed_precision, + 'amp_' + precision, fsdp_config_activation_checkpointing=args. fsdp_config_activation_checkpointing, + fsdp_config_limit_all_gathers=args.fsdp_config_limit_all_gathers, + fsdp_config_shard_strategy=args.fsdp_config_shard_strategy, + fsdp_config_forward_prefetch=args.fsdp_config_forward_prefetch, + fsdp_config_backward_prefetch=args.fsdp_config_backward_prefetch, + activation_cpu_offload=args.activation_cpu_offload, run_name=name, data_remote=args.data_remote, microbatch_size=microbatch_size, wandb=args.wandb, - pad_vocab_multiple=args.pad_vocab_multiple) - + pad_vocab_multiple=args.pad_vocab_multiple, + ) + if gpu_type == 'h100_80gb' and precision == 'fp8': + parameters['model']['fc_type'] = 'te' # Create run config mcli sdk/api config = RunConfig(name=name, gpu_type=gpu_type, @@ -417,8 +477,8 @@ def run_config(config: Tuple[str, int, int, str, str, int, str], integrations=integrations, command=command, parameters=parameters, - scheduling=SchedulingConfig(priority=args.priority)) - + scheduling=SchedulingConfig(priority=args.priority, + resumable=True)) if args.RUN: # Create the run from a config run = create_run(config) @@ -461,7 +521,6 @@ def run_check_dtms(num_gpus: int, dtms: int, batch_size: int): if __name__ == '__main__': args = parse_args() - n_jobs = 0 for max_seq_len in get_max_seq_lens(args.seq_len_exp): for cluster in args.clusters: @@ -497,7 +556,6 @@ def run_check_dtms(num_gpus: int, dtms: int, batch_size: int): global_train_batch_size, cluster, gpu_type, gpu_num, precision) - print(config) run_config(config, args) n_jobs += 1 diff --git a/scripts/train/benchmarking/sweep.sh b/scripts/train/benchmarking/sweep.sh index 5d962b7c5c..97372ee6fd 100755 --- a/scripts/train/benchmarking/sweep.sh +++ b/scripts/train/benchmarking/sweep.sh @@ -2,34 +2,148 @@ PROJECT="tput" GIT_COMMIT="v0.0.4" -IMAGE="mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04" -CLUSTER_80GB=YOUR_CLUSTER_80GB -CLUSTER_40GB=YOUR_CLUSTER_40GB +IMAGE="mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04" +CLUSTER_40GB= # TODO + +for PRECISION in fp8 bf16 +do + + # H100 80GB + python submit_benchmarks.py --project $PROJECT -m 125m.yaml -g 8 --microbatch_size 40 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 11 11 --RUN -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 350m.yaml -g 8 --microbatch_size 32 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 11 11 --RUN -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 760m.yaml -g 8 --microbatch_size 24 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 11 11 --RUN -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 1b.yaml -g 8 --microbatch_size 14 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 11 11 --RUN -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 10 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 11 11 --RUN --fsdp_config_activation_checkpointing false -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 6 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 11 11 --RUN --fsdp_config_activation_checkpointing false -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 2 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 11 11 --RUN --fsdp_config_activation_checkpointing false -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 30b.yaml -g 8 --microbatch_size 3 --accum 21 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 11 11 --RUN -t ${PRECISION} + + python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 7 --accum 1 --image $IMAGE1 --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 11 11 --RUN --fsdp_config_activation_checkpointing false -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 7 --accum 1 --image $IMAGE0 --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 11 11 --RUN --fsdp_config_activation_checkpointing false -t ${PRECISION} + + python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 6 --accum 1 --image $IMAGE1 --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 11 11 --RUN --fsdp_config_activation_checkpointing false -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 6 --accum 1 --image $IMAGE0 --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 11 11 --RUN --fsdp_config_activation_checkpointing false -t ${PRECISION} + + # INCREASE GPU COUNT + for GPU_NUM in 16 32 64 + do + python submit_benchmarks.py --project $PROJECT -m 125m.yaml -g $GPU_NUM --microbatch_size 32 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 11 11 --RUN -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 350m.yaml -g $GPU_NUM --microbatch_size 32 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 11 11 --RUN -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 760m.yaml -g $GPU_NUM --microbatch_size 24 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 11 11 --RUN -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 1b.yaml -g $GPU_NUM --microbatch_size 20 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 11 11 --RUN -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g $GPU_NUM --microbatch_size 32 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 11 11 --RUN --fsdp_config_activation_checkpointing false -t ${PRECISION} + done + + python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 16 --microbatch_size 10 --accum 3 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 11 11 --RUN --fsdp_config_activation_checkpointing false -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 16 --microbatch_size 2 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 11 11 --RUN --fsdp_config_activation_checkpointing false -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 30b.yaml -g 16 --microbatch_size 10 --accum 3 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 11 11 --RUN -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 32 --microbatch_size 6 --accum 3 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 11 11 --RUN --fsdp_config_activation_checkpointing false -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 32 --microbatch_size 2 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 11 11 --RUN --fsdp_config_activation_checkpointing false -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 64 --microbatch_size 6 --accum 3 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 11 11 --RUN --fsdp_config_activation_checkpointing false -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 64 --microbatch_size 2 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 11 11 --RUN --fsdp_config_activation_checkpointing false -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 30b.yaml -g 32 --microbatch_size 14 --accum 3 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 11 11 --RUN -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 70b.yaml -g 32 --microbatch_size 2 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 11 11 --RUN -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 30b.yaml -g 64 --microbatch_size 16 --accum 3 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 11 11 --RUN -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 70b.yaml -g 64 --microbatch_size 8 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 11 11 --RUN -t ${PRECISION} + + # SCALE SEQUENCE LENGTH + # seqlen 512 + python submit_benchmarks.py --project $PROJECT -m 125m.yaml -g 8 --microbatch_size 128 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --precision fp8 --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 9 9 --RUN -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 350m.yaml -g 8 --microbatch_size 128 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 9 9 --RUN -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 760m.yaml -g 8 --microbatch_size 96 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 9 9 --RUN -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 1b.yaml -g 8 --microbatch_size 56 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 9 9 --RUN -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 40 --accum 6 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 9 9 --RUN --fsdp_config_activation_checkpointing false -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 64 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 9 9 --RUN --fsdp_config_activation_checkpointing false -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 20 --accum 3 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 9 9 --RUN --fsdp_config_activation_checkpointing false -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 30b.yaml -g 8 --microbatch_size 12 --accum 21 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 9 9 --RUN -t ${PRECISION} + # seqlen 1024 + python submit_benchmarks.py --project $PROJECT -m 125m.yaml -g 8 --microbatch_size 64 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 10 10 --RUN -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 350m.yaml -g 8 --microbatch_size 64 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 10 10 --RUN -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 760m.yaml -g 8 --microbatch_size 48 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 10 10 --RUN -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 1b.yaml -g 8 --microbatch_size 18 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 10 10 --RUN -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 20 --accum 6 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 10 10 --RUN --fsdp_config_activation_checkpointing false -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 64 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 10 10 --RUN -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 40 --accum 3 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 10 10 --RUN -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 30b.yaml -g 8 --microbatch_size 6 --accum 21 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 10 10 --RUN -t ${PRECISION} + # seqlen 4096 + python submit_benchmarks.py --project $PROJECT -m 125m.yaml -g 8 --microbatch_size 16 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 12 12 --RUN -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 350m.yaml -g 8 --microbatch_size 16 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 12 12 --RUN -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 760m.yaml -g 8 --microbatch_size 12 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 12 12 --RUN -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 1b.yaml -g 8 --microbatch_size 7 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 12 12 --RUN -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 5 --accum 6 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 12 12 --RUN --fsdp_config_activation_checkpointing false -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 16 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 12 12 --RUN -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 10 --accum 3 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 12 12 --RUN -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 30b.yaml -g 8 --microbatch_size 1 --accum 21 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 12 12 --RUN -t ${PRECISION} + # seqlen 8192 + python submit_benchmarks.py --project $PROJECT -m 125m.yaml -g 8 --microbatch_size 8 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 13 13 --RUN -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 350m.yaml -g 8 --microbatch_size 8 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 13 13 --RUN -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 760m.yaml -g 8 --microbatch_size 6 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 13 13 --RUN -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 1b.yaml -g 8 --microbatch_size 3 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 13 13 --RUN -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 3 --accum 6 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 13 13 --RUN --fsdp_config_activation_checkpointing false -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 8 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 13 13 --RUN -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 5 --accum 3 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 13 13 --RUN -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 30b.yaml -g 8 --microbatch_size 2 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 13 13 --RUN -t ${PRECISION} + # seqlen 16384 + python submit_benchmarks.py --project $PROJECT -m 125m.yaml -g 8 --microbatch_size 4 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 14 14 --RUN -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 350m.yaml -g 8 --microbatch_size 4 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 14 14 --RUN -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 760m.yaml -g 8 --microbatch_size 3 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 14 14 --RUN -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 1b.yaml -g 8 --microbatch_size 2 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 14 14 --RUN -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 1 --accum 6 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 14 14 --RUN --fsdp_config_activation_checkpointing false -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 4 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 14 14 --RUN -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 3 --accum 3 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 14 14 --RUN -t ${PRECISION} + # seqlen 32768 + python submit_benchmarks.py --project $PROJECT -m 125m.yaml -g 8 --microbatch_size 2 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 15 15 --RUN -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 350m.yaml -g 8 --microbatch_size 2 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 15 15 --RUN -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 760m.yaml -g 8 --microbatch_size 1 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 15 15 --RUN -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 1b.yaml -g 8 --microbatch_size 1 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 15 15 --RUN -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 3 --accum 6 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 15 15 --RUN -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 2 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 15 15 --RUN -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 1 --accum 3 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 15 15 --RUN -t ${PRECISION} + # seqlen 65536 + python submit_benchmarks.py --project $PROJECT -m 125m.yaml -g 8 --microbatch_size 1 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 16 16 --RUN -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 350m.yaml -g 8 --microbatch_size 1 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 16 16 --RUN -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 760m.yaml -g 8 --microbatch_size 1 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 16 16 --RUN --fsdp_config_activation_checkpointing true -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 1b.yaml -g 8 --microbatch_size 1 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 16 16 --RUN --fsdp_config_activation_checkpointing true -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 1 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 16 16 --RUN -t ${PRECISION} + python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 1 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type h100_80gb --cluster $CLUSTER_H100 -s 16 16 --RUN -t ${PRECISION} +done # A100 80GB # seqlen 2048 -python submit_benchmarks.py --project $PROJECT -m 125m.yaml -g 8 --microbatch_size 32 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN +python submit_benchmarks.py --project $PROJECT -m 125m.yaml -g 8 --microbatch_size 40 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN python submit_benchmarks.py --project $PROJECT -m 350m.yaml -g 8 --microbatch_size 32 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN python submit_benchmarks.py --project $PROJECT -m 760m.yaml -g 8 --microbatch_size 24 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN -python submit_benchmarks.py --project $PROJECT -m 1b.yaml -g 8 --microbatch_size 14 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN -python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 10 --accum 6 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false -python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 32 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN -python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 20 --accum 3 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN +python submit_benchmarks.py --project $PROJECT -m 1b.yaml -g 8 --microbatch_size 14 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN +python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 10 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false +python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 6 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false +python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 2 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false python submit_benchmarks.py --project $PROJECT -m 30b.yaml -g 8 --microbatch_size 3 --accum 21 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN +python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 7 --accum 1 --image $IMAGE1 --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false +python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 7 --accum 1 --image $IMAGE0 --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false + +python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 6 --accum 1 --image $IMAGE1 --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false +python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 6 --accum 1 --image $IMAGE0 --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false + # INCREASE GPU COUNT -python submit_benchmarks.py --project $PROJECT -m 125m.yaml -g 16 32 64 --microbatch_size 32 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN -python submit_benchmarks.py --project $PROJECT -m 350m.yaml -g 16 32 64 --microbatch_size 32 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN -python submit_benchmarks.py --project $PROJECT -m 760m.yaml -g 16 32 64 --microbatch_size 24 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN -python submit_benchmarks.py --project $PROJECT -m 1b.yaml -g 16 32 64 --microbatch_size 20 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN +for GPU_NUM in 16 32 64 +do + python submit_benchmarks.py --project $PROJECT -m 125m.yaml -g $GPU_NUM --microbatch_size 32 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN + python submit_benchmarks.py --project $PROJECT -m 350m.yaml -g $GPU_NUM --microbatch_size 32 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN + python submit_benchmarks.py --project $PROJECT -m 760m.yaml -g $GPU_NUM --microbatch_size 24 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN + python submit_benchmarks.py --project $PROJECT -m 1b.yaml -g $GPU_NUM --microbatch_size 20 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN + python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g $GPU_NUM --microbatch_size 32 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false +done + python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 16 --microbatch_size 10 --accum 3 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false -python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 16 32 64 --microbatch_size 32 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN -python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 16 --microbatch_size 24 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN +python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 16 --microbatch_size 2 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false python submit_benchmarks.py --project $PROJECT -m 30b.yaml -g 16 --microbatch_size 10 --accum 3 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN -python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 32 64 --microbatch_size 12 --accum 3 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false -python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 32 64 --microbatch_size 32 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN +python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 32 --microbatch_size 6 --accum 3 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false +python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 32 --microbatch_size 2 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false +python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 64 --microbatch_size 6 --accum 3 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false +python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 64 --microbatch_size 2 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false python submit_benchmarks.py --project $PROJECT -m 30b.yaml -g 32 --microbatch_size 14 --accum 3 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN python submit_benchmarks.py --project $PROJECT -m 70b.yaml -g 32 --microbatch_size 2 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN python submit_benchmarks.py --project $PROJECT -m 30b.yaml -g 64 --microbatch_size 16 --accum 3 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN @@ -37,13 +151,13 @@ python submit_benchmarks.py --project $PROJECT -m 70b.yaml -g 64 --microb # SCALE SEQUENCE LENGTH # seqlen 512 -python submit_benchmarks.py --project $PROJECT -m 125m.yaml -g 8 --microbatch_size 128 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 9 9 --RUN +python submit_benchmarks.py --project $PROJECT -m 125m.yaml -g 8 --microbatch_size 128 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --precision fp8 --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 9 9 --RUN python submit_benchmarks.py --project $PROJECT -m 350m.yaml -g 8 --microbatch_size 128 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 9 9 --RUN python submit_benchmarks.py --project $PROJECT -m 760m.yaml -g 8 --microbatch_size 96 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 9 9 --RUN python submit_benchmarks.py --project $PROJECT -m 1b.yaml -g 8 --microbatch_size 56 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 9 9 --RUN python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 40 --accum 6 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 9 9 --RUN --fsdp_config_activation_checkpointing false -python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 128 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 9 9 --RUN -python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 80 --accum 3 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 9 9 --RUN +python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 64 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 9 9 --RUN --fsdp_config_activation_checkpointing false +python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 20 --accum 3 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 9 9 --RUN --fsdp_config_activation_checkpointing false python submit_benchmarks.py --project $PROJECT -m 30b.yaml -g 8 --microbatch_size 12 --accum 21 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 9 9 --RUN # seqlen 1024 python submit_benchmarks.py --project $PROJECT -m 125m.yaml -g 8 --microbatch_size 64 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 10 10 --RUN @@ -71,7 +185,7 @@ python submit_benchmarks.py --project $PROJECT -m 1b.yaml -g 8 --microbatch_si python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 3 --accum 6 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 13 13 --RUN --fsdp_config_activation_checkpointing false python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 8 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 13 13 --RUN python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 5 --accum 3 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 13 13 --RUN -python submit_benchmarks.py --project $PROJECT -m 30b.yaml -g 8 --microbatch_size 1 --accum 21 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 13 13 --RUN +python submit_benchmarks.py --project $PROJECT -m 30b.yaml -g 8 --microbatch_size 2 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 13 13 --RUN # seqlen 16384 python submit_benchmarks.py --project $PROJECT -m 125m.yaml -g 8 --microbatch_size 4 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 14 14 --RUN python submit_benchmarks.py --project $PROJECT -m 350m.yaml -g 8 --microbatch_size 4 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 14 14 --RUN @@ -95,80 +209,3 @@ python submit_benchmarks.py --project $PROJECT -m 760m.yaml -g 8 --microbatch_si python submit_benchmarks.py --project $PROJECT -m 1b.yaml -g 8 --microbatch_size 1 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 16 16 --RUN --fsdp_config_activation_checkpointing true python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 1 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 16 16 --RUN python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 1 --accum 2 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 16 16 --RUN - - -# A100 40GB - -# seqlen 2048 -python submit_benchmarks.py --project $PROJECT -m 125m.yaml -g 8 --microbatch_size 26 --accum 3 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 11 11 --RUN -python submit_benchmarks.py --project $PROJECT -m 350m.yaml -g 8 --microbatch_size 16 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 11 11 --RUN -python submit_benchmarks.py --project $PROJECT -m 760m.yaml -g 8 --microbatch_size 12 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 11 11 --RUN -python submit_benchmarks.py --project $PROJECT -m 1b.yaml -g 8 --microbatch_size 8 --accum 8 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 11 11 --RUN -python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 5 --accum 8 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false -python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 16 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 11 11 --RUN -python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 4 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 11 11 --RUN - -# INCREASE GPU COUNT -python submit_benchmarks.py --project $PROJECT -m 125m.yaml -g 16 32 64 128 --microbatch_size 26 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 11 11 --RUN -python submit_benchmarks.py --project $PROJECT -m 350m.yaml -g 16 32 64 128 --microbatch_size 18 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 11 11 --RUN -python submit_benchmarks.py --project $PROJECT -m 760m.yaml -g 16 32 64 128 --microbatch_size 12 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 11 11 --RUN -python submit_benchmarks.py --project $PROJECT -m 1b.yaml -g 16 --microbatch_size 8 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 11 11 --RUN -python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 16 --microbatch_size 5 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false -python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 16 --microbatch_size 16 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 11 11 --RUN -python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 16 --microbatch_size 10 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 11 11 --RUN -python submit_benchmarks.py --project $PROJECT -m 1b.yaml -g 32 64 128 --microbatch_size 10 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 11 11 --RUN -python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 32 64 128 --microbatch_size 6 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false -python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 32 64 128 --microbatch_size 18 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 11 11 --RUN -python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 32 --microbatch_size 14 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 11 11 --RUN -python submit_benchmarks.py --project $PROJECT -m 30b.yaml -g 32 --microbatch_size 4 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 11 11 --RUN -python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 64 128 --microbatch_size 16 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 11 11 --RUN -python submit_benchmarks.py --project $PROJECT -m 70b.yaml -g 64 --microbatch_size 2 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 11 11 --RUN -python submit_benchmarks.py --project $PROJECT -m 30b.yaml -g 128 --microbatch_size 6 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 11 11 --RUN -python submit_benchmarks.py --project $PROJECT -m 70b.yaml -g 128 --microbatch_size 4 --accum 1 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 11 11 --RUN - -# SCALE SEQUENCE LENGTH -# seqlen 512 -python submit_benchmarks.py --project $PROJECT -m 125m.yaml -g 8 --microbatch_size 104 --accum 3 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 9 9 --RUN -python submit_benchmarks.py --project $PROJECT -m 350m.yaml -g 8 --microbatch_size 64 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 9 9 --RUN -python submit_benchmarks.py --project $PROJECT -m 760m.yaml -g 8 --microbatch_size 48 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 9 9 --RUN -python submit_benchmarks.py --project $PROJECT -m 1b.yaml -g 8 --microbatch_size 32 --accum 8 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 9 9 --RUN -python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 20 --accum 8 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 9 9 --RUN --fsdp_config_activation_checkpointing false -python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 56 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 9 9 --RUN -python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 16 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 9 9 --RUN -# seqlen 1024 -python submit_benchmarks.py --project $PROJECT -m 125m.yaml -g 8 --microbatch_size 52 --accum 3 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 10 10 --RUN -python submit_benchmarks.py --project $PROJECT -m 350m.yaml -g 8 --microbatch_size 32 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 10 10 --RUN -python submit_benchmarks.py --project $PROJECT -m 760m.yaml -g 8 --microbatch_size 24 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 10 10 --RUN -python submit_benchmarks.py --project $PROJECT -m 1b.yaml -g 8 --microbatch_size 16 --accum 8 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 10 10 --RUN -python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 10 --accum 8 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 10 10 --RUN --fsdp_config_activation_checkpointing false -python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 28 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 10 10 --RUN -python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 8 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 10 10 --RUN -# seqlen 4096 -python submit_benchmarks.py --project $PROJECT -m 125m.yaml -g 8 --microbatch_size 13 --accum 3 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 12 12 --RUN -python submit_benchmarks.py --project $PROJECT -m 350m.yaml -g 8 --microbatch_size 8 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 12 12 --RUN -python submit_benchmarks.py --project $PROJECT -m 760m.yaml -g 8 --microbatch_size 6 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 12 12 --RUN -python submit_benchmarks.py --project $PROJECT -m 1b.yaml -g 8 --microbatch_size 4 --accum 8 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 12 12 --RUN -python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 2 --accum 8 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 12 12 --RUN --fsdp_config_activation_checkpointing false -python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 8 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 12 12 --RUN -python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 2 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 12 12 --RUN -# seqlen 8192 -python submit_benchmarks.py --project $PROJECT -m 125m.yaml -g 8 --microbatch_size 5 --accum 3 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 13 13 --RUN -python submit_benchmarks.py --project $PROJECT -m 350m.yaml -g 8 --microbatch_size 4 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 13 13 --RUN -python submit_benchmarks.py --project $PROJECT -m 760m.yaml -g 8 --microbatch_size 3 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 13 13 --RUN -python submit_benchmarks.py --project $PROJECT -m 1b.yaml -g 8 --microbatch_size 2 --accum 8 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 13 13 --RUN -python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 1 --accum 8 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 13 13 --RUN --fsdp_config_activation_checkpointing false -python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 3 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 13 13 --RUN -python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 1 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 13 13 --RUN -# seqlen 16384 -python submit_benchmarks.py --project $PROJECT -m 125m.yaml -g 8 --microbatch_size 2 --accum 3 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 14 14 --RUN -python submit_benchmarks.py --project $PROJECT -m 350m.yaml -g 8 --microbatch_size 2 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 14 14 --RUN -python submit_benchmarks.py --project $PROJECT -m 760m.yaml -g 8 --microbatch_size 1 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 14 14 --RUN -python submit_benchmarks.py --project $PROJECT -m 1b.yaml -g 8 --microbatch_size 1 --accum 8 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 14 14 --RUN -python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 2 --accum 8 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 14 14 --RUN -python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 1 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 14 14 --RUN -# seqlen 32768 -python submit_benchmarks.py --project $PROJECT -m 125m.yaml -g 8 --microbatch_size 1 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 15 15 --RUN -python submit_benchmarks.py --project $PROJECT -m 350m.yaml -g 8 --microbatch_size 1 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 15 15 --RUN -python submit_benchmarks.py --project $PROJECT -m 760m.yaml -g 8 --microbatch_size 1 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 15 15 --RUN --fsdp_config_activation_checkpointing true -python submit_benchmarks.py --project $PROJECT -m 1b.yaml -g 8 --microbatch_size 1 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 15 15 --RUN --fsdp_config_activation_checkpointing true -python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 1 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 15 15 --RUN diff --git a/scripts/train/train.py b/scripts/train/train.py index 180b8ef22b..5e93e33056 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -393,6 +393,10 @@ def main(cfg: DictConfig) -> Trainer: 'load_ignore_keys', must_exist=False, default_value=None) + compile_config: Optional[Dict[str, Any]] = pop_config(cfg, + 'compile_config', + must_exist=False, + default_value=None) # Enable autoresume from model checkpoints if possible autoresume_default: bool = False if logged_cfg.get('run_name', None) is not None \ @@ -606,6 +610,7 @@ def main(cfg: DictConfig) -> Trainer: python_log_level=python_log_level, dist_timeout=dist_timeout, profiler=profiler, + compile_config=compile_config, ) print('Logging config')