forked from NVIDIA/NeMo
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ci.groovy
119 lines (112 loc) · 5.46 KB
/
ci.groovy
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
@Library('blossom-github-lib@master')
import ipp.blossom.*
podTemplate(cloud:'sc-ipp-blossom-prod', yaml : """
apiVersion: v1
kind: Pod
metadata:
labels:
some-label: some-label-value
spec:
volumes:
- name: scratch
nfs:
server: ipp1-cdot01-col01
path: /vol/scratch1/scratch.okuchaiev_blossom
containers:
- name: latestdlfw
image: nvcr.io/nvidia/pytorch:22.11-py3
command:
- cat
volumeMounts:
- name: scratch
mountPath: /testdata
resources:
limits:
nvidia.com/gpu: 2
restartPolicy: Never
backoffLimit: 4
tty: true
shm-size: 32g
nodeSelector:
kubernetes.io/os: linux
nvidia.com/gpu_type: "Tesla_T4x4"
nvidia.com/node_type: gpu_tester
nvidia.com/driver_version: "510.20"
"""
) {
node(POD_LABEL) {
def githubHelper
stage('Get Token') {
withCredentials([usernamePassword(credentialsId: 'GHAtoken', passwordVariable: 'GIT_PASSWORD', usernameVariable: 'GIT_USERNAME')]) {
// create new instance of helper object
githubHelper = GithubHelper.getInstance("${GIT_PASSWORD}", githubData)
}
}
def stageName = ''
try {
currentBuild.description = githubHelper.getBuildDescription()
container('latestdlfw') {
stage('Code checkout') {
// update status on github
githubHelper.updateCommitStatus("$BUILD_URL", "$stageName Running", GitHubCommitState.PENDING)
checkout changelog: true, poll: true, scm: [$class: 'GitSCM', branches: [[name: "pr/"+githubHelper.getPRNumber()]],
doGenerateSubmoduleConfigurations: false,
submoduleCfg: [],
userRemoteConfigs: [[credentialsId: 'github-token', url: githubHelper.getCloneUrl(), refspec: '+refs/pull/*/head:refs/remotes/origin/pr/*']]]
}
stage('Code Style') {
sh "apt-get update && \
apt-get install -y bc && \
nvidia-smi && \
pip install -r requirements/requirements_test.txt && \
python setup.py style && ls -l /testdata/TestData && ln -s /testdata/TestData /home/TestData && \
ls -l /home && ls -l /home/TestData"
}
stage('Installation') {
sh "git config --global --add safe.directory '*' && nvidia-smi && ./reinstall.sh release"
}
stage('L0: GPU unit tests') {
sh "NEMO_NUMBA_MINVER=0.53 pytest -m 'not pleasefixme'"
}
parallel( //USE CUDA_VISIBLE_DEVICES to execute 2 single GPU tests in parallel here
[
"L1: NMT Training Pre-LN": { sh 'CUDA_VISIBLE_DEVICES=0 python examples/nlp/machine_translation/enc_dec_nmt.py \
--config-path=conf \
--config-name=aayn_base \
do_testing=true \
model.train_ds.src_file_name=/testdata/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
model.train_ds.tgt_file_name=/testdata/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
model.validation_ds.src_file_name=/testdata/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
model.validation_ds.tgt_file_name=/testdata/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
model.test_ds.src_file_name=/testdata/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
model.test_ds.tgt_file_name=/testdata/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
model.encoder_tokenizer.tokenizer_model=/testdata/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
model.decoder_tokenizer.tokenizer_model=/testdata/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
model.encoder.pre_ln=true \
model.decoder.pre_ln=true \
trainer.devices=[0] \
trainer.accelerator="gpu" \
+trainer.fast_dev_run=true \
+trainer.limit_test_batches=2 \
exp_manager=null \
'},
"L1: Speech to text": { sh 'CUDA_VISIBLE_DEVICES=1 python examples/asr/asr_ctc/speech_to_text_ctc.py \
model.train_ds.manifest_filepath=/testdata/TestData/an4_dataset/an4_train.json \
model.validation_ds.manifest_filepath=/testdata/TestData/an4_dataset/an4_val.json \
trainer.devices=[0] \
trainer.accelerator="gpu" \
+trainer.fast_dev_run=True \
exp_manager=null \
'}
]
)//end of parallel
}
githubHelper.updateCommitStatus("$BUILD_URL", "Complete", GitHubCommitState.SUCCESS)
}
catch (Exception ex){
currentBuild.result = 'FAILURE'
println ex
githubHelper.updateCommitStatus("$BUILD_URL", "$stageName Failed", GitHubCommitState.FAILURE)
}
}
}