From 32b9fe7fab3698e8763141cedc421ba19565cc0e Mon Sep 17 00:00:00 2001 From: Jake Smith Date: Wed, 13 Dec 2023 15:13:00 +0000 Subject: [PATCH 1/2] HPCC-31016 Use unique thorworker working dir in k8s Signed-off-by: Jake Smith --- helm/hpcc/templates/_helpers.tpl | 3 +++ helm/hpcc/templates/thor.yaml | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/helm/hpcc/templates/_helpers.tpl b/helm/hpcc/templates/_helpers.tpl index e1d6769c525..5822b99caf7 100644 --- a/helm/hpcc/templates/_helpers.tpl +++ b/helm/hpcc/templates/_helpers.tpl @@ -308,6 +308,9 @@ Add ConfigMap volume mount for a component mountPath: /tmp - name: {{ .name }}-hpcctmp-volume mountPath: /var/lib/HPCCSystems +{{- if .tmpSubPath }} + subPath: {{ .tmpSubPath | quote }} +{{- end }} - name: {{ .name }}-configmap-volume mountPath: /etc/config {{- end -}} diff --git a/helm/hpcc/templates/thor.yaml b/helm/hpcc/templates/thor.yaml index fe80bde7395..5e6a4dac6f1 100644 --- a/helm/hpcc/templates/thor.yaml +++ b/helm/hpcc/templates/thor.yaml @@ -244,7 +244,7 @@ data: {{ include "hpcc.mergeEnvironments" $configCtx.me.env | indent 12 }} workingDir: /var/lib/HPCCSystems volumeMounts: -{{ include "hpcc.addConfigMapVolumeMount" $configCtx.me | indent 12 }} +{{ include "hpcc.addConfigMapVolumeMount" (deepCopy $configCtx.me | merge (dict "tmpSubPath" $containerNum)) | indent 12 }} {{ include "hpcc.addVolumeMounts" $configCtx | indent 12 }} {{ include "hpcc.addSecretVolumeMounts" $configCtx | indent 12 }} {{ include "hpcc.addCertificateVolumeMount" (dict "root" $configCtx.root "name" $configCtx.me.name "component" "thorworker") | indent 12 }} From bf7d04ff5029de1e41d418dbfe32206e087f96e9 Mon Sep 17 00:00:00 2001 From: Jake Smith Date: Wed, 13 Dec 2023 15:30:28 +0000 Subject: [PATCH 2/2] HPCC-31017 Report cause of k8s thorworker job failure Ensure that the cause of the failure to apply the k8s thorworker job is reported back to the workunit. Also suppress follow on 'backoff' failure if the primary cause of failure has already been reported. Signed-off-by: Jake Smith --- thorlcr/master/thmastermain.cpp | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/thorlcr/master/thmastermain.cpp b/thorlcr/master/thmastermain.cpp index 459c0d92773..adc174d9a84 100644 --- a/thorlcr/master/thmastermain.cpp +++ b/thorlcr/master/thmastermain.cpp @@ -1162,18 +1162,23 @@ int main( int argc, const char *argv[] ) } if (isContainerized()) { + int retCode = exception ? TEC_Exception : 0; if (!cloudJobName.isEmpty()) { + if (exception) + { + Owned factory = getWorkUnitFactory(); + Owned wu = factory->openWorkUnit(workunit); + if (wu) + { + relayWuidException(wu, exception); + retCode = 0; // if successfully reported, suppress thormanager exit failure that would trigger another exception + } + } if (workerJobInstalled) { try { - if (exception) - { - Owned factory = getWorkUnitFactory(); - Owned wu = factory->openWorkUnit(workunit); - relayWuidException(wu, exception); - } k8s::KeepJobs keepJob = k8s::translateKeepJobs(globals->queryProp("@keepJobs")); switch (keepJob) { @@ -1208,7 +1213,7 @@ int main( int argc, const char *argv[] ) } } } - setExitCode(exception ? TEC_Exception : 0); + setExitCode(retCode); } // cleanup handler to be sure we end