From 53e3291e87e299cc1d480d49595f5dc398785593 Mon Sep 17 00:00:00 2001 From: Andrew Melo Date: Tue, 25 Jul 2017 17:18:19 -0500 Subject: [PATCH] Retry SLURM job submission Under load, the SLURM scheduler is prone to barf on any client commands. Retry the job submission if it fails. --- src/scripts/slurm_submit.sh | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/scripts/slurm_submit.sh b/src/scripts/slurm_submit.sh index f02f31f2..4037ce34 100755 --- a/src/scripts/slurm_submit.sh +++ b/src/scripts/slurm_submit.sh @@ -85,8 +85,17 @@ bls_add_job_wrapper ############################################################### datenow=`date +%Y%m%d` -jobID=`${slurm_binpath}/sbatch $bls_tmp_file` # actual submission -retcode=$? +retry=0 +MAX_RETRY=3 +until [ $retry -eq $MAX_RETRY ] ; do + jobID=$(${slurm_binpath}/sbatch $bls_tmp_file) + retcode=$? + if [ "$retcode" == "0" ] ; then + break + fi + retry=$[$retry+1] + sleep 10 +done if [ "$retcode" != "0" ] ; then rm -f $bls_tmp_file