4
# Based on globus submission script for pbs
6
# Submits job to SLURM.
7
# Input: path to grami file (same as Globus).
9
# The temporary job script is created for the submission and then removed
10
# at the end of this script.
12
echo "----- starting submit_slurm_job -----" 1>&2
15
# ARC1 passes first the config file.
16
if [ "$1" = "--config" ]; then shift; ARC_CONFIG=$1; shift; fi
19
basedir=`cd $basedir > /dev/null && pwd` || exit $?
23
. ${pkgdatadir}/submit_common.sh || exit $?
25
##############################################################
26
# Parse grami file, read arc config
27
##############################################################
33
failures_file="$joboption_controldir/job.$joboption_gridid.failed"
35
if [ -z "${RUNTIME_NODE_SEES_FRONTEND}" ] ; then
36
if [ -z "${RUNTIME_LOCAL_SCRATCH_DIR}" ] ; then
37
echo "Need to know at which directory to run job: RUNTIME_LOCAL_SCRATCH_DIR must be set if RUNTIME_NODE_SEES_FRONTEND is empty" 1>&2
38
echo "Submission: Configuration error.">>"$failures_file"
43
##############################################################
44
# Zero stage of runtime environments
45
##############################################################
48
##############################################################
50
##############################################################
54
##############################################################
56
##############################################################
57
echo "#!/bin/bash -l" > $LRMS_JOB_SCRIPT
58
echo "# SLURM batch job script built by grid-manager" >> $LRMS_JOB_SCRIPT
60
# rerun is handled by GM, do not let SLURM requeue jobs itself.
61
echo "#SBATCH --no-requeue" >> $LRMS_JOB_SCRIPT
63
# write SLURM output to 'comment' file
64
echo "#SBATCH -e ${joboption_directory}.comment">> $LRMS_JOB_SCRIPT
65
echo "#SBATCH -o ${joboption_directory}.comment">> $LRMS_JOB_SCRIPT
66
echo "" >> $LRMS_JOB_SCRIPT
68
if [ ! -z "${joboption_queue}" ] ; then
69
echo "#SBATCH -p $joboption_queue" >> $LRMS_JOB_SCRIPT
71
# project name for accounting
72
if [ ! -z "${joboption_rsl_project}" ] ; then
73
echo "#SBATCH -U $joboption_rsl_project" >> $LRMS_JOB_SCRIPT
75
# job name for convenience
76
if [ ! -z "${joboption_jobname}" ] ; then
77
#TODO is this necessary? do parts of the infosys need these limitations?
78
jobname=`echo "$joboption_jobname" | \
79
sed 's/^\([^[:alpha:]]\)/N\1/' | \
80
sed 's/[^[:alnum:]]/_/g' | \
81
sed 's/\(...............\).*/\1/'`
82
echo "#SBATCH -J '$jobname'" >> $LRMS_JOB_SCRIPT
85
echo "#SBATCH -J '$jobname'" >> $LRMS_JOB_SCRIPT
87
echo "SLURM jobname: $jobname" 1>&2
88
# Set up the user's environment on the compute node where the script
90
echo "#SBATCH --get-user-env=10L" >> $LRMS_JOB_SCRIPT
92
##############################################################
94
##############################################################
98
nodes_string="#SBATCH -n ${joboption_count}"
99
echo "$nodes_string" >> $LRMS_JOB_SCRIPT
101
if [ ! -z $joboption_countpernode ] && [ $joboption_countpernode -gt 0 ] ; then
102
echo "#SBATCH -c $joboption_countpernode" >> $LRMS_JOB_SCRIPT
105
nodes_string="#SBATCH "
108
eval "var_is_set=\${joboption_nodeproperty_$i+yes}"
109
while [ ! -z "${var_is_set}" ] ; do
110
eval "var_value=\${joboption_nodeproperty_$i}"
111
nodes_string="${nodes_string} ${var_value}"
113
eval "var_is_set=\${joboption_nodeproperty_$i+yes}"
115
echo "$nodes_string" >> $LRMS_JOB_SCRIPT
117
if [ "$joboption_exclusivenode" = "true" ]; then
118
echo "#SBATCH --exclusive" >> $LRMS_JOB_SCRIPT
122
##############################################################
123
# Execution times (minutes)
124
##############################################################
125
if [ ! -z "$joboption_cputime" ] ; then
126
if [ $joboption_cputime -lt 0 ] ; then
129
# this is actually walltime deduced from cputime !
130
maxcputime=$(( $joboption_cputime / $joboption_count ))
131
cputime_min=$(( $maxcputime / 60 ))
132
cputime_sec=$(( $maxcputime - $cputime_min * 60 ))
133
echo "#SBATCH -t ${cputime_min}:${cputime_sec}" >> $LRMS_JOB_SCRIPT
136
if [ -z "$joboption_walltime" ] ; then
137
if [ ! -z "$joboption_cputime" ] ; then
138
# Set walltime for backward compatibility or incomplete requests
139
joboption_walltime=$(( $maxcputime * $walltime_ratio ))
143
if [ ! -z "$joboption_walltime" ] ; then
144
if [ $joboption_walltime -lt 0 ] ; then
147
maxwalltime="$joboption_walltime"
148
walltime_min=$(( $maxwalltime / 60 ))
149
walltime_sec=$(( $maxwalltime - $walltime_min * 60 ))
150
echo "#SBATCH -t ${walltime_min}:${walltime_sec}" >> $LRMS_JOB_SCRIPT
153
##############################################################
154
# Requested memory (mb)
155
##############################################################
159
if [ ! -z "$joboption_memory" ] ; then
160
echo "#SBATCH --mem-per-cpu=${joboption_memory}" >> $LRMS_JOB_SCRIPT
163
echo "" >> $LRMS_JOB_SCRIPT
164
echo "# Overide umask of execution node (sometime values are really strange)" >> $LRMS_JOB_SCRIPT
165
echo "umask 077" >> $LRMS_JOB_SCRIPT
166
echo " " >> $LRMS_JOB_SCRIPT
168
sourcewithargs_jobscript
170
##############################################################
171
# Add environment variables
172
##############################################################
175
##############################################################
176
# Check for existance of executable,
177
# there is no sense to check for executable if files are
178
# downloaded directly to computing node
179
##############################################################
180
if [ -z "${joboption_arg_0}" ] ; then
181
echo 'Executable is not specified' 1>&2
182
rm -f "$LRMS_JOB_SCRIPT" "$LRMS_JOB_OUT" "$LRMS_JOB_ERR"
183
echo "Submission: Job description error.">>"$failures_file"
187
#######################################################################
188
# copy information useful for transfering files to/from node directly
189
#######################################################################
190
if [ "$joboption_localtransfer" = 'yes' ] ; then
194
######################################################################
195
# Adjust working directory for tweaky nodes
196
# RUNTIME_GRIDAREA_DIR should be defined by external means on nodes
197
######################################################################
198
if [ ! -z "${RUNTIME_NODE_SEES_FRONTEND}" ] ; then
201
echo "RUNTIME_JOB_DIR=$RUNTIME_LOCAL_SCRATCH_DIR/$joboption_gridid" >> $LRMS_JOB_SCRIPT
202
echo "RUNTIME_JOB_DIAG=$RUNTIME_LOCAL_SCRATCH_DIR/${joboption_gridid}.diag" >> $LRMS_JOB_SCRIPT
203
RUNTIME_STDIN_REL=`echo "${joboption_stdin}" | sed "s#^${joboption_directory}/*##"`
204
RUNTIME_STDOUT_REL=`echo "${joboption_stdout}" | sed "s#^${joboption_directory}/*##"`
205
RUNTIME_STDERR_REL=`echo "${joboption_stderr}" | sed "s#^${joboption_directory}/*##"`
206
if [ "$RUNTIME_STDIN_REL" = "${joboption_stdin}" ] ; then
207
echo "RUNTIME_JOB_STDIN=\"${joboption_stdin}\"" >> $LRMS_JOB_SCRIPT
209
echo "RUNTIME_JOB_STDIN=\"$RUNTIME_LOCAL_SCRATCH_DIR/$joboption_gridid/$RUNTIME_STDIN_REL\"" >> $LRMS_JOB_SCRIPT
211
if [ "$RUNTIME_STDOUT_REL" = "${joboption_stdout}" ] ; then
212
echo "RUNTIME_JOB_STDOUT=\"${joboption_stdout}\"" >> $LRMS_JOB_SCRIPT
214
echo "RUNTIME_JOB_STDOUT=\"$RUNTIME_LOCAL_SCRATCH_DIR/$joboption_gridid/$RUNTIME_STDOUT_REL\"" >> $LRMS_JOB_SCRIPT
216
if [ "$RUNTIME_STDERR_REL" = "${joboption_stderr}" ] ; then
217
echo "RUNTIME_JOB_STDERR=\"${joboption_stderr}\"" >> $LRMS_JOB_SCRIPT
219
echo "RUNTIME_JOB_STDERR=\"$RUNTIME_LOCAL_SCRATCH_DIR/$joboption_gridid/$RUNTIME_STDERR_REL\"" >> $LRMS_JOB_SCRIPT
223
##############################################################
224
# Add std... to job arguments
225
##############################################################
228
##############################################################
229
# Move files to local working directory (job is done on node only)
230
# RUNTIME_JOB_DIR -> RUNTIME_LOCAL_SCRATCH_DIR/job_id
231
##############################################################
234
echo "" >> $LRMS_JOB_SCRIPT
235
echo "RESULT=0" >> $LRMS_JOB_SCRIPT
236
echo "" >> $LRMS_JOB_SCRIPT
239
#####################################################
240
# Download input files
241
####################################################
244
##############################################################
245
# Skip execution if something already failed
246
##############################################################
247
echo "if [ \"\$RESULT\" = '0' ] ; then" >> $LRMS_JOB_SCRIPT
249
##############################################################
250
# Runtime configuration at computing node
251
##############################################################
254
##############################################################
256
##############################################################
257
echo "echo \"runtimeenvironments=\$runtimeenvironments\" >> \"\$RUNTIME_JOB_DIAG\"" >> $LRMS_JOB_SCRIPT
258
cat >> $LRMS_JOB_SCRIPT <<'EOSCR'
259
if [ ! "X$SLURM_NODEFILE" = 'X' ] ; then
260
if [ -r "$SLURM_NODEFILE" ] ; then
261
cat "$SLURM_NODEFILE" | sed 's/\(.*\)/nodename=\1/' >> "$RUNTIME_JOB_DIAG"
268
##############################################################
269
# Check intermediate result again
270
##############################################################
271
echo "if [ \"\$RESULT\" = '0' ] ; then" >> $LRMS_JOB_SCRIPT
273
##############################################################
275
##############################################################
278
##############################################################
279
# End of RESULT checks
280
##############################################################
281
echo "fi" >> $LRMS_JOB_SCRIPT
282
echo "fi" >> $LRMS_JOB_SCRIPT
284
##############################################################
285
# Runtime (post)configuration at computing node
286
##############################################################
289
#####################################################
290
# Upload output files
291
####################################################
292
if [ "$joboption_localtransfer" = 'yes' ] ; then
295
# There is no sense to keep trash till GM runs uploader
296
echo 'if [ ! -z "$RUNTIME_LOCAL_SCRATCH_DIR" ] ; then' >> $LRMS_JOB_SCRIPT
297
# Delete all files except listed in job.#.output
298
echo ' find ./ -type l -exec rm -f "{}" ";"' >> $LRMS_JOB_SCRIPT
299
echo ' find ./ -type f -exec chmod u+w "{}" ";"' >> $LRMS_JOB_SCRIPT
301
if [ -f "$joboption_controldir/job.$joboption_gridid.output" ] ; then
302
cat "$joboption_controldir/job.$joboption_gridid.output" | \
303
# remove leading backslashes, if any
305
# backslashes and spaces are escaped with a backslash in job.*.output. The
306
# shell built-in read undoes this escaping.
307
while read name rest; do
309
# make it safe for shell by replacing single quotes with '\''
310
name=`printf "%s" "$name"|sed "s/'/'\\\\\\''/g"`;
312
# protect from deleting output files including those in the dynamic list
313
if [ "${name#@}" != "$name" ]; then # Does $name start with a @ ?
316
echo " dynlist='$dynlist'" >> $LRMS_JOB_SCRIPT
317
cat >> $LRMS_JOB_SCRIPT <<'EOSCR'
318
chmod -R u-w "./$dynlist" 2>/dev/null
319
cat "./$dynlist" | while read name rest; do
320
chmod -R u-w "./$name" 2>/dev/null
325
echo " chmod -R u-w \"\$RUNTIME_JOB_DIR\"/'$name' 2>/dev/null" >> $LRMS_JOB_SCRIPT
330
echo ' find ./ -type f -perm +200 -exec rm -f "{}" ";"' >> $LRMS_JOB_SCRIPT
331
echo ' find ./ -type f -exec chmod u+w "{}" ";"' >> $LRMS_JOB_SCRIPT
332
echo 'fi' >> $LRMS_JOB_SCRIPT
334
echo "" >> $LRMS_JOB_SCRIPT
336
##############################################################
337
# Move files back to session directory (job is done on node only)
338
# RUNTIME_JOB_DIR -> RUNTIME_LOCAL_SCRATCH_DIR/job_id
339
# !!!!!!!!!!!!!!!!!!! would be better to know the names of files !!!!!!!!!!!
340
##############################################################
341
move_files_to_frontend
343
#######################################
345
#######################################
346
echo "SLURM job script built" 1>&2
347
# Execute sbatch command
348
cd "$joboption_directory"
349
echo "SLURM script follows:" 1>&2
350
echo "-------------------------------------------------------------------" 1>&2
351
cat "$LRMS_JOB_SCRIPT" 1>&2
352
echo "-------------------------------------------------------------------" 1>&2
356
while [ "$SLURM_TRIES" -lt '10' ] ; do
358
# Unset all environment variables before calling sbatch. Otherwise
359
# SLURM will forward them to the job and leak information about
361
# Only unset lines with assignments.
362
# Risks unsetting variables in sub assignments, but this is likely harmless.
363
# TODO: Maybe we only should unset $ARC_*, $CONFIG_*, $GLOBUS_* etc?
364
(for i in $(env|grep '^[A-Za-z][A-Za-z0-9]*='|grep -v "LRMS_JOB_SCRIPT"|cut -d= -f1);do unset $i;done; \
365
${sbatch} $LRMS_JOB_SCRIPT) 1>$LRMS_JOB_OUT 2>$LRMS_JOB_ERR
367
if [ "$SLURM_RESULT" -eq '0' ] ; then break ; fi
368
if [ "$SLURM_RESULT" -eq '198' ] ; then
369
echo "Waiting for queue to decrease" 1>&2
374
grep 'maximum number of jobs' "$LRMS_JOB_OUT" "$LRMS_JOB_ERR"
375
if [ $? -eq '0' ] ; then
376
echo "Waiting for queue to decrease" 1>&2
381
# A rare SLURM error, but may cause chaos in the information/accounting system
382
grep 'unable to accept job' "$LRMS_JOB_OUT" "$LRMS_JOB_ERR"
383
if [ $? -eq '0' ] ; then
384
echo "Waiting for queue to decrease" 1>&2
389
SLURM_TRIES=$(( $SLURM_TRIES + 1 ))
392
if [ $SLURM_RESULT -eq '0' ] ; then
394
#TODO test what happens when the jobqueue is full or when the slurm ctld is not responding
395
# SLURM 1.x and 2.2.x outputs the jobid into STDERR and STDOUT respectively. Concat them,
396
# and let sed sort it out. From the exit code we know that the job was submitted, so this
397
# is safe. Ulf Tigerstedt <tigerste@csc.fi> 1.5.2011
398
job_id=`cat $LRMS_JOB_OUT $LRMS_JOB_ERR |sed -e 's/^\(sbatch: \)\{0,1\}Submitted batch job \([0-9]*\)$/\2/'`
399
if [ "${job_id}" = "" ] ; then
400
echo "job *NOT* submitted successfully!" 1>&2
401
echo "failed getting the slurm jobid for the job!" 1>&2
402
echo "Submission: Local submission client behaved unexpectedly.">>"$failures_file"
404
echo "joboption_jobid=$job_id" >> $arg_file
405
echo "job submitted successfully!" 1>&2
406
echo "local job id: $job_id" 1>&2
407
# Remove temporary job script file
408
rm -f $LRMS_JOB_SCRIPT $LRMS_JOB_OUT $LRMS_JOB_ERR
409
echo "----- exiting submit_slurm_job -----" 1>&2
414
echo "job *NOT* submitted successfully!" 1>&2
415
echo "got error code from sbatch: $SLURM_RESULT !" 1>&2
416
echo "Submission: Local submission client failed.">>"$failures_file"
418
echo "Output is:" 1>&2
419
cat $LRMS_JOB_OUT 1>&2
420
echo "Error output is:"
421
cat $LRMS_JOB_ERR 1>&2
422
rm -f "$LRMS_JOB_SCRIPT" "$LRMS_JOB_OUT" "$LRMS_JOB_ERR"
423
echo "----- exiting submit_slurm_job -----" 1>&2