5
##########################################################################
7
# The Contents of this file are made available subject to the terms of
8
# the Sun Industry Standards Source License Version 1.2
10
# Sun Microsystems Inc., March, 2001
13
# Sun Industry Standards Source License Version 1.2
14
# =================================================
15
# The contents of this file are subject to the Sun Industry Standards
16
# Source License Version 1.2 (the "License"); You may not use this file
17
# except in compliance with the License. You may obtain a copy of the
18
# License at http://gridengine.sunsource.net/Gridengine_SISSL_license.html
20
# Software provided under this License is provided on an "AS IS" basis,
21
# WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING,
22
# WITHOUT LIMITATION, WARRANTIES THAT THE SOFTWARE IS FREE OF DEFECTS,
23
# MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE, OR NON-INFRINGING.
24
# See the License for the specific provisions governing your rights and
25
# obligations concerning the Software.
27
# The Initial Developer of the Original Code is: Sun Microsystems, Inc.
29
# Copyright: 2001 by Sun Microsystems, Inc.
31
# All Rights Reserved.
33
##########################################################################
38
if [ ! -f $ckpt_dir/ckpt.log ]; then
39
touch $ckpt_dir/ckpt.log
40
chmod 666 $ckpt_dir/ckpt.log
43
# create temp directory for holding checkpoint info
45
tmpdir=$ckpt_dir/ckpt.$1
51
F=$tmpdir/checkpoint.log
54
echo ------------------------------------------------------------- >> $F 2>&1
55
echo `basename $0` called at `date` >> $F 2>&1
56
echo called by: `id` >> $F 2>&1
57
echo with args: $* >> $F 2>&1
59
# restore the O.S. job identifier to the jobs directory
61
job_dir=`dirname $JOB_SCRIPT`/../active_jobs/$1
63
echo $job_id > $job_dir/osjobid
65
echo `date +"%D %T"` Job $1 "(osjobid=$job_id)" restarting >> $ckpt_dir/ckpt.log
67
/usr/bin/restart -w -f chkpnt_$1 >> $F 2>&1
69
# Now be careful: The restart command is the parent process of the restarted
70
# job. SGE is the parent process of the restart command.
71
# If the job was killed (probably due to a migration request), we need to
72
# tell our parent that by killing ourselves. SGE will also detect an
73
# exit status > 128 analogous to a KILL
76
echo Exit status of restart command: $exit_status >> $F 2>&1
78
if [ $exit_status = 1 ]
80
jstat=`/bin/acctcom -j $job_id -b -p -Z -f -v /usr/adm/acct/day/pacct | $SGE_ROOT/ckpt/cray_parse_job_status $2`
81
echo "jobstatus $job_id $2 = $jstat" >> $F 2>&1
85
elif [ "$jstat" = "0" ]
89
exit_status=`expr $jstat + 128`
93
# If killing ourselves didn't help or the exit_status was < 128 exit
94
# with the exit status of our child
96
echo `date +"%D %T"` Job $1 "(osjobid=$job_id) exiting, status=$exit_status" >> $ckpt_dir/ckpt.log
98
echo Exiting with exit status: $exit_status >> $F 2>&1