5
##########################################################################
7
# The Contents of this file are made available subject to the terms of
8
# the Sun Industry Standards Source License Version 1.2
10
# Sun Microsystems Inc., March, 2001
13
# Sun Industry Standards Source License Version 1.2
14
# =================================================
15
# The contents of this file are subject to the Sun Industry Standards
16
# Source License Version 1.2 (the "License"); You may not use this file
17
# except in compliance with the License. You may obtain a copy of the
18
# License at http://gridengine.sunsource.net/Gridengine_SISSL_license.html
20
# Software provided under this License is provided on an "AS IS" basis,
21
# WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING,
22
# WITHOUT LIMITATION, WARRANTIES THAT THE SOFTWARE IS FREE OF DEFECTS,
23
# MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE, OR NON-INFRINGING.
24
# See the License for the specific provisions governing your rights and
25
# obligations concerning the Software.
27
# The Initial Developer of the Original Code is: Sun Microsystems, Inc.
29
# Copyright: 2001 by Sun Microsystems, Inc.
31
# All Rights Reserved.
33
##########################################################################
36
set +u # don't treat unset parameters as an error
37
set +e # don't exit on bad command status
41
if [ ! -f $ckpt_dir/ckpt.log ]; then
42
touch $ckpt_dir/ckpt.log
43
chmod 666 $ckpt_dir/ckpt.log
46
if [ "$SGE_TASK_ID" = "undefined" -o "$SGE_TASK_ID" = "" ]; then
50
jobid=$JOB_ID.$SGE_TASK_ID
51
jobdir=$JOB_ID.$SGE_TASK_ID
55
# create temp directory for holding checkpoint info
58
tmpdir=$ckpt_dir/ckpt.$jobid
66
F=$tmpdir/checkpoint.log
70
echo -------------------------------------------------------------
71
echo `basename $0` called at `date`
76
# Get original job_pid and osjobid
82
# restore the O.S. job identifier to the jobs directory
83
# NOTE: do we need to restore osjobid?
84
# NOTE: do we need to restore job_pid?
85
job_dir=`dirname $JOB_SCRIPT`/../active_jobs/$jobdir
86
echo original job_pid=$job_pid
87
echo original osjobid=$osjobid
88
ls -la $job_dir/job_pid
90
ls -la $job_dir/osjobid
92
#echo $job_pid > $job_dir/job_pid
93
#echo $osjobid > $job_dir/osjobid
95
echo `date +"%D %T"` Job $jobid "(job_pid=$job_pid, osjobid=$osjobid)" restarting >> $ckpt_dir/ckpt.log
98
# If previous restart file exists, we assume that we
99
# died during a checkpoint and we should recover using
103
if [ -f chkpnt_$jobid.save ]; then
104
mv chkpnt_$jobid.save chkpnt_$jobid
108
# Register restart file, just in case it's not registered. This could
109
# happen if the host died during the last checkpoint
112
echo /usr/bin/rsvresf chkpnt_$jobid
113
/usr/bin/rsvresf chkpnt_$jobid
116
# Now restart the job and wait for it to complete
119
echo /usr/bin/restart -f -i -w chkpnt_$jobid
120
/usr/bin/restart -f -i -w chkpnt_$jobid
122
echo Exit status of restart command: $exit_status
124
# Now be careful: The restart command is the parent process of the restarted
125
# job. Grid Engine is the parent process of the restart command.
126
# If the job was killed (probably due to a migration request), we need to
127
# tell our parent that by killing ourselves. Grid Engine will also detect an
128
# exit status > 128 analogous to a KILL
130
#if [ $exit_status = 1 ]
132
# jstat=`/bin/acctcom -j $job_id -b -p -Z -f -v /usr/adm/acct/day/pacct | $SGE_ROOT/ckpt/cray_parse_job_status $2`
133
# echo "jobstatus $job_id $2 = $jstat"
134
# if [ "$jstat" = "" ]
137
# elif [ "$jstat" = "0" ]
141
# exit_status=`expr $jstat + 128`
145
# If killing ourselves didn't help or the exit_status was < 128 exit
146
# with the exit status of our child
148
echo `date +"%D %T"` Job $jobid "(job_pid=$job_pid, osjobid=$osjobid) exiting, status=$exit_status" >> $ckpt_dir/ckpt.log
150
echo Exiting with exit status: $exit_status