~ubuntu-branches/ubuntu/utopic/gridengine/utopic

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#!/bin/sh
#
#
#___INFO__MARK_BEGIN__
##########################################################################
#
#  The Contents of this file are made available subject to the terms of
#  the Sun Industry Standards Source License Version 1.2
#
#  Sun Microsystems Inc., March, 2001
#
#
#  Sun Industry Standards Source License Version 1.2
#  =================================================
#  The contents of this file are subject to the Sun Industry Standards
#  Source License Version 1.2 (the "License"); You may not use this file
#  except in compliance with the License. You may obtain a copy of the
#  License at http://gridengine.sunsource.net/Gridengine_SISSL_license.html
#
#  Software provided under this License is provided on an "AS IS" basis,
#  WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING,
#  WITHOUT LIMITATION, WARRANTIES THAT THE SOFTWARE IS FREE OF DEFECTS,
#  MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE, OR NON-INFRINGING.
#  See the License for the specific provisions governing your rights and
#  obligations concerning the Software.
#
#  The Initial Developer of the Original Code is: Sun Microsystems, Inc.
#
#  Copyright: 2001 by Sun Microsystems, Inc.
#
#  All Rights Reserved.
#
##########################################################################
#___INFO__MARK_END__

set +u    # don't treat unset parameters as an error
set +e    # don't exit on bad command status

ckpt_dir=$3

if [ ! -f $ckpt_dir/ckpt.log ]; then
   touch $ckpt_dir/ckpt.log
   chmod 666 $ckpt_dir/ckpt.log
fi

if [ "$SGE_TASK_ID" = "undefined" -o "$SGE_TASK_ID" = "" ]; then
   jobid=$JOB_ID
   jobdir=$JOB_ID.1
else
   jobid=$JOB_ID.$SGE_TASK_ID
   jobdir=$JOB_ID.$SGE_TASK_ID
fi

#
# create temp directory for holding checkpoint info
#

tmpdir=$ckpt_dir/ckpt.$jobid
mkdir -p $tmpdir
cd $tmpdir

#
# create log file
#

F=$tmpdir/checkpoint.log
touch $F
exec >> $F 2>&1

echo -------------------------------------------------------------
echo `basename $0` called at `date`
echo called by: `id`
echo with args: $*

#
# Get original job_pid and osjobid
#

job_pid=`cat job_pid`
osjobid=`cat osjobid`

# restore the O.S. job identifier to the jobs directory
# NOTE: do we need to restore osjobid?
# NOTE: do we need to restore job_pid?
job_dir=`dirname $JOB_SCRIPT`/../active_jobs/$jobdir
echo original job_pid=$job_pid
echo original osjobid=$osjobid
ls -la $job_dir/job_pid
cat $job_dir/job_pid
ls -la $job_dir/osjobid
cat $job_dir/osjobid
#echo $job_pid > $job_dir/job_pid
#echo $osjobid > $job_dir/osjobid

echo `date +"%D %T"` Job $jobid "(job_pid=$job_pid, osjobid=$osjobid)" restarting >> $ckpt_dir/ckpt.log

#
# If previous restart file exists, we assume that we
# died during a checkpoint and we should recover using
# this file
#

if [ -f chkpnt_$jobid.save ]; then
   mv chkpnt_$jobid.save chkpnt_$jobid
fi

#
# Register restart file, just in case it's not registered. This could
# happen if the host died during the last checkpoint
#

echo /usr/bin/rsvresf chkpnt_$jobid
/usr/bin/rsvresf chkpnt_$jobid

#
# Now restart the job and wait for it to complete
#

echo /usr/bin/restart -f -i -w chkpnt_$jobid
/usr/bin/restart -f -i -w chkpnt_$jobid
exit_status=$?
echo Exit status of restart command: $exit_status

# Now be careful: The restart command is the parent process of the restarted
# job. Grid Engine is the parent process of the restart command.
# If the job was killed (probably due to a migration request), we need to
# tell our parent that by killing ourselves. Grid Engine will also detect an 
# exit status > 128 analogous to a KILL

#if [ $exit_status = 1 ]
#then
#   jstat=`/bin/acctcom -j $job_id -b -p -Z -f -v /usr/adm/acct/day/pacct | $SGE_ROOT/ckpt/cray_parse_job_status $2`
#   echo "jobstatus $job_id $2 = $jstat"
#   if [ "$jstat" = "" ]
#   then
#      exit_status=100
#   elif [ "$jstat" = "0" ]
#   then
#      exit_status=0
#   else
#      exit_status=`expr $jstat + 128`
#   fi
#fi

# If killing ourselves didn't help or the exit_status was  < 128 exit 
# with the exit status of our child

echo `date +"%D %T"` Job $jobid "(job_pid=$job_pid, osjobid=$osjobid) exiting, status=$exit_status" >> $ckpt_dir/ckpt.log

echo Exiting with exit status: $exit_status
exit $exit_status