199
202
sprintf(log_buffer, msg_jobholdset, pset, preq->rq_user,
202
if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
205
pattr = &pjob->ji_wattr[(int)JOB_ATR_checkpoint];
207
if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) &&
208
((pattr->at_flags & ATR_VFLAG_SET) &&
209
((csv_find_string(pattr->at_val.at_str, "s") != NULL) ||
210
(csv_find_string(pattr->at_val.at_str, "c") != NULL) ||
211
(csv_find_string(pattr->at_val.at_str, "enabled") != NULL))))
205
214
/* have MOM attempt checkpointing */
215
pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN;
216
224
pjob->ji_qs.ji_svrflags |=
217
JOB_SVFLG_HASRUN | JOB_SVFLG_CHKPT;
225
JOB_SVFLG_HASRUN | JOB_SVFLG_CHECKPOINT_FILE;
218
226
job_save(pjob, SAVEJOB_QUICK);
228
/* fill in log_buffer again, since relay_to_mom changed it */
230
sprintf(log_buffer, msg_jobholdset, pset, preq->rq_user,
219
233
LOG_EVENT(PBSEVENT_JOB, PBS_EVENTCLASS_JOB,
220
pjob->ji_qs.ji_jobid, log_buffer);
234
pjob->ji_qs.ji_jobid, log_buffer);
238
else if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
241
* This system is configured with BLCR checkpointing to be used,
242
* but this Running job does not have checkpointing enabled,
243
* so we reject the request
249
pjob->ji_qs.ji_jobid,
252
req_reject(PBSE_IVALREQ, 0, preq, NULL,
253
"job not held since checkpointing is expected but not enabled for job");
225
258
/* everything went well, may need to update the job state */
285
* req_checkpointjob - service the Checkpoint Job Request
289
void req_checkpointjob(
291
struct batch_request *preq)
298
if ((pjob = chk_job_request(preq->rq_ind.rq_manager.rq_objname, preq)) == NULL)
303
pattr = &pjob->ji_wattr[(int)JOB_ATR_checkpoint];
305
if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) &&
306
((pattr->at_flags & ATR_VFLAG_SET) &&
307
((csv_find_string(pattr->at_val.at_str, "s") != NULL) ||
308
(csv_find_string(pattr->at_val.at_str, "c") != NULL) ||
309
(csv_find_string(pattr->at_val.at_str, "enabled") != NULL))))
311
/* have MOM attempt checkpointing */
313
if ((rc = relay_to_mom(pjob->ji_qs.ji_un.ji_exect.ji_momaddr,
314
preq, process_checkpoint_reply)) != 0)
316
req_reject(rc, 0, preq, NULL, NULL);
320
pjob->ji_qs.ji_svrflags |= JOB_SVFLG_CHECKPOINT_FILE;
321
job_save(pjob, SAVEJOB_QUICK);
322
LOG_EVENT(PBSEVENT_JOB, PBS_EVENTCLASS_JOB,
323
pjob->ji_qs.ji_jobid, log_buffer);
328
/* Job does not have checkpointing enabled, so reject the request */
333
pjob->ji_qs.ji_jobid,
336
req_reject(PBSE_IVALREQ, 0, preq, NULL, "job is not checkpointable");
338
} /* END req_checkpointjob() */
252
344
* req_releasejob - service the Release Job Request
254
346
* This request clears one or more holds on a job.
425
520
else if (preq->rq_reply.brp_code != 0)
523
rc = get_hold(&preq->rq_ind.rq_hold.rq_orig.rq_attr, &pset, &temphold);
527
rc = job_attr_def[(int)JOB_ATR_hold].at_set(&pjob->ji_wattr[(int)JOB_ATR_hold],
427
531
pjob->ji_qs.ji_substate = JOB_SUBSTATE_RUNNING; /* reset it */
428
pjob->ji_modified = 1; /* indicate attributes changed */
533
pjob->ji_modified = 1; /* indicate attributes changed */
429
534
svr_evaljobstate(pjob, &newstate, &newsub, 0);
430
535
svr_setjobstate(pjob, newstate, newsub); /* saves job */
446
551
/* record that MOM has a checkpoint file */
448
/* Stupid PBS_CHKPT_MIGRATE is defined as zero therefore this code will never fire.
553
/* PBS_CHECKPOINT_MIGRATEABLE is defined as zero therefore this code will never fire.
449
554
* And if these flags are not set, start_exec will not try to run the job from
450
555
* the checkpoint image file.
453
pjob->ji_qs.ji_svrflags |= JOB_SVFLG_CHKPT;
558
pjob->ji_qs.ji_svrflags |= JOB_SVFLG_CHECKPOINT_FILE;
455
if (preq->rq_reply.brp_auxcode) /* chkpt can be moved */
560
if (preq->rq_reply.brp_auxcode) /* checkpoint can be moved */
457
pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_CHKPT;
458
pjob->ji_qs.ji_svrflags |= JOB_SVFLG_HASRUN | JOB_SVFLG_ChkptMig;
562
pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_CHECKPOINT_FILE;
563
pjob->ji_qs.ji_svrflags |= JOB_SVFLG_HASRUN | JOB_SVFLG_CHECKPOINT_MIGRATEABLE;
461
566
pjob->ji_modified = 1; /* indicate attributes changed */
463
568
svr_evaljobstate(pjob, &newstate, &newsub, 0);
464
569
svr_setjobstate(pjob, newstate, newsub); /* saves job */
466
account_record(PBS_ACCT_CHKPNT, pjob, (char *)0); /* note in accounting file */
571
account_record(PBS_ACCT_CHKPNT, pjob, "Checkpointed and held"); /* note in accounting file */
577
* process_checkpoint_reply
578
* called when a checkpoint request was sent to MOM and the answer
579
* is received. Completes the checkpoint request for running jobs.
582
static void process_checkpoint_reply(
584
struct work_task *pwt)
588
struct batch_request *preq;
590
svr_disconnect(pwt->wt_event); /* close connection to MOM */
592
preq = pwt->wt_parm1;
593
preq->rq_conn = preq->rq_orgconn; /* restore client socket */
595
if ((pjob = find_job(preq->rq_ind.rq_manager.rq_objname)) == (job *)0)
597
LOG_EVENT(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB,
598
preq->rq_ind.rq_manager.rq_objname,
600
req_reject(PBSE_UNKJOBID, 0, preq, NULL, msg_postmomnojob);
604
/* record that MOM has a checkpoint file */
606
account_record(PBS_ACCT_CHKPNT, pjob, "Checkpointed"); /* note in accounting file */