1
1
/*****************************************************************************\
2
2
* sfree.c - free specified block or all blocks.
3
* $Id: sfree.c 16357 2009-01-30 18:05:07Z da $
3
* $Id: sfree.c 18185 2009-07-17 19:02:22Z da $
4
4
*****************************************************************************
5
5
* Copyright (C) 2004 The Regents of the University of California.
6
6
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
7
7
* Written by Danny Auble <da@llnl.gov>
9
* CODE-OCEC-09-009. All rights reserved.
11
11
* This file is part of SLURM, a resource management program.
12
* For details, see <http://www.llnl.gov/linux/slurm/>.
12
* For details, see <https://computing.llnl.gov/linux/slurm/>.
13
* Please also read the included file: DISCLAIMER.
14
15
* SLURM is free software; you can redistribute it and/or modify it under
15
16
* the terms of the GNU General Public License as published by the Free
292
293
if(rc == PARTITION_NOT_FOUND) {
293
294
info("block %s is not found");
296
} else if(rc == INCOMPATIBLE_STATE) {
297
debug2("bridge_destroy_partition"
298
"(%s): %s State = %d",
299
delete_record->bg_block_id,
301
delete_record->state);
303
error("bridge_destroy_block(%s): %s",
304
delete_record->bg_block_id,
296
error("bridge_destroy_block(%s): %s",
297
delete_record->bg_block_id,
301
309
bg_record->state = RM_PARTITION_FREE;
519
527
/* Kill a job and remove its record from MMCS */
520
528
static int _remove_job(db_job_id_t job_id)
523
531
rm_job_t *job_rec = NULL;
524
532
rm_job_state_t job_state;
526
534
info("removing job %d from MMCS", job_id);
527
for (i=0; i<MAX_POLL_RETRIES; i++) {
529
537
sleep(POLL_INTERVAL);
531
540
/* Find the job */
532
541
if ((rc = bridge_get_job(job_id, &job_rec)) != STATUS_OK) {
560
569
/* check the state and process accordingly */
561
570
if(job_state == RM_JOB_TERMINATED)
562
571
return STATUS_OK;
563
else if(job_state == RM_JOB_DYING)
572
else if(job_state == RM_JOB_DYING) {
573
if(count > MAX_POLL_RETRIES)
574
error("Job %d isn't dying, trying for "
575
"%d seconds", job_id,
576
count*POLL_INTERVAL);
565
else if(job_state == RM_JOB_ERROR) {
578
} else if(job_state == RM_JOB_ERROR) {
566
579
error("job %d is in a error state.", job_id);
568
581
//free_bg_block();
569
582
return STATUS_OK;
572
(void) bridge_signal_job(job_id, SIGKILL);
573
rc = bridge_cancel_job(job_id);
585
/* we have been told the next 2 lines do the same
586
* thing, but I don't believe it to be true. In most
587
* cases when you do a signal of SIGTERM the mpirun
588
* process gets killed with a SIGTERM. In the case of
589
* bridge_cancel_job it always gets killed with a
590
* SIGKILL. From IBM's point of view that is a bad
591
* deally, so we are going to use signal ;).
594
// rc = bridge_cancel_job(job_id);
595
rc = bridge_signal_job(job_id, SIGTERM);
574
597
if (rc != STATUS_OK) {
575
598
if (rc == JOB_NOT_FOUND) {
576
599
debug("job %d removed from MMCS", job_id);