~ubuntu-branches/ubuntu/vivid/slurm-llnl/vivid

« back to all changes in this revision

Viewing changes to src/plugins/select/bluegene/plugin/sfree.c

  • Committer: Bazaar Package Importer
  • Author(s): Gennaro Oliva
  • Date: 2009-09-24 23:28:15 UTC
  • mfrom: (1.1.11 upstream) (3.2.4 sid)
  • Revision ID: james.westby@ubuntu.com-20090924232815-enh65jn32q1ebg07
Tags: 2.0.5-1
* New upstream release 
* Changed dependecy from lib-mysqlclient15 to lib-mysqlclient 
* Added Default-Start for runlevel 2 and 4 and $remote_fs requirement in
  init.d scripts (Closes: #541252)
* Postinst checks for wrong runlevels 2 and 4 links
* Upgraded to standard version 3.8.3
* Add lintian overrides for missing slurm-llnl-configurator.html in doc
  base registration
* modified postrm scripts to ignore pkill return value in order to avoid
  postrm failure when no slurm process is running
* Checking for slurmctld.pid before cancelling running and pending
  jobs during package removal 

Show diffs side-by-side

added added

removed removed

Lines of Context:
1
1
/*****************************************************************************\
2
2
 *  sfree.c - free specified block or all blocks.
3
 
 *  $Id: sfree.c 16357 2009-01-30 18:05:07Z da $
 
3
 *  $Id: sfree.c 18185 2009-07-17 19:02:22Z da $
4
4
 *****************************************************************************
5
5
 *  Copyright (C) 2004 The Regents of the University of California.
6
6
 *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
7
7
 *  Written by Danny Auble <da@llnl.gov>
8
8
 *
9
 
 *  LLNL-CODE-402394.
 
9
 *  CODE-OCEC-09-009. All rights reserved.
10
10
 *  
11
11
 *  This file is part of SLURM, a resource management program.
12
 
 *  For details, see <http://www.llnl.gov/linux/slurm/>.
 
12
 *  For details, see <https://computing.llnl.gov/linux/slurm/>.
 
13
 *  Please also read the included file: DISCLAIMER.
13
14
 *  
14
15
 *  SLURM is free software; you can redistribute it and/or modify it under
15
16
 *  the terms of the GNU General Public License as published by the Free
39
40
 
40
41
#include "sfree.h"
41
42
 
42
 
#define MAX_POLL_RETRIES    110
 
43
#define MAX_POLL_RETRIES    220
43
44
#define POLL_INTERVAL        3
44
45
#define MAX_PTHREAD_RETRIES  1
45
46
 
292
293
                                if(rc == PARTITION_NOT_FOUND) {
293
294
                                        info("block %s is not found");
294
295
                                        break;
 
296
                                } else if(rc == INCOMPATIBLE_STATE) {
 
297
                                        debug2("bridge_destroy_partition"
 
298
                                               "(%s): %s State = %d",
 
299
                                               delete_record->bg_block_id, 
 
300
                                               _bg_err_str(rc), 
 
301
                                               delete_record->state);
 
302
                                } else {
 
303
                                        error("bridge_destroy_block(%s): %s",
 
304
                                              delete_record->bg_block_id,
 
305
                                              _bg_err_str(rc));
295
306
                                }
296
 
                                error("bridge_destroy_block(%s): %s",
297
 
                                      delete_record->bg_block_id,
298
 
                                      _bg_err_str(rc));
299
307
                        }
300
308
#else
301
309
                        bg_record->state = RM_PARTITION_FREE;   
519
527
/* Kill a job and remove its record from MMCS */
520
528
static int _remove_job(db_job_id_t job_id)
521
529
{
522
 
        int i, rc;
 
530
        int rc, count = 0;
523
531
        rm_job_t *job_rec = NULL;
524
532
        rm_job_state_t job_state;
525
533
 
526
534
        info("removing job %d from MMCS", job_id);
527
 
        for (i=0; i<MAX_POLL_RETRIES; i++) {
528
 
                if (i > 0)
 
535
        while(1) {
 
536
                if (count)
529
537
                        sleep(POLL_INTERVAL);
 
538
                count++;
530
539
 
531
540
                /* Find the job */
532
541
                if ((rc = bridge_get_job(job_id, &job_rec)) != STATUS_OK) {
560
569
                /* check the state and process accordingly */
561
570
                if(job_state == RM_JOB_TERMINATED)
562
571
                        return STATUS_OK;
563
 
                else if(job_state == RM_JOB_DYING)
 
572
                else if(job_state == RM_JOB_DYING) {
 
573
                        if(count > MAX_POLL_RETRIES) 
 
574
                                error("Job %d isn't dying, trying for "
 
575
                                      "%d seconds", job_id, 
 
576
                                      count*POLL_INTERVAL);
564
577
                        continue;
565
 
                else if(job_state == RM_JOB_ERROR) {
 
578
                } else if(job_state == RM_JOB_ERROR) {
566
579
                        error("job %d is in a error state.", job_id);
567
580
                        
568
581
                        //free_bg_block();
569
582
                        return STATUS_OK;
570
583
                }
571
584
 
572
 
                (void) bridge_signal_job(job_id, SIGKILL);
573
 
                rc = bridge_cancel_job(job_id);
 
585
                /* we have been told the next 2 lines do the same
 
586
                 * thing, but I don't believe it to be true.  In most
 
587
                 * cases when you do a signal of SIGTERM the mpirun
 
588
                 * process gets killed with a SIGTERM.  In the case of
 
589
                 * bridge_cancel_job it always gets killed with a
 
590
                 * SIGKILL.  From IBM's point of view that is a bad
 
591
                 * deally, so we are going to use signal ;).
 
592
                 */
 
593
 
 
594
//               rc = bridge_cancel_job(job_id);
 
595
                 rc = bridge_signal_job(job_id, SIGTERM);
 
596
 
574
597
                if (rc != STATUS_OK) {
575
598
                        if (rc == JOB_NOT_FOUND) {
576
599
                                debug("job %d removed from MMCS", job_id);