~ubuntu-branches/ubuntu/vivid/slurm-llnl/vivid

« back to all changes in this revision

Viewing changes to src/plugins/select/bluegene/plugin/bg_block_info.c

  • Committer: Bazaar Package Importer
  • Author(s): Gennaro Oliva
  • Date: 2009-09-24 23:28:15 UTC
  • mfrom: (1.1.11 upstream) (3.2.4 sid)
  • Revision ID: james.westby@ubuntu.com-20090924232815-enh65jn32q1ebg07
Tags: 2.0.5-1
* New upstream release 
* Changed dependecy from lib-mysqlclient15 to lib-mysqlclient 
* Added Default-Start for runlevel 2 and 4 and $remote_fs requirement in
  init.d scripts (Closes: #541252)
* Postinst checks for wrong runlevels 2 and 4 links
* Upgraded to standard version 3.8.3
* Add lintian overrides for missing slurm-llnl-configurator.html in doc
  base registration
* modified postrm scripts to ignore pkill return value in order to avoid
  postrm failure when no slurm process is running
* Checking for slurmctld.pid before cancelling running and pending
  jobs during package removal 

Show diffs side-by-side

added added

removed removed

Lines of Context:
1
1
/*****************************************************************************\
2
2
 *  bg_block_info.c - bluegene block information from the db2 database.
3
3
 *
4
 
 *  $Id: bg_block_info.c 17202 2009-04-09 16:56:23Z da $
 
4
 *  $Id: bg_block_info.c 18147 2009-07-15 16:25:53Z da $
5
5
 *****************************************************************************
6
6
 *  Copyright (C) 2004-2006 The Regents of the University of California.
7
7
 *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
8
8
 *  Written by Danny Auble <da@llnl.gov>
9
9
 *  
10
10
 *  This file is part of SLURM, a resource management program.
11
 
 *  For details, see <http://www.llnl.gov/linux/slurm/>.
 
11
 *  For details, see <https://computing.llnl.gov/linux/slurm/>.
 
12
 *  Please also read the included file: DISCLAIMER.
12
13
 *  
13
14
 *  SLURM is free software; you can redistribute it and/or modify it under
14
15
 *  the terms of the GNU General Public License as published by the Free
90
91
        if(bg_record->modifying)
91
92
                return SLURM_SUCCESS;
92
93
        
93
 
        user_name = xstrdup(bg_slurm_user_name);
 
94
        user_name = xstrdup(bg_conf->slurm_user_name);
94
95
        if(remove_all_users(bg_record->bg_block_id, NULL) 
95
96
           == REMOVE_USER_ERR) {
96
97
                error("Something happened removing "
136
137
                bg_record->target_name = xstrdup(bg_record->user_name);
137
138
        }
138
139
 
139
 
        if(remove_from_bg_list(bg_job_block_list, bg_record) == SLURM_SUCCESS) 
 
140
        if(remove_from_bg_list(bg_lists->job_running, bg_record)
 
141
           == SLURM_SUCCESS) 
140
142
                num_unused_cpus += bg_record->cpu_cnt;                         
141
 
        remove_from_bg_list(bg_booted_block_list, bg_record);
 
143
        remove_from_bg_list(bg_lists->booted, bg_record);
142
144
 
143
145
        xfree(user_name);
144
146
                        
174
176
        rc = select_g_get_jobinfo(job_ptr->select_jobinfo,
175
177
                                  SELECT_DATA_BLOCK_ID, &block_id);
176
178
        if (rc == SLURM_SUCCESS) {
177
 
                bg_record = find_bg_record_in_list(bg_list, block_id);
 
179
                bg_record = find_bg_record_in_list(bg_lists->main, block_id);
178
180
                slurm_mutex_lock(&block_state_mutex);
179
181
                
180
182
                if(bg_record) {
189
191
                        else
190
192
                                rc = READY_JOB_ERROR;   /* try again */
191
193
                } else {
192
 
                        error("block_ready: block %s not in bg_list.",
 
194
                        error("block_ready: block %s not in bg_lists->main.",
193
195
                              block_id);
194
196
                        rc = READY_JOB_FATAL;   /* fatal error */
195
197
                }
213
215
        pack16((uint16_t)bg_record->conn_type, buffer);
214
216
#ifdef HAVE_BGL
215
217
        pack16((uint16_t)bg_record->node_use, buffer);  
216
 
        pack16((uint16_t)0, buffer);    
217
 
        pack16((uint16_t)0, buffer);    
218
218
#endif
219
219
        pack32((uint32_t)bg_record->node_cnt, buffer);
 
220
        pack32((uint32_t)bg_record->job_running, buffer);
220
221
        pack_bit_fmt(bg_record->bitmap, buffer);
221
222
        pack_bit_fmt(bg_record->ionode_bitmap, buffer);
222
223
#ifdef HAVE_BGL
248
249
        if(!kill_job_list)
249
250
                kill_job_list = list_create(_destroy_kill_struct);
250
251
 
251
 
        if(!bg_list) 
 
252
        if(!bg_lists->main) 
252
253
                return updated;
253
254
        
254
255
        slurm_mutex_lock(&block_state_mutex);
255
 
        itr = list_iterator_create(bg_list);
 
256
        itr = list_iterator_create(bg_lists->main);
256
257
        while ((bg_record = (bg_record_t *) list_next(itr)) != NULL) {
257
258
                if(!bg_record->bg_block_id)
258
259
                        continue;
259
260
                name = bg_record->bg_block_id;
260
261
                if ((rc = bridge_get_block_info(name, &block_ptr)) 
261
262
                    != STATUS_OK) {
262
 
                        if(bluegene_layout_mode == LAYOUT_DYNAMIC) {
 
263
                        if(bg_conf->layout_mode == LAYOUT_DYNAMIC) {
263
264
                                switch(rc) {
264
265
                                case INCONSISTENT_DATA:
265
266
                                        debug2("got inconsistent data when "
301
302
                        updated = 1;
302
303
                }
303
304
#else
304
 
                if((bg_record->node_cnt < bluegene_bp_node_cnt) 
305
 
                   || (bluegene_bp_node_cnt == bluegene_nodecard_node_cnt)) {
 
305
                if((bg_record->node_cnt < bg_conf->bp_node_cnt) 
 
306
                   || (bg_conf->bp_node_cnt == bg_conf->nodecard_node_cnt)) {
306
307
                        char *mode = NULL;
307
308
                        uint16_t conn_type = SELECT_SMALL;
308
309
                        if ((rc = bridge_get_data(block_ptr,
404
405
                        else if(bg_record->state == RM_PARTITION_CONFIGURING) 
405
406
                                bg_record->boot_state = 1;
406
407
                        else if(bg_record->state == RM_PARTITION_FREE) {
407
 
                                if(remove_from_bg_list(bg_job_block_list, 
 
408
                                if(remove_from_bg_list(bg_lists->job_running, 
408
409
                                                       bg_record) 
409
410
                                   == SLURM_SUCCESS) {
410
411
                                        num_unused_cpus += bg_record->cpu_cnt;
411
412
                                }
412
 
                                remove_from_bg_list(bg_booted_block_list,
413
 
                                                    bg_record);
414
 
                        } 
 
413
                                remove_from_bg_list(bg_lists->booted,
 
414
                                                    bg_record);
 
415
                        } else if(bg_record->state == RM_PARTITION_ERROR) {
 
416
                                if(bg_record->boot_state == 1)
 
417
                                        error("Block %s in an error "
 
418
                                              "state while booting.",
 
419
                                              bg_record->bg_block_id);
 
420
                                else                                    
 
421
                                        error("Block %s in an error state.",
 
422
                                              bg_record->bg_block_id);
 
423
                                remove_from_bg_list(bg_lists->booted,
 
424
                                                    bg_record);
 
425
                                trigger_block_error();
 
426
                        }
415
427
                        updated = 1;
416
428
                        
417
429
                }
432
444
                                
433
445
                                break;
434
446
                        case RM_PARTITION_ERROR:
435
 
                                bg_record->boot_state = 0;
436
 
                                bg_record->boot_count = 0;
437
 
                                if(bg_record->job_running > NO_JOB_RUNNING) {
438
 
                                        error("Block %s in an error "
439
 
                                              "state while booting.  "
440
 
                                              "Failing job %u.",
441
 
                                              bg_record->bg_block_id,
442
 
                                              bg_record->job_running);
443
 
                                        freeit = xmalloc(
444
 
                                                sizeof(kill_job_struct_t));
445
 
                                        freeit->jobid = bg_record->job_running;
446
 
                                        list_push(kill_job_list, freeit);
447
 
                                        if(remove_from_bg_list(
448
 
                                                   bg_job_block_list, 
449
 
                                                   bg_record) 
450
 
                                           == SLURM_SUCCESS) {
451
 
                                                num_unused_cpus += 
452
 
                                                        bg_record->cpu_cnt;
453
 
                                        } 
454
 
                                } else 
455
 
                                        error("block %s in an error "
456
 
                                              "state while booting.",
457
 
                                              bg_record->bg_block_id);
458
 
                                remove_from_bg_list(bg_booted_block_list,
459
 
                                                    bg_record);
460
 
                                trigger_block_error();
 
447
                                /* If we get an error on boot that
 
448
                                 * means it is a transparent L3 error
 
449
                                 * and should be trying to fix
 
450
                                 * itself.  If this is the case we
 
451
                                 * just hang out waiting for the state
 
452
                                 * to go to free where we will try to
 
453
                                 * boot again below.
 
454
                                 */
461
455
                                break;
462
456
                        case RM_PARTITION_FREE:
463
457
                                if(bg_record->boot_count < RETRY_BOOT_COUNT) {
495
489
                                        bg_record->boot_state = 0;
496
490
                                        bg_record->boot_count = 0;
497
491
                                        if(remove_from_bg_list(
498
 
                                                   bg_job_block_list, 
 
492
                                                   bg_lists->job_running, 
499
493
                                                   bg_record) 
500
494
                                           == SLURM_SUCCESS) {
501
495
                                                num_unused_cpus += 
502
496
                                                        bg_record->cpu_cnt;
503
497
                                        } 
504
498
                                        remove_from_bg_list(
505
 
                                                bg_booted_block_list,
 
499
                                                bg_lists->booted,
506
500
                                                bg_record);
507
501
                                }
508
502
                                break;
576
570
        bg_record_t *bg_record = NULL;
577
571
        ListIterator itr = NULL;
578
572
        
579
 
        if(!bg_freeing_list) 
 
573
        if(!bg_lists->freeing) 
580
574
                return updated;
581
575
        
582
576
        slurm_mutex_lock(&block_state_mutex);
583
 
        itr = list_iterator_create(bg_freeing_list);
 
577
        itr = list_iterator_create(bg_lists->freeing);
584
578
        while ((bg_record = (bg_record_t *) list_next(itr)) != NULL) {
585
579
                if(!bg_record->bg_block_id)
586
580
                        continue;
588
582
                name = bg_record->bg_block_id;
589
583
                if ((rc = bridge_get_block_info(name, &block_ptr)) 
590
584
                    != STATUS_OK) {
591
 
                        if(bluegene_layout_mode == LAYOUT_DYNAMIC) {
 
585
                        if(bg_conf->layout_mode == LAYOUT_DYNAMIC) {
592
586
                                switch(rc) {
593
587
                                case INCONSISTENT_DATA:
594
588
                                        debug2("got inconsistent data when "