~ubuntu-branches/ubuntu/vivid/slurm-llnl/vivid

« back to all changes in this revision

Viewing changes to src/slurmctld/read_config.c

  • Committer: Bazaar Package Importer
  • Author(s): Gennaro Oliva
  • Date: 2009-09-24 23:28:15 UTC
  • mfrom: (1.1.11 upstream) (3.2.4 sid)
  • Revision ID: james.westby@ubuntu.com-20090924232815-enh65jn32q1ebg07
Tags: 2.0.5-1
* New upstream release 
* Changed dependecy from lib-mysqlclient15 to lib-mysqlclient 
* Added Default-Start for runlevel 2 and 4 and $remote_fs requirement in
  init.d scripts (Closes: #541252)
* Postinst checks for wrong runlevels 2 and 4 links
* Upgraded to standard version 3.8.3
* Add lintian overrides for missing slurm-llnl-configurator.html in doc
  base registration
* modified postrm scripts to ignore pkill return value in order to avoid
  postrm failure when no slurm process is running
* Checking for slurmctld.pid before cancelling running and pending
  jobs during package removal 

Show diffs side-by-side

added added

removed removed

Lines of Context:
1
1
/*****************************************************************************\
2
2
 *  read_config.c - read the overall slurm configuration file
3
3
 *****************************************************************************
4
 
 *  Copyright (C) 2002 The Regents of the University of California.
 
4
 *  Copyright (C) 2002-2007 The Regents of the University of California.
 
5
 *  Copyright (C) 2008-2009 Lawrence Livermore National Security.
5
6
 *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
6
7
 *  Written by Morris Jette <jette1@llnl.gov>.
7
 
 *  LLNL-CODE-402394.
 
8
 *  CODE-OCEC-09-009. All rights reserved.
8
9
 *  
9
10
 *  This file is part of SLURM, a resource management program.
10
 
 *  For details, see <http://www.llnl.gov/linux/slurm/>.
 
11
 *  For details, see <https://computing.llnl.gov/linux/slurm/>.
 
12
 *  Please also read the included file: DISCLAIMER.
11
13
 *  
12
14
 *  SLURM is free software; you can redistribute it and/or modify it under
13
15
 *  the terms of the GNU General Public License as published by the Free
15
17
 *  any later version.
16
18
 *
17
19
 *  In addition, as a special exception, the copyright holders give permission 
18
 
 *  to link the code of portions of this program with the OpenSSL library under 
 
20
 *  to link the code of portions of this program with the OpenSSL library under
19
21
 *  certain conditions as described in each individual source file, and 
20
22
 *  distribute linked combinations including the two. You must obey the GNU 
21
23
 *  General Public License in all respects for all of the code used other than 
68
70
#include "src/common/xstring.h"
69
71
 
70
72
#include "src/slurmctld/acct_policy.h"
 
73
#include "src/slurmctld/basil_interface.h"
71
74
#include "src/slurmctld/job_scheduler.h"
72
75
#include "src/slurmctld/licenses.h"
73
76
#include "src/slurmctld/locks.h"
74
77
#include "src/slurmctld/node_scheduler.h"
 
78
#include "src/slurmctld/port_mgr.h"
75
79
#include "src/slurmctld/proc_req.h"
76
80
#include "src/slurmctld/read_config.h"
 
81
#include "src/slurmctld/reservation.h"
77
82
#include "src/slurmctld/sched_plugin.h"
78
83
#include "src/slurmctld/slurmctld.h"
 
84
#include "src/slurmctld/srun_comm.h"
79
85
#include "src/slurmctld/trigger_mgr.h"
 
86
#include "src/slurmctld/topo_plugin.h"
80
87
 
81
88
static void _acct_restore_active_jobs(void);
82
89
static int  _build_bitmaps(void);
83
90
static void _build_bitmaps_pre_select(void);
84
91
static int  _init_all_slurm_conf(void);
85
 
static void _purge_old_node_state(struct node_record *old_node_table_ptr, 
86
 
                                int old_node_record_count);
87
 
static int  _restore_job_dependencies(void);
88
 
static int  _restore_node_state(struct node_record *old_node_table_ptr, 
89
 
                                int old_node_record_count);
90
92
static int  _preserve_select_type_param(slurm_ctl_conf_t * ctl_conf_ptr, 
91
 
                                        select_type_plugin_info_t old_select_type_p);
 
93
                                        select_type_plugin_info_t 
 
94
                                        old_select_type_p);
92
95
static int  _preserve_plugins(slurm_ctl_conf_t * ctl_conf_ptr, 
93
96
                                char *old_auth_type, char *old_checkpoint_type,
94
97
                                char *old_crypto_type, char *old_sched_type, 
95
98
                                char *old_select_type, char *old_switch_type);
 
99
static void _purge_old_node_state(struct node_record *old_node_table_ptr, 
 
100
                                int old_node_record_count);
 
101
static int  _restore_job_dependencies(void);
 
102
static int  _restore_node_state(struct node_record *old_node_table_ptr, 
 
103
                                int old_node_record_count);
96
104
static int  _sync_nodes_to_comp_job(void);
97
105
static int  _sync_nodes_to_jobs(void);
98
106
static int  _sync_nodes_to_active_job(struct job_record *job_ptr);
103
111
static char *highest_node_name = NULL;
104
112
int node_record_count = 0;
105
113
 
106
 
/* FIXME - declarations for temporarily moved functions */
107
 
#define MULTIPLE_VALUE_MSG "Multiple values for %s, latest one used"
108
 
 
109
114
/*
110
115
 * _build_bitmaps_pre_select - recover some state for jobs and nodes prior to 
111
116
 *      calling the select_* functions
116
121
        struct node_record   *node_ptr;
117
122
        ListIterator part_iterator;
118
123
        int i;
119
 
        
120
124
 
121
125
        /* scan partition table and identify nodes in each */
122
126
        part_iterator = list_iterator_create(part_list);
179
183
        /* initialize the idle and up bitmaps */
180
184
        FREE_NULL_BITMAP(idle_node_bitmap);
181
185
        FREE_NULL_BITMAP(avail_node_bitmap);
 
186
        FREE_NULL_BITMAP(power_node_bitmap);
182
187
        FREE_NULL_BITMAP(share_node_bitmap);
183
188
        FREE_NULL_BITMAP(up_node_bitmap);
184
189
        idle_node_bitmap  = (bitstr_t *) bit_alloc(node_record_count);
185
190
        avail_node_bitmap = (bitstr_t *) bit_alloc(node_record_count);
 
191
        power_node_bitmap = (bitstr_t *) bit_alloc(node_record_count);
186
192
        share_node_bitmap = (bitstr_t *) bit_alloc(node_record_count);
187
193
        up_node_bitmap    = (bitstr_t *) bit_alloc(node_record_count);
188
194
        if ((idle_node_bitmap     == NULL) ||
189
195
            (avail_node_bitmap    == NULL) ||
 
196
            (power_node_bitmap    == NULL) ||
190
197
            (share_node_bitmap    == NULL) ||
191
198
            (up_node_bitmap       == NULL)) 
192
199
                fatal ("bit_alloc malloc failure");
231
238
         * their configuration, resync DRAINED vs. DRAINING state */
232
239
        for (i = 0; i < node_record_count; i++) {
233
240
                uint16_t base_state, drain_flag, no_resp_flag, job_cnt;
 
241
                struct node_record *node_ptr = node_record_table_ptr + i;
234
242
 
235
 
                if (node_record_table_ptr[i].name[0] == '\0')
 
243
                if (node_ptr->name[0] == '\0')
236
244
                        continue;       /* defunct */
237
 
                base_state = node_record_table_ptr[i].node_state & 
238
 
                                NODE_STATE_BASE;
239
 
                drain_flag = node_record_table_ptr[i].node_state &
 
245
                base_state = node_ptr->node_state & NODE_STATE_BASE;
 
246
                drain_flag = node_ptr->node_state &
240
247
                                (NODE_STATE_DRAIN | NODE_STATE_FAIL);
241
 
                no_resp_flag = node_record_table_ptr[i].node_state & 
242
 
                                NODE_STATE_NO_RESPOND;
243
 
                job_cnt = node_record_table_ptr[i].run_job_cnt +
244
 
                          node_record_table_ptr[i].comp_job_cnt;
 
248
                no_resp_flag = node_ptr->node_state & NODE_STATE_NO_RESPOND;
 
249
                job_cnt = node_ptr->run_job_cnt + node_ptr->comp_job_cnt;
245
250
 
246
251
                if (((base_state == NODE_STATE_IDLE) && (job_cnt == 0))
247
252
                ||  (base_state == NODE_STATE_DOWN))
252
257
                                bit_set(avail_node_bitmap, i);
253
258
                        bit_set(up_node_bitmap, i);
254
259
                }
255
 
                if (node_record_table_ptr[i].config_ptr)
256
 
                        bit_set(node_record_table_ptr[i].config_ptr->
257
 
                                node_bitmap, i);
 
260
                if (node_ptr->node_state & NODE_STATE_POWER_SAVE)
 
261
                        bit_set(power_node_bitmap, i);
 
262
                if (node_ptr->config_ptr)
 
263
                        bit_set(node_ptr->config_ptr->node_bitmap, i);
258
264
        }
259
265
        return error_code;
260
266
}
314
320
        return state_val;
315
321
}
316
322
 
317
 
#ifdef HAVE_3D
318
 
/* Used to get the general name of the machine, used primarily 
319
 
 * for bluegene systems.  Not in general use because some systems 
320
 
 * have multiple prefix's such as foo[1-1000],bar[1-1000].
321
 
 */
322
 
/* Caller must be holding slurm_conf_lock() */
323
 
static void _set_node_prefix(const char *nodenames, slurm_ctl_conf_t *conf)
324
 
{
325
 
        int i;
326
 
        char *tmp;
327
 
 
328
 
        xassert(nodenames != NULL);
329
 
        for (i = 1; nodenames[i] != '\0'; i++) {
330
 
                if((nodenames[i-1] == '[') 
331
 
                   || (nodenames[i-1] <= '9'
332
 
                       && nodenames[i-1] >= '0'))
333
 
                        break;
334
 
        }
335
 
        xfree(conf->node_prefix);
336
 
        if(nodenames[i] == '\0')
337
 
                conf->node_prefix = xstrdup(nodenames);
338
 
        else {
339
 
                tmp = xmalloc(sizeof(char)*i+1);
340
 
                memset(tmp, 0, i+1);
341
 
                snprintf(tmp, i, "%s", nodenames);
342
 
                conf->node_prefix = tmp;
343
 
                tmp = NULL;
344
 
        }
345
 
        debug3("Prefix is %s %s %d", conf->node_prefix, nodenames, i);
346
 
}
347
 
#endif /* HAVE_BG */
348
 
 
349
323
/* 
350
324
 * _build_single_nodeline_info - From the slurm.conf reader, build table,
351
325
 *      and set values
392
366
                goto cleanup;
393
367
        }
394
368
 
395
 
#ifdef HAVE_3D
396
 
        if (conf->node_prefix == NULL)
397
 
                _set_node_prefix(node_ptr->nodenames, conf);
398
 
#endif
399
 
 
400
369
        /* some sanity checks */
401
370
#ifdef HAVE_FRONT_END
402
371
        if ((hostlist_count(hostname_list) != 1) ||
518
487
        return error_code;
519
488
}
520
489
 
521
 
static void _handle_all_downnodes()
 
490
static void _handle_all_downnodes(void)
522
491
{
523
492
        slurm_conf_downnodes_t *ptr, **ptr_array;
524
493
        int count;
544
513
 * Note: Operates on common variables
545
514
 *      default_node_record - default node configuration values
546
515
 */
547
 
static int _build_all_nodeline_info(slurm_ctl_conf_t *conf)
 
516
static int _build_all_nodeline_info(void)
548
517
{
549
518
        slurm_conf_node_t *node, **ptr_array;
550
519
        struct config_record *config_ptr = NULL;
 
520
        slurm_ctl_conf_t *conf;
551
521
        int count;
552
522
        int i;
553
523
 
555
525
        if (count == 0)
556
526
                fatal("No NodeName information available!");
557
527
 
 
528
        conf = slurm_conf_lock();
558
529
        for (i = 0; i < count; i++) {
559
530
                node = ptr_array[i];
560
531
 
574
545
                _build_single_nodeline_info(node, config_ptr, conf);
575
546
        }
576
547
        xfree(highest_node_name);
 
548
 
 
549
        /* Unlock config here so that we can call
 
550
         * find_node_record() below and in the topology plugins */
 
551
        slurm_conf_unlock();
577
552
#ifdef HAVE_3D
578
553
{
579
554
        char *node_000 = NULL;
581
556
        if (conf->node_prefix)
582
557
                node_000 = xstrdup(conf->node_prefix);
583
558
        xstrcat(node_000, "000");
584
 
        slurm_conf_unlock();
585
559
        node_rec = find_node_record(node_000);
586
 
        slurm_conf_lock();
587
560
        if (node_rec == NULL)
588
561
                fatal("No node %s configured", node_000);
589
562
        xfree(node_000);
590
563
#ifndef HAVE_BG
591
564
        if (count == 1)
592
 
                nodes_to_hilbert_curve();
 
565
                slurm_topo_build_config();
593
566
#endif  /* ! HAVE_BG */
594
567
}
 
568
#else
 
569
        slurm_topo_build_config();
595
570
#endif  /* HAVE_3D */
 
571
 
596
572
        return SLURM_SUCCESS;
597
573
}
598
574
 
635
611
        
636
612
        if(part_ptr->disable_root_jobs) 
637
613
                debug2("partition %s does not allow root jobs", part_ptr->name);
638
 
        
 
614
 
 
615
        if ((part->default_time != NO_VAL) &&
 
616
            (part->default_time > part->max_time)) {
 
617
                info("partition %s DefaultTime exceeds MaxTime (%u > %u)",
 
618
                     part->default_time, part->max_time);
 
619
                part->default_time = NO_VAL;
 
620
        }
 
621
 
639
622
        part_ptr->hidden         = part->hidden_flag ? 1 : 0;
640
623
        part_ptr->max_time       = part->max_time;
 
624
        part_ptr->default_time   = part->default_time;
641
625
        part_ptr->max_share      = part->max_share;
642
626
        part_ptr->max_nodes      = part->max_nodes;
643
627
        part_ptr->max_nodes_orig = part->max_nodes;
650
634
                xfree(part_ptr->allow_groups);
651
635
                part_ptr->allow_groups = xstrdup(part->allow_groups);
652
636
        }
 
637
        if (part->allow_alloc_nodes) {
 
638
                if (part_ptr->allow_alloc_nodes) {
 
639
                        int cnt_tot, cnt_uniq, buf_size;
 
640
                        hostlist_t hl = hostlist_create(part_ptr->
 
641
                                                        allow_alloc_nodes);
 
642
                        
 
643
                        hostlist_push(hl, part->allow_alloc_nodes);
 
644
                        cnt_tot = hostlist_count(hl);
 
645
                        hostlist_uniq(hl);
 
646
                        cnt_uniq = hostlist_count(hl);
 
647
                        if (cnt_tot != cnt_uniq) {
 
648
                                fatal("Duplicate Allowed Allocating Nodes for "
 
649
                                      "Partition %s", part->name);
 
650
                        }
 
651
                        buf_size = strlen(part_ptr->allow_alloc_nodes) + 1 +
 
652
                                   strlen(part->allow_alloc_nodes) + 1;
 
653
                        xfree(part_ptr->allow_alloc_nodes);
 
654
                        part_ptr->allow_alloc_nodes = xmalloc(buf_size);
 
655
                        hostlist_ranged_string(hl, buf_size, 
 
656
                                               part_ptr->allow_alloc_nodes);
 
657
                        hostlist_destroy(hl);
 
658
                } else {
 
659
                        part_ptr->allow_alloc_nodes = 
 
660
                                        xstrdup(part->allow_alloc_nodes);
 
661
                }
 
662
        }
653
663
        if (part->nodes) {
654
664
                if (part_ptr->nodes) {
655
665
                        int cnt_tot, cnt_uniq, buf_size;
690
700
        slurm_conf_partition_t *part, **ptr_array;
691
701
        int count;
692
702
        int i;
693
 
 
 
703
        ListIterator itr = NULL;
 
704
                        
694
705
        count = slurm_conf_partition_array(&ptr_array);
695
706
        if (count == 0)
696
707
                fatal("No PartitionName information available!");
699
710
                part = ptr_array[i];
700
711
 
701
712
                _build_single_partitionline_info(part);
702
 
        }
 
713
                if(part->priority > part_max_priority) 
 
714
                        part_max_priority = part->priority;
 
715
        }
 
716
 
 
717
        /* set up the normalized priority of the partitions */
 
718
        if(part_max_priority) {
 
719
                struct part_record *part_ptr = NULL;
 
720
 
 
721
                itr = list_iterator_create(part_list);
 
722
                while((part_ptr = list_next(itr))) {
 
723
                        part_ptr->norm_priority = (double)part_ptr->priority 
 
724
                                / (double)part_max_priority;
 
725
                }
 
726
                list_iterator_destroy(itr);
 
727
        }
 
728
 
703
729
        return SLURM_SUCCESS;
704
730
}
705
731
 
728
754
        char *old_select_type     = xstrdup(slurmctld_conf.select_type);
729
755
        char *old_switch_type     = xstrdup(slurmctld_conf.switch_type);
730
756
        char *state_save_dir      = xstrdup(slurmctld_conf.state_save_location);
731
 
        slurm_ctl_conf_t *conf;
 
757
        char *mpi_params;
732
758
        select_type_plugin_info_t old_select_type_p = 
733
759
                (select_type_plugin_info_t) slurmctld_conf.select_type_param;
734
760
 
763
789
                node_record_table_ptr = old_node_table_ptr;
764
790
                return error_code;
765
791
        }
766
 
        conf = slurm_conf_lock();
767
 
        _build_all_nodeline_info(conf);
768
 
        slurm_conf_unlock();
 
792
 
 
793
        if (slurm_topo_init() != SLURM_SUCCESS)
 
794
                fatal("Failed to initialize topology plugin");
 
795
 
 
796
        _build_all_nodeline_info();
769
797
        _handle_all_downnodes();
770
798
        _build_all_partitionline_info();
771
799
 
772
800
        update_logging();
773
801
        g_slurm_jobcomp_init(slurmctld_conf.job_comp_loc);
774
 
        slurm_sched_init();
775
 
        if (switch_init() < 0)
776
 
                error("Failed to initialize switch plugin");
 
802
        if (slurm_sched_init() != SLURM_SUCCESS)
 
803
                fatal("Failed to initialize sched plugin");
 
804
        if (switch_init() != SLURM_SUCCESS)
 
805
                fatal("Failed to initialize switch plugin");
777
806
 
778
807
        if (default_part_loc == NULL)
779
808
                error("read_slurm_conf: default partition not set.");
803
832
                                                 old_node_record_count);
804
833
                        error_code = MAX(error_code, rc);  /* not fatal */
805
834
                }
 
835
                load_last_job_id();
806
836
                reset_first_job_id();
807
837
                (void) slurm_sched_reconfig();
808
838
                xfree(state_save_dir);
810
840
 
811
841
        _build_bitmaps_pre_select();
812
842
        if ((select_g_node_init(node_record_table_ptr, node_record_count)
813
 
             != SLURM_SUCCESS) 
814
 
            || (select_g_block_init(part_list) != SLURM_SUCCESS) 
815
 
            || (select_g_state_restore(state_save_dir) != SLURM_SUCCESS) 
816
 
            || (select_g_job_init(job_list) != SLURM_SUCCESS)) {
 
843
             != SLURM_SUCCESS)                                          ||
 
844
            (select_g_block_init(part_list) != SLURM_SUCCESS)           ||
 
845
            (select_g_state_restore(state_save_dir) != SLURM_SUCCESS)   ||
 
846
            (select_g_job_init(job_list) != SLURM_SUCCESS)) {
817
847
                fatal("failed to initialize node selection plugin state, "
818
848
                      "Clean start required.");
819
849
        }
826
856
 
827
857
        if ((rc = _build_bitmaps()))
828
858
                fatal("_build_bitmaps failure");
 
859
        mpi_params = slurm_get_mpi_params();
 
860
        reserve_port_config(mpi_params);
 
861
        xfree(mpi_params);
829
862
 
830
863
        license_free();
831
864
        if (license_init(slurmctld_conf.licenses) != SLURM_SUCCESS)
839
872
        (void) _sync_nodes_to_comp_job();/* must follow select_g_node_init() */
840
873
        load_part_uid_allow_list(1);
841
874
 
 
875
        load_all_resv_state(recover);
842
876
        if (recover >= 1)
843
877
                (void) trigger_state_restore();
844
878
 
862
896
        if (load_job_ret)
863
897
                _acct_restore_active_jobs();
864
898
 
 
899
#ifdef HAVE_CRAY_XT
 
900
        basil_query();
 
901
#endif
 
902
 
865
903
        slurmctld_conf.last_update = time(NULL);
866
904
        END_TIMER2("read_slurm_conf");
867
905
        return error_code;
871
909
/* Restore node state and size information from saved records.
872
910
 * If a node was re-configured to be down or drained, we set those states */
873
911
static int _restore_node_state(struct node_record *old_node_table_ptr, 
874
 
                                int old_node_record_count)
 
912
                               int old_node_record_count)
875
913
{
876
914
        struct node_record *node_ptr;
877
915
        int i, rc = SLURM_SUCCESS;
 
916
        hostset_t hs = NULL;
 
917
        slurm_ctl_conf_t *conf = slurm_conf_lock();
 
918
        bool power_save_mode = false;
 
919
 
 
920
        if (conf->suspend_program && conf->resume_program)
 
921
                power_save_mode = true;
 
922
        slurm_conf_unlock();
878
923
 
879
924
        for (i = 0; i < old_node_record_count; i++) {
880
925
                uint16_t drain_flag = false, down_flag = false;
882
927
                if (node_ptr == NULL)
883
928
                        continue;
884
929
 
885
 
                if ((node_ptr->node_state & NODE_STATE_BASE) == NODE_STATE_DOWN)
 
930
                if ((node_ptr->node_state & NODE_STATE_BASE) == 
 
931
                    NODE_STATE_DOWN)
886
932
                        down_flag = true;
887
933
                if (node_ptr->node_state & NODE_STATE_DRAIN)
888
934
                        drain_flag = true;
893
939
                }
894
940
                if (drain_flag)
895
941
                        node_ptr->node_state |= NODE_STATE_DRAIN; 
896
 
                        
 
942
                if ((node_ptr->node_state & NODE_STATE_POWER_SAVE) &&
 
943
                    (!power_save_mode)) {
 
944
                        node_ptr->node_state &= (~NODE_STATE_POWER_SAVE);
 
945
                        if (hs)
 
946
                                hostset_insert(hs, node_ptr->name);
 
947
                        else
 
948
                                hs = hostset_create(node_ptr->name);
 
949
                }
 
950
 
897
951
                node_ptr->last_response = old_node_table_ptr[i].last_response;
898
952
                if (old_node_table_ptr[i].port != node_ptr->config_ptr->cpus) {
899
953
                        rc = ESLURM_NEED_RESTART;
928
982
                        old_node_table_ptr[i].os = NULL;
929
983
                }
930
984
        }
 
985
 
 
986
        if (hs) {
 
987
                char node_names[128];
 
988
                hostset_ranged_string(hs, sizeof(node_names), node_names);
 
989
                info("Cleared POWER_SAVE flag from nodes %s", node_names);
 
990
                hostset_destroy(hs);
 
991
        }
931
992
        return rc;
932
993
}
933
994
 
1094
1155
                        update_cnt++;
1095
1156
                        info("Killing job_id %u", job_ptr->job_id);
1096
1157
                        deallocate_nodes(job_ptr, false, false);
 
1158
                        job_completion_logger(job_ptr);
1097
1159
                }
1098
1160
        }
1099
1161
        list_iterator_destroy(job_iterator);
1111
1173
        uint16_t base_state, node_flags;
1112
1174
        struct node_record *node_ptr = node_record_table_ptr;
1113
1175
 
1114
 
        job_ptr->node_cnt = 0;
 
1176
        job_ptr->node_cnt = bit_set_count(job_ptr->node_bitmap);
1115
1177
        for (i = 0; i < node_record_count; i++, node_ptr++) {
1116
1178
                if (bit_test(job_ptr->node_bitmap, i) == 0)
1117
1179
                        continue;
1118
 
                job_ptr->node_cnt++;
1119
1180
 
1120
1181
                base_state = node_ptr->node_state & NODE_STATE_BASE;
1121
1182
                node_flags = node_ptr->node_state & NODE_STATE_FLAGS;
1128
1189
                    (job_ptr->details) && (job_ptr->details->shared == 0))
1129
1190
                        node_ptr->no_share_job_cnt++;
1130
1191
 
1131
 
                if (base_state == NODE_STATE_DOWN) {
 
1192
                if ((base_state == NODE_STATE_DOWN)     &&
 
1193
                    (job_ptr->job_state == JOB_RUNNING) &&
 
1194
                    (job_ptr->kill_on_node_fail == 0)   &&
 
1195
                    (job_ptr->node_cnt > 1)) {
 
1196
                        /* This should only happen if a job was running 
 
1197
                         * on a node that was newly configured DOWN */
 
1198
                        info("Removing failed node %s from job_id %u",
 
1199
                             node_ptr->name, job_ptr->job_id);
 
1200
                        srun_node_fail(job_ptr->job_id, node_ptr->name);
 
1201
                        kill_step_on_node(job_ptr, node_ptr);
 
1202
                        excise_node_from_job(job_ptr, node_ptr);
 
1203
                } else if (base_state == NODE_STATE_DOWN) {
1132
1204
                        time_t now = time(NULL);
 
1205
                        info("Killing job %u on DOWN node %s",
 
1206
                             job_ptr->job_id, node_ptr->name);
1133
1207
                        job_ptr->job_state = JOB_NODE_FAIL | JOB_COMPLETING;
1134
1208
                        job_ptr->end_time = MIN(job_ptr->end_time, now);
1135
1209
                        job_ptr->exit_code = MAX(job_ptr->exit_code, 1);
1136
1210
                        job_ptr->state_reason = FAIL_DOWN_NODE;
 
1211
                        xfree(job_ptr->state_desc);
1137
1212
                        job_completion_logger(job_ptr);
1138
1213
                        cnt++;
1139
1214
                } else if ((base_state == NODE_STATE_UNKNOWN) ||