1
1
/*****************************************************************************\
2
2
* read_config.c - read the overall slurm configuration file
3
3
*****************************************************************************
4
* Copyright (C) 2002 The Regents of the University of California.
4
* Copyright (C) 2002-2007 The Regents of the University of California.
5
* Copyright (C) 2008-2009 Lawrence Livermore National Security.
5
6
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
6
7
* Written by Morris Jette <jette1@llnl.gov>.
8
* CODE-OCEC-09-009. All rights reserved.
9
10
* This file is part of SLURM, a resource management program.
10
* For details, see <http://www.llnl.gov/linux/slurm/>.
11
* For details, see <https://computing.llnl.gov/linux/slurm/>.
12
* Please also read the included file: DISCLAIMER.
12
14
* SLURM is free software; you can redistribute it and/or modify it under
13
15
* the terms of the GNU General Public License as published by the Free
15
17
* any later version.
17
19
* In addition, as a special exception, the copyright holders give permission
18
* to link the code of portions of this program with the OpenSSL library under
20
* to link the code of portions of this program with the OpenSSL library under
19
21
* certain conditions as described in each individual source file, and
20
22
* distribute linked combinations including the two. You must obey the GNU
21
23
* General Public License in all respects for all of the code used other than
68
70
#include "src/common/xstring.h"
70
72
#include "src/slurmctld/acct_policy.h"
73
#include "src/slurmctld/basil_interface.h"
71
74
#include "src/slurmctld/job_scheduler.h"
72
75
#include "src/slurmctld/licenses.h"
73
76
#include "src/slurmctld/locks.h"
74
77
#include "src/slurmctld/node_scheduler.h"
78
#include "src/slurmctld/port_mgr.h"
75
79
#include "src/slurmctld/proc_req.h"
76
80
#include "src/slurmctld/read_config.h"
81
#include "src/slurmctld/reservation.h"
77
82
#include "src/slurmctld/sched_plugin.h"
78
83
#include "src/slurmctld/slurmctld.h"
84
#include "src/slurmctld/srun_comm.h"
79
85
#include "src/slurmctld/trigger_mgr.h"
86
#include "src/slurmctld/topo_plugin.h"
81
88
static void _acct_restore_active_jobs(void);
82
89
static int _build_bitmaps(void);
83
90
static void _build_bitmaps_pre_select(void);
84
91
static int _init_all_slurm_conf(void);
85
static void _purge_old_node_state(struct node_record *old_node_table_ptr,
86
int old_node_record_count);
87
static int _restore_job_dependencies(void);
88
static int _restore_node_state(struct node_record *old_node_table_ptr,
89
int old_node_record_count);
90
92
static int _preserve_select_type_param(slurm_ctl_conf_t * ctl_conf_ptr,
91
select_type_plugin_info_t old_select_type_p);
93
select_type_plugin_info_t
92
95
static int _preserve_plugins(slurm_ctl_conf_t * ctl_conf_ptr,
93
96
char *old_auth_type, char *old_checkpoint_type,
94
97
char *old_crypto_type, char *old_sched_type,
95
98
char *old_select_type, char *old_switch_type);
99
static void _purge_old_node_state(struct node_record *old_node_table_ptr,
100
int old_node_record_count);
101
static int _restore_job_dependencies(void);
102
static int _restore_node_state(struct node_record *old_node_table_ptr,
103
int old_node_record_count);
96
104
static int _sync_nodes_to_comp_job(void);
97
105
static int _sync_nodes_to_jobs(void);
98
106
static int _sync_nodes_to_active_job(struct job_record *job_ptr);
103
111
static char *highest_node_name = NULL;
104
112
int node_record_count = 0;
106
/* FIXME - declarations for temporarily moved functions */
107
#define MULTIPLE_VALUE_MSG "Multiple values for %s, latest one used"
110
115
* _build_bitmaps_pre_select - recover some state for jobs and nodes prior to
111
116
* calling the select_* functions
179
183
/* initialize the idle and up bitmaps */
180
184
FREE_NULL_BITMAP(idle_node_bitmap);
181
185
FREE_NULL_BITMAP(avail_node_bitmap);
186
FREE_NULL_BITMAP(power_node_bitmap);
182
187
FREE_NULL_BITMAP(share_node_bitmap);
183
188
FREE_NULL_BITMAP(up_node_bitmap);
184
189
idle_node_bitmap = (bitstr_t *) bit_alloc(node_record_count);
185
190
avail_node_bitmap = (bitstr_t *) bit_alloc(node_record_count);
191
power_node_bitmap = (bitstr_t *) bit_alloc(node_record_count);
186
192
share_node_bitmap = (bitstr_t *) bit_alloc(node_record_count);
187
193
up_node_bitmap = (bitstr_t *) bit_alloc(node_record_count);
188
194
if ((idle_node_bitmap == NULL) ||
189
195
(avail_node_bitmap == NULL) ||
196
(power_node_bitmap == NULL) ||
190
197
(share_node_bitmap == NULL) ||
191
198
(up_node_bitmap == NULL))
192
199
fatal ("bit_alloc malloc failure");
231
238
* their configuration, resync DRAINED vs. DRAINING state */
232
239
for (i = 0; i < node_record_count; i++) {
233
240
uint16_t base_state, drain_flag, no_resp_flag, job_cnt;
241
struct node_record *node_ptr = node_record_table_ptr + i;
235
if (node_record_table_ptr[i].name[0] == '\0')
243
if (node_ptr->name[0] == '\0')
236
244
continue; /* defunct */
237
base_state = node_record_table_ptr[i].node_state &
239
drain_flag = node_record_table_ptr[i].node_state &
245
base_state = node_ptr->node_state & NODE_STATE_BASE;
246
drain_flag = node_ptr->node_state &
240
247
(NODE_STATE_DRAIN | NODE_STATE_FAIL);
241
no_resp_flag = node_record_table_ptr[i].node_state &
242
NODE_STATE_NO_RESPOND;
243
job_cnt = node_record_table_ptr[i].run_job_cnt +
244
node_record_table_ptr[i].comp_job_cnt;
248
no_resp_flag = node_ptr->node_state & NODE_STATE_NO_RESPOND;
249
job_cnt = node_ptr->run_job_cnt + node_ptr->comp_job_cnt;
246
251
if (((base_state == NODE_STATE_IDLE) && (job_cnt == 0))
247
252
|| (base_state == NODE_STATE_DOWN))
252
257
bit_set(avail_node_bitmap, i);
253
258
bit_set(up_node_bitmap, i);
255
if (node_record_table_ptr[i].config_ptr)
256
bit_set(node_record_table_ptr[i].config_ptr->
260
if (node_ptr->node_state & NODE_STATE_POWER_SAVE)
261
bit_set(power_node_bitmap, i);
262
if (node_ptr->config_ptr)
263
bit_set(node_ptr->config_ptr->node_bitmap, i);
259
265
return error_code;
314
320
return state_val;
318
/* Used to get the general name of the machine, used primarily
319
* for bluegene systems. Not in general use because some systems
320
* have multiple prefix's such as foo[1-1000],bar[1-1000].
322
/* Caller must be holding slurm_conf_lock() */
323
static void _set_node_prefix(const char *nodenames, slurm_ctl_conf_t *conf)
328
xassert(nodenames != NULL);
329
for (i = 1; nodenames[i] != '\0'; i++) {
330
if((nodenames[i-1] == '[')
331
|| (nodenames[i-1] <= '9'
332
&& nodenames[i-1] >= '0'))
335
xfree(conf->node_prefix);
336
if(nodenames[i] == '\0')
337
conf->node_prefix = xstrdup(nodenames);
339
tmp = xmalloc(sizeof(char)*i+1);
341
snprintf(tmp, i, "%s", nodenames);
342
conf->node_prefix = tmp;
345
debug3("Prefix is %s %s %d", conf->node_prefix, nodenames, i);
350
324
* _build_single_nodeline_info - From the slurm.conf reader, build table,
544
513
* Note: Operates on common variables
545
514
* default_node_record - default node configuration values
547
static int _build_all_nodeline_info(slurm_ctl_conf_t *conf)
516
static int _build_all_nodeline_info(void)
549
518
slurm_conf_node_t *node, **ptr_array;
550
519
struct config_record *config_ptr = NULL;
520
slurm_ctl_conf_t *conf;
581
556
if (conf->node_prefix)
582
557
node_000 = xstrdup(conf->node_prefix);
583
558
xstrcat(node_000, "000");
585
559
node_rec = find_node_record(node_000);
587
560
if (node_rec == NULL)
588
561
fatal("No node %s configured", node_000);
592
nodes_to_hilbert_curve();
565
slurm_topo_build_config();
593
566
#endif /* ! HAVE_BG */
569
slurm_topo_build_config();
595
570
#endif /* HAVE_3D */
596
572
return SLURM_SUCCESS;
636
612
if(part_ptr->disable_root_jobs)
637
613
debug2("partition %s does not allow root jobs", part_ptr->name);
615
if ((part->default_time != NO_VAL) &&
616
(part->default_time > part->max_time)) {
617
info("partition %s DefaultTime exceeds MaxTime (%u > %u)",
618
part->default_time, part->max_time);
619
part->default_time = NO_VAL;
639
622
part_ptr->hidden = part->hidden_flag ? 1 : 0;
640
623
part_ptr->max_time = part->max_time;
624
part_ptr->default_time = part->default_time;
641
625
part_ptr->max_share = part->max_share;
642
626
part_ptr->max_nodes = part->max_nodes;
643
627
part_ptr->max_nodes_orig = part->max_nodes;
650
634
xfree(part_ptr->allow_groups);
651
635
part_ptr->allow_groups = xstrdup(part->allow_groups);
637
if (part->allow_alloc_nodes) {
638
if (part_ptr->allow_alloc_nodes) {
639
int cnt_tot, cnt_uniq, buf_size;
640
hostlist_t hl = hostlist_create(part_ptr->
643
hostlist_push(hl, part->allow_alloc_nodes);
644
cnt_tot = hostlist_count(hl);
646
cnt_uniq = hostlist_count(hl);
647
if (cnt_tot != cnt_uniq) {
648
fatal("Duplicate Allowed Allocating Nodes for "
649
"Partition %s", part->name);
651
buf_size = strlen(part_ptr->allow_alloc_nodes) + 1 +
652
strlen(part->allow_alloc_nodes) + 1;
653
xfree(part_ptr->allow_alloc_nodes);
654
part_ptr->allow_alloc_nodes = xmalloc(buf_size);
655
hostlist_ranged_string(hl, buf_size,
656
part_ptr->allow_alloc_nodes);
657
hostlist_destroy(hl);
659
part_ptr->allow_alloc_nodes =
660
xstrdup(part->allow_alloc_nodes);
653
663
if (part->nodes) {
654
664
if (part_ptr->nodes) {
655
665
int cnt_tot, cnt_uniq, buf_size;
699
710
part = ptr_array[i];
701
712
_build_single_partitionline_info(part);
713
if(part->priority > part_max_priority)
714
part_max_priority = part->priority;
717
/* set up the normalized priority of the partitions */
718
if(part_max_priority) {
719
struct part_record *part_ptr = NULL;
721
itr = list_iterator_create(part_list);
722
while((part_ptr = list_next(itr))) {
723
part_ptr->norm_priority = (double)part_ptr->priority
724
/ (double)part_max_priority;
726
list_iterator_destroy(itr);
703
729
return SLURM_SUCCESS;
763
789
node_record_table_ptr = old_node_table_ptr;
764
790
return error_code;
766
conf = slurm_conf_lock();
767
_build_all_nodeline_info(conf);
793
if (slurm_topo_init() != SLURM_SUCCESS)
794
fatal("Failed to initialize topology plugin");
796
_build_all_nodeline_info();
769
797
_handle_all_downnodes();
770
798
_build_all_partitionline_info();
772
800
update_logging();
773
801
g_slurm_jobcomp_init(slurmctld_conf.job_comp_loc);
775
if (switch_init() < 0)
776
error("Failed to initialize switch plugin");
802
if (slurm_sched_init() != SLURM_SUCCESS)
803
fatal("Failed to initialize sched plugin");
804
if (switch_init() != SLURM_SUCCESS)
805
fatal("Failed to initialize switch plugin");
778
807
if (default_part_loc == NULL)
779
808
error("read_slurm_conf: default partition not set.");
811
841
_build_bitmaps_pre_select();
812
842
if ((select_g_node_init(node_record_table_ptr, node_record_count)
814
|| (select_g_block_init(part_list) != SLURM_SUCCESS)
815
|| (select_g_state_restore(state_save_dir) != SLURM_SUCCESS)
816
|| (select_g_job_init(job_list) != SLURM_SUCCESS)) {
844
(select_g_block_init(part_list) != SLURM_SUCCESS) ||
845
(select_g_state_restore(state_save_dir) != SLURM_SUCCESS) ||
846
(select_g_job_init(job_list) != SLURM_SUCCESS)) {
817
847
fatal("failed to initialize node selection plugin state, "
818
848
"Clean start required.");
871
909
/* Restore node state and size information from saved records.
872
910
* If a node was re-configured to be down or drained, we set those states */
873
911
static int _restore_node_state(struct node_record *old_node_table_ptr,
874
int old_node_record_count)
912
int old_node_record_count)
876
914
struct node_record *node_ptr;
877
915
int i, rc = SLURM_SUCCESS;
917
slurm_ctl_conf_t *conf = slurm_conf_lock();
918
bool power_save_mode = false;
920
if (conf->suspend_program && conf->resume_program)
921
power_save_mode = true;
879
924
for (i = 0; i < old_node_record_count; i++) {
880
925
uint16_t drain_flag = false, down_flag = false;
895
941
node_ptr->node_state |= NODE_STATE_DRAIN;
942
if ((node_ptr->node_state & NODE_STATE_POWER_SAVE) &&
943
(!power_save_mode)) {
944
node_ptr->node_state &= (~NODE_STATE_POWER_SAVE);
946
hostset_insert(hs, node_ptr->name);
948
hs = hostset_create(node_ptr->name);
897
951
node_ptr->last_response = old_node_table_ptr[i].last_response;
898
952
if (old_node_table_ptr[i].port != node_ptr->config_ptr->cpus) {
899
953
rc = ESLURM_NEED_RESTART;
1111
1173
uint16_t base_state, node_flags;
1112
1174
struct node_record *node_ptr = node_record_table_ptr;
1114
job_ptr->node_cnt = 0;
1176
job_ptr->node_cnt = bit_set_count(job_ptr->node_bitmap);
1115
1177
for (i = 0; i < node_record_count; i++, node_ptr++) {
1116
1178
if (bit_test(job_ptr->node_bitmap, i) == 0)
1118
job_ptr->node_cnt++;
1120
1181
base_state = node_ptr->node_state & NODE_STATE_BASE;
1121
1182
node_flags = node_ptr->node_state & NODE_STATE_FLAGS;
1128
1189
(job_ptr->details) && (job_ptr->details->shared == 0))
1129
1190
node_ptr->no_share_job_cnt++;
1131
if (base_state == NODE_STATE_DOWN) {
1192
if ((base_state == NODE_STATE_DOWN) &&
1193
(job_ptr->job_state == JOB_RUNNING) &&
1194
(job_ptr->kill_on_node_fail == 0) &&
1195
(job_ptr->node_cnt > 1)) {
1196
/* This should only happen if a job was running
1197
* on a node that was newly configured DOWN */
1198
info("Removing failed node %s from job_id %u",
1199
node_ptr->name, job_ptr->job_id);
1200
srun_node_fail(job_ptr->job_id, node_ptr->name);
1201
kill_step_on_node(job_ptr, node_ptr);
1202
excise_node_from_job(job_ptr, node_ptr);
1203
} else if (base_state == NODE_STATE_DOWN) {
1132
1204
time_t now = time(NULL);
1205
info("Killing job %u on DOWN node %s",
1206
job_ptr->job_id, node_ptr->name);
1133
1207
job_ptr->job_state = JOB_NODE_FAIL | JOB_COMPLETING;
1134
1208
job_ptr->end_time = MIN(job_ptr->end_time, now);
1135
1209
job_ptr->exit_code = MAX(job_ptr->exit_code, 1);
1136
1210
job_ptr->state_reason = FAIL_DOWN_NODE;
1211
xfree(job_ptr->state_desc);
1137
1212
job_completion_logger(job_ptr);
1139
1214
} else if ((base_state == NODE_STATE_UNKNOWN) ||