2
2
* controller.c - main control machine daemon for slurm
3
3
*****************************************************************************
4
4
* Copyright (C) 2002-2007 The Regents of the University of California.
5
* Copyright (C) 2008 Lawrence Livermore National Security.
5
* Copyright (C) 2008-2009 Lawrence Livermore National Security.
6
6
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
7
7
* Written by Morris Jette <jette1@llnl.gov>, Kevin Tew <tew1@llnl.gov>
8
* CODE-OCEC-09-009. All rights reserved.
10
10
* This file is part of SLURM, a resource management program.
11
* For details, see <http://www.llnl.gov/linux/slurm/>.
11
* For details, see <https://computing.llnl.gov/linux/slurm/>.
12
* Please also read the included file: DISCLAIMER.
13
14
* SLURM is free software; you can redistribute it and/or modify it under
14
15
* the terms of the GNU General Public License as published by the Free
72
74
#include "src/common/slurm_accounting_storage.h"
73
75
#include "src/common/slurm_auth.h"
74
76
#include "src/common/slurm_jobcomp.h"
77
#include "src/common/slurm_priority.h"
75
78
#include "src/common/slurm_protocol_api.h"
76
79
#include "src/common/switch.h"
77
80
#include "src/common/uid.h"
78
81
#include "src/common/xsignal.h"
79
82
#include "src/common/xstring.h"
80
#include "src/common/assoc_mgr.h"
82
84
#include "src/slurmctld/agent.h"
85
#include "src/slurmctld/basil_interface.h"
83
86
#include "src/slurmctld/job_scheduler.h"
84
87
#include "src/slurmctld/licenses.h"
85
88
#include "src/slurmctld/locks.h"
86
89
#include "src/slurmctld/ping_nodes.h"
90
#include "src/slurmctld/port_mgr.h"
87
91
#include "src/slurmctld/proc_req.h"
88
92
#include "src/slurmctld/read_config.h"
93
#include "src/slurmctld/reservation.h"
89
94
#include "src/slurmctld/slurmctld.h"
90
95
#include "src/slurmctld/sched_plugin.h"
91
96
#include "src/slurmctld/srun_comm.h"
92
97
#include "src/slurmctld/state_save.h"
98
#include "src/slurmctld/topo_plugin.h"
93
99
#include "src/slurmctld/trigger_mgr.h"
145
151
int accounting_enforce = 0;
146
152
int association_based_accounting = 0;
147
153
bool ping_nodes_now = false;
154
int cluster_procs = 0;
155
struct switch_record *switch_record_table = NULL;
156
int switch_record_cnt = 0;
149
158
/* Local variables */
150
159
static int daemonize = DEFAULT_DAEMONIZE;
167
176
SIGPIPE, SIGALRM, SIGABRT, SIGHUP, 0
179
static int _accounting_cluster_ready();
180
static int _accounting_mark_all_nodes_down(char *reason);
181
static void * _assoc_cache_mgr(void *no_data);
182
static void _become_slurm_user(void);
170
183
static void _default_sigaction(int sig);
171
184
inline static void _free_server_thread(void);
172
static int _accounting_cluster_ready();
173
static int _accounting_mark_all_nodes_down(char *reason);
174
185
static void _init_config(void);
175
186
static void _init_pidfile(void);
176
187
static void _kill_old_slurmctld(void);
201
211
slurmctld_lock_t config_write_lock = {
202
212
WRITE_LOCK, WRITE_LOCK, WRITE_LOCK, WRITE_LOCK };
203
213
assoc_init_args_t assoc_init_arg;
204
pthread_t assoc_cache_thread = 0;
205
gid_t slurm_user_gid;
214
pthread_t assoc_cache_thread;
208
217
* Establish initial configuration
224
233
* able to write a core dump.
228
/* Determine SlurmUser gid */
229
slurm_user_gid = gid_from_uid(slurmctld_conf.slurm_user_id);
230
if (slurm_user_gid == (gid_t) -1) {
231
fatal("Failed to determine gid of SlurmUser(%d)",
235
/* Initialize supplementary groups ID list for SlurmUser */
237
/* root does not need supplementary groups */
238
if ((slurmctld_conf.slurm_user_id == 0) &&
239
(setgroups(0, NULL) != 0)) {
240
fatal("Failed to drop supplementary groups, "
242
} else if ((slurmctld_conf.slurm_user_id != getuid()) &&
243
initgroups(slurmctld_conf.slurm_user_name,
245
fatal("Failed to set supplementary groups, "
249
info("Not running as root. Can't drop supplementary groups");
252
/* Set GID to GID of SlurmUser */
253
if ((slurm_user_gid != getegid()) &&
254
(setgid(slurm_user_gid))) {
255
fatal("Failed to set GID to %d", slurm_user_gid);
258
/* Set UID to UID of SlurmUser */
259
if ((slurmctld_conf.slurm_user_id != getuid()) &&
260
(setuid(slurmctld_conf.slurm_user_id))) {
261
fatal("Can not set uid to SlurmUser(%d): %m",
262
slurmctld_conf.slurm_user_id);
236
_become_slurm_user();
265
238
if (stat(slurmctld_conf.mail_prog, &stat_buf) != 0)
266
239
error("Configured MailProg is invalid");
372
345
fatal("slurmdbd and/or database must be up at "
373
346
"slurmctld start time");
350
/* Now load the usage from a flat file since it isn't kept in
351
the database No need to check for an error since if this
352
fails we will get an error message and we will go on our
353
way. If we get an error we can't do anything about it.
355
load_assoc_usage(slurmctld_conf.state_save_location);
377
357
/* This thread is looking for when we get correct data from
378
358
the database so we can update the assoc_ptr's in the jobs
381
361
slurm_attr_init(&thread_attr);
384
&thread_attr, _assoc_cache_mgr, NULL))
385
fatal("pthread_create error %m");
362
while (pthread_create(&assoc_cache_thread, &thread_attr,
363
_assoc_cache_mgr, NULL)) {
364
error("pthread_create error %m");
386
367
slurm_attr_destroy(&thread_attr);
389
370
info("slurmctld version %s started on cluster %s",
390
371
SLURM_VERSION, slurmctld_cluster_name);
429
410
(strcmp(node_name,
430
411
slurmctld_conf.backup_controller) == 0)) {
431
412
slurm_sched_fini(); /* make sure shutdown */
433
415
} else if (slurmctld_conf.control_machine &&
434
(strcmp(node_name, slurmctld_conf.control_machine)
416
(strcmp(node_name, slurmctld_conf.control_machine)
436
418
(void) _shutdown_backup_controller(SHUTDOWN_WAIT);
437
419
/* Now recover the remaining state information */
438
420
if (switch_restore(slurmctld_conf.state_save_location,
480
464
slurmctld_conf.slurmctld_port);
482
466
_accounting_cluster_ready();
468
if (slurm_priority_init() != SLURM_SUCCESS)
469
fatal("failed to initialize priority plugin");
483
471
if (slurm_sched_init() != SLURM_SUCCESS)
484
472
fatal("failed to initialize scheduling plugin");
487
476
* create attached thread to process RPCs
490
479
slurmctld_config.server_thread_count++;
491
480
slurm_mutex_unlock(&slurmctld_config.thread_count_lock);
492
481
slurm_attr_init(&thread_attr);
493
if (pthread_create(&slurmctld_config.thread_id_rpc,
494
&thread_attr, _slurmctld_rpc_mgr, NULL))
495
fatal("pthread_create error %m");
482
while (pthread_create(&slurmctld_config.thread_id_rpc,
483
&thread_attr, _slurmctld_rpc_mgr,
485
error("pthread_create error %m");
496
488
slurm_attr_destroy(&thread_attr);
499
491
* create attached thread for signal handling
501
493
slurm_attr_init(&thread_attr);
502
if (pthread_create(&slurmctld_config.thread_id_sig,
503
&thread_attr, _slurmctld_signal_hand,
505
fatal("pthread_create %m");
494
while (pthread_create(&slurmctld_config.thread_id_sig,
495
&thread_attr, _slurmctld_signal_hand,
497
error("pthread_create %m");
506
500
slurm_attr_destroy(&thread_attr);
509
503
* create attached thread for state save
511
505
slurm_attr_init(&thread_attr);
512
if (pthread_create(&slurmctld_config.thread_id_save,
513
&thread_attr, slurmctld_state_save,
515
fatal("pthread_create %m");
506
while (pthread_create(&slurmctld_config.thread_id_save,
507
&thread_attr, slurmctld_state_save,
509
error("pthread_create %m");
516
512
slurm_attr_destroy(&thread_attr);
519
515
* create attached thread for node power management
521
slurm_attr_init(&thread_attr);
522
if (pthread_create(&slurmctld_config.thread_id_power,
523
&thread_attr, init_power_save,
525
fatal("pthread_create %m");
526
slurm_attr_destroy(&thread_attr);
517
start_power_mgr(&slurmctld_config.thread_id_power);
529
520
* process slurm background activities, could run as pthread
531
522
_slurmctld_background(NULL);
533
524
/* termination of controller */
525
slurm_priority_fini();
534
526
shutdown_state_save();
535
527
pthread_join(slurmctld_config.thread_id_sig, NULL);
536
528
pthread_join(slurmctld_config.thread_id_rpc, NULL);
537
529
pthread_join(slurmctld_config.thread_id_save, NULL);
538
pthread_join(slurmctld_config.thread_id_power,NULL);
539
if(assoc_cache_thread) {
540
/* end the thread here just say we aren't
541
* running cache so it ends */
532
/* break out and end the association cache
533
* thread since we are shuting down, no reason
534
* to wait for current info from the database */
542
535
slurm_mutex_lock(&assoc_cache_mutex);
543
536
running_cache = (uint16_t)NO_VAL;
544
537
pthread_cond_signal(&assoc_cache_cond);
553
546
/* Save any pending state save RPCs */
554
547
acct_storage_g_close_connection(&acct_db_conn);
549
/* join the power save thread after saving all state
550
* since it could wait a while waiting for spawned
551
* processes to exit */
552
pthread_join(slurmctld_config.thread_id_power, NULL);
556
554
if (slurmctld_config.resume_backup == false)
557
/* primary controller doesn't resume backup mode */
558
if ((slurmctld_config.resume_backup == true) &&
561
565
/* Since pidfile is created as user root (its owner is
562
566
* changed to SlurmUser) SlurmUser may not be able to
563
567
* remove it, so this is not necessarily an error. */
564
if (unlink(slurmctld_conf.slurmctld_pidfile) < 0)
568
if (unlink(slurmctld_conf.slurmctld_pidfile) < 0) {
565
569
verbose("Unable to remove pidfile '%s': %m",
566
570
slurmctld_conf.slurmctld_pidfile);
568
574
#ifdef MEMORY_LEAK_DEBUG
569
575
/* This should purge all allocated memory, *\
582
588
error("Left %d agent threads active", cnt);
590
slurm_sched_fini(); /* Stop all scheduling */
586
592
/* Purge our local data structures */
588
594
part_fini(); /* part_fini() must preceed node_fini() */
591
598
assoc_mgr_fini(slurmctld_conf.state_save_location);
599
reserve_port_config(NULL);
593
/* Plugins are needed to purge job/node data structures,
601
/* Some plugins are needed to purge job/node data structures,
594
602
* unplug after other data structures are purged */
595
603
g_slurm_jobcomp_fini();
596
604
slurm_acct_storage_fini();
597
605
slurm_jobacct_gather_fini();
598
606
slurm_select_fini();
599
608
checkpoint_fini();
600
609
slurm_auth_fini();
679
688
/* Read configuration file.
680
* Same name as API function for use in accounting_storage plugin */
681
extern int slurm_reconfigure(void)
689
* Same name as API function for use in accounting_storage plugin.
690
* Anything you add to this function must be added to the
691
* _slurm_rpc_reconfigure_controller function inside proc_req.c try
692
* to keep these in sync.
694
static int _reconfigure_slurm(void)
683
696
/* Locks: Write configuration, job, node, and partition */
684
697
slurmctld_lock_t config_write_lock = {
698
711
_update_cred_key();
699
712
set_slurmctld_state_loc();
714
select_g_reconfigure(); /* notify select
717
* inside the lock. */
701
718
unlock_slurmctld(config_write_lock);
719
start_power_mgr(&slurmctld_config.thread_id_power);
702
720
trigger_reconfig();
703
721
slurm_sched_partition_change(); /* notify sched plugin */
704
select_g_reconfigure(); /* notify select plugin too */
722
priority_g_reconfig(); /* notify priority plugin too */
798
817
slurmctld_lock_t config_read_lock = {
799
818
READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK };
800
819
int sigarray[] = {SIGUSR1, 0};
820
char* node_addr = NULL;
802
822
(void) pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);
803
823
(void) pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL);
809
829
(&thread_attr_rpc_req, PTHREAD_CREATE_DETACHED))
810
830
fatal("pthread_attr_setdetachstate %m");
832
/* set node_addr to bind to (NULL means any) */
833
if (slurmctld_conf.backup_controller && slurmctld_conf.backup_addr &&
834
(strcmp(node_name, slurmctld_conf.backup_controller) == 0) &&
835
(strcmp(slurmctld_conf.backup_controller,
836
slurmctld_conf.backup_addr) != 0)) {
837
node_addr = slurmctld_conf.backup_addr ;
839
else if ((strcmp(node_name,slurmctld_conf.control_machine) == 0) &&
840
(strcmp(slurmctld_conf.control_machine,
841
slurmctld_conf.control_addr) != 0)) {
842
node_addr = slurmctld_conf.control_addr ;
812
845
/* initialize port for RPCs */
813
846
lock_slurmctld(config_read_lock);
814
if ((sockfd = slurm_init_msg_engine_port(slurmctld_conf.
847
if ((sockfd = slurm_init_msg_engine_addrname_port(node_addr,
816
850
== SLURM_SOCKET_ERROR)
817
fatal("slurm_init_msg_engine_port error %m");
851
fatal("slurm_init_msg_engine_addrname_port error %m");
818
852
unlock_slurmctld(config_read_lock);
819
853
slurm_get_stream_addr(sockfd, &srv_addr);
820
854
slurm_get_ip_str(&srv_addr, &port, ip, sizeof(ip));
974
1008
static int _accounting_cluster_ready()
977
1010
struct node_record *node_ptr;
979
1012
int rc = SLURM_ERROR;
980
1013
time_t event_time = time(NULL);
1015
bitstr_t *total_node_bitmap = NULL;
1016
char *cluster_nodes = NULL;
982
1018
node_ptr = node_record_table_ptr;
983
1019
for (i = 0; i < node_record_count; i++, node_ptr++) {
1032
/* Since cluster_procs is used else where we need to keep a
1033
local var here to avoid race conditions on cluster_procs
1036
cluster_procs = procs;
1038
/* Now get the names of all the nodes on the cluster at this
1039
time and send it also.
1041
total_node_bitmap = bit_alloc(node_record_count);
1042
bit_nset(total_node_bitmap, 0, node_record_count-1);
1043
cluster_nodes = bitmap2node_name(total_node_bitmap);
1044
FREE_NULL_BITMAP(total_node_bitmap);
996
1046
rc = clusteracct_storage_g_cluster_procs(acct_db_conn,
997
1047
slurmctld_cluster_name,
1049
cluster_procs, event_time);
1050
xfree(cluster_nodes);
999
1051
if(rc == ACCOUNTING_FIRST_REG) {
1000
1052
/* see if we are running directly to a database
1001
1053
* instead of a slurmdbd.
1076
1133
static time_t last_ping_node_time;
1077
1134
static time_t last_ping_srun_time;
1078
1135
static time_t last_purge_job_time;
1136
static time_t last_resv_time;
1079
1137
static time_t last_timelimit_time;
1080
1138
static time_t last_assert_primary_time;
1081
1139
static time_t last_trigger;
1112
1170
last_sched_time = last_checkpoint_time = last_group_time = now;
1113
1171
last_purge_job_time = last_trigger = last_health_check_time = now;
1114
1172
last_timelimit_time = last_assert_primary_time = now;
1115
last_no_resp_msg_time = now;
1173
last_no_resp_msg_time = last_resv_time = now;
1116
1174
if (slurmctld_conf.slurmd_timeout) {
1117
1175
/* We ping nodes that haven't responded in SlurmdTimeout/3,
1118
1176
* but need to do the test at a higher frequency or we might
1224
if (difftime(now, last_resv_time) >= 2) {
1225
last_resv_time = now;
1226
lock_slurmctld(node_write_lock);
1227
set_node_maint_mode();
1228
unlock_slurmctld(node_write_lock);
1166
1231
if (difftime(now, last_no_resp_msg_time) >=
1167
1232
no_resp_msg_interval) {
1168
1233
last_no_resp_msg_time = now;
1667
1738
debug2("got real data from the database "
1668
"refreshing the association ptr's %d", list_count(job_list));
1739
"refreshing the association ptr's for %d jobs",
1740
list_count(job_list));
1669
1741
itr = list_iterator_create(job_list);
1670
1742
while ((job_ptr = list_next(itr))) {
1744
memset(&qos_rec, 0, sizeof(acct_qos_rec_t));
1745
qos_rec.id = job_ptr->qos;
1746
if((assoc_mgr_fill_in_qos(
1747
acct_db_conn, &qos_rec,
1749
(acct_qos_rec_t **)&job_ptr->qos_ptr))
1751
verbose("Invalid qos (%u) for job_id %u",
1752
job_ptr->qos, job_ptr->job_id);
1753
/* not a fatal error, qos could have
1671
1757
if(job_ptr->assoc_id) {
1672
1758
memset(&assoc_rec, 0, sizeof(acct_association_rec_t));
1673
1759
assoc_rec.id = job_ptr->assoc_id;
1674
1761
debug("assoc is %x (%d) for job %u",
1675
1762
job_ptr->assoc_ptr, job_ptr->assoc_id,
1676
1763
job_ptr->job_id);
1698
1786
_accounting_cluster_ready();
1790
static void _become_slurm_user(void)
1792
gid_t slurm_user_gid;
1794
/* Determine SlurmUser gid */
1795
slurm_user_gid = gid_from_uid(slurmctld_conf.slurm_user_id);
1796
if (slurm_user_gid == (gid_t) -1) {
1797
fatal("Failed to determine gid of SlurmUser(%u)",
1798
slurmctld_conf.slurm_user_id);
1801
/* Initialize supplementary groups ID list for SlurmUser */
1802
if (getuid() == 0) {
1803
/* root does not need supplementary groups */
1804
if ((slurmctld_conf.slurm_user_id == 0) &&
1805
(setgroups(0, NULL) != 0)) {
1806
fatal("Failed to drop supplementary groups, "
1808
} else if ((slurmctld_conf.slurm_user_id != getuid()) &&
1809
initgroups(slurmctld_conf.slurm_user_name,
1811
fatal("Failed to set supplementary groups, "
1815
info("Not running as root. Can't drop supplementary groups");
1818
/* Set GID to GID of SlurmUser */
1819
if ((slurm_user_gid != getegid()) &&
1820
(setgid(slurm_user_gid))) {
1821
fatal("Failed to set GID to %d", slurm_user_gid);
1824
/* Set UID to UID of SlurmUser */
1825
if ((slurmctld_conf.slurm_user_id != getuid()) &&
1826
(setuid(slurmctld_conf.slurm_user_id))) {
1827
fatal("Can not set uid to SlurmUser(%u): %m",
1828
slurmctld_conf.slurm_user_id);