1
1
/*****************************************************************************\
2
2
* power_save.c - support node power saving mode. Nodes which have been
3
3
* idle for an extended period of time will be placed into a power saving
4
* mode by running an arbitrary script (typically to set frequency governor).
4
* mode by running an arbitrary script. This script can lower the voltage
5
* or frequency of the nodes or can completely power the nodes off.
5
6
* When the node is restored to normal operation, another script will be
6
7
* executed. Many parameters are available to control this mode of operation.
7
8
*****************************************************************************
8
9
* Copyright (C) 2007 The Regents of the University of California.
10
* Copyright (C) 2008-2009 Lawrence Livermore National Security.
9
11
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
10
12
* Written by Morris Jette <jette1@llnl.gov>
13
* CODE-OCEC-09-009. All rights reserved.
13
15
* This file is part of SLURM, a resource management program.
14
* For details, see <http://www.llnl.gov/linux/slurm/>.
16
* For details, see <https://computing.llnl.gov/linux/slurm/>.
17
* Please also read the included file: DISCLAIMER.
16
19
* SLURM is free software; you can redistribute it and/or modify it under
17
20
* the terms of the GNU General Public License as published by the Free
43
46
# include "config.h"
53
#include <sys/types.h>
46
58
#include "src/common/bitstring.h"
59
#include "src/common/macros.h"
47
60
#include "src/common/xstring.h"
48
61
#include "src/slurmctld/locks.h"
49
62
#include "src/slurmctld/slurmctld.h"
52
#include <sys/types.h>
59
int idle_time, suspend_rate, resume_rate;
64
#if defined (HAVE_DECL_STRSIGNAL) && !HAVE_DECL_STRSIGNAL
66
extern char *strsignal(int);
68
#endif /* defined HAVE_DECL_STRSIGNAL && !HAVE_DECL_STRSIGNAL */
72
#define MAX_SHUTDOWN_DELAY 120 /* seconds to wait for child procs
73
* to exit after daemon shutdown
74
* request, then orphan or kill proc */
76
/* Records for tracking processes forked to suspend/resume nodes */
77
pid_t child_pid[PID_CNT]; /* pid of process */
78
time_t child_time[PID_CNT]; /* start time of process */
80
pthread_mutex_t power_mutex = PTHREAD_MUTEX_INITIALIZER;
81
bool power_save_enabled = false;
83
int idle_time, suspend_rate, resume_timeout, resume_rate, suspend_timeout;
60
84
char *suspend_prog = NULL, *resume_prog = NULL;
61
85
char *exc_nodes = NULL, *exc_parts = NULL;
62
time_t last_config = (time_t) 0;
86
time_t last_config = (time_t) 0, last_suspend = (time_t) 0;
87
uint16_t slurmd_timeout;
64
bitstr_t *exc_node_bitmap = NULL;
65
int suspend_cnt, resume_cnt;
89
bitstr_t *exc_node_bitmap = NULL, *suspend_node_bitmap = NULL;
90
int suspend_cnt, resume_cnt;
91
float suspend_cnt_f, resume_cnt_f;
67
93
static void _clear_power_config(void);
68
94
static void _do_power_work(void);
69
95
static void _do_resume(char *host);
70
96
static void _do_suspend(char *host);
71
97
static int _init_power_config(void);
72
static void _kill_zombies(void);
98
static void *_init_power_save(void *arg);
99
static int _kill_procs(void);
100
static int _reap_procs(void);
73
101
static void _re_wake(void);
74
102
static pid_t _run_prog(char *prog, char *arg);
103
static void _shutdown_power(void);
75
104
static bool _valid_prog(char *file_name);
77
106
/* Perform any power change work to nodes */
80
109
static time_t last_log = 0, last_work_scan = 0;
81
110
int i, wake_cnt = 0, sleep_cnt = 0, susp_total = 0;
82
111
time_t now = time(NULL), delta_t;
83
uint16_t base_state, susp_state;
112
uint16_t base_state, comp_state, susp_state;
84
113
bitstr_t *wake_node_bitmap = NULL, *sleep_node_bitmap = NULL;
85
114
struct node_record *node_ptr;
115
bool run_suspend = false;
87
117
/* Set limit on counts of nodes to have state changed */
88
118
delta_t = now - last_work_scan;
89
119
if (delta_t >= 60) {
93
123
float rate = (60 - delta_t) / 60.0;
124
suspend_cnt_f *= rate;
125
resume_cnt_f *= rate;
127
suspend_cnt = (suspend_cnt_f + 0.5);
128
resume_cnt = (resume_cnt_f + 0.5);
130
if (now > (last_suspend + suspend_timeout)) {
131
/* ready to start another round of node suspends */
134
bit_nclear(suspend_node_bitmap, 0,
135
(node_record_count - 1));
136
last_suspend = (time_t) 0;
97
140
last_work_scan = now;
99
142
/* Build bitmaps identifying each node which should change state */
101
144
node_ptr = &node_record_table_ptr[i];
102
145
base_state = node_ptr->node_state & NODE_STATE_BASE;
103
146
susp_state = node_ptr->node_state & NODE_STATE_POWER_SAVE;
147
comp_state = node_ptr->node_state & NODE_STATE_COMPLETING;
108
&& ((suspend_rate == 0) || (suspend_cnt <= suspend_rate))
109
&& ((base_state == NODE_STATE_ALLOCATED)
110
|| (node_ptr->last_idle > (now - idle_time)))) {
111
if (wake_node_bitmap == NULL)
112
wake_node_bitmap = bit_alloc(node_record_count);
152
/* Resume nodes as appropriate */
154
((resume_rate == 0) || (resume_cnt < resume_rate)) &&
155
(bit_test(suspend_node_bitmap, i) == 0) &&
156
((base_state == NODE_STATE_ALLOCATED) ||
157
(node_ptr->last_idle > (now - idle_time)))) {
158
if (wake_node_bitmap == NULL) {
160
bit_alloc(node_record_count);
115
165
node_ptr->node_state &= (~NODE_STATE_POWER_SAVE);
166
bit_clear(power_node_bitmap, i);
167
node_ptr->node_state |= NODE_STATE_NO_RESPOND;
168
node_ptr->last_response = now + resume_timeout;
116
169
bit_set(wake_node_bitmap, i);
118
if ((susp_state == 0)
119
&& ((resume_rate == 0) || (resume_cnt <= resume_rate))
120
&& (base_state == NODE_STATE_IDLE)
121
&& (node_ptr->last_idle < (now - idle_time))
122
&& ((exc_node_bitmap == NULL) ||
172
/* Suspend nodes as appropriate */
175
((suspend_rate == 0) || (suspend_cnt < suspend_rate)) &&
176
(base_state == NODE_STATE_IDLE) &&
178
(node_ptr->last_idle < (now - idle_time)) &&
179
((exc_node_bitmap == NULL) ||
123
180
(bit_test(exc_node_bitmap, i) == 0))) {
124
if (sleep_node_bitmap == NULL)
125
sleep_node_bitmap = bit_alloc(node_record_count);
181
if (sleep_node_bitmap == NULL) {
183
bit_alloc(node_record_count);
128
188
node_ptr->node_state |= NODE_STATE_POWER_SAVE;
129
bit_set(sleep_node_bitmap, i);
189
bit_set(power_node_bitmap, i);
190
bit_set(sleep_node_bitmap, i);
191
bit_set(suspend_node_bitmap, i);
132
if ((now - last_log) > 600) {
133
info("Power save mode %d nodes", susp_total);
195
if (((now - last_log) > 600) && (susp_total > 0)) {
196
info("Power save mode: %d nodes", susp_total);
137
if ((wake_cnt == 0) && (sleep_cnt == 0))
138
_re_wake(); /* No work to be done now */
140
200
if (sleep_node_bitmap) {
142
202
nodes = bitmap2node_name(sleep_node_bitmap);
165
/* Just in case some resume calls failed, re-issue the requests
166
* periodically for active nodes. We do not increment resume_cnt
167
* since there should be no change in power requirements. */
225
/* If slurmctld crashes, the node state that it recovers could differ
226
* from the actual hardware state (e.g. ResumeProgram failed to complete).
227
* To address that, when a node that should be powered up for a running
228
* job is not responding, they try running ResumeProgram again. */
168
229
static void _re_wake(void)
170
static time_t last_wakeup = 0;
171
static int last_inx = 0;
172
time_t now = time(NULL);
173
232
struct node_record *node_ptr;
174
233
bitstr_t *wake_node_bitmap = NULL;
175
int i, lim = MIN(node_record_count, 20);
176
uint16_t base_state, susp_state;
178
/* Run at most once per minute */
179
if ((now - last_wakeup) < 60)
183
for (i=0; i<lim; i++) {
184
node_ptr = &node_record_table_ptr[last_inx];
236
node_ptr = node_record_table_ptr;
237
for (i=0; i<node_record_count; i++, node_ptr++) {
185
238
base_state = node_ptr->node_state & NODE_STATE_BASE;
186
susp_state = node_ptr->node_state & NODE_STATE_POWER_SAVE;
188
if ((susp_state == 0) &&
189
((base_state == NODE_STATE_ALLOCATED) ||
190
(base_state == NODE_STATE_IDLE))) {
191
if (wake_node_bitmap == NULL)
192
wake_node_bitmap = bit_alloc(node_record_count);
193
bit_set(wake_node_bitmap, last_inx);
239
if ((base_state == NODE_STATE_ALLOCATED) &&
240
(node_ptr->node_state & NODE_STATE_NO_RESPOND) &&
241
((node_ptr->node_state & NODE_STATE_POWER_SAVE) == 0) &&
242
(bit_test(suspend_node_bitmap, i) == 0)) {
243
if (wake_node_bitmap == NULL) {
245
bit_alloc(node_record_count);
247
bit_set(wake_node_bitmap, i);
196
if (last_inx >= node_record_count)
200
251
if (wake_node_bitmap) {
202
253
nodes = bitmap2node_name(wake_node_bitmap);
205
255
info("power_save: rewaking nodes %s", nodes);
207
debug("power_save: rewaking nodes %s", nodes);
209
256
_run_prog(resume_prog, nodes);
211
258
error("power_save: bitmap2nodename");
255
307
if (child == 0) {
257
308
for (i=0; i<128; i++)
259
311
execl(program, arg0, arg1, NULL);
261
} else if (child < 0)
313
} else if (child < 0) {
262
314
error("fork: %m");
317
for (i=0; i<PID_CNT; i++) {
320
child_pid[i] = child;
321
child_time[i] = time(NULL);
325
error("power_save: filled child_pid array");
266
/* We don't bother to track individual process IDs,
267
* just clean everything up here. We could capture
268
* the value of "child" in _run_prog() if we want
269
* to track each process. */
270
static void _kill_zombies(void)
272
while (waitpid(-1, NULL, WNOHANG) > 0)
330
/* reap child processes previously forked to modify node state.
331
* return the count of empty slots in the child_pid array */
332
static int _reap_procs(void)
334
int empties = 0, delay, i, max_timeout, rc, status;
336
max_timeout = MAX(suspend_timeout, resume_timeout);
337
for (i=0; i<PID_CNT; i++) {
338
if (child_pid[i] == 0) {
342
rc = waitpid(child_pid[i], &status, WNOHANG);
346
delay = difftime(time(NULL), child_time[i]);
347
if (delay > max_timeout) {
348
info("power_save: program %d ran for %d sec",
349
(int) child_pid[i], delay);
352
if (WIFEXITED(status)) {
353
rc = WEXITSTATUS(status);
355
error("power_save: program exit status of %d",
358
} else if (WIFSIGNALED(status)) {
359
error("power_save: program signalled: %s",
360
strsignal(WTERMSIG(status)));
364
child_time[i] = (time_t) 0;
369
/* kill (or orphan) child processes previously forked to modify node state.
370
* return the count of killed/orphaned processes */
371
static int _kill_procs(void)
373
int killed = 0, i, rc, status;
375
for (i=0; i<PID_CNT; i++) {
376
if (child_pid[i] == 0)
379
rc = waitpid(child_pid[i], &status, WNOHANG);
381
#ifdef POWER_SAVE_KILL_PROCS
382
error("power_save: killing process %d",
384
kill((0-child_pid[i]), SIGKILL);
386
error("power_save: orphaning process %d",
391
/* process already completed */
394
child_time[i] = (time_t) 0;
399
/* shutdown power save daemons */
400
static void _shutdown_power(void)
402
int i, proc_cnt, max_timeout;
404
max_timeout = MAX(suspend_timeout, resume_timeout);
405
/* Try to avoid orphan processes */
407
proc_cnt = PID_CNT - _reap_procs();
408
if (proc_cnt == 0) /* all procs completed */
410
if (i >= max_timeout) {
411
error("power_save: orphaning %d processes which are "
412
"not terminating so slurmctld can exit",
417
info("power_save: waiting for %d processes to "
418
"complete", proc_cnt);
419
} else if (i % 5 == 0) {
420
debug("power_save: waiting for %d processes to "
421
"complete", proc_cnt);
276
427
/* Free all allocated memory */
306
460
slurm_conf_unlock();
308
462
if (idle_time < 0) { /* not an error */
309
debug("power_save module disabled, idle_time < 0");
463
debug("power_save module disabled, SuspendTime < 0");
312
466
if (suspend_rate < 1) {
313
error("power_save module disabled, suspend_rate < 1");
467
error("power_save module disabled, SuspendRate < 1");
316
470
if (resume_rate < 1) {
317
error("power_save module disabled, resume_rate < 1");
320
if (suspend_prog == NULL)
321
info("WARNING: power_save module has NULL suspend program");
322
else if (!_valid_prog(suspend_prog)) {
323
error("power_save module disabled, invalid suspend program %s",
327
if (resume_prog == NULL)
328
info("WARNING: power_save module has NULL resume program");
329
else if (!_valid_prog(resume_prog)) {
330
error("power_save module disabled, invalid resume program %s",
471
error("power_save module disabled, ResumeRate < 1");
474
if (suspend_prog == NULL) {
475
error("power_save module disabled, NULL SuspendProgram");
477
} else if (!_valid_prog(suspend_prog)) {
478
error("power_save module disabled, invalid SuspendProgram %s",
482
if (resume_prog == NULL) {
483
error("power_save module disabled, NULL ResumeProgram");
485
} else if (!_valid_prog(resume_prog)) {
486
error("power_save module disabled, invalid ResumeProgram %s",
336
&& (node_name2bitmap(exc_nodes, false, &exc_node_bitmap))) {
492
(node_name2bitmap(exc_nodes, false, &exc_node_bitmap))) {
337
493
error("power_save module disabled, "
338
"invalid excluded nodes %s", exc_nodes);
494
"invalid SuspendExcNodes %s", exc_nodes);
382
539
if (file_name[0] != '/') {
383
debug("program %s not absolute pathname", file_name);
540
debug("power_save program %s not absolute pathname",
545
if (access(file_name, X_OK) != 0) {
546
debug("power_save program %s not executable", file_name);
387
550
if (stat(file_name, &buf)) {
388
debug("program %s not found", file_name);
391
if (!S_ISREG(buf.st_mode)) {
392
debug("program %s not regular file", file_name);
551
debug("power_save program %s not found", file_name);
395
554
if (buf.st_mode & 022) {
396
debug("program %s has group or world write permission",
555
debug("power_save program %s has group or "
556
"world write permission",
564
/* start_power_mgr - Start power management thread as needed. The thread
565
* terminates automatically at slurmctld shutdown time.
566
* IN thread_id - pointer to thread ID of the started pthread.
568
extern void start_power_mgr(pthread_t *thread_id)
570
pthread_attr_t thread_attr;
572
slurm_mutex_lock(&power_mutex);
573
if (power_save_enabled) { /* Already running */
574
slurm_mutex_unlock(&power_mutex);
577
power_save_enabled = true;
578
slurm_mutex_unlock(&power_mutex);
580
slurm_attr_init(&thread_attr);
581
while (pthread_create(thread_id, &thread_attr, _init_power_save,
583
error("pthread_create %m");
586
slurm_attr_destroy(&thread_attr);
404
* init_power_save - initialize the power save module. Started as a
590
* init_power_save - Onitialize the power save module. Started as a
405
591
* pthread. Terminates automatically at slurmctld shutdown time.
406
592
* Input and output are unused.
408
extern void *init_power_save(void *arg)
594
static void *_init_power_save(void *arg)
410
/* Locks: Write node, read jobs and partitions */
596
/* Locks: Read nodes */
597
slurmctld_lock_t node_read_lock = {
598
NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK };
599
/* Locks: Write nodes */
411
600
slurmctld_lock_t node_write_lock = {
412
NO_LOCK, READ_LOCK, WRITE_LOCK, READ_LOCK };
413
time_t now, last_power_scan = 0;
601
NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK };
602
time_t now, boot_time = 0, last_power_scan = 0;
415
604
if (_init_power_config())
607
suspend_node_bitmap = bit_alloc(node_record_count);
608
if (suspend_node_bitmap == NULL)
609
fatal("power_save: malloc error");
418
611
while (slurmctld_config.shutdown_time == 0) {
422
if ((last_config != slurmctld_conf.last_update)
423
&& (_init_power_config()))
614
if (_reap_procs() < 2) {
615
debug("power_save programs getting backlogged");
619
if ((last_config != slurmctld_conf.last_update) &&
620
(_init_power_config())) {
621
info("power_save mode has been disabled due to "
622
"configuration changes");
426
/* Only run every 60 seconds or after
427
* a node state change, whichever
429
626
now = time(NULL);
430
if ((last_node_update < last_power_scan)
431
&& (now < (last_power_scan + 60)))
434
lock_slurmctld(node_write_lock);
436
unlock_slurmctld(node_write_lock);
437
last_power_scan = now;
630
/* Only run every 60 seconds or after a node state change,
631
* whichever happens first */
632
if ((last_node_update >= last_power_scan) ||
633
(now >= (last_power_scan + 60))) {
634
lock_slurmctld(node_write_lock);
636
unlock_slurmctld(node_write_lock);
637
last_power_scan = now;
640
if (slurmd_timeout &&
641
(now > (boot_time + (slurmd_timeout / 2)))) {
642
lock_slurmctld(node_read_lock);
644
unlock_slurmctld(node_read_lock);
645
/* prevent additional executions */
646
boot_time += (365 * 24 * 60 * 60);
440
651
fini: _clear_power_config();
652
FREE_NULL_BITMAP(suspend_node_bitmap);
654
slurm_mutex_lock(&power_mutex);
655
power_save_enabled = false;
656
slurm_mutex_unlock(&power_mutex);