1
/*****************************************************************************\
2
* basil_interface.c - slurmctld interface to BASIL, Cray's Batch Application
3
* Scheduler Interface Layer (BASIL). In order to support development,
4
* these functions will provide basic BASIL-like functionality even
5
* without a BASIL command being present.
6
*****************************************************************************
7
* Copyright (C) 2009 Lawrence Livermore National Security.
8
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
9
* Written by Morris Jette <jette1@llnl.gov>
10
* CODE-OCEC-09-009. All rights reserved.
12
* This file is part of SLURM, a resource management program.
13
* For details, see <https://computing.llnl.gov/linux/slurm/>.
14
* Please also read the included file: DISCLAIMER.
16
* SLURM is free software; you can redistribute it and/or modify it under
17
* the terms of the GNU General Public License as published by the Free
18
* Software Foundation; either version 2 of the License, or (at your option)
21
* In addition, as a special exception, the copyright holders give permission
22
* to link the code of portions of this program with the OpenSSL library under
23
* certain conditions as described in each individual source file, and
24
* distribute linked combinations including the two. You must obey the GNU
25
* General Public License in all respects for all of the code used other than
26
* OpenSSL. If you modify file(s) with this exception, you may extend this
27
* exception to your version of the file(s), but you are not obligated to do
28
* so. If you do not wish to do so, delete this exception statement from your
29
* version. If you delete this exception statement from all source files in
30
* the program, then also delete it here.
32
* SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
33
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
34
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
37
* You should have received a copy of the GNU General Public License along
38
* with SLURM; if not, write to the Free Software Foundation, Inc.,
39
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
40
\*****************************************************************************/
42
/* FIXME: Document, ALPS must be started before SLURM */
43
/* FIXME: Document BASIL_RESERVATION_ID env var */
47
#endif /* HAVE_CONFIG_H */
49
#include <slurm/slurm_errno.h>
53
#include "src/common/log.h"
54
#include "src/common/node_select.h"
55
#include "src/common/xmalloc.h"
56
#include "src/common/xstring.h"
57
#include "src/slurmctld/basil_interface.h"
58
#include "src/slurmctld/slurmctld.h"
64
static int last_res_id = 0;
65
#endif /* !APBASIL_LOC */
68
/* Make sure that each SLURM node has a BASIL node ID */
69
static void _validate_basil_node_id(void)
73
struct node_record *node_ptr = node_record_table_ptr;
75
for (i=0; i<node_record_cnt; i++, node_ptr++)
76
if (node_ptr->basil_node_id != NO_VAL)
78
base_state = node_ptr->state & NODE_STATE_BASE;
79
if (base_state == NODE_STATE_DOWN)
82
error("Node %s has no basil node_id", node_ptr->name);
83
last_node_update = time(NULL);
84
set_node_down(node_ptr->name, "No BASIL node_id");
87
#endif /* APBASIL_LOC */
88
#endif /* HAVE_CRAY_XT */
91
* basil_query - Query BASIL for node and reservation state.
92
* Execute once at slurmctld startup and periodically thereafter.
95
extern int basil_query(void)
97
int error_code = SLURM_SUCCESS;
100
struct config_record *config_ptr;
101
struct node_record *node_ptr;
102
struct job_record *job_ptr;
103
ListIterator job_iterator;
106
char *reason, *res_id;
107
static bool first_run = true;
109
/* Issue the BASIL QUERY request */
110
if (request_failure) {
111
fatal("basil query error: %s", "TBD");
114
debug("basil query initiated");
117
/* Set basil_node_id to NO_VAL since the default value
118
* of zero is a valid BASIL node ID */
119
node_ptr = node_record_table_ptr;
120
for (i=0; i<node_record_cnt; i++, node_ptr++)
121
node_ptr->basil_node_id = NO_VAL;
125
/* Validate configuration for each node that BASIL reports */
126
for (each_basil_node) {
128
/* Log node state according to BASIL */
129
info("basil query: name=%s arch=%s",
130
basil_node_name, basil_node_arch, etc.);
131
#endif /* BASIL_DEBUG */
133
/* NOTE: Cray should provide X-, Y- and Z-coordinates
134
* in the future. When that happens, we'll want to use
135
* those numbers to generate the hostname:
136
* slurm_host_name = xmalloc(sizeof(conf->node_prefix) + 4);
137
* sprintf(slurm_host_name: %s%d%d%d", basil_node_name, X,Y,Z);
138
* Until then the node name must contain a 3-digit numberic
139
* suffix specifying the X-, Y- and Z-coordinates.
141
node_ptr = find_node_record(basil_node_name);
142
if (node_ptr == NULL) {
143
error("basil node %s not found in slurm",
148
/* Record BASIL's node_id for use in reservations */
149
node_ptr->basil_node_id = basil_node_id;
151
/* Update architecture in slurmctld's node record */
152
if (node_ptr->arch == NULL) {
153
xfree(node_ptr->arch);
154
node_ptr->arch = xstrdup(basil_node_arch);
157
/* Update slurmctld's node state if necessary */
159
base_state = node_ptr->state & NODE_STATE_BASE;
160
if (base_state != NODE_STATE_DOWN) {
161
if (strcmp(basil_state, "UP"))
162
reason = "basil state not UP";
163
else if (strcmp(basil_role, "BATCH"))
164
reason = "basil role not BATCH";
167
/* Calculate the total count of processors and
168
* MB of memory on the node */
169
config_ptr = node_ptr->config_ptr;
170
if ((slurmctld_conf.fast_schedule != 2) &&
171
(basil_cpus < config_ptr->cpus)) {
172
error("Node %s has low cpu count %d",
173
node_ptr->name, basil_cpus);
176
node_ptr->cpus = basil_cpus;
177
if ((slurmctld_conf.fast_schedule != 2) &&
178
(basil_memory < config_ptr->real_memory)) {
179
error("Node %s has low real_memory size %d",
180
node_ptr->name, basil_memory);
181
reason = "Low RealMemory";
183
node_ptr->real_memory = basil_memory;
186
last_node_update = time(NULL);
187
set_node_down(node_ptr->name, reason);
190
_validate_basil_node_id();
192
/* Confirm that each BASIL reservation is still valid,
193
* purge vestigial reservations */
194
for (each_basil_reservation) {
196
job_iterator = list_iterator_create(job_list);
197
while ((job_ptr = (struct job_record *)
198
list_next(job_iterator))) {
199
select_g_get_jobinfo(job_ptr->select_jobinfo,
200
SELECT_DATA_RESV_ID, &res_id);
201
found = !strcmp(res_id, basil_reservation_id);
206
list_iterator_destroy(job_iterator);
208
error("vestigial basil reservation %s being removed",
209
basil_reservation_id);
210
basil_dealloc(basil_reservation_id);
214
struct job_record *job_ptr;
215
ListIterator job_iterator;
219
/* Capture the highest reservation ID recorded to avoid re-use */
220
job_iterator = list_iterator_create(job_list);
221
while ((job_ptr = (struct job_record *) list_next(job_iterator))) {
223
select_g_get_jobinfo(job_ptr->select_jobinfo,
224
SELECT_DATA_RESV_ID, &res_id);
226
tmp = strchr(res_id, '_');
228
job_res_id = atoi(tmp+1);
229
last_res_id = MAX(last_res_id, job_res_id);
234
list_iterator_destroy(job_iterator);
235
debug("basil_query() executed, last_res_id=%d", last_res_id);
236
#endif /* APBASIL_LOC */
237
#endif /* HAVE_CRAY_XT */
243
* basil_reserve - create a BASIL reservation.
244
* IN job_ptr - pointer to job which has just been allocated resources
245
* RET 0 or error code, job will abort or be requeued on failure
247
extern int basil_reserve(struct job_record *job_ptr)
249
int error_code = SLURM_SUCCESS;
252
/* Issue the BASIL RESERVE request */
253
if (request_failure) {
254
error("basil reserve error: %s", "TBD");
257
select_g_set_jobinfo(job_ptr->select_jobinfo,
258
SELECT_DATA_RESV_ID, reservation_id);
259
debug("basil reservation made job_id=%u resv_id=%s",
260
job_ptr->job_id, reservation_id);
262
char reservation_id[32];
263
snprintf(reservation_id, sizeof(reservation_id),
264
"resv_%d", ++last_res_id);
265
select_g_set_jobinfo(job_ptr->select_jobinfo,
266
SELECT_DATA_RESV_ID, reservation_id);
267
debug("basil reservation made job_id=%u resv_id=%s",
268
job_ptr->job_id, reservation_id);
269
#endif /* APBASIL_LOC */
270
#endif /* HAVE_CRAY_XT */
275
* basil_release - release a BASIL reservation by job.
276
* IN job_ptr - pointer to job which has just been deallocated resources
277
* RET 0 or error code
279
extern int basil_release(struct job_record *job_ptr)
281
int error_code = SLURM_SUCCESS;
283
char *reservation_id = NULL;
284
select_g_get_jobinfo(job_ptr->select_jobinfo,
285
SELECT_DATA_RESV_ID, &reservation_id);
286
if (reservation_id) {
287
error_code = basil_release_id(reservation_id);
288
xfree(reservation_id);
290
#endif /* HAVE_CRAY_XT */
295
* basil_release_id - release a BASIL reservation by ID.
296
* IN reservation_id - ID of reservation to release
297
* RET 0 or error code
299
extern int basil_release_id(char *reservation_id)
301
int error_code = SLURM_SUCCESS;
304
/* Issue the BASIL RELEASE request */
305
if (request_failure) {
306
error("basil release of %s error: %s", reservation_id, "TBD");
309
debug("basil release of reservation %s complete", reservation_id);
311
debug("basil release of reservation %s complete", reservation_id);
312
#endif /* APBASIL_LOC */
313
#endif /* HAVE_CRAY_XT */