117
114
* overwritten when linking with the slurmctld.
119
116
#if defined (__APPLE__)
120
slurm_ctl_conf_t slurmctld_conf __attribute__((weak_import));
121
struct node_record *node_record_table_ptr __attribute__((weak_import));
122
List part_list __attribute__((weak_import));
123
List job_list __attribute__((weak_import));
124
int node_record_count __attribute__((weak_import));
125
time_t last_node_update __attribute__((weak_import));
126
struct switch_record *switch_record_table __attribute__((weak_import));
127
int switch_record_cnt __attribute__((weak_import));
128
bitstr_t *avail_node_bitmap __attribute__((weak_import));
129
bitstr_t *idle_node_bitmap __attribute__((weak_import));
130
List slurm_find_preemptable_jobs(struct job_record *job_ptr)
131
__attribute__((weak_import));
117
slurm_ctl_conf_t slurmctld_conf __attribute__((weak_import));
118
struct node_record *node_record_table_ptr __attribute__((weak_import));
119
List part_list __attribute__((weak_import));
120
List job_list __attribute__((weak_import));
121
int node_record_count __attribute__((weak_import));
122
time_t last_node_update __attribute__((weak_import));
123
struct switch_record *switch_record_table __attribute__((weak_import));
124
int switch_record_cnt __attribute__((weak_import));
125
bitstr_t *avail_node_bitmap __attribute__((weak_import));
126
bitstr_t *idle_node_bitmap __attribute__((weak_import));
133
slurm_ctl_conf_t slurmctld_conf;
134
struct node_record *node_record_table_ptr;
137
int node_record_count;
138
time_t last_node_update;
139
struct switch_record *switch_record_table;
140
int switch_record_cnt;
141
bitstr_t *avail_node_bitmap;
142
bitstr_t *idle_node_bitmap;
143
List slurm_find_preemptable_jobs(struct job_record *job_ptr);
128
slurm_ctl_conf_t slurmctld_conf;
129
struct node_record *node_record_table_ptr;
132
int node_record_count;
133
time_t last_node_update;
134
struct switch_record *switch_record_table;
135
int switch_record_cnt;
136
bitstr_t *avail_node_bitmap;
137
bitstr_t *idle_node_bitmap;
169
163
* of the plugin. If major and minor revisions are desired, the major
170
164
* version number may be multiplied by a suitable magnitude constant such
171
165
* as 100 or 1000. Various SLURM versions will likely require a certain
172
* minimum versions for their plugins as the node selection API matures.
166
* minimum version for their plugins as the node selection API matures.
174
168
const char plugin_name[] = "Consumable Resources (CR) Node Selection plugin";
175
169
const char plugin_type[] = "select/cons_res";
170
const uint32_t plugin_id = 101;
176
171
const uint32_t plugin_version = 91;
177
172
const uint32_t pstate_version = 7; /* version control on saved state */
179
select_type_plugin_info_t cr_type = CR_CPU; /* cr_type is overwritten in init() */
174
uint16_t cr_type = CR_CPU; /* cr_type is overwritten in init() */
176
uint32_t select_debug_flags;
181
177
uint16_t select_fast_schedule;
183
179
uint16_t *cr_node_num_cores = NULL;
184
uint32_t *cr_num_core_count = NULL;
180
uint32_t *cr_node_cores_offset = NULL;
185
181
struct part_res_record *select_part_record = NULL;
186
182
struct node_res_record *select_node_record = NULL;
187
183
struct node_use_record *select_node_usage = NULL;
184
static bool select_state_initializing = true;
188
185
static int select_node_cnt = 0;
189
186
static bool job_preemption_enabled = false;
190
187
static bool job_preemption_killing = false;
207
206
uint32_t min_nodes, uint32_t max_nodes,
208
207
uint32_t req_nodes, uint16_t job_node_req);
209
208
static int _will_run_test(struct job_record *job_ptr, bitstr_t *bitmap,
210
uint32_t min_nodes, uint32_t max_nodes,
211
uint32_t req_nodes, uint16_t job_node_req,
212
List preemptee_candidates, List *preemptee_job_list);
209
uint32_t min_nodes, uint32_t max_nodes,
210
uint32_t req_nodes, uint16_t job_node_req,
211
List preemptee_candidates, List *preemptee_job_list);
216
213
static void _dump_job_res(struct job_resources *job) {
223
220
info("DEBUG: Dump job_resources: nhosts %u cb %s", job->nhosts, str);
226
static void _dump_nodes()
223
static void _dump_nodes(void)
225
struct node_record *node_ptr;
230
229
for (i=0; i<select_node_cnt; i++) {
230
node_ptr = select_node_record[i].node_ptr;
231
231
info("node:%s cpus:%u c:%u s:%u t:%u mem:%u a_mem:%u state:%d",
232
select_node_record[i].node_ptr->name,
233
select_node_record[i].cpus,
234
select_node_record[i].cores,
235
select_node_record[i].sockets,
236
select_node_record[i].vpus,
237
select_node_record[i].real_memory,
238
select_node_usage[i].alloc_memory,
239
select_node_usage[i].node_state);
233
select_node_record[i].cpus,
234
select_node_record[i].cores,
235
select_node_record[i].sockets,
236
select_node_record[i].vpus,
237
select_node_record[i].real_memory,
238
select_node_usage[i].alloc_memory,
239
select_node_usage[i].node_state);
241
if (select_node_usage[i].gres_list)
242
gres_list = select_node_usage[i].gres_list;
244
gres_list = node_ptr->gres_list;
246
gres_plugin_node_state_log(gres_list, node_ptr->name);
276
extern bool cr_preemption_enabled(void)
278
if (!job_preemption_tested) {
279
uint16_t mode = slurm_get_preempt_mode();
280
mode &= ~PREEMPT_MODE_GANG;
281
if (mode == PREEMPT_MODE_SUSPEND)
282
job_preemption_enabled = true;
283
else if ((mode == PREEMPT_MODE_CANCEL) ||
284
(mode == PREEMPT_MODE_CHECKPOINT) ||
285
(mode == PREEMPT_MODE_REQUEUE)) {
286
job_preemption_enabled = true;
287
job_preemption_killing = true;
289
job_preemption_tested = true;
291
return job_preemption_enabled;
293
extern bool cr_preemption_killing(void)
295
(void) cr_preemption_enabled();
296
return job_preemption_killing;
300
#define CR_NUM_CORE_ARRAY_INCREMENT 8
302
/* (re)set cr_node_num_cores and cr_num_core_count arrays */
281
/* (re)set cr_node_num_cores arrays */
303
282
static void _init_global_core_data(struct node_record *node_ptr, int node_cnt)
305
uint32_t i, n, array_size = CR_NUM_CORE_ARRAY_INCREMENT;
307
xfree(cr_num_core_count);
308
286
xfree(cr_node_num_cores);
309
cr_node_num_cores = xmalloc(array_size * sizeof(uint16_t));
310
cr_num_core_count = xmalloc(array_size * sizeof(uint32_t));
312
for (i = 0, n = 0; n < node_cnt; n++) {
287
cr_node_num_cores = xmalloc(node_cnt * sizeof(uint16_t));
289
xfree(cr_node_cores_offset);
290
cr_node_cores_offset = xmalloc((node_cnt+1) * sizeof(uint32_t));
292
for (n = 0; n < node_cnt; n++) {
314
294
if (select_fast_schedule) {
315
295
cores = node_ptr[n].config_ptr->cores;
318
298
cores = node_ptr[n].cores;
319
299
cores *= node_ptr[n].sockets;
321
if (cr_node_num_cores[i] == cores) {
322
cr_num_core_count[i]++;
325
if (cr_num_core_count[i] > 0) {
326
if (++i >= array_size) {
327
array_size += CR_NUM_CORE_ARRAY_INCREMENT;
328
xrealloc(cr_node_num_cores,
329
array_size * sizeof(uint16_t));
330
xrealloc(cr_num_core_count,
331
array_size * sizeof(uint32_t));
334
cr_node_num_cores[i] = cores;
335
cr_num_core_count[i] = 1;
337
/* make sure we have '0'-terminate the arrays */
338
if (++i >= array_size) {
340
xrealloc(cr_node_num_cores, array_size * sizeof(uint16_t));
341
xrealloc(cr_num_core_count, array_size * sizeof(uint32_t));
301
cr_node_num_cores[n] = cores;
303
cr_node_cores_offset[n] = cr_node_cores_offset[n-1] +
304
cr_node_num_cores[n-1] ;
306
cr_node_cores_offset[0] = 0;
309
/* an extra value is added to get the total number of cores */
310
/* as cr_get_coremap_offset is sometimes used to get the total */
311
/* number of cores in the cluster */
312
cr_node_cores_offset[node_cnt] = cr_node_cores_offset[node_cnt-1] +
313
cr_node_num_cores[node_cnt-1] ;
346
318
/* return the coremap index to the first core of the given node */
347
319
extern uint32_t cr_get_coremap_offset(uint32_t node_index)
351
uint32_t n = cr_num_core_count[0];
352
for (i = 0; cr_num_core_count[i] && node_index > n; i++) {
353
cindex += cr_node_num_cores[i] * cr_num_core_count[i];
354
n += cr_num_core_count[i+1];
356
if (!cr_num_core_count[i])
358
n -= cr_num_core_count[i];
360
cindex += cr_node_num_cores[i] * (node_index-n);
365
/* return the total number of cores in a given node */
366
extern uint32_t cr_get_node_num_cores(uint32_t node_index)
369
uint32_t pos = cr_num_core_count[i++];
370
while (node_index >= pos) {
371
pos += cr_num_core_count[i++];
373
return cr_node_num_cores[i-1];
321
return cr_node_cores_offset[node_index];
554
511
uint32_t size = bit_size(r_ptr->row_bitmap);
555
512
bit_nclear(r_ptr->row_bitmap, 0, size-1);
557
add_job_to_cores(job, &(r_ptr->row_bitmap), cr_node_num_cores,
514
add_job_to_cores(job, &(r_ptr->row_bitmap), cr_node_num_cores);
560
516
/* add the job to the job_list */
561
517
if (r_ptr->num_jobs >= r_ptr->job_list_size) {
562
518
r_ptr->job_list_size += 8;
563
519
xrealloc(r_ptr->job_list, r_ptr->job_list_size *
564
sizeof(struct job_resources *));
520
sizeof(struct job_resources *));
566
522
r_ptr->job_list[r_ptr->num_jobs++] = job;
750
for (i = 0; i < num_jobs; i++) {
751
char cstr[64], nstr[64];
752
if (tmpjobs[i]->core_bitmap) {
753
bit_fmt(cstr, (sizeof(cstr)-1) ,
754
tmpjobs[i]->core_bitmap);
756
sprintf(cstr, "[no core_bitmap]");
757
if (tmpjobs[i]->node_bitmap) {
758
bit_fmt(nstr, (sizeof(nstr)-1),
759
tmpjobs[i]->node_bitmap);
761
sprintf(nstr, "[no node_bitmap]");
762
info ("DEBUG: jstart %d job nb %s cb %s", jstart[i], nstr,
706
if (select_debug_flags & DEBUG_FLAG_CPU_BIND) {
707
for (i = 0; i < num_jobs; i++) {
708
char cstr[64], nstr[64];
709
if (tmpjobs[i]->core_bitmap) {
710
bit_fmt(cstr, (sizeof(cstr)-1) ,
711
tmpjobs[i]->core_bitmap);
713
sprintf(cstr, "[no core_bitmap]");
714
if (tmpjobs[i]->node_bitmap) {
715
bit_fmt(nstr, (sizeof(nstr)-1),
716
tmpjobs[i]->node_bitmap);
718
sprintf(nstr, "[no node_bitmap]");
719
info("DEBUG: jstart %d job nb %s cb %s", jstart[i],
767
724
/* add jobs to the rows */
768
725
for (j = 0; j < num_jobs; j++) {
859
816
* - add 'struct job_resources' resources to 'struct part_res_record'
860
817
* - add job's memory requirements to 'struct node_res_record'
862
* if action = 0 then add cores and memory
863
* if action = 1 then only add memory (job is suspended)
864
* if action = 2 then only add cores (job is resumed)
819
* if action = 0 then add cores and memory (starting new job)
820
* if action = 1 then only add memory (adding suspended job)
821
* if action = 2 then only add cores (suspended job is resumed)
866
823
static int _add_job_to_res(struct job_record *job_ptr, int action)
868
825
struct job_resources *job = job_ptr->job_resrcs;
826
struct node_record *node_ptr;
869
827
struct part_res_record *p_ptr;
872
831
if (!job || !job->core_bitmap) {
877
836
debug3("cons_res: _add_job_to_res: job %u act %d ", job_ptr->job_id,
886
for (i = 0, n = 0; i < select_node_cnt; i++) {
887
if (!bit_test(job->node_bitmap, i))
839
if (select_debug_flags & DEBUG_FLAG_CPU_BIND)
842
for (i = 0, n = -1; i < select_node_cnt; i++) {
843
if (!bit_test(job->node_bitmap, i))
847
node_ptr = select_node_record[i].node_ptr;
849
if (select_node_usage[i].gres_list)
850
gres_list = select_node_usage[i].gres_list;
852
gres_list = node_ptr->gres_list;
853
gres_plugin_job_alloc(job_ptr->gres_list, gres_list,
854
job->nhosts, n, job->cpus[n],
855
job_ptr->job_id, node_ptr->name);
856
gres_plugin_node_state_log(gres_list, node_ptr->name);
860
if (job->memory_allocated[n] == 0)
861
continue; /* node lost by job resizing */
889
862
select_node_usage[i].alloc_memory +=
890
job->memory_allocated[n];
863
job->memory_allocated[n];
891
864
if ((select_node_usage[i].alloc_memory >
892
865
select_node_record[i].real_memory)) {
893
error("error: node %s mem is overallocated "
895
select_node_record[i].node_ptr->name,
866
error("cons_res: node %s memory is "
867
"overallocated (%u) for job %u",
896
869
select_node_usage[i].alloc_memory,
897
870
job_ptr->job_id);
963
934
struct job_record *job_ptr, int action)
965
936
struct job_resources *job = job_ptr->job_resrcs;
937
struct node_record *node_ptr;
938
int first_bit, last_bit;
942
if (select_state_initializing) {
943
/* Ignore job removal until select/cons_res data structures
944
* values are set by select_p_reconfigure() */
945
return SLURM_SUCCESS;
968
947
if (!job || !job->core_bitmap) {
969
948
error("job %u has no select data", job_ptr->job_id);
970
949
return SLURM_ERROR;
973
debug3("cons_res: _rm_job_from_res: job %u act %d", job_ptr->job_id,
979
/* subtract memory */
981
for (i = 0, n = 0; i < select_node_cnt; i++) {
982
if (!bit_test(job->node_bitmap, i))
952
debug3("cons_res: _rm_job_from_res: job %u action %d", job_ptr->job_id,
954
if (select_debug_flags & DEBUG_FLAG_CPU_BIND)
957
first_bit = bit_ffs(job->node_bitmap);
958
last_bit = bit_fls(job->node_bitmap);
959
for (i = first_bit, n = -1; i <= last_bit; i++) {
960
if (!bit_test(job->node_bitmap, i))
964
node_ptr = node_record_table_ptr + i;
966
if (node_usage[i].gres_list)
967
gres_list = node_usage[i].gres_list;
969
gres_list = node_ptr->gres_list;
970
gres_plugin_job_dealloc(job_ptr->gres_list, gres_list,
973
gres_plugin_node_state_log(gres_list, node_ptr->name);
977
if (job->memory_allocated[n] == 0)
978
continue; /* no memory allocated */
984
979
if (node_usage[i].alloc_memory <
985
980
job->memory_allocated[n]) {
986
error("error: node %s mem is underallocated "
987
"(%u-%u) for job %u",
988
select_node_record[i].node_ptr->name,
981
error("cons_res: node %s memory is "
982
"underallocated (%u-%u) for job %u",
989
984
node_usage[i].alloc_memory,
990
985
job->memory_allocated[n],
991
986
job_ptr->job_id);
992
987
node_usage[i].alloc_memory = 0;
994
989
node_usage[i].alloc_memory -=
995
job->memory_allocated[n];
990
job->memory_allocated[n];
1056
1049
* the removal of this job. If all cores are now
1057
1050
* available, set node_state = NODE_CR_AVAILABLE
1059
for (n = 0; n < select_node_cnt; n++) {
1060
if (bit_test(job->node_bitmap, n) == 0)
1052
for (i = 0, n = -1; i < select_node_cnt; i++) {
1053
if (bit_test(job->node_bitmap, i) == 0)
1062
if (node_usage[n].node_state >=
1056
if (job->cpus[n] == 0)
1057
continue; /* node lost by job resize */
1058
if (node_usage[i].node_state >=
1063
1059
job->node_req) {
1064
node_usage[n].node_state -=
1060
node_usage[i].node_state -=
1067
1063
error("cons_res:_rm_job_from_res: "
1068
"node_state mis-count");
1069
node_usage[n].node_state =
1064
"node_state mis-count");
1065
node_usage[i].node_state =
1076
1072
return SLURM_SUCCESS;
1075
static int _rm_job_from_one_node(struct job_record *job_ptr,
1076
struct node_record *node_ptr)
1078
struct part_res_record *part_record_ptr = select_part_record;
1079
struct node_use_record *node_usage = select_node_usage;
1080
struct job_resources *job = job_ptr->job_resrcs;
1081
struct part_res_record *p_ptr;
1082
int first_bit, last_bit;
1086
if (!job || !job->core_bitmap) {
1087
error("job %u has no select data", job_ptr->job_id);
1091
debug3("cons_res: _rm_job_from_one_node: job %u node %s",
1092
job_ptr->job_id, node_ptr->name);
1093
if (select_debug_flags & DEBUG_FLAG_CPU_BIND)
1096
/* subtract memory */
1097
node_inx = node_ptr - node_record_table_ptr;
1098
first_bit = bit_ffs(job->node_bitmap);
1099
last_bit = bit_fls(job->node_bitmap);
1100
for (i = first_bit, n = -1; i <= last_bit; i++) {
1101
if (!bit_test(job->node_bitmap, i))
1107
if (job->cpus[n] == 0) {
1108
info("attempt to remove node %s from job %u again",
1109
node_ptr->name, job_ptr->job_id);
1110
return SLURM_SUCCESS;
1113
if (node_usage[i].gres_list)
1114
gres_list = node_usage[i].gres_list;
1116
gres_list = node_ptr->gres_list;
1117
gres_plugin_job_dealloc(job_ptr->gres_list, gres_list, n,
1118
job_ptr->job_id, node_ptr->name);
1119
gres_plugin_node_state_log(gres_list, node_ptr->name);
1122
job->ncpus = build_job_resources_cpu_array(job);
1123
clear_job_resources_node(job, n);
1124
if (node_usage[i].alloc_memory < job->memory_allocated[n]) {
1125
error("cons_res: node %s memory is underallocated "
1126
"(%u-%u) for job %u",
1127
node_ptr->name, node_usage[i].alloc_memory,
1128
job->memory_allocated[n], job_ptr->job_id);
1129
node_usage[i].alloc_memory = 0;
1131
node_usage[i].alloc_memory -= job->memory_allocated[n];
1132
job->memory_allocated[n] = 0;
1136
if (IS_JOB_SUSPENDED(job_ptr))
1137
return SLURM_SUCCESS; /* No cores allocated to the job now */
1139
/* subtract cores, reconstruct rows with remaining jobs */
1140
if (!job_ptr->part_ptr) {
1141
error("cons_res: removed job %u does not have a partition "
1142
"assigned", job_ptr->job_id);
1146
for (p_ptr = part_record_ptr; p_ptr; p_ptr = p_ptr->next) {
1147
if (p_ptr->part_ptr == job_ptr->part_ptr)
1151
error("cons_res: removed job %u could not find part %s",
1152
job_ptr->job_id, job_ptr->part_ptr->name);
1157
return SLURM_SUCCESS;
1159
/* look for the job in the partition's job_list */
1161
for (i = 0; i < p_ptr->num_rows; i++) {
1163
for (j = 0; j < p_ptr->row[i].num_jobs; j++) {
1164
if (p_ptr->row[i].job_list[j] != job)
1166
debug3("cons_res: found job %u in part %s row %u",
1167
job_ptr->job_id, p_ptr->part_ptr->name, i);
1168
/* found job - we're done, don't actually remove */
1170
i = p_ptr->num_rows;
1175
error("cons_res: could not find job %u in partition %s",
1176
job_ptr->job_id, p_ptr->part_ptr->name);
1181
/* job was found and removed from core-bitmap, so refresh CR bitmaps */
1182
_build_row_bitmaps(p_ptr);
1184
/* Adjust the node_state of the node removed from this job.
1185
* If all cores are now available, set node_state = NODE_CR_AVAILABLE */
1186
if (node_usage[node_inx].node_state >= job->node_req) {
1187
node_usage[node_inx].node_state -= job->node_req;
1189
error("cons_res:_rm_job_from_one_node: node_state miscount");
1190
node_usage[node_inx].node_state = NODE_CR_AVAILABLE;
1193
return SLURM_SUCCESS;
1079
1196
static struct multi_core_data * _create_default_mc(void)
1081
1198
struct multi_core_data *mc_ptr;
1082
1199
mc_ptr = xmalloc(sizeof(struct multi_core_data));
1083
mc_ptr->min_sockets = (uint16_t) NO_VAL;
1084
mc_ptr->min_cores = (uint16_t) NO_VAL;
1085
mc_ptr->min_threads = (uint16_t) NO_VAL;
1200
mc_ptr->sockets_per_node = (uint16_t) NO_VAL;
1201
mc_ptr->cores_per_socket = (uint16_t) NO_VAL;
1202
mc_ptr->threads_per_core = (uint16_t) NO_VAL;
1086
1203
/* mc_ptr is initialized to zero by xmalloc*/
1087
1204
/* mc_ptr->ntasks_per_socket = 0; */
1088
1205
/* mc_ptr->ntasks_per_core = 0; */
1172
1291
select_node_cnt, select_part_record,
1173
1292
select_node_usage);
1175
if ((rc != SLURM_SUCCESS) && cr_preemption_killing() &&
1176
preemptee_candidates) {
1294
if ((rc != SLURM_SUCCESS) && preemptee_candidates) {
1177
1295
/* Remove preemptable jobs from simulated environment */
1178
1296
future_part = _dup_part_data(select_part_record);
1179
1297
if (future_part == NULL) {
1298
FREE_NULL_BITMAP(orig_map);
1181
1299
return SLURM_ERROR;
1183
1301
future_usage = _dup_node_usage(select_node_usage);
1184
1302
if (future_usage == NULL) {
1185
1303
_destroy_part_data(future_part);
1304
FREE_NULL_BITMAP(orig_map);
1187
1305
return SLURM_ERROR;
1190
1308
job_iterator = list_iterator_create(job_list);
1309
if (job_iterator == NULL)
1310
fatal ("memory allocation failure");
1191
1311
while ((tmp_job_ptr = (struct job_record *)
1192
list_next(job_iterator))) {
1312
list_next(job_iterator))) {
1193
1313
if (!IS_JOB_RUNNING(tmp_job_ptr) &&
1194
1314
!IS_JOB_SUSPENDED(tmp_job_ptr))
1316
mode = slurm_job_preempt_mode(tmp_job_ptr);
1317
if ((mode != PREEMPT_MODE_REQUEUE) &&
1318
(mode != PREEMPT_MODE_CHECKPOINT) &&
1319
(mode != PREEMPT_MODE_CANCEL))
1320
continue; /* can't remove job */
1196
1321
if (_is_preemptable(tmp_job_ptr,
1197
1322
preemptee_candidates)) {
1198
1323
/* Remove preemptable job now */
1221
1346
fatal("list_create malloc failure");
1223
1348
preemptee_iterator = list_iterator_create(
1224
preemptee_candidates);
1349
preemptee_candidates);
1350
if (preemptee_iterator == NULL)
1351
fatal ("memory allocation failure");
1225
1352
while ((tmp_job_ptr = (struct job_record *)
1226
list_next(preemptee_iterator))) {
1353
list_next(preemptee_iterator))) {
1354
mode = slurm_job_preempt_mode(tmp_job_ptr);
1355
if ((mode != PREEMPT_MODE_REQUEUE) &&
1356
(mode != PREEMPT_MODE_CHECKPOINT) &&
1357
(mode != PREEMPT_MODE_CANCEL))
1227
1359
if (bit_overlap(bitmap,
1228
1360
tmp_job_ptr->node_bitmap) == 0)
1231
1362
list_append(*preemptee_job_list,
1364
remove_some_jobs = true;
1234
1366
list_iterator_destroy(preemptee_iterator);
1367
if (!remove_some_jobs) {
1368
list_destroy(*preemptee_job_list);
1369
*preemptee_job_list = NULL;
1237
1373
_destroy_part_data(future_part);
1238
1374
_destroy_node_data(future_usage, NULL);
1376
FREE_NULL_BITMAP(orig_map);
1328
1469
/* Remove the running jobs one at a time from exp_node_cr and try
1329
* scheduling the pending job after each one */
1470
* scheduling the pending job after each one. */
1330
1471
if (rc != SLURM_SUCCESS) {
1331
1472
list_sort(cr_job_list, _cr_job_list_sort);
1332
1473
job_iterator = list_iterator_create(cr_job_list);
1474
if (job_iterator == NULL)
1475
fatal ("memory allocation failure");
1333
1476
while ((tmp_job_ptr = list_next(job_iterator))) {
1478
bit_or(bitmap, orig_map);
1479
ovrlap = bit_overlap(bitmap, tmp_job_ptr->node_bitmap);
1480
if (ovrlap == 0) /* job has no usable nodes */
1481
continue; /* skip it */
1482
debug2("cons_res: _will_run_test, job %u: overlap=%d",
1483
tmp_job_ptr->job_id, ovrlap);
1334
1484
_rm_job_from_res(future_part, future_usage,
1335
1485
tmp_job_ptr, 0);
1336
bit_or(bitmap, orig_map);
1337
1486
rc = cr_job_test(job_ptr, bitmap, min_nodes,
1338
1487
max_nodes, req_nodes,
1339
1488
SELECT_MODE_WILL_RUN, cr_type,
1426
1577
static int _synchronize_bitmaps(struct job_record *job_ptr,
1427
1578
bitstr_t ** partially_idle_bitmap)
1429
int size, i, idlecpus = bit_set_count(avail_node_bitmap);
1430
1581
struct part_res_record *p_ptr;
1431
1582
size = bit_size(avail_node_bitmap);
1432
1583
bitstr_t *bitmap = bit_alloc(size);
1433
1585
if (bitmap == NULL)
1434
1586
return SLURM_ERROR;
1436
1588
debug3("cons_res: synch_bm: avail %d of %d set, idle %d of %d set",
1437
idlecpus, size, bit_set_count(idle_node_bitmap), size);
1589
bit_set_count(avail_node_bitmap), size,
1590
bit_set_count(idle_node_bitmap), size);
1440
1593
fatal("cons_res: error: don't know what job I'm sync'ing");
1447
1600
for (i = 0; i < select_node_cnt; i++) {
1448
if (bit_test(avail_node_bitmap, i) == 0)
1601
if (!bit_test(avail_node_bitmap, i))
1451
if (bit_test(idle_node_bitmap, i) == 1) {
1604
if (bit_test(idle_node_bitmap, i)) {
1452
1605
bit_set(bitmap, i);
1456
if(!p_ptr || _is_node_avail(p_ptr, i))
1609
if (!p_ptr || _is_node_avail(p_ptr, i))
1457
1610
bit_set(bitmap, i);
1459
idlecpus = bit_set_count(bitmap);
1461
1613
debug3("cons_res: found %d partially idle nodes in part %s",
1462
idlecpus, p_ptr->part_ptr->name);
1614
bit_set_count(bitmap), p_ptr->part_ptr->name);
1464
1616
debug3("cons_res: found %d partially idle nodes",
1617
bit_set_count(bitmap));
1467
1620
*partially_idle_bitmap = bitmap;
1468
1621
return SLURM_SUCCESS;
1539
1685
/* This is Part 1 of a 4-part procedure which can be found in
1540
1686
* src/slurmctld/read_config.c. The whole story goes like this:
1542
* Step 1: select_g_node_init : initializes the global node arrays
1543
* Step 2: select_g_state_restore : NO-OP - nothing to restore
1544
* Step 3: select_g_job_init : NO-OP - nothing to initialize
1545
* Step 4: select_g_update_nodeinfo : called from reset_job_bitmaps() with
1546
* each valid recovered job_ptr AND from
1547
* select_nodes(), this procedure adds job
1548
* data to the 'select_part_record' global
1688
* Step 1: select_g_node_init : initializes the global node arrays
1689
* Step 2: select_g_state_restore : NO-OP - nothing to restore
1690
* Step 3: select_g_job_init : NO-OP - nothing to initialize
1691
* Step 4: select_g_select_nodeinfo_set: called from reset_job_bitmaps() with
1692
* each valid recovered job_ptr AND from
1693
* select_nodes(), this procedure adds
1694
* job data to the 'select_part_record'
1551
1697
extern int select_p_node_init(struct node_record *node_ptr, int node_cnt)
1555
1701
info("cons_res: select_p_node_init");
1556
1702
if (node_ptr == NULL) {
1583
1730
select_node_record[i].cores = config_ptr->cores;
1584
1731
select_node_record[i].vpus = config_ptr->threads;
1585
1732
select_node_record[i].real_memory = config_ptr->
1588
1735
select_node_record[i].cpus = node_ptr[i].cpus;
1589
1736
select_node_record[i].sockets = node_ptr[i].sockets;
1590
1737
select_node_record[i].cores = node_ptr[i].cores;
1591
1738
select_node_record[i].vpus = node_ptr[i].threads;
1592
1739
select_node_record[i].real_memory = node_ptr[i].
1742
tot_core = select_node_record[i].sockets *
1743
select_node_record[i].cores;
1744
if (tot_core >= select_node_record[i].cpus)
1745
select_node_record[i].vpus = 1;
1595
1746
select_node_usage[i].node_state = NODE_CR_AVAILABLE;
1747
gres_plugin_node_state_dealloc_all(select_node_record[i].
1748
node_ptr->gres_list);
1597
1750
_create_part_data();
1660
1813
job_ptr->details->mc_ptr = _create_default_mc();
1661
1814
job_node_req = _get_job_node_req(job_ptr);
1663
debug3("cons_res: select_p_job_test: job %u node_req %u, mode %d",
1664
job_ptr->job_id, job_node_req, mode);
1665
debug3("cons_res: select_p_job_test: min_n %u max_n %u req_n %u nb %u",
1666
min_nodes, max_nodes, req_nodes, bit_set_count(bitmap));
1669
_dump_state(select_part_record);
1816
if (select_debug_flags & DEBUG_FLAG_CPU_BIND) {
1817
info("cons_res: select_p_job_test: job %u node_req %u mode %d",
1818
job_ptr->job_id, job_node_req, mode);
1819
info("cons_res: select_p_job_test: min_n %u max_n %u req_n %u "
1821
min_nodes, max_nodes, req_nodes, bit_set_count(bitmap));
1822
_dump_state(select_part_record);
1671
1824
if (mode == SELECT_MODE_WILL_RUN) {
1672
1825
rc = _will_run_test(job_ptr, bitmap, min_nodes, max_nodes,
1673
1826
req_nodes, job_node_req,
1701
1854
return SLURM_SUCCESS;
1857
/* Determine if allocated nodes are usable (powered up) */
1704
1858
extern int select_p_job_ready(struct job_record *job_ptr)
1860
int i, i_first, i_last;
1861
struct node_record *node_ptr;
1863
if (!IS_JOB_RUNNING(job_ptr) && !IS_JOB_SUSPENDED(job_ptr)) {
1864
/* Gang scheduling might suspend job immediately */
1868
if ((job_ptr->node_bitmap == NULL) ||
1869
((i_first = bit_ffs(job_ptr->node_bitmap)) == -1))
1870
return READY_NODE_STATE;
1871
i_last = bit_fls(job_ptr->node_bitmap);
1873
for (i=i_first; i<=i_last; i++) {
1874
if (bit_test(job_ptr->node_bitmap, i) == 0)
1876
node_ptr = node_record_table_ptr + i;
1877
if (IS_NODE_POWER_SAVE(node_ptr) || IS_NODE_POWER_UP(node_ptr))
1881
return READY_NODE_STATE;
1884
extern int select_p_job_resized(struct job_record *job_ptr,
1885
struct node_record *node_ptr)
1888
xassert(job_ptr->magic == JOB_MAGIC);
1890
_rm_job_from_one_node(job_ptr, node_ptr);
1706
1891
return SLURM_SUCCESS;
1801
1990
uint16_t tmp, tmp_16 = 0;
1802
1991
static time_t last_set_all = 0;
1803
1992
uint32_t node_threads, node_cpus;
1993
select_nodeinfo_t *nodeinfo = NULL;
1805
1995
/* only set this once when the last_node_update is newer than
1806
1996
the last time we set things up. */
1807
1997
if(last_set_all && (last_node_update < last_set_all)) {
1808
1998
debug2("Node select info for set all hasn't "
1999
"changed since %ld",
2000
(long)last_set_all);
1811
2001
return SLURM_NO_CHANGE_IN_DATA;
1813
2003
last_set_all = last_node_update;
1815
for (n=0; n < node_record_count; n++) {
2005
for (n=0; n < select_node_cnt; n++) {
1816
2006
node_ptr = &(node_record_table_ptr[n]);
2008
/* We have to use the '_g_' here to make sure we get
2009
the correct data to work on. i.e. cray calls this
2010
plugin from within select/cray which has it's own
2013
select_g_select_nodeinfo_get(node_ptr->select_nodeinfo,
2014
SELECT_NODEDATA_PTR, 0,
2017
error("no nodeinfo returned from structure");
1817
2021
if (slurmctld_conf.fast_schedule) {
1818
2022
node_cpus = node_ptr->config_ptr->cpus;
1819
2023
node_threads = node_ptr->config_ptr->threads;
1913
2127
extern int select_p_select_jobinfo_set(select_jobinfo_t *jobinfo,
1914
enum select_jobdata_type data_type, void *data)
2128
enum select_jobdata_type data_type,
1916
2131
return SLURM_SUCCESS;
1919
2134
extern int select_p_select_jobinfo_get(select_jobinfo_t *jobinfo,
1920
enum select_jobdata_type data_type, void *data)
2135
enum select_jobdata_type data_type,
1922
2138
return SLURM_ERROR;
1925
2141
extern select_jobinfo_t *select_p_select_jobinfo_copy(
1926
select_jobinfo_t *jobinfo)
2142
select_jobinfo_t *jobinfo)
1931
extern int select_p_select_jobinfo_pack(select_jobinfo_t *jobinfo, Buf buffer)
2147
extern int select_p_select_jobinfo_pack(select_jobinfo_t *jobinfo, Buf buffer,
2148
uint16_t protocol_version)
1933
2150
return SLURM_SUCCESS;
1936
2153
extern int select_p_select_jobinfo_unpack(select_jobinfo_t *jobinfo,
2155
uint16_t protocol_version)
1939
2157
return SLURM_SUCCESS;