8
8
* Written by Danny Auble <da@llnl.gov>
10
10
* This file is part of SLURM, a resource management program.
11
* For details, see <http://www.llnl.gov/linux/slurm/>.
11
* For details, see <https://computing.llnl.gov/linux/slurm/>.
12
* Please also read the included file: DISCLAIMER.
13
14
* SLURM is free software; you can redistribute it and/or modify it under
14
15
* the terms of the GNU General Public License as published by the Free
308
318
bg_record->geo[Z]++;
309
319
end[Z] = ba_node->coord[Z];
322
if(ba_node->coord[X]<bg_record->start[X]) {
323
bg_record->start[X] = ba_node->coord[X];
325
if(ba_node->coord[Y]<bg_record->start[Y]) {
326
bg_record->start[Y] = ba_node->coord[Y];
328
if(ba_node->coord[Z]<bg_record->start[Z]) {
329
bg_record->start[Z] = ba_node->coord[Z];
312
333
list_iterator_destroy(itr);
313
debug3("geo = %c%c%c bp count is %d\n",
334
debug3("process_nodes: "
335
"geo = %c%c%c bp count is %d start is %c%c%c\n",
314
336
alpha_num[bg_record->geo[X]],
315
337
alpha_num[bg_record->geo[Y]],
316
338
alpha_num[bg_record->geo[Z]],
317
bg_record->bp_count);
340
alpha_num[bg_record->start[X]],
341
alpha_num[bg_record->start[Y]],
342
alpha_num[bg_record->start[Z]]);
318
343
/* This check is for sub midplane systems to figure out what
319
344
the largest block can be.
325
350
&& (bg_record->geo[Z] == DIM_SIZE[Z])) {
326
351
bg_record->full_block = 1;
328
} else if(bg_record->node_cnt == bluegene_bp_node_cnt)
353
} else if(bg_record->node_cnt == bg_conf->bp_node_cnt)
329
354
bg_record->full_block = 1;
332
356
/* #ifndef HAVE_BG_FILES */
333
357
/* max_dim[X] = MAX(max_dim[X], end[X]); */
334
358
/* max_dim[Y] = MAX(max_dim[Y], end[Y]); */
670
692
int small_count = 0;
694
xassert(bg_conf->slurm_user_name);
673
697
fatal("add_bg_record: no records list given");
675
bg_record = (bg_record_t*) xmalloc(sizeof(bg_record_t));
678
bg_record->user_name = xstrdup(bg_slurm_user_name);
679
bg_record->target_name = xstrdup(bg_slurm_user_name);
681
pw_uid = uid_from_string(bg_record->user_name);
682
if(pw_uid == (uid_t) -1) {
683
error("No such user: %s", bg_record->user_name);
699
bg_record = (bg_record_t*) xmalloc(sizeof(bg_record_t));
701
bg_record->user_name = xstrdup(bg_conf->slurm_user_name);
702
bg_record->target_name = xstrdup(bg_conf->slurm_user_name);
704
if (uid_from_string (bg_record->user_name, &pw_uid) < 0)
705
error("add_bg_record: No such user: %s", bg_record->user_name);
685
707
bg_record->user_uid = pw_uid;
688
709
bg_record->bg_block_list = list_create(destroy_ba_node);
690
711
if(copy_node_path(used_nodes, &bg_record->bg_block_list)
692
error("couldn't copy the path for the allocation");
713
error("add_bg_record: "
714
"couldn't copy the path for the allocation");
693
715
bg_record->bp_count = list_count(used_nodes);
695
717
/* bg_record->boot_state = 0; Implicit */
696
718
/* bg_record->state = 0; Implicit */
698
debug2("asking for %s %d %d %s",
720
debug2("add_bg_record: asking for %s %d %d %s",
699
721
blockreq->block, blockreq->small32, blockreq->small128,
700
722
convert_conn_type(blockreq->conn_type));
702
debug2("asking for %s %d %d %d %d %d %s",
724
debug2("add_bg_record: asking for %s %d %d %d %d %d %s",
703
725
blockreq->block, blockreq->small256,
704
726
blockreq->small128, blockreq->small64,
705
727
blockreq->small32, blockreq->small16,
724
len += strlen(bg_slurm_node_prefix)+1;
746
len += strlen(bg_conf->slurm_node_prefix)+1;
725
747
bg_record->nodes = xmalloc(len);
726
748
snprintf(bg_record->nodes, len, "%s%s",
727
bg_slurm_node_prefix, blockreq->block+i);
749
bg_conf->slurm_node_prefix, blockreq->block+i);
729
fatal("BPs=%s is in a weird format", blockreq->block);
751
fatal("add_bg_record: BPs=%s is in a weird format",
731
754
process_nodes(bg_record, false);
734
757
bg_record->node_use = SELECT_COPROCESSOR_MODE;
736
759
bg_record->conn_type = blockreq->conn_type;
737
bg_record->cpu_cnt = procs_per_node * bg_record->bp_count;
738
bg_record->node_cnt = bluegene_bp_node_cnt * bg_record->bp_count;
760
bg_record->cpu_cnt = bg_conf->procs_per_bp * bg_record->bp_count;
761
bg_record->node_cnt = bg_conf->bp_node_cnt * bg_record->bp_count;
739
762
bg_record->job_running = NO_JOB_RUNNING;
742
765
if(blockreq->blrtsimage)
743
766
bg_record->blrtsimage = xstrdup(blockreq->blrtsimage);
745
bg_record->blrtsimage = xstrdup(default_blrtsimage);
768
bg_record->blrtsimage = xstrdup(bg_conf->default_blrtsimage);
747
770
if(blockreq->linuximage)
748
771
bg_record->linuximage = xstrdup(blockreq->linuximage);
750
bg_record->linuximage = xstrdup(default_linuximage);
773
bg_record->linuximage = xstrdup(bg_conf->default_linuximage);
752
775
if(blockreq->mloaderimage)
753
776
bg_record->mloaderimage = xstrdup(blockreq->mloaderimage);
755
bg_record->mloaderimage = xstrdup(default_mloaderimage);
778
bg_record->mloaderimage =
779
xstrdup(bg_conf->default_mloaderimage);
757
781
if(blockreq->ramdiskimage)
758
782
bg_record->ramdiskimage = xstrdup(blockreq->ramdiskimage);
760
bg_record->ramdiskimage = xstrdup(default_ramdiskimage);
784
bg_record->ramdiskimage =
785
xstrdup(bg_conf->default_ramdiskimage);
762
787
if(bg_record->conn_type != SELECT_SMALL) {
763
788
/* this needs to be an append so we keep things in the
766
791
/* this isn't a correct list so we need to set it later for
767
792
now we just used it to be the bp number */
768
793
if(!used_nodes) {
769
debug4("we didn't get a request list so we are "
794
debug4("add_bg_record: "
795
"we didn't get a request list so we are "
770
796
"destroying this bp list");
771
797
list_destroy(bg_record->bg_block_list);
772
798
bg_record->bg_block_list = NULL;
775
debug("adding a small block");
801
debug("add_bg_record: adding a small block");
778
804
/* if the ionode cnt for small32 is 0 then don't
779
805
allow a sub quarter allocation
781
if(bluegene_nodecard_ionode_cnt < 2) {
782
if(!bluegene_nodecard_ionode_cnt && blockreq->small32)
783
fatal("There is an error in your "
807
if(bg_conf->nodecard_ionode_cnt < 2) {
808
if(!bg_conf->nodecard_ionode_cnt && blockreq->small32)
809
fatal("add_bg_record: "
810
"There is an error in your "
784
811
"bluegene.conf file.\n"
785
812
"Can't create a 32 node block with "
786
813
"Numpsets=%u. (Try setting it "
787
814
"to at least 16)",
790
817
if(blockreq->small16)
791
fatal("There is an error in your "
818
fatal("add_bg_record: "
819
"There is an error in your "
792
820
"bluegene.conf file.\n"
793
821
"Can't create a 16 node block with "
794
822
"Numpsets=%u. (Try setting it to "
797
if((bluegene_io_ratio < 0.5) && blockreq->small64)
798
fatal("There is an error in your "
825
if((bg_conf->io_ratio < 0.5) && blockreq->small64)
826
fatal("add_bg_record: "
827
"There is an error in your "
799
828
"bluegene.conf file.\n"
800
829
"Can't create a 64 node block with "
801
830
"Numpsets=%u. (Try setting it "
802
831
"to at least 8)",
808
837
if(blockreq->small32==0 && blockreq->small128==0) {
809
info("No specs given for this small block, "
838
info("add_bg_record: "
839
"No specs given for this small block, "
810
840
"I am spliting this block into 4 128CnBlocks");
811
841
blockreq->small128=4;
814
i = (blockreq->small32*bluegene_nodecard_node_cnt) +
815
(blockreq->small128*bluegene_quarter_node_cnt);
816
if(i != bluegene_bp_node_cnt)
817
fatal("There is an error in your bluegene.conf file.\n"
844
i = (blockreq->small32*bg_conf->nodecard_node_cnt) +
845
(blockreq->small128*bg_conf->quarter_node_cnt);
846
if(i != bg_conf->bp_node_cnt)
847
fatal("add_bg_record: "
848
"There is an error in your bluegene.conf file.\n"
818
849
"I am unable to request %d nodes consisting of "
819
850
"%u 32CnBlocks and\n%u 128CnBlocks in one "
820
851
"base partition with %u nodes.",
821
852
i, blockreq->small32, blockreq->small128,
822
bluegene_bp_node_cnt);
853
bg_conf->bp_node_cnt);
823
854
small_count = blockreq->small32+blockreq->small128;
825
856
if(!blockreq->small16 && !blockreq->small32
826
857
&& !blockreq->small64 && !blockreq->small128
827
858
&& !blockreq->small256) {
828
info("No specs given for this small block, "
859
info("add_bg_record: "
860
"No specs given for this small block, "
829
861
"I am spliting this block into 2 256CnBlocks");
830
862
blockreq->small256=2;
835
867
+ (blockreq->small64*64)
836
868
+ (blockreq->small128*128)
837
869
+ (blockreq->small256*256);
838
if(i != bluegene_bp_node_cnt)
839
fatal("There is an error in your bluegene.conf file.\n"
870
if(i != bg_conf->bp_node_cnt)
871
fatal("add_bg_record: "
872
"There is an error in your bluegene.conf file.\n"
840
873
"I am unable to request %d nodes consisting of "
841
874
"%u 16CNBlocks, %u 32CNBlocks,\n"
842
875
"%u 64CNBlocks, %u 128CNBlocks, "
844
877
"in one base partition with %u nodes.",
845
878
i, blockreq->small16, blockreq->small32,
846
879
blockreq->small64, blockreq->small128,
847
blockreq->small256, bluegene_bp_node_cnt);
880
blockreq->small256, bg_conf->bp_node_cnt);
848
881
small_count = blockreq->small16
849
882
+ blockreq->small32
850
883
+ blockreq->small64
987
1020
if(io_cnt == NO_VAL) {
989
1022
/* Translate 1 nodecard count to ionode count */
990
if((io_cnt *= bluegene_io_ratio))
1023
if((io_cnt *= bg_conf->io_ratio))
992
1026
/* make sure we create something that is able to be
994
if(bluegene_smallest_block < bluegene_nodecard_node_cnt)
995
create_size = bluegene_nodecard_node_cnt;
1028
if(bg_conf->smallest_block < bg_conf->nodecard_node_cnt)
1029
create_size = bg_conf->nodecard_node_cnt;
997
create_size = bluegene_smallest_block;
1031
create_size = bg_conf->smallest_block;
1000
1034
node_ptr = find_node_record(bp_name);
1041
/* this is here for sanity check to make sure we don't core on
1042
these bits when we set them below. */
1043
if(io_start >= bg_conf->numpsets
1044
|| (io_start+io_cnt) >= bg_conf->numpsets) {
1045
debug("io %d-%d not configured on this "
1046
"system, only %d ionodes per midplane",
1047
io_start, io_start+io_cnt, bg_conf->numpsets);
1006
1050
bp_bit = (node_ptr - node_record_table_ptr);
1008
1052
memset(&blockreq, 0, sizeof(blockreq_t));
1010
1054
blockreq.conn_type = SELECT_SMALL;
1011
1055
blockreq.block = bp_name;
1013
debug3("here setting %d of %d and %d-%d of %d",
1057
debug3("here setting node %d of %d and ionodes %d-%d of %d",
1014
1058
bp_bit, node_record_count, io_start,
1015
io_start+io_cnt, bluegene_numpsets);
1059
io_start+io_cnt, bg_conf->numpsets);
1017
1061
memset(&tmp_record, 0, sizeof(bg_record_t));
1018
1062
tmp_record.bp_count = 1;
1019
tmp_record.node_cnt = bluegene_nodecard_node_cnt;
1063
tmp_record.node_cnt = bg_conf->nodecard_node_cnt;
1020
1064
tmp_record.bitmap = bit_alloc(node_record_count);
1021
1065
bit_set(tmp_record.bitmap, bp_bit);
1023
tmp_record.ionode_bitmap = bit_alloc(bluegene_numpsets);
1067
tmp_record.ionode_bitmap = bit_alloc(bg_conf->numpsets);
1024
1068
bit_nset(tmp_record.ionode_bitmap, io_start, io_start+io_cnt);
1026
1070
slurm_mutex_lock(&block_state_mutex);
1027
itr = list_iterator_create(bg_list);
1071
itr = list_iterator_create(bg_lists->main);
1028
1072
while ((bg_record = list_next(itr))) {
1029
1073
if(!bit_test(bg_record->bitmap, bp_bit))
1035
1079
if(bg_record->job_running > NO_JOB_RUNNING)
1036
1080
slurm_fail_job(bg_record->job_running);
1038
/* mark every one of these in an error state */
1039
if(bluegene_layout_mode != LAYOUT_DYNAMIC) {
1082
/* If Running Dynamic mode and the the block is
1083
smaller than the create size just continue on.
1085
if((bg_conf->layout_mode == LAYOUT_DYNAMIC)
1086
&& (bg_record->node_cnt < create_size)) {
1040
1087
if(!delete_list)
1041
1088
delete_list = list_create(NULL);
1042
1089
list_append(delete_list, bg_record);
1046
/* below is only for dynamic modes since there are
1047
never overlapping blocks there */
1048
/* if the block is smaller than the create size just
1051
if(bg_record->node_cnt < create_size)
1093
/* keep track of the smallest size that is at least
1094
the size of create_size. */
1054
1095
if(!smallest_bg_record ||
1055
1096
(smallest_bg_record->node_cnt > bg_record->node_cnt))
1056
1097
smallest_bg_record = bg_record;
1058
1099
list_iterator_destroy(itr);
1059
1100
slurm_mutex_unlock(&block_state_mutex);
1061
if(bluegene_layout_mode != LAYOUT_DYNAMIC) {
1102
if(bg_conf->layout_mode != LAYOUT_DYNAMIC) {
1062
1103
debug3("running non-dynamic mode");
1065
/* don't lock here since it is handled inside
1066
the put_block_in_error_state
1068
itr = list_iterator_create(delete_list);
1069
while ((bg_record = list_next(itr))) {
1070
/* we already handled this */
1071
if(bg_record->state == RM_PARTITION_ERROR) {
1072
rc = SLURM_NO_CHANGE_IN_DATA;
1076
rc = put_block_in_error_state(
1077
bg_record, BLOCK_ERROR_STATE);
1082
list_iterator_destroy(itr);
1105
/* This should never happen, but just in case... */
1083
1107
list_destroy(delete_list);
1109
/* If we found a block that is smaller or equal to a
1110
midplane we will just mark it in an error state as
1111
opposed to draining the node.
1113
if(smallest_bg_record
1114
&& (smallest_bg_record->node_cnt < bg_conf->bp_node_cnt)){
1115
if(smallest_bg_record->state == RM_PARTITION_ERROR) {
1116
rc = SLURM_NO_CHANGE_IN_DATA;
1120
rc = put_block_in_error_state(
1121
smallest_bg_record, BLOCK_ERROR_STATE);
1087
debug("didn't get a smallest block");
1125
debug("No block under 1 midplane available for this nodecard. "
1126
"Draining the whole node.");
1088
1127
if(!node_already_down(bp_name)) {
1089
1128
time_t now = time(NULL);
1090
1129
char reason[128], time_str[32];
1142
/* below is only for Dynamic mode */
1104
if(smallest_bg_record) {
1146
bitstr_t *iobitmap = bit_alloc(bg_conf->numpsets);
1147
/* don't lock here since it is handled inside
1148
the put_block_in_error_state
1150
itr = list_iterator_create(delete_list);
1151
while ((bg_record = list_next(itr))) {
1152
debug2("combining smaller than nodecard "
1154
bg_record->bg_block_id);
1155
while(bg_record->job_running > NO_JOB_RUNNING)
1158
bit_or(iobitmap, bg_record->ionode_bitmap);
1161
list_iterator_destroy(itr);
1162
list_destroy(delete_list);
1164
FREE_NULL_BITMAP(iobitmap);
1168
/* set the start to be the same as the start of the
1169
ionode_bitmap. If no ionodes set (not a small
1170
block) set io_start = 0. */
1171
if((io_start = bit_ffs(iobitmap)) == -1) {
1173
if(create_size > bg_conf->nodecard_node_cnt)
1174
blockreq.small128 = 4;
1176
blockreq.small32 = 16;
1177
} else if(create_size <= bg_conf->nodecard_node_cnt)
1178
blockreq.small32 = 1;
1180
/* this should never happen */
1181
blockreq.small128 = 1;
1183
FREE_NULL_BITMAP(iobitmap);
1184
} else if(smallest_bg_record) {
1105
1185
debug2("smallest dynamic block is %s",
1106
1186
smallest_bg_record->bg_block_id);
1107
1187
if(smallest_bg_record->state == RM_PARTITION_ERROR) {
1149
if(create_size != bluegene_nodecard_node_cnt) {
1229
if(create_size != bg_conf->nodecard_node_cnt) {
1150
1230
blockreq.small128 = blockreq.small32 / 4;
1151
1231
blockreq.small32 = 0;
1153
/* set the start to be the same as the start of the
1154
ionode_bitmap. If no ionodes set (not a small
1155
block) set io_start = 0. */
1156
if((io_start = bit_ffs(smallest_bg_record->ionode_bitmap))
1233
} else if((io_start =
1234
bit_ffs(smallest_bg_record->ionode_bitmap)) == -1)
1235
/* set the start to be the same as the start of the
1236
ionode_bitmap. If no ionodes set (not a small
1237
block) set io_start = 0. */
1160
1240
switch(create_size) {
1208
1288
delete_list = list_create(NULL);
1209
1289
while((bg_record = list_pop(requests))) {
1210
1290
slurm_mutex_lock(&block_state_mutex);
1211
itr = list_iterator_create(bg_list);
1291
itr = list_iterator_create(bg_lists->main);
1212
1292
while((found_record = list_next(itr))) {
1213
1293
if(!blocks_overlap(bg_record, found_record))
1215
1295
list_push(delete_list, found_record);
1216
1296
list_remove(itr);
1217
num_block_to_free++;
1219
1298
list_iterator_destroy(itr);
1220
1299
slurm_mutex_unlock(&block_state_mutex);
1232
1311
bg_record->bg_block_id);
1233
1312
print_bg_record(bg_record);
1234
1313
slurm_mutex_lock(&block_state_mutex);
1235
list_append(bg_list, bg_record);
1314
list_append(bg_lists->main, bg_record);
1236
1315
slurm_mutex_unlock(&block_state_mutex);
1237
1316
if(bit_overlap(bg_record->ionode_bitmap,
1238
1317
tmp_record.ionode_bitmap)) {
1330
1409
info("Setting Block %s to ERROR state.", bg_record->bg_block_id);
1331
1410
/* we add the block to these lists so we don't try to schedule
1333
if(!block_ptr_exist_in_list(bg_job_block_list, bg_record)) {
1334
list_push(bg_job_block_list, bg_record);
1412
if(!block_ptr_exist_in_list(bg_lists->job_running, bg_record)) {
1413
list_push(bg_lists->job_running, bg_record);
1335
1414
num_unused_cpus -= bg_record->cpu_cnt;
1337
if(!block_ptr_exist_in_list(bg_booted_block_list, bg_record))
1338
list_push(bg_booted_block_list, bg_record);
1416
if(!block_ptr_exist_in_list(bg_lists->booted, bg_record))
1417
list_push(bg_lists->booted, bg_record);
1340
1419
slurm_mutex_lock(&block_state_mutex);
1341
1420
bg_record->job_running = state;
1344
1423
xfree(bg_record->user_name);
1345
1424
xfree(bg_record->target_name);
1346
bg_record->user_name = xstrdup(bg_slurm_user_name);
1347
bg_record->target_name = xstrdup(bg_slurm_user_name);
1425
bg_record->user_name = xstrdup(bg_conf->slurm_user_name);
1426
bg_record->target_name = xstrdup(bg_conf->slurm_user_name);
1349
pw_uid = uid_from_string(bg_record->user_name);
1350
if(pw_uid == (uid_t) -1) {
1428
if (uid_from_string (bg_record->user_name, &pw_uid) < 0)
1351
1429
error("No such user: %s", bg_record->user_name);
1353
1431
bg_record->user_uid = pw_uid;
1355
1433
slurm_mutex_unlock(&block_state_mutex);
1357
1435
trigger_block_error();
1358
last_bg_update = time(NULL);
1360
1437
return SLURM_SUCCESS;
1372
1449
"being in an error state.",
1373
1450
bg_record->bg_block_id);
1375
if(remove_from_bg_list(bg_job_block_list, bg_record) == SLURM_SUCCESS)
1452
if(remove_from_bg_list(bg_lists->job_running, bg_record)
1376
1454
num_unused_cpus += bg_record->cpu_cnt;
1377
remove_from_bg_list(bg_booted_block_list, bg_record);
1455
remove_from_bg_list(bg_lists->booted, bg_record);
1379
1457
bg_record->job_running = NO_JOB_RUNNING;
1380
1458
bg_record->state = RM_PARTITION_FREE;