1
1
/*****************************************************************************\
2
2
* bg_job_place.c - blue gene job placement (e.g. base block selection)
5
* $Id: bg_job_place.c 17205 2009-04-09 17:24:11Z da $
6
4
*****************************************************************************
7
5
* Copyright (C) 2004-2007 The Regents of the University of California.
6
* Copyright (C) 2008 Lawrence Livermore National Security.
8
7
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
9
8
* Written by Dan Phung <phung4@llnl.gov> and Morris Jette <jette1@llnl.gov>
11
10
* This file is part of SLURM, a resource management program.
12
* For details, see <http://www.llnl.gov/linux/slurm/>.
11
* For details, see <https://computing.llnl.gov/linux/slurm/>.
12
* Please also read the included file: DISCLAIMER.
14
14
* SLURM is free software; you can redistribute it and/or modify it under
15
15
* the terms of the GNU General Public License as published by the Free
63
61
pthread_mutex_t job_list_test_mutex = PTHREAD_MUTEX_INITIALIZER;
65
63
/* This list is for the test_job_list function because we will be
66
* adding and removing blocks off the bg_job_block_list and don't want
67
* to ruin that list in submit_job it should = bg_job_block_list
64
* adding and removing blocks off the bg_lists->job_running and don't want
65
* to ruin that list in submit_job it should = bg_lists->job_running
68
66
* otherwise it should be a copy of that list.
70
68
List job_block_test_list = NULL;
100
98
static int _dynamically_request(List block_list, int *blocks_added,
101
99
ba_request_t *request,
102
bitstr_t* slurm_block_bitmap,
103
char *user_req_nodes);
100
char *user_req_nodes,
104
102
static int _find_best_block_match(List block_list, int *blocks_added,
105
103
struct job_record* job_ptr,
106
104
bitstr_t* slurm_block_bitmap,
284
282
SELECT_DATA_BLRTS_IMAGE, blrtsimage);
286
284
if (*blrtsimage) {
287
allow = _test_image_perms(*blrtsimage, bg_blrtsimage_list,
285
allow = _test_image_perms(*blrtsimage, bg_conf->blrts_list,
290
288
error("User %u:%u is not allowed to use BlrtsImage %s",
298
296
select_g_get_jobinfo(job_ptr->select_jobinfo,
299
297
SELECT_DATA_LINUX_IMAGE, linuximage);
300
298
if (*linuximage) {
301
allow = _test_image_perms(*linuximage, bg_linuximage_list,
299
allow = _test_image_perms(*linuximage, bg_conf->linux_list,
304
302
error("User %u:%u is not allowed to use LinuxImage %s",
310
308
select_g_get_jobinfo(job_ptr->select_jobinfo,
311
309
SELECT_DATA_MLOADER_IMAGE, mloaderimage);
312
310
if (*mloaderimage) {
313
allow = _test_image_perms(*mloaderimage, bg_mloaderimage_list,
311
allow = _test_image_perms(*mloaderimage,
312
bg_conf->mloader_list,
316
315
error("User %u:%u is not allowed "
324
323
select_g_get_jobinfo(job_ptr->select_jobinfo,
325
324
SELECT_DATA_RAMDISK_IMAGE, ramdiskimage);
326
325
if (*ramdiskimage) {
327
allow = _test_image_perms(*ramdiskimage, bg_ramdiskimage_list,
326
allow = _test_image_perms(*ramdiskimage,
327
bg_conf->ramdisk_list,
330
330
error("User %u:%u is not allowed "
373
373
} else if((bg_record->job_running != NO_JOB_RUNNING)
374
374
&& (bg_record->job_running != job_ptr->job_id)
375
&& (bluegene_layout_mode == LAYOUT_DYNAMIC
375
&& (bg_conf->layout_mode == LAYOUT_DYNAMIC
377
&& bluegene_layout_mode != LAYOUT_DYNAMIC))) {
377
&& bg_conf->layout_mode != LAYOUT_DYNAMIC))) {
378
378
debug("block %s in use by %s job %d",
379
379
bg_record->bg_block_id,
380
380
bg_record->user_name,
480
480
goto good_conn_type;
481
} else if(bg_record->conn_type >= SELECT_SMALL) {
482
/* since we already checked to see if
483
the cpus were good this means we are
484
looking for a block in a range that
485
includes small and regular blocks.
486
So we can just continue on.
483
492
debug("bg block %s conn-type not usable asking for %s "
484
493
"bg_record is %s",
604
613
* overlapping that we could avoid freeing if
605
614
* we choose something else
607
if(bluegene_layout_mode == LAYOUT_OVERLAP
616
if(bg_conf->layout_mode == LAYOUT_OVERLAP
608
617
&& ((overlap_check == 0 && bg_record->state
609
618
!= RM_PARTITION_READY)
610
619
|| (overlap_check == 1 && found_record->state
636
645
found_record->job_running,
637
646
found_record->bg_block_id);
639
if(bluegene_layout_mode == LAYOUT_DYNAMIC) {
648
if(bg_conf->layout_mode == LAYOUT_DYNAMIC) {
649
List temp_list = list_create(NULL);
640
650
/* this will remove and
641
651
* destroy the memory for
650
660
bg_record->original;
651
661
remove_from_bg_list(
652
bg_list, found_record);
654
665
debug("looking for original");
656
667
find_and_remove_org_from_bg_list(
660
destroy_bg_record(bg_record);
672
debug("Removing unusable block %s "
674
bg_record->bg_block_id);
661
676
if(!found_record) {
662
/* There may be a bug
667
If that is the case we
671
instead of destroying
674
debug("This record wasn't "
675
"found in the bg_list, "
677
debug("This record %s wasn't "
676
680
"no big deal, it "
677
"probably wasn't added");
680
debug("removing the block "
687
free_block_list(temp_list);
688
list_destroy(temp_list);
681
"probably wasn't added",
682
bg_record->bg_block_id);
683
found_record = bg_record;
685
destroy_bg_record(bg_record);
687
list_push(temp_list, found_record);
688
free_block_list(temp_list);
689
list_destroy(temp_list);
690
691
slurm_mutex_unlock(&block_state_mutex);
727
728
list_append(list_of_lists, job_block_test_list);
729
730
list_append(list_of_lists, block_list);
730
if(job_block_test_list == bg_job_block_list &&
731
list_count(block_list) != list_count(bg_booted_block_list)) {
732
list_append(list_of_lists, bg_booted_block_list);
733
if(list_count(bg_booted_block_list)
731
if(job_block_test_list == bg_lists->job_running &&
732
list_count(block_list) != list_count(bg_lists->booted)) {
733
list_append(list_of_lists, bg_lists->booted);
734
if(list_count(bg_lists->booted)
734
735
!= list_count(job_block_test_list))
735
736
list_append(list_of_lists, job_block_test_list);
736
737
} else if(list_count(block_list)
756
757
while((bg_record = list_pop(new_blocks))) {
757
758
if(block_exist_in_list(block_list, bg_record))
758
759
destroy_bg_record(bg_record);
761
list_append(block_list, bg_record);
760
764
if(job_block_test_list
761
== bg_job_block_list) {
765
== bg_lists->job_running) {
762
766
if(configure_block(bg_record)
763
767
== SLURM_ERROR) {
764
768
destroy_bg_record(
775
779
list_append(block_list, bg_record);
776
780
print_bg_record(bg_record);
777
781
(*blocks_added) = 1;
780
784
list_destroy(new_blocks);
781
785
if(!*blocks_added) {
782
memcpy(request->geometry, start_geo,
786
memcpy(request->geometry, start_geo,
783
787
sizeof(int)*BA_SYSTEM_DIMENSIONS);
784
788
rc = SLURM_ERROR;
787
791
list_sort(block_list,
788
792
(ListCmpF)_bg_record_sort_aval_dec);
790
794
rc = SLURM_SUCCESS;
792
796
} else if (errno == ESLURM_INTERCONNECT_FAILURE) {
910
914
req_nodes = min_nodes;
912
if (target_size == 0) { /* no geometry specified */
913
if(job_ptr->details->req_nodes
915
bg_record_t *tmp_record = NULL;
916
char *tmp_nodes= job_ptr->details->req_nodes;
917
int len = strlen(tmp_nodes);
921
&& tmp_nodes[i] != '['
922
&& (tmp_nodes[i] < '0' || tmp_nodes[i] > 'Z'
923
|| (tmp_nodes[i] > '9'
924
&& tmp_nodes[i] < 'A')))
929
tmp_record = xmalloc(sizeof(bg_record_t));
930
tmp_record->bg_block_list =
931
list_create(destroy_ba_node);
933
len += strlen(bg_slurm_node_prefix)+1;
934
tmp_record->nodes = xmalloc(len);
936
snprintf(tmp_record->nodes,
939
bg_slurm_node_prefix,
943
process_nodes(tmp_record, false);
944
for(i=0; i<BA_SYSTEM_DIMENSIONS; i++) {
945
req_geometry[i] = tmp_record->geo[i];
946
start[i] = tmp_record->start[i];
948
destroy_bg_record(tmp_record);
949
select_g_set_jobinfo(job_ptr->select_jobinfo,
950
SELECT_DATA_GEOMETRY,
952
select_g_set_jobinfo(job_ptr->select_jobinfo,
957
error("BPs=%s is in a weird format",
960
req_geometry[X] = (uint16_t)NO_VAL;
916
req_geometry[X] = (uint16_t)NO_VAL;
962
917
target_size = min_nodes;
1008
963
* works we will have can look and see the earliest
1009
964
* the job can start. This doesn't apply to Dynamic mode.
1011
if(test_only && bluegene_layout_mode != LAYOUT_DYNAMIC)
966
if(test_only && bg_conf->layout_mode != LAYOUT_DYNAMIC)
1012
967
overlapped_list = list_create(NULL);
1014
969
bg_record = _find_matching_block(block_list,
1081
1036
/* all these assume that the *bg_record is NULL */
1083
if(bluegene_layout_mode == LAYOUT_OVERLAP
1038
if(bg_conf->layout_mode == LAYOUT_OVERLAP
1084
1039
&& !test_only && overlap_check < 2) {
1085
1040
overlap_check++;
1089
if(create_try || bluegene_layout_mode != LAYOUT_DYNAMIC)
1044
if(create_try || bg_conf->layout_mode != LAYOUT_DYNAMIC)
1092
1047
if((rc = _dynamically_request(block_list, blocks_added,
1095
job_ptr->details->req_nodes))
1049
job_ptr->details->req_nodes,
1096
1051
== SLURM_SUCCESS) {
1097
1052
create_try = 1;
1104
1059
List job_list = NULL;
1105
1060
debug("trying with empty machine");
1106
1061
slurm_mutex_lock(&block_state_mutex);
1107
if(job_block_test_list == bg_job_block_list)
1062
if(job_block_test_list == bg_lists->job_running)
1108
1063
job_list = copy_bg_list(job_block_test_list);
1110
1065
job_list = job_block_test_list;
1244
1204
itr = list_iterator_create(full_list);
1245
1205
itr2 = list_iterator_create(incomp_list);
1246
1206
while((new_record = list_next(itr))) {
1207
/* Make sure we aren't adding any block that doesn't
1210
if(!new_record->bg_block_id)
1247
1212
while((bg_record = list_next(itr2))) {
1248
1213
if(bit_equal(bg_record->bitmap, new_record->bitmap)
1249
1214
&& bit_equal(bg_record->ionode_bitmap,
1254
1219
if(!bg_record) {
1255
bg_record = xmalloc(sizeof(bg_record_t));
1256
copy_bg_record(new_record, bg_record);
1257
debug4("adding %s", bg_record->bg_block_id);
1258
list_append(incomp_list, bg_record);
1221
debug4("adding %s", new_record->bg_block_id);
1222
list_append(incomp_list, new_record);
1261
1225
list_iterator_reset(itr2);
1234
static void _build_select_struct(struct job_record *job_ptr, bitstr_t *bitmap)
1237
int first_bit, last_bit;
1238
uint32_t node_cpus, total_cpus = 0, node_cnt;
1239
select_job_res_t select_ptr;
1241
if (job_ptr->select_job) {
1242
error("select_p_job_test: already have select_job");
1243
free_select_job_res(&job_ptr->select_job);
1247
node_cnt = bit_set_count(bitmap);
1248
job_ptr->select_job = select_ptr = create_select_job_res();
1249
select_ptr->cpu_array_reps = xmalloc(sizeof(uint32_t) * node_cnt);
1250
select_ptr->cpu_array_value = xmalloc(sizeof(uint16_t) * node_cnt);
1251
select_ptr->cpus = xmalloc(sizeof(uint16_t) * node_cnt);
1252
select_ptr->cpus_used = xmalloc(sizeof(uint16_t) * node_cnt);
1253
select_ptr->nhosts = node_cnt;
1254
select_ptr->node_bitmap = bit_copy(bitmap);
1255
if (select_ptr->node_bitmap == NULL)
1256
fatal("bit_copy malloc failure");
1257
select_ptr->nprocs = job_ptr->num_procs;
1258
if (build_select_job_res(select_ptr, (void *)node_record_table_ptr, 1))
1259
error("select_p_job_test: build_select_job_res: %m");
1261
if (job_ptr->num_procs <= bg_conf->procs_per_bp)
1262
node_cpus = job_ptr->num_procs;
1264
node_cpus = bg_conf->procs_per_bp;
1266
first_bit = bit_ffs(bitmap);
1267
last_bit = bit_fls(bitmap);
1268
for (i=first_bit, j=0, k=-1; i<=last_bit; i++) {
1269
if (!bit_test(bitmap, i))
1272
select_ptr->cpus[j] = node_cpus;
1274
(select_ptr->cpu_array_value[k] != node_cpus)) {
1275
select_ptr->cpu_array_cnt++;
1276
select_ptr->cpu_array_reps[++k] = 1;
1277
select_ptr->cpu_array_value[k] = node_cpus;
1279
select_ptr->cpu_array_reps[k]++;
1280
total_cpus += node_cpus;
1282
if (set_select_job_res_node(select_ptr, j))
1283
error("select_p_job_test: set_select_job_res_node: %m");
1286
if (select_ptr->nprocs != total_cpus) {
1287
error("select_p_job_test: nprocs mismatch %u != %u",
1288
select_ptr->nprocs, total_cpus);
1273
1293
* Try to find resources for a given job request
1303
1322
return EINVAL; /* something not yet supported */
1305
if(bluegene_layout_mode == LAYOUT_DYNAMIC)
1324
if(bg_conf->layout_mode == LAYOUT_DYNAMIC)
1306
1325
slurm_mutex_lock(&create_dynamic_mutex);
1308
job_block_test_list = bg_job_block_list;
1327
job_block_test_list = bg_lists->job_running;
1310
1329
select_g_get_jobinfo(job_ptr->select_jobinfo,
1311
1330
SELECT_DATA_CONN_TYPE, &conn_type);
1312
1331
if(conn_type == SELECT_NAV) {
1313
1332
uint32_t max_procs = (uint32_t)NO_VAL;
1314
if(bluegene_bp_node_cnt == bluegene_nodecard_node_cnt)
1333
if(bg_conf->bp_node_cnt == bg_conf->nodecard_node_cnt)
1315
1334
conn_type = SELECT_SMALL;
1316
1335
else if(min_nodes > 1) {
1317
1336
conn_type = SELECT_TORUS;
1337
1356
select_g_sprint_jobinfo(job_ptr->select_jobinfo, buf, sizeof(buf),
1338
1357
SELECT_PRINT_MIXED);
1339
debug("bluegene:submit_job: %s nodes=%u-%u-%u",
1340
buf, min_nodes, req_nodes, max_nodes);
1358
debug("bluegene:submit_job: %d %s nodes=%u-%u-%u",
1359
mode, buf, min_nodes, req_nodes, max_nodes);
1341
1360
select_g_sprint_jobinfo(job_ptr->select_jobinfo, buf, sizeof(buf),
1342
1361
SELECT_PRINT_BLRTS_IMAGE);
1343
1362
#ifdef HAVE_BGL
1362
1381
debug2("RamDiskIoLoadImage=%s", buf);
1364
1383
slurm_mutex_lock(&block_state_mutex);
1365
block_list = copy_bg_list(bg_list);
1384
block_list = copy_bg_list(bg_lists->main);
1366
1385
slurm_mutex_unlock(&block_state_mutex);
1368
1387
list_sort(block_list, (ListCmpF)_bg_record_sort_aval_dec);
1401
1420
bg_record->ionodes);
1403
1422
if(!bg_record->bg_block_id) {
1404
uint16_t geo[BA_SYSTEM_DIMENSIONS];
1406
1423
debug2("%d can start unassigned job %u at "
1408
1425
test_only, job_ptr->job_id, starttime,
1409
1426
bg_record->nodes);
1410
1427
select_g_set_jobinfo(job_ptr->select_jobinfo,
1411
SELECT_DATA_BLOCK_ID,
1428
SELECT_DATA_BLOCK_ID,
1414
1431
min_nodes = bg_record->node_cnt;
1415
1432
select_g_set_jobinfo(job_ptr->select_jobinfo,
1416
SELECT_DATA_NODE_CNT,
1419
sizeof(uint16_t) * BA_SYSTEM_DIMENSIONS);
1420
select_g_set_jobinfo(job_ptr->select_jobinfo,
1421
SELECT_DATA_GEOMETRY,
1433
SELECT_DATA_NODE_CNT,
1423
1435
/* This is a fake record so we need to
1424
1436
* destroy it after we get the info from
1426
destroy_bg_record(bg_record);
1437
* it. if it was just testing then
1438
* we added this record to the
1439
* block_list. If this is the case
1440
* it will be set below, but set
1441
* blocks_added to 0 since we don't
1442
* want to sync this with the list. */
1444
destroy_bg_record(bg_record);
1428
1447
if((bg_record->ionodes)
1429
1448
&& (job_ptr->part_ptr->max_share <= 1))
1441
1460
select_g_set_jobinfo(job_ptr->select_jobinfo,
1442
1461
SELECT_DATA_NODE_CNT,
1443
1462
&bg_record->node_cnt);
1444
select_g_set_jobinfo(job_ptr->select_jobinfo,
1445
SELECT_DATA_GEOMETRY,
1448
1464
/* tmp16 = bg_record->conn_type; */
1449
1465
/* select_g_set_jobinfo(job_ptr->select_jobinfo, */
1450
1466
/* SELECT_DATA_CONN_TYPE, */
1469
if (mode == SELECT_MODE_RUN_NOW) {
1470
_build_select_struct(job_ptr,
1471
slurm_block_bitmap);
1454
1474
error("we got a success, but no block back");
1458
if(bluegene_layout_mode == LAYOUT_DYNAMIC) {
1478
if(bg_conf->layout_mode == LAYOUT_DYNAMIC) {
1459
1479
slurm_mutex_lock(&block_state_mutex);
1460
1480
if(blocks_added)
1461
_sync_block_lists(block_list, bg_list);
1481
_sync_block_lists(block_list, bg_lists->main);
1462
1482
slurm_mutex_unlock(&block_state_mutex);
1463
1483
slurm_mutex_unlock(&create_dynamic_mutex);
1466
1486
list_destroy(block_list);
1471
1490
extern int test_job_list(List req_list)
1473
1492
int rc = SLURM_SUCCESS;
1475
1493
bg_record_t* bg_record = NULL;
1476
1494
bg_record_t* new_record = NULL;
1485
1503
slurm_mutex_lock(&job_list_test_mutex);
1487
if(bluegene_layout_mode == LAYOUT_DYNAMIC)
1505
if(bg_conf->layout_mode == LAYOUT_DYNAMIC)
1488
1506
slurm_mutex_lock(&create_dynamic_mutex);
1490
job_block_test_list = copy_bg_list(bg_job_block_list);
1508
job_block_test_list = copy_bg_list(bg_lists->job_running);
1492
1510
slurm_mutex_lock(&block_state_mutex);
1493
block_list = copy_bg_list(bg_list);
1511
block_list = copy_bg_list(bg_lists->main);
1494
1512
slurm_mutex_unlock(&block_state_mutex);
1496
1514
itr = list_iterator_create(req_list);
1640
1658
/* SELECT_DATA_BLOCK_ID, */
1641
1659
/* "unassigned"); */
1642
1660
/* if(will_run->job_ptr->num_procs */
1643
/* < bluegene_bp_node_cnt */
1661
/* < bg_conf->bp_node_cnt */
1644
1662
/* && will_run->job_ptr->num_procs */
1646
/* i = procs_per_node/ */
1664
/* i = bg_conf->procs_per_bp/ */
1647
1665
/* will_run->job_ptr-> */
1648
1666
/* num_procs; */
1649
1667
/* debug2("divide by %d", i); */
1652
1670
/* will_run->min_nodes *= */
1653
/* bluegene_bp_node_cnt/i; */
1671
/* bg_conf->bp_node_cnt/i; */
1654
1672
/* select_g_set_jobinfo( */
1655
1673
/* will_run->job_ptr-> */
1656
1674
/* select_jobinfo, */