~ubuntu-branches/ubuntu/vivid/slurm-llnl/vivid

« back to all changes in this revision

Viewing changes to src/plugins/select/bluegene/plugin/bg_job_run.c

  • Committer: Bazaar Package Importer
  • Author(s): Gennaro Oliva
  • Date: 2009-09-24 23:28:15 UTC
  • mfrom: (1.1.11 upstream) (3.2.4 sid)
  • Revision ID: james.westby@ubuntu.com-20090924232815-enh65jn32q1ebg07
Tags: 2.0.5-1
* New upstream release 
* Changed dependecy from lib-mysqlclient15 to lib-mysqlclient 
* Added Default-Start for runlevel 2 and 4 and $remote_fs requirement in
  init.d scripts (Closes: #541252)
* Postinst checks for wrong runlevels 2 and 4 links
* Upgraded to standard version 3.8.3
* Add lintian overrides for missing slurm-llnl-configurator.html in doc
  base registration
* modified postrm scripts to ignore pkill return value in order to avoid
  postrm failure when no slurm process is running
* Checking for slurmctld.pid before cancelling running and pending
  jobs during package removal 

Show diffs side-by-side

added added

removed removed

Lines of Context:
2
2
 *  bg_job_run.c - blue gene job execution (e.g. initiation and termination) 
3
3
 *  functions.
4
4
 *
5
 
 *  $Id: bg_job_run.c 17202 2009-04-09 16:56:23Z da $ 
 
5
 *  $Id: bg_job_run.c 18162 2009-07-15 23:23:06Z da $ 
6
6
 *****************************************************************************
7
7
 *  Copyright (C) 2004-2006 The Regents of the University of California.
8
8
 *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
9
9
 *  Written by Morris Jette <jette1@llnl.gov>
10
10
 *  
11
11
 *  This file is part of SLURM, a resource management program.
12
 
 *  For details, see <http://www.llnl.gov/linux/slurm/>.
 
12
 *  For details, see <https://computing.llnl.gov/linux/slurm/>.
 
13
 *  Please also read the included file: DISCLAIMER.
13
14
 *  
14
15
 *  SLURM is free software; you can redistribute it and/or modify it under
15
16
 *  the terms of the GNU General Public License as published by the Free
160
161
                if(job_state == RM_JOB_TERMINATED)
161
162
                        return STATUS_OK;
162
163
                else if(job_state == RM_JOB_DYING) {
163
 
                        /* start sending sigkills for the last 5 tries */
164
164
                        if(count > MAX_POLL_RETRIES) 
165
165
                                error("Job %d isn't dying, trying for "
166
 
                                      "%d seconds", count*POLL_INTERVAL);
 
166
                                      "%d seconds", job_id, 
 
167
                                      count*POLL_INTERVAL);
167
168
                        continue;
168
169
                } else if(job_state == RM_JOB_ERROR) {
169
170
                        error("job %d is in a error state.", job_id);
215
216
                /* remove user from list */             
216
217
                
217
218
                if(bg_record->target_name) {
218
 
                        if(strcmp(bg_record->target_name, bg_slurm_user_name)) {
 
219
                        if(strcmp(bg_record->target_name, 
 
220
                                  bg_conf->slurm_user_name)) {
219
221
                                xfree(bg_record->target_name);
220
222
                                bg_record->target_name = 
221
 
                                        xstrdup(bg_slurm_user_name);
 
223
                                        xstrdup(bg_conf->slurm_user_name);
222
224
                        }
223
225
                        update_block_user(bg_record, 1);
224
226
                } else {
225
 
                        bg_record->target_name = xstrdup(bg_slurm_user_name);
 
227
                        bg_record->target_name = 
 
228
                                xstrdup(bg_conf->slurm_user_name);
226
229
                }       
227
230
                
228
231
                        
230
233
                bg_record->boot_count = 0;
231
234
                
232
235
                last_bg_update = time(NULL);
233
 
                if(remove_from_bg_list(bg_job_block_list, bg_record) 
234
 
                   == SLURM_SUCCESS) {
235
 
                        num_unused_cpus += bg_record->cpu_cnt;
236
 
                }
 
236
                /* Only remove from the job_running list if
 
237
                   job_running == NO_JOB_RUNNING, since blocks in
 
238
                   error state could also be in this list and we don't
 
239
                   want to remove them.
 
240
                */
 
241
                if(bg_record->job_running == NO_JOB_RUNNING)
 
242
                        if(remove_from_bg_list(bg_lists->job_running,
 
243
                                               bg_record) 
 
244
                           == SLURM_SUCCESS) {
 
245
                                num_unused_cpus += bg_record->cpu_cnt;
 
246
                        }
237
247
        } else {
238
248
                error("No block given to reset");
239
249
                rc = SLURM_ERROR;
262
272
{
263
273
        bg_record_t * bg_record = NULL;
264
274
        
265
 
        bg_record = find_bg_record_in_list(bg_list, bg_update_ptr->bg_block_id);
 
275
        bg_record = find_bg_record_in_list(bg_lists->main,
 
276
                                           bg_update_ptr->bg_block_id);
266
277
        if(!bg_record) {
267
278
                error("No block %s", bg_update_ptr->bg_block_id);
268
279
                return;
269
280
        }
270
281
        slurm_mutex_lock(&block_state_mutex);
271
282
 
 
283
        bg_update_ptr->job_ptr->num_procs = bg_record->cpu_cnt;
 
284
        bg_update_ptr->job_ptr->total_procs = bg_update_ptr->job_ptr->num_procs;
272
285
        bg_record->job_running = bg_update_ptr->job_ptr->job_id;
273
286
        bg_record->job_ptr = bg_update_ptr->job_ptr;
274
287
 
275
 
        if(!block_ptr_exist_in_list(bg_job_block_list, bg_record)) {
276
 
                list_push(bg_job_block_list, bg_record);
 
288
        if(!block_ptr_exist_in_list(bg_lists->job_running, bg_record)) {
 
289
                list_push(bg_lists->job_running, bg_record);
277
290
                num_unused_cpus -= bg_record->cpu_cnt;
278
291
        }
279
 
        if(!block_ptr_exist_in_list(bg_booted_block_list, bg_record)) 
280
 
                list_push(bg_booted_block_list, bg_record);
 
292
        if(!block_ptr_exist_in_list(bg_lists->booted, bg_record)) 
 
293
                list_push(bg_lists->booted, bg_record);
281
294
        slurm_mutex_unlock(&block_state_mutex);
282
295
 
283
296
        if(bg_record->state == RM_PARTITION_READY) {
325
338
 
326
339
        slurm_mutex_lock(&job_start_mutex);
327
340
                
328
 
        bg_record = find_bg_record_in_list(bg_list, bg_update_ptr->bg_block_id);
 
341
        bg_record = find_bg_record_in_list(bg_lists->main, 
 
342
                                           bg_update_ptr->bg_block_id);
329
343
 
330
344
        if(!bg_record) {
331
 
                error("block %s not found in bg_list",
 
345
                error("block %s not found in bg_lists->main",
332
346
                      bg_update_ptr->bg_block_id);
333
347
                /* wait for the slurmd to begin 
334
348
                   the batch script, slurm_fail_job() 
370
384
        
371
385
        delete_list = list_create(NULL);
372
386
        slurm_mutex_lock(&block_state_mutex);
373
 
        itr = list_iterator_create(bg_list);
 
387
        itr = list_iterator_create(bg_lists->main);
374
388
        while ((found_record = list_next(itr))) {
375
389
                if ((!found_record) || (bg_record == found_record))
376
390
                        continue;
400
414
                       found_record->bg_block_id, 
401
415
                       bg_record->bg_block_id);
402
416
                list_push(delete_list, found_record);
403
 
                if(bluegene_layout_mode == LAYOUT_DYNAMIC) {
 
417
                if(bg_conf->layout_mode == LAYOUT_DYNAMIC) 
404
418
                        list_remove(itr);
405
 
                }
406
 
                num_block_to_free++;
407
419
        }               
408
420
        list_iterator_destroy(itr);
409
421
 
410
422
        if(requeue_job) {
411
 
                num_block_to_free = 0;
412
 
                num_block_freed = 0;
 
423
                num_block_to_free = num_block_freed = 0;
413
424
                list_destroy(delete_list);
414
425
 
415
426
                _reset_block(bg_record);
446
457
        }
447
458
        /* Zero out the values here because we are done with them and
448
459
           they will be ready for the next job */
449
 
        num_block_to_free = 0;
450
 
        num_block_freed = 0;
 
460
        num_block_to_free = num_block_freed = 0;
451
461
        
452
462
        slurm_mutex_lock(&block_state_mutex);
453
463
        if(bg_record->job_running <= NO_JOB_RUNNING) {
637
647
        } else if (bg_record->state == RM_PARTITION_CONFIGURING) {
638
648
                bg_record->boot_state = 1;              
639
649
        }
640
 
        
 
650
 
641
651
        if(bg_record->job_running <= NO_JOB_RUNNING) {
642
652
                slurm_mutex_unlock(&job_start_mutex);
643
653
                debug("job %u finished during the start of the boot "
672
682
                   incase the fail job isn't ran */
673
683
                (void) slurm_fail_job(bg_record->job_running);
674
684
                slurm_mutex_lock(&block_state_mutex);
675
 
                if (remove_from_bg_list(bg_job_block_list, bg_record)
 
685
                if (remove_from_bg_list(bg_lists->job_running, bg_record)
676
686
                    == SLURM_SUCCESS) {
677
687
                        num_unused_cpus += bg_record->cpu_cnt;
678
688
                }
710
720
        }
711
721
        
712
722
                        
713
 
        if ((rc = bridge_get_data(job_list, RM_JobListSize, &jobs)) != STATUS_OK) {
 
723
        if ((rc = bridge_get_data(job_list, RM_JobListSize, &jobs)) 
 
724
            != STATUS_OK) {
714
725
                error("bridge_get_data(RM_JobListSize): %s", bg_err_str(rc));
715
726
                jobs = 0;
716
727
        }
778
789
#endif
779
790
        
780
791
        /* remove the block's users */
781
 
        bg_record = find_bg_record_in_list(bg_list, bg_update_ptr->bg_block_id);
 
792
        bg_record = find_bg_record_in_list(bg_lists->main,
 
793
                                           bg_update_ptr->bg_block_id);
782
794
        if(bg_record) {
783
795
                debug("got the record %s user is %s",
784
796
                      bg_record->bg_block_id,
804
816
                
805
817
                slurm_mutex_unlock(&block_state_mutex);
806
818
                
807
 
        } else if (bluegene_layout_mode == LAYOUT_DYNAMIC) {
 
819
        } else if (bg_conf->layout_mode == LAYOUT_DYNAMIC) {
808
820
                debug2("Hopefully we are destroying this block %s "
809
 
                       "since it isn't in the bg_list",
 
821
                       "since it isn't in the bg_lists->main",
810
822
                       bg_update_ptr->bg_block_id);
811
823
        } else {
812
824
                error("Could not find block %s previously assigned to job.  "
878
890
            &&  ((bg_update_list = list_create(_bg_list_del)) == NULL))
879
891
                fatal("malloc failure in start_job/list_create");
880
892
 
881
 
        /* push TERM_OP on the head of the queue
882
 
         * append START_OP and SYNC_OP to the tail of the queue */
883
 
        if (bg_update_ptr->op == TERM_OP) {
 
893
        /* push SYNC_OP and TERM_OP on the head of the queue
 
894
         * append START_OP to the tail of the queue */
 
895
        if (bg_update_ptr->op == START_OP) {
 
896
                if (list_append(bg_update_list, bg_update_ptr) == NULL)
 
897
                        fatal("malloc failure in _block_op/list_append");
 
898
        } else {
884
899
                if (list_push(bg_update_list, bg_update_ptr) == NULL)
885
900
                        fatal("malloc failure in _block_op/list_push");
886
 
        } else {
887
 
                if (list_enqueue(bg_update_list, bg_update_ptr) == NULL)
888
 
                        fatal("malloc failure in _block_op/list_enqueue");
889
901
        }
890
902
                
891
903
        /* already running MAX_AGENTS we don't really need more 
894
906
                slurm_mutex_unlock(&agent_cnt_mutex);
895
907
                return;
896
908
        }
 
909
 
 
910
        slurm_mutex_unlock(&agent_cnt_mutex);
897
911
        agent_cnt++;
898
 
        slurm_mutex_unlock(&agent_cnt_mutex);
899
912
        /* spawn an agent */
900
913
        slurm_attr_init(&attr_agent);
901
914
        if (pthread_attr_setdetachstate(&attr_agent, 
925
938
        if (!ret_list)
926
939
                fatal("malloc error");
927
940
 
928
 
        if(bg_list) {
929
 
                itr = list_iterator_create(bg_list);
 
941
        if(bg_lists->main) {
 
942
                itr = list_iterator_create(bg_lists->main);
930
943
                while ((block_ptr = (bg_record_t *) list_next(itr))) {
931
944
                        if ((block_ptr->user_name == NULL)
932
945
                            ||  (block_ptr->user_name[0] == '\0')
941
954
                }
942
955
                list_iterator_destroy(itr);
943
956
        } else {
944
 
                error("_get_all_blocks: no bg_list");
 
957
                error("_get_all_blocks: no bg_lists->main");
945
958
        }
946
959
 
947
960
        return ret_list;
1033
1046
                             SELECT_DATA_BLOCK_ID, 
1034
1047
                             &(bg_update_ptr->bg_block_id));
1035
1048
        select_g_get_jobinfo(job_ptr->select_jobinfo,
 
1049
                             SELECT_DATA_REBOOT, 
 
1050
                             &(bg_update_ptr->reboot));
 
1051
#ifdef HAVE_BGL
 
1052
        select_g_get_jobinfo(job_ptr->select_jobinfo,
1036
1053
                             SELECT_DATA_BLRTS_IMAGE, 
1037
1054
                             &(bg_update_ptr->blrtsimage));
1038
 
        select_g_get_jobinfo(job_ptr->select_jobinfo,
1039
 
                             SELECT_DATA_REBOOT, 
1040
 
                             &(bg_update_ptr->reboot));
1041
 
#ifdef HAVE_BGL
1042
1055
        if(!bg_update_ptr->blrtsimage) {
1043
 
                bg_update_ptr->blrtsimage = xstrdup(default_blrtsimage);
 
1056
                bg_update_ptr->blrtsimage =
 
1057
                        xstrdup(bg_conf->default_blrtsimage);
1044
1058
                select_g_set_jobinfo(job_ptr->select_jobinfo,
1045
1059
                                     SELECT_DATA_BLRTS_IMAGE, 
1046
1060
                                     bg_update_ptr->blrtsimage);
1055
1069
                             SELECT_DATA_LINUX_IMAGE, 
1056
1070
                             &(bg_update_ptr->linuximage));
1057
1071
        if(!bg_update_ptr->linuximage) {
1058
 
                bg_update_ptr->linuximage = xstrdup(default_linuximage);
 
1072
                bg_update_ptr->linuximage =
 
1073
                        xstrdup(bg_conf->default_linuximage);
1059
1074
                select_g_set_jobinfo(job_ptr->select_jobinfo,
1060
1075
                                     SELECT_DATA_LINUX_IMAGE, 
1061
1076
                                     bg_update_ptr->linuximage);
1064
1079
                             SELECT_DATA_MLOADER_IMAGE, 
1065
1080
                             &(bg_update_ptr->mloaderimage));
1066
1081
        if(!bg_update_ptr->mloaderimage) {
1067
 
                bg_update_ptr->mloaderimage = xstrdup(default_mloaderimage);
 
1082
                bg_update_ptr->mloaderimage = 
 
1083
                        xstrdup(bg_conf->default_mloaderimage);
1068
1084
                select_g_set_jobinfo(job_ptr->select_jobinfo,
1069
1085
                                     SELECT_DATA_MLOADER_IMAGE, 
1070
1086
                                     bg_update_ptr->mloaderimage);
1073
1089
                             SELECT_DATA_RAMDISK_IMAGE, 
1074
1090
                             &(bg_update_ptr->ramdiskimage));
1075
1091
        if(!bg_update_ptr->ramdiskimage) {
1076
 
                bg_update_ptr->ramdiskimage = xstrdup(default_ramdiskimage);
 
1092
                bg_update_ptr->ramdiskimage =
 
1093
                        xstrdup(bg_conf->default_ramdiskimage);
1077
1094
                select_g_set_jobinfo(job_ptr->select_jobinfo,
1078
1095
                                     SELECT_DATA_RAMDISK_IMAGE, 
1079
1096
                                     bg_update_ptr->ramdiskimage);
1080
1097
        }
1081
1098
        bg_record = 
1082
 
                find_bg_record_in_list(bg_list, bg_update_ptr->bg_block_id);
 
1099
                find_bg_record_in_list(bg_lists->main, 
 
1100
                                       bg_update_ptr->bg_block_id);
1083
1101
        if (bg_record) {
1084
1102
                slurm_mutex_lock(&block_state_mutex);
1085
1103
                job_ptr->num_procs = bg_record->cpu_cnt;
 
1104
                job_ptr->total_procs = job_ptr->num_procs;
1086
1105
                bg_record->job_running = bg_update_ptr->job_ptr->job_id;
1087
1106
                bg_record->job_ptr = bg_update_ptr->job_ptr;
1088
 
                if(!block_ptr_exist_in_list(bg_job_block_list, bg_record)) {
1089
 
                        list_push(bg_job_block_list, bg_record);
 
1107
                if(!block_ptr_exist_in_list(bg_lists->job_running, bg_record)) {
 
1108
                        list_push(bg_lists->job_running, bg_record);
1090
1109
                        num_unused_cpus -= bg_record->cpu_cnt;
1091
1110
                }
1092
 
                if(!block_ptr_exist_in_list(bg_booted_block_list, bg_record))
1093
 
                        list_push(bg_booted_block_list, bg_record);
 
1111
                if(!block_ptr_exist_in_list(bg_lists->booted, bg_record))
 
1112
                        list_push(bg_lists->booted, bg_record);
1094
1113
                slurm_mutex_unlock(&block_state_mutex);
1095
1114
        } else {
1096
1115
                error("bg_record %s doesn't exist, requested for job (%d)", 
1164
1183
                                continue;
1165
1184
                        
1166
1185
                        bg_update_ptr = xmalloc(sizeof(bg_update_t));
 
1186
                        bg_update_ptr->op = SYNC_OP;
 
1187
                        bg_update_ptr->job_ptr = job_ptr;
 
1188
 
1167
1189
                        select_g_get_jobinfo(job_ptr->select_jobinfo,
1168
1190
                                             SELECT_DATA_BLOCK_ID, 
1169
1191
                                             &(bg_update_ptr->bg_block_id));
1171
1193
                        select_g_get_jobinfo(job_ptr->select_jobinfo,
1172
1194
                                             SELECT_DATA_BLRTS_IMAGE, 
1173
1195
                                             &(bg_update_ptr->blrtsimage));
 
1196
#else
 
1197
                        select_g_get_jobinfo(job_ptr->select_jobinfo,
 
1198
                                             SELECT_DATA_CONN_TYPE, 
 
1199
                                             &(bg_update_ptr->conn_type));
1174
1200
#endif
1175
1201
                        select_g_get_jobinfo(job_ptr->select_jobinfo,
1176
1202
                                             SELECT_DATA_LINUX_IMAGE, 
1213
1239
                               job_ptr->job_id, 
1214
1240
                               bg_update_ptr->bg_block_id,
1215
1241
                               job_ptr->end_time);
1216
 
                        bg_update_ptr->op = SYNC_OP;
1217
 
                        bg_update_ptr->job_ptr = job_ptr;
1218
1242
                        _block_op(bg_update_ptr);
1219
1243
                }
1220
1244
                list_iterator_destroy(job_iterator);
1258
1282
        int rc; 
1259
1283
                
1260
1284
        if ((rc = bridge_set_block_owner(bg_record->bg_block_id, 
1261
 
                                         bg_slurm_user_name)) 
 
1285
                                         bg_conf->slurm_user_name)) 
1262
1286
            != STATUS_OK) {
1263
1287
                error("bridge_set_block_owner(%s,%s): %s", 
1264
1288
                      bg_record->bg_block_id, 
1265
 
                      bg_slurm_user_name,
 
1289
                      bg_conf->slurm_user_name,
1266
1290
                      bg_err_str(rc));
1267
1291
                return SLURM_ERROR;
1268
1292
        }       
1291
1315
        }
1292
1316
        
1293
1317
        slurm_mutex_lock(&block_state_mutex);
1294
 
        if(!block_ptr_exist_in_list(bg_booted_block_list, bg_record))
1295
 
                list_push(bg_booted_block_list, bg_record);
 
1318
        if(!block_ptr_exist_in_list(bg_lists->booted, bg_record))
 
1319
                list_push(bg_lists->booted, bg_record);
1296
1320
        slurm_mutex_unlock(&block_state_mutex);
1297
1321
        
1298
1322
        rc = 0;
1317
1341
        slurm_mutex_unlock(&block_state_mutex);
1318
1342
#else
1319
1343
        slurm_mutex_lock(&block_state_mutex);
1320
 
        if(!block_ptr_exist_in_list(bg_booted_block_list, bg_record))
1321
 
                list_push(bg_booted_block_list, bg_record);
 
1344
        if(!block_ptr_exist_in_list(bg_lists->booted, bg_record))
 
1345
                list_push(bg_lists->booted, bg_record);
1322
1346
        bg_record->state = RM_PARTITION_READY;
1323
1347
        last_bg_update = time(NULL);
1324
1348
        slurm_mutex_unlock(&block_state_mutex);