~ubuntu-branches/ubuntu/vivid/slurm-llnl/vivid

« back to all changes in this revision

Viewing changes to src/slurmctld/step_mgr.c

  • Committer: Bazaar Package Importer
  • Author(s): Gennaro Oliva
  • Date: 2009-09-24 23:28:15 UTC
  • mfrom: (1.1.11 upstream) (3.2.4 sid)
  • Revision ID: james.westby@ubuntu.com-20090924232815-enh65jn32q1ebg07
Tags: 2.0.5-1
* New upstream release 
* Changed dependecy from lib-mysqlclient15 to lib-mysqlclient 
* Added Default-Start for runlevel 2 and 4 and $remote_fs requirement in
  init.d scripts (Closes: #541252)
* Postinst checks for wrong runlevels 2 and 4 links
* Upgraded to standard version 3.8.3
* Add lintian overrides for missing slurm-llnl-configurator.html in doc
  base registration
* modified postrm scripts to ignore pkill return value in order to avoid
  postrm failure when no slurm process is running
* Checking for slurmctld.pid before cancelling running and pending
  jobs during package removal 

Show diffs side-by-side

added added

removed removed

Lines of Context:
1
1
/*****************************************************************************\
2
2
 *  step_mgr.c - manage the job step information of slurm
3
 
 *  $Id: step_mgr.c 16584 2009-02-18 19:03:40Z jette $
4
3
 *****************************************************************************
5
4
 *  Copyright (C) 2002-2007 The Regents of the University of California.
 
5
 *  Copyright (C) 2008-2009 Lawrence Livermore National Security.
6
6
 *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
7
7
 *  Written by Morris Jette <jette1@llnl.gov>, et. al.
8
 
 *  LLNL-CODE-402394.
 
8
 *  CODE-OCEC-09-009. All rights reserved.
9
9
 *  
10
10
 *  This file is part of SLURM, a resource management program.
11
 
 *  For details, see <http://www.llnl.gov/linux/slurm/>.
 
11
 *  For details, see <https://computing.llnl.gov/linux/slurm/>.
 
12
 *  Please also read the included file: DISCLAIMER.
12
13
 *  
13
14
 *  SLURM is free software; you can redistribute it and/or modify it under
14
15
 *  the terms of the GNU General Public License as published by the Free
55
56
 
56
57
#include "src/common/bitstring.h"
57
58
#include "src/common/checkpoint.h"
 
59
#include "src/common/forward.h"
 
60
#include "src/common/slurm_accounting_storage.h"
 
61
#include "src/common/slurm_jobacct_gather.h"
58
62
#include "src/common/slurm_protocol_interface.h"
59
63
#include "src/common/switch.h"
60
64
#include "src/common/xstring.h"
61
 
#include "src/common/forward.h"
62
 
#include "src/common/slurm_accounting_storage.h"
63
 
#include "src/common/slurm_jobacct_gather.h"
64
65
 
65
66
#include "src/slurmctld/agent.h"
66
67
#include "src/slurmctld/locks.h"
67
68
#include "src/slurmctld/node_scheduler.h"
 
69
#include "src/slurmctld/port_mgr.h"
68
70
#include "src/slurmctld/slurmctld.h"
69
71
#include "src/slurmctld/srun_comm.h"
70
72
 
71
 
#define STEP_DEBUG 0
72
73
#define MAX_RETRIES 10
73
74
 
74
75
static int  _count_cpus(bitstr_t *bitmap);
 
76
static struct step_record * _create_step_record (struct job_record *job_ptr);
 
77
static void _dump_step_layout(struct step_record *step_ptr);
 
78
static void _free_step_rec(struct step_record *step_ptr);
75
79
static void _pack_ctld_job_step_info(struct step_record *step, Buf buffer);
76
80
static bitstr_t * _pick_step_nodes (struct job_record  *job_ptr, 
77
81
                                    job_step_create_request_msg_t *step_spec,
78
 
                                    bool batch_step, int *return_code);
 
82
                                    int cpus_per_task, bool batch_step,
 
83
                                    int *return_code);
79
84
static hostlist_t _step_range_to_hostlist(struct step_record *step_ptr,
80
85
                                uint32_t range_first, uint32_t range_last);
81
86
static int _step_hostname_to_inx(struct step_record *step_ptr,
84
89
 
85
90
 
86
91
/* 
87
 
 * create_step_record - create an empty step_record for the specified job.
 
92
 * _create_step_record - create an empty step_record for the specified job.
88
93
 * IN job_ptr - pointer to job table entry to have step record added
89
94
 * RET a pointer to the record or NULL if error
90
95
 * NOTE: allocates memory that should be xfreed with delete_step_record
91
96
 */
92
 
struct step_record * 
93
 
create_step_record (struct job_record *job_ptr) 
 
97
static struct step_record * _create_step_record(struct job_record *job_ptr)
94
98
{
95
99
        struct step_record *step_ptr;
96
100
 
97
101
        xassert(job_ptr);
 
102
        /* NOTE: Reserve highest step ID values for NO_VAL and
 
103
         * SLURM_BATCH_SCRIPT */
 
104
        if (job_ptr->next_step_id >= 0xfffffff0) {
 
105
                /* avoid step records in the accounting database */
 
106
                info("job %u has reached step id limit", job_ptr->job_id);
 
107
                return NULL;
 
108
        }
 
109
 
98
110
        step_ptr = (struct step_record *) xmalloc(sizeof (struct step_record));
99
111
 
100
112
        last_job_update = time(NULL);
101
 
        step_ptr->job_ptr = job_ptr; 
102
 
        step_ptr->step_id = (job_ptr->next_step_id)++;
 
113
        step_ptr->job_ptr = job_ptr;
103
114
        step_ptr->start_time = time(NULL) ;
104
115
        step_ptr->jobacct = jobacct_gather_g_create(NULL);
105
 
        step_ptr->ckpt_path = NULL;
 
116
        step_ptr->ckpt_dir = NULL;
106
117
        if (list_append (job_ptr->step_list, step_ptr) == NULL)
107
 
                fatal ("create_step_record: unable to allocate memory");
 
118
                fatal ("_create_step_record: unable to allocate memory");
108
119
 
109
120
        return step_ptr;
110
121
}
139
150
                        switch_free_jobinfo(step_ptr->switch_job);
140
151
                }
141
152
                checkpoint_free_jobinfo(step_ptr->check_job);
142
 
                xfree(step_ptr->host);
143
 
                xfree(step_ptr->name);
144
 
                slurm_step_layout_destroy(step_ptr->step_layout);
145
 
                jobacct_gather_g_destroy(step_ptr->jobacct);
146
 
                FREE_NULL_BITMAP(step_ptr->step_node_bitmap);
147
 
                FREE_NULL_BITMAP(step_ptr->exit_node_bitmap);
148
 
                if (step_ptr->network)
149
 
                        xfree(step_ptr->network);
150
 
                xfree(step_ptr->ckpt_path);
151
 
                xfree(step_ptr);
 
153
                _free_step_rec(step_ptr);
152
154
        }               
153
155
 
154
156
        list_iterator_destroy (step_iterator);
155
157
}
156
158
 
 
159
/* _free_step_rec - delete a step record's data structures */
 
160
static void _free_step_rec(struct step_record *step_ptr)
 
161
{
 
162
        xfree(step_ptr->host);
 
163
        xfree(step_ptr->name);
 
164
        slurm_step_layout_destroy(step_ptr->step_layout);
 
165
        jobacct_gather_g_destroy(step_ptr->jobacct);
 
166
        FREE_NULL_BITMAP(step_ptr->core_bitmap_job);
 
167
        FREE_NULL_BITMAP(step_ptr->exit_node_bitmap);
 
168
        FREE_NULL_BITMAP(step_ptr->step_node_bitmap);
 
169
        xfree(step_ptr->resv_port_array);
 
170
        xfree(step_ptr->resv_ports);
 
171
        xfree(step_ptr->network);
 
172
        xfree(step_ptr->ckpt_dir);
 
173
        xfree(step_ptr);
 
174
}
157
175
 
158
176
/* 
159
177
 * delete_step_record - delete record for job step for specified job_ptr 
186
204
                                        step_ptr->step_layout->node_list);
187
205
                                switch_free_jobinfo (step_ptr->switch_job);
188
206
                        }
 
207
                        resv_port_free(step_ptr);
189
208
                        checkpoint_free_jobinfo (step_ptr->check_job);
190
 
 
191
 
                        if (step_ptr->mem_per_task)
192
 
                                select_g_step_fini(step_ptr);
193
 
 
194
 
                        xfree(step_ptr->host);
195
 
                        xfree(step_ptr->name);
196
 
                        slurm_step_layout_destroy(step_ptr->step_layout);
197
 
                        jobacct_gather_g_destroy(step_ptr->jobacct);
198
 
                        FREE_NULL_BITMAP(step_ptr->step_node_bitmap);
199
 
                        FREE_NULL_BITMAP(step_ptr->exit_node_bitmap);
200
 
                        if (step_ptr->network)
201
 
                                xfree(step_ptr->network);
202
 
                        xfree(step_ptr->ckpt_path);
203
 
                        xfree(step_ptr);
 
209
                        _free_step_rec(step_ptr);
204
210
                        error_code = 0;
205
211
                        break;
206
212
                }
218
224
void
219
225
dump_step_desc(job_step_create_request_msg_t *step_spec)
220
226
{
221
 
        if (step_spec == NULL) 
222
 
                return;
223
 
 
224
 
        debug3("StepDesc: user_id=%u job_id=%u node_count=%u, cpu_count=%u", 
225
 
                step_spec->user_id, step_spec->job_id, 
226
 
                step_spec->node_count, step_spec->cpu_count);
 
227
        debug3("StepDesc: user_id=%u job_id=%u node_count=%u cpu_count=%u", 
 
228
               step_spec->user_id, step_spec->job_id, 
 
229
               step_spec->node_count, step_spec->cpu_count);
227
230
        debug3("   num_tasks=%u relative=%u task_dist=%u node_list=%s", 
228
 
                step_spec->num_tasks, step_spec->relative, 
229
 
                step_spec->task_dist, step_spec->node_list);
230
 
        debug3("   host=%s port=%u name=%s network=%s checkpoint=%u", 
231
 
                step_spec->host, step_spec->port, step_spec->name,
232
 
                step_spec->network, step_spec->ckpt_interval);
233
 
        debug3("   checkpoint-path=%s exclusive=%u immediate=%u mem_per_task=%u",
234
 
                step_spec->ckpt_path, step_spec->exclusive, 
235
 
                step_spec->immediate, step_spec->mem_per_task);
 
231
               step_spec->num_tasks, step_spec->relative, 
 
232
               step_spec->task_dist, step_spec->node_list);
 
233
        debug3("   host=%s port=%u name=%s network=%s exclusive=%u", 
 
234
               step_spec->host, step_spec->port, step_spec->name,
 
235
               step_spec->network, step_spec->exclusive);
 
236
        debug3("   checkpoint-dir=%s checkpoint_int=%u",
 
237
               step_spec->ckpt_dir, step_spec->ckpt_interval);
 
238
        debug3("   mem_per_task=%u resv_port_cnt=%u immediate=%u no_kill=%u",
 
239
               step_spec->mem_per_task, step_spec->resv_port_cnt,
 
240
               step_spec->immediate, step_spec->no_kill);
 
241
        debug3("   overcommit=%d",
 
242
               step_spec->overcommit);
236
243
}
237
244
 
238
245
 
244
251
 * RET pointer to the job step's record, NULL on error
245
252
 */
246
253
struct step_record *
247
 
find_step_record(struct job_record *job_ptr, uint16_t step_id) 
 
254
find_step_record(struct job_record *job_ptr, uint32_t step_id) 
248
255
{
249
256
        ListIterator step_iterator;
250
257
        struct step_record *step_ptr;
254
261
 
255
262
        step_iterator = list_iterator_create (job_ptr->step_list);
256
263
        while ((step_ptr = (struct step_record *) list_next (step_iterator))) {
257
 
                if ((step_ptr->step_id == step_id)
258
 
                ||  ((uint16_t) step_id == (uint16_t) NO_VAL)) {
 
264
                if ((step_ptr->step_id == step_id) || (step_id == NO_VAL))
259
265
                        break;
260
 
                }
261
266
        }               
262
267
        list_iterator_destroy (step_iterator);
263
268
 
424
429
 *      we satisfy the super-set of constraints.
425
430
 * IN job_ptr - pointer to job to have new step started
426
431
 * IN step_spec - job step specification
 
432
 * IN cpus_per_task - NOTE could be zero
427
433
 * IN batch_step - if set then step is a batch script
428
434
 * OUT return_code - exit code or SLURM_SUCCESS
429
435
 * global: node_record_table_ptr - pointer to global node table
433
439
static bitstr_t *
434
440
_pick_step_nodes (struct job_record  *job_ptr, 
435
441
                  job_step_create_request_msg_t *step_spec,
 
442
                  int cpus_per_task,
436
443
                  bool batch_step, int *return_code)
437
444
{
438
 
 
439
445
        bitstr_t *nodes_avail = NULL, *nodes_idle = NULL;
440
446
        bitstr_t *nodes_picked = NULL, *node_tmp = NULL;
441
447
        int error_code, nodes_picked_cnt=0, cpus_picked_cnt = 0, i;
442
448
        ListIterator step_iterator;
443
449
        struct step_record *step_p;
444
 
#if STEP_DEBUG
445
 
        char *temp;
446
 
#endif
 
450
        select_job_res_t select_ptr = job_ptr->select_job;
 
451
 
 
452
        xassert(select_ptr);
 
453
        xassert(select_ptr->cpus);
 
454
        xassert(select_ptr->cpus_used);
447
455
 
448
456
        *return_code = SLURM_SUCCESS;
449
457
        if (job_ptr->node_bitmap == NULL) {
456
464
                fatal("bit_copy malloc failure");
457
465
        bit_and (nodes_avail, up_node_bitmap);
458
466
 
459
 
        /* In exclusive mode, satisfy the processor count.
460
 
         * Do not use nodes that have no unused CPUs */
 
467
        if (step_spec->mem_per_task &&
 
468
            ((select_ptr->memory_allocated == NULL) ||
 
469
             (select_ptr->memory_used == NULL))) {
 
470
                error("_pick_step_nodes: lack memory allocation details "
 
471
                      "to enforce memory limits for job %u", job_ptr->job_id);
 
472
                step_spec->mem_per_task = 0;
 
473
        }
 
474
 
 
475
        if (job_ptr->next_step_id == 0) {
 
476
                if (job_ptr->details && job_ptr->details->prolog_running) {
 
477
                        *return_code = ESLURM_PROLOG_RUNNING;
 
478
                        return NULL;
 
479
                }
 
480
                for (i=bit_ffs(job_ptr->node_bitmap); i<node_record_count; 
 
481
                     i++) {
 
482
                        if (!bit_test(job_ptr->node_bitmap, i))
 
483
                                continue;
 
484
                        if ((node_record_table_ptr[i].node_state &
 
485
                             NODE_STATE_POWER_SAVE) ||
 
486
                            (node_record_table_ptr[i].node_state &
 
487
                             NODE_STATE_NO_RESPOND)) {
 
488
                                /* Node is/was powered down. Need to wait 
 
489
                                 * for it to start responding again. */
 
490
                                FREE_NULL_BITMAP(nodes_avail);
 
491
                                *return_code = ESLURM_NODES_BUSY;
 
492
                                /* Update job's end-time to allow for node
 
493
                                 * boot time. */
 
494
                                if (job_ptr->time_limit != INFINITE) {
 
495
                                        job_ptr->end_time = time(NULL) + 
 
496
                                                (job_ptr->time_limit * 60);
 
497
                                }
 
498
                                return NULL;
 
499
                        }
 
500
                }
 
501
        }
 
502
 
 
503
        /* In exclusive mode, just satisfy the processor count.
 
504
         * Do not use nodes that have no unused CPUs or insufficient 
 
505
         * unused memory */
461
506
        if (step_spec->exclusive) {
462
 
                int i, j=0, avail, tot_cpus = 0;
 
507
                int avail_cpus, avail_tasks, total_cpus, total_tasks, node_inx;
 
508
                int i_first, i_last;
 
509
                uint32_t avail_mem, total_mem;
 
510
                uint32_t nodes_picked_cnt = 0;
 
511
                uint32_t tasks_picked_cnt = 0, total_task_cnt = 0;
463
512
                bitstr_t *selected_nodes = NULL;
464
513
 
465
514
                if (step_spec->node_list) {
487
536
                                goto cleanup;
488
537
                        }
489
538
                }
490
 
                for (i=bit_ffs(job_ptr->node_bitmap); i<node_record_count;
491
 
                     i++) {
492
 
                        if (!bit_test(job_ptr->node_bitmap, i))
 
539
 
 
540
                node_inx = -1;
 
541
                i_first = bit_ffs(select_ptr->node_bitmap);
 
542
                i_last  = bit_fls(select_ptr->node_bitmap);
 
543
                for (i=i_first; i<=i_last; i++) {
 
544
                        if (!bit_test(select_ptr->node_bitmap, i))
493
545
                                continue;
494
 
                        if (selected_nodes && (!bit_test(selected_nodes, i)))
495
 
                                avail = 0;
 
546
                        node_inx++;
 
547
                        if (!bit_test(nodes_avail, i))
 
548
                                continue;       /* node now DOWN */
 
549
                        avail_cpus = select_ptr->cpus[node_inx] - 
 
550
                                     select_ptr->cpus_used[node_inx];
 
551
                        total_cpus = select_ptr->cpus[node_inx];
 
552
                        if (cpus_per_task > 0) {
 
553
                                avail_tasks = avail_cpus / cpus_per_task;
 
554
                                total_tasks = total_cpus / cpus_per_task;
 
555
                        } else {
 
556
                                avail_tasks = step_spec->num_tasks;
 
557
                                total_tasks = step_spec->num_tasks;
 
558
                        }
 
559
                        if (step_spec->mem_per_task) {
 
560
                                avail_mem = select_ptr->
 
561
                                            memory_allocated[node_inx] -
 
562
                                            select_ptr->memory_used[node_inx];
 
563
                                avail_mem /= step_spec->mem_per_task;
 
564
                                avail_tasks = MIN(avail_tasks, avail_mem);
 
565
                                total_mem = select_ptr->
 
566
                                            memory_allocated[node_inx];
 
567
                                total_mem /= step_spec->mem_per_task;
 
568
                                total_tasks = MIN(total_tasks, total_mem);
 
569
                        }
 
570
                        if ((avail_tasks <= 0) ||
 
571
                            ((selected_nodes == NULL) &&
 
572
                             (nodes_picked_cnt >= step_spec->node_count) &&
 
573
                             (tasks_picked_cnt > 0) &&
 
574
                             (tasks_picked_cnt >= step_spec->num_tasks)))
 
575
                                bit_clear(nodes_avail, i);
496
576
                        else {
497
 
                                avail = job_ptr->alloc_lps[j] - 
498
 
                                        job_ptr->used_lps[j];
499
 
                                tot_cpus += job_ptr->alloc_lps[j];
 
577
                                nodes_picked_cnt++;
 
578
                                tasks_picked_cnt += avail_tasks;
500
579
                        }
501
 
                        if ((avail <= 0) ||
502
 
                            ((selected_nodes == NULL) &&
503
 
                             (cpus_picked_cnt > 0) &&
504
 
                             (cpus_picked_cnt >= step_spec->cpu_count)))
505
 
                                bit_clear(nodes_avail, i);
506
 
                        else
507
 
                                cpus_picked_cnt += avail;
508
 
                        if (++j >= job_ptr->node_cnt)
509
 
                                break;
 
580
                        total_task_cnt += total_tasks;
510
581
                }
 
582
 
511
583
                if (selected_nodes) {
512
584
                        if (!bit_equal(selected_nodes, nodes_avail)) {
513
585
                                /* some required nodes have no available
514
586
                                 * processors, defer request */
515
 
                                cpus_picked_cnt = 0;
 
587
                                tasks_picked_cnt = 0;
516
588
                        }
517
589
                        bit_free(selected_nodes);
518
590
                }
519
 
                if (cpus_picked_cnt >= step_spec->cpu_count)
 
591
 
 
592
                if (tasks_picked_cnt >= step_spec->num_tasks)
520
593
                        return nodes_avail;
521
 
 
522
594
                FREE_NULL_BITMAP(nodes_avail);
523
 
                if (tot_cpus >= step_spec->cpu_count)
 
595
                if (total_task_cnt >= step_spec->num_tasks)
524
596
                        *return_code = ESLURM_NODES_BUSY;
525
597
                else
526
598
                        *return_code = ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE;
527
599
                return NULL;
528
600
        }
529
601
 
530
 
        if ( step_spec->node_count == INFINITE) /* use all nodes */
 
602
        if (step_spec->mem_per_task) {
 
603
                int node_inx = 0, usable_mem;
 
604
                for (i=bit_ffs(select_ptr->node_bitmap); i<node_record_count; 
 
605
                     i++) {
 
606
                        if (!bit_test(select_ptr->node_bitmap, i))
 
607
                                continue;
 
608
                        usable_mem = select_ptr->memory_allocated[node_inx] -
 
609
                                     select_ptr->memory_used[node_inx];
 
610
                        usable_mem /= step_spec->mem_per_task;
 
611
                        if (usable_mem <= 0) {
 
612
                                if (step_spec->node_count == INFINITE) {
 
613
                                        FREE_NULL_BITMAP(nodes_avail);
 
614
                                        *return_code = 
 
615
                                                ESLURM_INVALID_TASK_MEMORY;
 
616
                                        return NULL;
 
617
                                }
 
618
                                bit_clear(nodes_avail, i);
 
619
                        }
 
620
                        if (++node_inx >= select_ptr->nhosts)
 
621
                                break;
 
622
                }
 
623
        }
 
624
 
 
625
        if (step_spec->node_count == INFINITE)  /* use all nodes */
531
626
                return nodes_avail;
532
627
 
533
628
        if (step_spec->node_list) {
534
629
                bitstr_t *selected_nodes = NULL;
535
 
#if STEP_DEBUG
536
 
                info("selected nodelist is %s", step_spec->node_list);
537
 
#endif
 
630
                if (slurm_get_debug_flags() & DEBUG_FLAG_STEPS)
 
631
                        info("selected nodelist is %s", step_spec->node_list);
538
632
                error_code = node_name2bitmap(step_spec->node_list, false, 
539
633
                                              &selected_nodes);
540
634
                
552
646
                        goto cleanup;
553
647
                }
554
648
                if (!bit_super_set(selected_nodes, nodes_avail)) {
555
 
                        info ("_pick_step_nodes: requested nodes %s are DOWN",
556
 
                              step_spec->node_list);
 
649
                        info ("_pick_step_nodes: requested nodes %s "
 
650
                              "have inadequate memory",
 
651
                               step_spec->node_list);
557
652
                        bit_free(selected_nodes);
558
653
                        goto cleanup;
559
654
                }
560
 
                if(step_spec->task_dist == SLURM_DIST_ARBITRARY) {
 
655
                if (step_spec->task_dist == SLURM_DIST_ARBITRARY) {
561
656
                        /* if we are in arbitrary mode we need to make
562
657
                         * sure we aren't running on an elan switch.
563
658
                         * If we aren't change the number of nodes
636
731
                while ((step_p = (struct step_record *)
637
732
                        list_next(step_iterator))) {
638
733
                        bit_or(nodes_idle, step_p->step_node_bitmap);
639
 
#if STEP_DEBUG
640
 
                        temp = bitmap2node_name(step_p->step_node_bitmap);
641
 
                        info("step %d has nodes %s", step_p->step_id, temp);
642
 
                        xfree(temp);
643
 
#endif
 
734
                        if (slurm_get_debug_flags() & DEBUG_FLAG_STEPS) {
 
735
                                char *temp;
 
736
                                temp = bitmap2node_name(step_p->
 
737
                                                        step_node_bitmap);
 
738
                                info("step %u.%u has nodes %s", 
 
739
                                     job_ptr->job_id, step_p->step_id, temp);
 
740
                                xfree(temp);
 
741
                        }
644
742
                } 
645
743
                list_iterator_destroy (step_iterator);
646
744
                bit_not(nodes_idle);
647
745
                bit_and(nodes_idle, nodes_avail);
648
746
        }
649
 
#if STEP_DEBUG
650
 
        temp = bitmap2node_name(nodes_avail);
651
 
        info("can pick from %s %d", temp, step_spec->node_count);
652
 
        xfree(temp);
653
 
        temp = bitmap2node_name(nodes_idle);
654
 
        info("can pick from %s", temp);
655
 
        xfree(temp);
656
 
#endif
 
747
 
 
748
        if (slurm_get_debug_flags() & DEBUG_FLAG_STEPS) {
 
749
                char *temp1, *temp2;
 
750
                temp1 = bitmap2node_name(nodes_avail);
 
751
                temp2 = bitmap2node_name(nodes_idle);
 
752
                info("step pick %u nodes, avail:%s idle:%s", 
 
753
                     step_spec->node_count, temp1, temp2);
 
754
                xfree(temp1);
 
755
                xfree(temp2);
 
756
        }
657
757
 
658
758
        /* if user specifies step needs a specific processor count and 
659
759
         * all nodes have the same processor count, just translate this to
660
760
         * a node count */
661
 
        if (step_spec->cpu_count && (job_ptr->num_cpu_groups == 1) && 
662
 
            job_ptr->cpus_per_node[0]) {
663
 
                i = (step_spec->cpu_count + (job_ptr->cpus_per_node[0] - 1) ) 
664
 
                                / job_ptr->cpus_per_node[0];
 
761
        if (step_spec->cpu_count && job_ptr->select_job && 
 
762
            (job_ptr->select_job->cpu_array_cnt == 1) &&
 
763
            job_ptr->select_job->cpu_array_value) {
 
764
                i = (step_spec->cpu_count + 
 
765
                     (job_ptr->select_job->cpu_array_value[0] - 1)) /
 
766
                    job_ptr->select_job->cpu_array_value[0];
665
767
                step_spec->node_count = (i > step_spec->node_count) ? 
666
 
                                                i : step_spec->node_count ;
 
768
                                         i : step_spec->node_count ;
667
769
                //step_spec->cpu_count = 0;
668
770
        }
669
771
 
670
772
        if (step_spec->node_count) {
671
773
                nodes_picked_cnt = bit_set_count(nodes_picked);
672
 
#if STEP_DEBUG
673
 
                info("got %u %d", step_spec->node_count, nodes_picked_cnt);
674
 
#endif
 
774
                if (slurm_get_debug_flags() & DEBUG_FLAG_STEPS) {
 
775
                        verbose("got %u %d", step_spec->node_count, 
 
776
                                nodes_picked_cnt);
 
777
                }
675
778
                if (nodes_idle 
676
779
                    && (bit_set_count(nodes_idle) >= step_spec->node_count)
677
780
                    && (step_spec->node_count > nodes_picked_cnt)) {
708
811
                cpus_picked_cnt = _count_cpus(nodes_picked);
709
812
                /* user is requesting more cpus than we got from the
710
813
                 * picked nodes we should return with an error */
711
 
                if(step_spec->cpu_count > cpus_picked_cnt) {
 
814
                if (step_spec->cpu_count > cpus_picked_cnt) {
712
815
                        debug2("Have %d nodes with %d cpus which is less "
713
816
                               "than what the user is asking for (%d cpus) "
714
817
                               "aborting.",
753
856
        return sum;
754
857
}
755
858
 
 
859
/* Update the step's core bitmaps, create as needed.
 
860
 *      Add the specified task count for a specific node in the job's 
 
861
 *      and step's allocation */
 
862
static void _pick_step_cores(struct step_record *step_ptr, 
 
863
                             select_job_res_t select_ptr, 
 
864
                             int job_node_inx, uint16_t task_cnt)
 
865
{
 
866
        int bit_offset, core_inx, i, sock_inx;
 
867
        uint16_t sockets, cores;
 
868
        int cpu_cnt = (int) task_cnt;
 
869
        bool use_all_cores;
 
870
        static int last_core_inx;
 
871
 
 
872
        if (!step_ptr->core_bitmap_job) {
 
873
                step_ptr->core_bitmap_job = bit_alloc(bit_size(select_ptr->
 
874
                                                               core_bitmap));
 
875
        }
 
876
        if (get_select_job_res_cnt(select_ptr, job_node_inx, &sockets, &cores))
 
877
                fatal("get_select_job_res_cnt");
 
878
 
 
879
        if (task_cnt == (cores * sockets))
 
880
                use_all_cores = true;
 
881
        else
 
882
                use_all_cores = false;
 
883
        if (step_ptr->cpus_per_task > 0)
 
884
                cpu_cnt *= step_ptr->cpus_per_task;
 
885
 
 
886
        /* select idle cores first */
 
887
        for (core_inx=0; core_inx<cores; core_inx++) {
 
888
                for (sock_inx=0; sock_inx<sockets; sock_inx++) {
 
889
                        bit_offset = get_select_job_res_offset(select_ptr,
 
890
                                                               job_node_inx,
 
891
                                                               sock_inx, 
 
892
                                                               core_inx);
 
893
                        if (bit_offset < 0)
 
894
                                fatal("get_select_job_res_offset");
 
895
                        if (!bit_test(select_ptr->core_bitmap, bit_offset))
 
896
                                continue;
 
897
                        if ((use_all_cores == false) &&
 
898
                            bit_test(select_ptr->core_bitmap_used, bit_offset))
 
899
                                continue;
 
900
                        bit_set(select_ptr->core_bitmap_used, bit_offset);
 
901
                        bit_set(step_ptr->core_bitmap_job, bit_offset);
 
902
#if 0
 
903
                        info("step alloc N:%d S:%dC :%d", 
 
904
                             job_node_inx, sock_inx, core_inx);
 
905
#endif
 
906
                        if (--cpu_cnt == 0)
 
907
                                return;
 
908
                }
 
909
        }
 
910
        if (use_all_cores)
 
911
                return;
 
912
 
 
913
        /* We need to over-subscribe one or more cores.
 
914
         * Use last_core_inx to avoid putting all of the extra
 
915
         * work onto core zero */
 
916
        verbose("job step needs to over-subscribe cores");
 
917
        last_core_inx = (last_core_inx + 1) % cores;
 
918
        for (i=0; i<cores; i++) {
 
919
                core_inx = (last_core_inx + i) % cores;
 
920
                for (sock_inx=0; sock_inx<sockets; sock_inx++) {
 
921
                        bit_offset = get_select_job_res_offset(select_ptr,
 
922
                                                               job_node_inx,
 
923
                                                               sock_inx, 
 
924
                                                               core_inx);
 
925
                        if (bit_offset < 0)
 
926
                                fatal("get_select_job_res_offset");
 
927
                        if (!bit_test(select_ptr->core_bitmap, bit_offset))
 
928
                                continue;
 
929
                        if (bit_test(step_ptr->core_bitmap_job, bit_offset))
 
930
                                continue;   /* already taken by this step */
 
931
                        bit_set(step_ptr->core_bitmap_job, bit_offset);
 
932
#if 0
 
933
                        info("step alloc N:%d S:%dC :%d", 
 
934
                             job_node_inx, sock_inx, core_inx);
 
935
#endif
 
936
                        if (--cpu_cnt == 0)
 
937
                                return;
 
938
                }
 
939
        }
 
940
}
 
941
 
 
942
 
756
943
/* Update a job's record of allocated CPUs when a job step gets scheduled */
757
944
extern void step_alloc_lps(struct step_record *step_ptr)
758
945
{
759
946
        struct job_record  *job_ptr = step_ptr->job_ptr;
 
947
        select_job_res_t select_ptr = job_ptr->select_job;
 
948
        int cpus_alloc;
760
949
        int i_node, i_first, i_last;
761
950
        int job_node_inx = -1, step_node_inx = -1;
 
951
        bool pick_step_cores = true;
 
952
 
 
953
        xassert(select_ptr);
 
954
        xassert(select_ptr->core_bitmap);
 
955
        xassert(select_ptr->core_bitmap_used);
 
956
        xassert(select_ptr->cpus);
 
957
        xassert(select_ptr->cpus_used);
762
958
 
763
959
        if (step_ptr->step_layout == NULL)      /* batch step */
764
960
                return;
765
961
 
766
 
        i_first = bit_ffs(job_ptr->node_bitmap);
767
 
        i_last  = bit_fls(job_ptr->node_bitmap);
 
962
        i_first = bit_ffs(select_ptr->node_bitmap);
 
963
        i_last  = bit_fls(select_ptr->node_bitmap);
768
964
        if (i_first == -1)      /* empty bitmap */
769
965
                return;
 
966
 
 
967
        if (step_ptr->core_bitmap_job) {
 
968
                /* "scontrol reconfig" of live system */
 
969
                pick_step_cores = false;
 
970
        } else if ((step_ptr->exclusive == 0) ||
 
971
                   (step_ptr->cpu_count == job_ptr->total_procs)) {
 
972
                /* Step uses all of job's cores
 
973
                 * Just copy the bitmap to save time */
 
974
                step_ptr->core_bitmap_job = bit_copy(select_ptr->core_bitmap);
 
975
                pick_step_cores = false;
 
976
        }
 
977
 
 
978
        if (step_ptr->mem_per_task &&
 
979
            ((select_ptr->memory_allocated == NULL) ||
 
980
             (select_ptr->memory_used == NULL))) {
 
981
                error("step_alloc_lps: lack memory allocation details "
 
982
                      "to enforce memory limits for job %u", job_ptr->job_id);
 
983
                step_ptr->mem_per_task = 0;
 
984
        }
 
985
 
770
986
        for (i_node = i_first; i_node <= i_last; i_node++) {
771
 
                if (!bit_test(job_ptr->node_bitmap, i_node))
 
987
                if (!bit_test(select_ptr->node_bitmap, i_node))
772
988
                        continue;
773
989
                job_node_inx++;
774
990
                if (!bit_test(step_ptr->step_node_bitmap, i_node))
775
991
                        continue;
776
992
                step_node_inx++;
777
 
                if (step_ptr->cpus_per_task) {
778
 
                        job_ptr->used_lps[job_node_inx] +=
779
 
                                step_ptr->step_layout->tasks[step_node_inx];
780
 
                }
781
 
#if 0
782
 
                info("step alloc of %s procs: %u of %u", 
783
 
                        node_record_table_ptr[i_node].name,
784
 
                        job_ptr->used_lps[job_node_inx],
785
 
                        job_ptr->alloc_lps[job_node_inx]);
786
 
#endif
 
993
                if (job_node_inx >= select_ptr->nhosts)
 
994
                        fatal("step_alloc_lps: node index bad");
 
995
                /* NOTE: The --overcommit option can result in
 
996
                 * cpus_used[] having a higher value than cpus[] */
 
997
                cpus_alloc = step_ptr->step_layout->tasks[step_node_inx] *
 
998
                             step_ptr->cpus_per_task;
 
999
                select_ptr->cpus_used[job_node_inx] += cpus_alloc; 
 
1000
                if (step_ptr->mem_per_task) {
 
1001
                        select_ptr->memory_used[job_node_inx] += 
 
1002
                                (step_ptr->mem_per_task *
 
1003
                                 step_ptr->step_layout->tasks[step_node_inx]);
 
1004
                }
 
1005
                if (pick_step_cores) {
 
1006
                        _pick_step_cores(step_ptr, select_ptr, 
 
1007
                                         job_node_inx,
 
1008
                                         step_ptr->step_layout->
 
1009
                                         tasks[step_node_inx]);
 
1010
                }
 
1011
                if (slurm_get_debug_flags() & DEBUG_FLAG_CPU_BIND)
 
1012
                        _dump_step_layout(step_ptr);
 
1013
                if (slurm_get_debug_flags() & DEBUG_FLAG_STEPS) {
 
1014
                        info("step alloc of %s procs: %u of %u", 
 
1015
                             node_record_table_ptr[i_node].name,
 
1016
                             select_ptr->cpus_used[job_node_inx],
 
1017
                             select_ptr->cpus[job_node_inx]);
 
1018
                }
787
1019
                if (step_node_inx == (step_ptr->step_layout->node_cnt - 1))
788
1020
                        break;
789
1021
        }
790
1022
        
791
1023
}
792
1024
 
 
1025
/* Dump a job step's CPU binding information.
 
1026
 * NOTE: The core_bitmap_job and node index are based upon 
 
1027
 * the _job_ allocation */
 
1028
static void _dump_step_layout(struct step_record *step_ptr)
 
1029
{
 
1030
        struct job_record* job_ptr = step_ptr->job_ptr;
 
1031
        select_job_res_t select_ptr = job_ptr->select_job;
 
1032
        int i, bit_inx, core_inx, node_inx, rep, sock_inx;
 
1033
 
 
1034
        if ((step_ptr->core_bitmap_job == NULL) ||
 
1035
            (select_ptr == NULL) || (select_ptr->cores_per_socket == NULL))
 
1036
                return;
 
1037
 
 
1038
        info("====================");
 
1039
        info("step_id:%u.%u", job_ptr->job_id, step_ptr->step_id);
 
1040
        for (i=0, bit_inx= 0, node_inx=0; node_inx<select_ptr->nhosts; i++) {
 
1041
                for (rep=0; rep<select_ptr->sock_core_rep_count[i]; rep++) {
 
1042
                        for (sock_inx=0; 
 
1043
                             sock_inx<select_ptr->sockets_per_node[i]; 
 
1044
                             sock_inx++) {
 
1045
                                for (core_inx=0; 
 
1046
                                     core_inx<select_ptr->cores_per_socket[i]; 
 
1047
                                     core_inx++) {
 
1048
                                        if (bit_test(step_ptr->
 
1049
                                                     core_bitmap_job, 
 
1050
                                                     bit_inx++)) {
 
1051
                                                info("JobNode[%d] Socket[%d] "
 
1052
                                                     "Core[%d] is allocated",
 
1053
                                                     node_inx, sock_inx, 
 
1054
                                                     core_inx);
 
1055
                                        }
 
1056
                                }
 
1057
                        }
 
1058
                        node_inx++;
 
1059
                }
 
1060
        }
 
1061
        info("====================");
 
1062
}
 
1063
 
793
1064
static void _step_dealloc_lps(struct step_record *step_ptr)
794
1065
{
795
1066
        struct job_record  *job_ptr = step_ptr->job_ptr;
 
1067
        select_job_res_t select_ptr = job_ptr->select_job;
 
1068
        int cpus_alloc;
796
1069
        int i_node, i_first, i_last;
797
1070
        int job_node_inx = -1, step_node_inx = -1;
798
1071
 
 
1072
        xassert(select_ptr);
 
1073
        xassert(select_ptr->core_bitmap);
 
1074
        xassert(select_ptr->core_bitmap_used);
 
1075
        xassert(select_ptr->cpus);
 
1076
        xassert(select_ptr->cpus_used);
 
1077
 
799
1078
        if (step_ptr->step_layout == NULL)      /* batch step */
800
1079
                return;
801
1080
 
802
 
        i_first = bit_ffs(job_ptr->node_bitmap);
803
 
        i_last  = bit_fls(job_ptr->node_bitmap);
 
1081
        i_first = bit_ffs(select_ptr->node_bitmap);
 
1082
        i_last  = bit_fls(select_ptr->node_bitmap);
804
1083
        if (i_first == -1)      /* empty bitmap */
805
1084
                return;
 
1085
 
 
1086
        if (step_ptr->mem_per_task &&
 
1087
            ((select_ptr->memory_allocated == NULL) ||
 
1088
             (select_ptr->memory_used == NULL))) {
 
1089
                error("_step_dealloc_lps: lack memory allocation details "
 
1090
                      "to enforce memory limits for job %u", job_ptr->job_id);
 
1091
                step_ptr->mem_per_task = 0;
 
1092
        }
 
1093
 
806
1094
        for (i_node = i_first; i_node <= i_last; i_node++) {
807
 
                if (!bit_test(job_ptr->node_bitmap, i_node))
 
1095
                if (!bit_test(select_ptr->node_bitmap, i_node))
808
1096
                        continue;
809
1097
                job_node_inx++;
810
1098
                if (!bit_test(step_ptr->step_node_bitmap, i_node))
811
1099
                        continue;
812
1100
                step_node_inx++;
813
 
                if (step_ptr->cpus_per_task == 0)
814
 
                        ;       /* no CPUs allocated */
815
 
                else if (job_ptr->used_lps[job_node_inx] >=
816
 
                    step_ptr->step_layout->tasks[step_node_inx]) {
817
 
                        job_ptr->used_lps[job_node_inx] -= 
818
 
                                step_ptr->step_layout->tasks[step_node_inx];
819
 
                } else {
820
 
                        error("_step_dealloc_lps: underflow for %u.%u",
 
1101
                if (job_node_inx >= select_ptr->nhosts)
 
1102
                        fatal("_step_dealloc_lps: node index bad");
 
1103
                cpus_alloc = step_ptr->step_layout->tasks[step_node_inx] *
 
1104
                             step_ptr->cpus_per_task;
 
1105
                if (select_ptr->cpus_used[job_node_inx] >= cpus_alloc)
 
1106
                        select_ptr->cpus_used[job_node_inx] -= cpus_alloc;
 
1107
                else {
 
1108
                        error("_step_dealloc_lps: cpu underflow for %u.%u",
821
1109
                                job_ptr->job_id, step_ptr->step_id);
822
 
                        job_ptr->used_lps[job_node_inx] = 0;
823
 
                }
824
 
#if 0
825
 
                info("step dealloc of %s procs: %u of %u", 
826
 
                        node_record_table_ptr[i_node].name,
827
 
                        job_ptr->used_lps[job_node_inx],
828
 
                        job_ptr->alloc_lps[job_node_inx]);
829
 
#endif
 
1110
                        select_ptr->cpus_used[job_node_inx] = 0;
 
1111
                }
 
1112
                if (step_ptr->mem_per_task) {
 
1113
                        uint32_t mem_use = step_ptr->mem_per_task *
 
1114
                                           step_ptr->step_layout->
 
1115
                                           tasks[step_node_inx];
 
1116
                        if (select_ptr->memory_used[job_node_inx] >= mem_use) {
 
1117
                                select_ptr->memory_used[job_node_inx] -= 
 
1118
                                                mem_use;
 
1119
                        } else {
 
1120
                                error("_step_dealloc_lps: "
 
1121
                                      "mem underflow for %u.%u",
 
1122
                                      job_ptr->job_id, step_ptr->step_id);
 
1123
                                select_ptr->memory_used[job_node_inx] = 0;
 
1124
                        }
 
1125
                }
 
1126
                if (slurm_get_debug_flags() & DEBUG_FLAG_STEPS) {
 
1127
                        info("step dealloc of %s procs: %u of %u", 
 
1128
                             node_record_table_ptr[i_node].name,
 
1129
                             select_ptr->cpus_used[job_node_inx],
 
1130
                             select_ptr->cpus[job_node_inx]);
 
1131
                }
830
1132
                if (step_node_inx == (step_ptr->step_layout->node_cnt - 1))
831
1133
                        break;
832
1134
        }
833
 
        
 
1135
        if (step_ptr->core_bitmap_job) {
 
1136
                /* Mark the job's cores as no longer in use */
 
1137
                bit_not(step_ptr->core_bitmap_job);
 
1138
                bit_and(select_ptr->core_bitmap_used,
 
1139
                        step_ptr->core_bitmap_job);
 
1140
                /* no need for bit_not(step_ptr->core_bitmap_job); */
 
1141
                FREE_NULL_BITMAP(step_ptr->core_bitmap_job);
 
1142
        }
834
1143
}
835
1144
 
836
1145
/*
852
1161
        struct step_record *step_ptr;
853
1162
        struct job_record  *job_ptr;
854
1163
        bitstr_t *nodeset;
855
 
        int cpus_per_task, node_count, ret_code;
 
1164
        int cpus_per_task, node_count, ret_code, i;
856
1165
        time_t now = time(NULL);
857
1166
        char *step_node_list = NULL;
 
1167
        uint32_t orig_cpu_count;
858
1168
 
859
1169
        *new_step_record = NULL;
860
1170
        job_ptr = find_job_record (step_specs->job_id);
890
1200
            (job_ptr->end_time <= time(NULL)))
891
1201
                return ESLURM_ALREADY_DONE;
892
1202
 
893
 
        if (job_ptr->details->job_min_memory) {
894
 
                /* use memory reserved by job, no limit on steps */
895
 
                step_specs->mem_per_task = 0;
896
 
        } else if (step_specs->mem_per_task) {
897
 
                if (slurmctld_conf.max_mem_per_task &&
898
 
                    (step_specs->mem_per_task > 
899
 
                     slurmctld_conf.max_mem_per_task))
900
 
                        return ESLURM_INVALID_TASK_MEMORY;
901
 
        } else
902
 
                step_specs->mem_per_task = slurmctld_conf.def_mem_per_task;
903
 
 
904
1203
        if ((step_specs->task_dist != SLURM_DIST_CYCLIC) &&
905
1204
            (step_specs->task_dist != SLURM_DIST_BLOCK) &&
906
1205
            (step_specs->task_dist != SLURM_DIST_CYCLIC_CYCLIC) &&
911
1210
            (step_specs->task_dist != SLURM_DIST_ARBITRARY))
912
1211
                return ESLURM_BAD_DIST;
913
1212
 
914
 
        if (step_specs->task_dist == SLURM_DIST_ARBITRARY
915
 
            && (!strcmp(slurmctld_conf.switch_type, "switch/elan"))) {
 
1213
        if ((step_specs->task_dist == SLURM_DIST_ARBITRARY) &&
 
1214
            (!strcmp(slurmctld_conf.switch_type, "switch/elan"))) {
916
1215
                return ESLURM_TASKDIST_ARBITRARY_UNSUPPORTED;
917
1216
        }
918
1217
 
924
1223
             (strlen(step_specs->network)   > MAX_STR_LEN)) ||
925
1224
            (step_specs->name      && 
926
1225
             (strlen(step_specs->name)      > MAX_STR_LEN)) ||
927
 
            (step_specs->ckpt_path && 
928
 
             (strlen(step_specs->ckpt_path) > MAX_STR_LEN)))
 
1226
            (step_specs->ckpt_dir && 
 
1227
             (strlen(step_specs->ckpt_dir) > MAX_STR_LEN)))
929
1228
                return ESLURM_PATHNAME_TOO_LONG;
930
1229
 
931
 
        /* we can figure out the cpus_per_task here by reversing what happens
932
 
         * in srun, record argument, plus save/restore in slurm v1.4 */
 
1230
        /* if the overcommit flag is checked, we 0 set cpu_count=0
 
1231
         * which makes it so we don't check to see the available cpus
 
1232
         */
 
1233
        orig_cpu_count =  step_specs->cpu_count;
 
1234
        if (step_specs->overcommit) {
 
1235
                if (step_specs->exclusive) {
 
1236
                        /* Not really a legitimate combination, try to 
 
1237
                         * exclusively allocate one CPU per task */
 
1238
                        step_specs->overcommit = 0;
 
1239
                        step_specs->cpu_count = step_specs->num_tasks;
 
1240
                } else
 
1241
                        step_specs->cpu_count = 0;
 
1242
        }
 
1243
 
 
1244
        /* determine cpus_per_task value by reversing what srun does */
 
1245
        if (step_specs->num_tasks < 1)
 
1246
                return ESLURM_BAD_TASK_COUNT;
933
1247
        if (step_specs->cpu_count == 0)
934
1248
                cpus_per_task = 0;
935
 
        else if (step_specs->num_tasks < 1)
936
 
                cpus_per_task = 1;
937
1249
        else {
938
1250
                cpus_per_task = step_specs->cpu_count / step_specs->num_tasks;
939
1251
                if (cpus_per_task < 1)
940
1252
                        cpus_per_task = 1;
941
1253
        }
942
1254
 
943
 
        /* if the overcommit flag is checked we 0 out the cpu_count
944
 
         * which makes it so we don't check to see the available cpus
945
 
         */      
946
 
        if (step_specs->overcommit)
947
 
                step_specs->cpu_count = 0;
 
1255
        if (step_specs->no_kill > 1)
 
1256
                step_specs->no_kill = 1;
948
1257
 
949
1258
        if (job_ptr->kill_on_step_done)
950
1259
                /* Don't start more steps, job already being cancelled */
952
1261
        job_ptr->kill_on_step_done = kill_job_when_step_done;
953
1262
 
954
1263
        job_ptr->time_last_active = now;
955
 
        nodeset = _pick_step_nodes(job_ptr, step_specs, batch_step, &ret_code);
 
1264
        nodeset = _pick_step_nodes(job_ptr, step_specs,
 
1265
                                   cpus_per_task, batch_step, &ret_code);
956
1266
        if (nodeset == NULL)
957
1267
                return ret_code;
958
1268
        node_count = bit_set_count(nodeset);
964
1274
                        step_specs->num_tasks = node_count;
965
1275
        }
966
1276
        
967
 
        if ((step_specs->num_tasks < 1)
968
 
        ||  (step_specs->num_tasks > (node_count*MAX_TASKS_PER_NODE))) {
 
1277
        if (step_specs->num_tasks > (node_count*MAX_TASKS_PER_NODE)) {
969
1278
                error("step has invalid task count: %u", 
970
1279
                      step_specs->num_tasks);
971
1280
                bit_free(nodeset);
972
1281
                return ESLURM_BAD_TASK_COUNT;
973
1282
        }
974
1283
 
975
 
        step_ptr = create_step_record (job_ptr);
976
 
        if (step_ptr == NULL)
977
 
                fatal ("create_step_record failed with no memory");
 
1284
        step_ptr = _create_step_record (job_ptr);
 
1285
        if (step_ptr == NULL) {
 
1286
                bit_free(nodeset);
 
1287
                return ESLURMD_TOOMANYSTEPS;
 
1288
        }
 
1289
        step_ptr->step_id = job_ptr->next_step_id++;
978
1290
 
979
1291
        /* set the step_record values */
980
1292
 
989
1301
                xfree(step_specs->node_list);
990
1302
                step_specs->node_list = xstrdup(step_node_list);
991
1303
        }
992
 
#if STEP_DEBUG
993
 
        info("got %s and %s looking for %d nodes", step_node_list,
994
 
             step_specs->node_list, step_specs->node_count);
995
 
#endif
 
1304
        if (slurm_get_debug_flags() & DEBUG_FLAG_STEPS) {
 
1305
                verbose("got %s and %s looking for %d nodes", step_node_list,
 
1306
                        step_specs->node_list, step_specs->node_count);
 
1307
        }
996
1308
        step_ptr->step_node_bitmap = nodeset;
997
1309
        
998
1310
        switch(step_specs->task_dist) {
1013
1325
        step_ptr->mem_per_task = step_specs->mem_per_task;
1014
1326
        step_ptr->ckpt_interval = step_specs->ckpt_interval;
1015
1327
        step_ptr->ckpt_time = now;
 
1328
        step_ptr->cpu_count = orig_cpu_count;
1016
1329
        step_ptr->exit_code = NO_VAL;
1017
1330
        step_ptr->exclusive = step_specs->exclusive;
1018
 
        step_ptr->ckpt_path = xstrdup(step_specs->ckpt_path);
 
1331
        step_ptr->ckpt_dir  = xstrdup(step_specs->ckpt_dir);
 
1332
        step_ptr->no_kill   = step_specs->no_kill;
1019
1333
 
1020
1334
        /* step's name and network default to job's values if not 
1021
1335
         * specified in the step specification */
1038
1352
                                           (uint16_t)cpus_per_task,
1039
1353
                                           step_specs->task_dist,
1040
1354
                                           step_specs->plane_size);
1041
 
                if (!step_ptr->step_layout)
 
1355
                xfree(step_node_list);
 
1356
                if (!step_ptr->step_layout) {
 
1357
                        delete_step_record (job_ptr, step_ptr->step_id);
1042
1358
                        return SLURM_ERROR;
 
1359
                }
 
1360
 
 
1361
                if ((step_specs->resv_port_cnt != (uint16_t) NO_VAL) &&
 
1362
                    (step_specs->resv_port_cnt == 0)) {
 
1363
                        /* reserved port count set to maximum task count on 
 
1364
                         * any node plus one */
 
1365
                        for (i=0; i<step_ptr->step_layout->node_cnt; i++) {
 
1366
                                step_specs->resv_port_cnt = 
 
1367
                                        MAX(step_specs->resv_port_cnt,
 
1368
                                            step_ptr->step_layout->tasks[i]);
 
1369
                        }
 
1370
                        step_specs->resv_port_cnt++;
 
1371
                }
 
1372
                if (step_specs->resv_port_cnt != (uint16_t) NO_VAL) {
 
1373
                        step_ptr->resv_port_cnt = step_specs->resv_port_cnt;
 
1374
                        i = resv_port_alloc(step_ptr);
 
1375
                        if (i != SLURM_SUCCESS) {
 
1376
                                delete_step_record (job_ptr, step_ptr->step_id);
 
1377
                                return i;
 
1378
                        }
 
1379
                }
 
1380
 
1043
1381
                if (switch_alloc_jobinfo (&step_ptr->switch_job) < 0)
1044
1382
                        fatal ("step_create: switch_alloc_jobinfo error");
1045
1383
                
1053
1391
                        return ESLURM_INTERCONNECT_FAILURE;
1054
1392
                }
1055
1393
                step_alloc_lps(step_ptr);
1056
 
        }
 
1394
        } else
 
1395
                xfree(step_node_list);
1057
1396
        if (checkpoint_alloc_jobinfo (&step_ptr->check_job) < 0)
1058
1397
                fatal ("step_create: checkpoint_alloc_jobinfo error");
1059
 
        xfree(step_node_list);
1060
 
        if (step_ptr->mem_per_task &&
1061
 
            (select_g_step_begin(step_ptr) != SLURM_SUCCESS)) {
1062
 
                error("No memory to allocate step for job %u", job_ptr->job_id);
1063
 
                step_ptr->mem_per_task = 0;     /* no memory to be freed */
1064
 
                delete_step_record (job_ptr, step_ptr->step_id);
1065
 
                return ESLURM_INVALID_TASK_MEMORY;
1066
 
        }
1067
1398
        *new_step_record = step_ptr;
1068
1399
        jobacct_storage_g_step_start(acct_db_conn, step_ptr);
1069
1400
        return SLURM_SUCCESS;
1071
1402
 
1072
1403
extern slurm_step_layout_t *step_layout_create(struct step_record *step_ptr,
1073
1404
                                               char *step_node_list,
1074
 
                                               uint16_t node_count,
 
1405
                                               uint32_t node_count,
1075
1406
                                               uint32_t num_tasks,
1076
1407
                                               uint16_t cpus_per_task,
1077
1408
                                               uint16_t task_dist,
1078
1409
                                               uint32_t plane_size)
1079
1410
{
1080
 
        uint32_t cpus_per_node[node_count];
 
1411
        uint16_t cpus_per_node[node_count];
1081
1412
        uint32_t cpu_count_reps[node_count];
1082
1413
        int cpu_inx = -1;
1083
 
        int usable_cpus = 0, i;
1084
 
        int set_nodes = 0, set_cpus = 0;
 
1414
        int i, usable_cpus, usable_mem;
 
1415
        int set_cpus = 0, set_nodes = 0, set_tasks = 0;
1085
1416
        int pos = -1;
 
1417
        int first_bit, last_bit;
1086
1418
        struct job_record *job_ptr = step_ptr->job_ptr;
 
1419
        select_job_res_t select_ptr = job_ptr->select_job;
 
1420
 
 
1421
        xassert(select_ptr);
 
1422
        xassert(select_ptr->cpus);
 
1423
        xassert(select_ptr->cpus_used);
 
1424
 
 
1425
        if (step_ptr->mem_per_task &&
 
1426
            ((select_ptr->memory_allocated == NULL) ||
 
1427
             (select_ptr->memory_used == NULL))) {
 
1428
                error("step_layout_create: lack memory allocation details "
 
1429
                      "to enforce memory limits for job %u", job_ptr->job_id);
 
1430
                step_ptr->mem_per_task = 0;
 
1431
        }
1087
1432
 
1088
1433
        /* build the cpus-per-node arrays for the subset of nodes
1089
 
           used by this job step */
1090
 
        for (i = 0; i < node_record_count; i++) {
 
1434
         * used by this job step */
 
1435
        first_bit = bit_ffs(step_ptr->step_node_bitmap);
 
1436
        last_bit  = bit_fls(step_ptr->step_node_bitmap);
 
1437
        for (i = first_bit; i <= last_bit; i++) {
1091
1438
                if (bit_test(step_ptr->step_node_bitmap, i)) {
1092
1439
                        /* find out the position in the job */
1093
 
                        pos = bit_get_pos_num(job_ptr->node_bitmap, i);
 
1440
                        pos = bit_get_pos_num(select_ptr->node_bitmap, i);
1094
1441
                        if (pos == -1)
1095
1442
                                return NULL;
 
1443
                        if (pos >= select_ptr->nhosts)
 
1444
                                fatal("step_layout_create: node index bad");
1096
1445
                        if (step_ptr->exclusive) {
1097
 
                                usable_cpus = job_ptr->alloc_lps[pos] -
1098
 
                                              job_ptr->used_lps[pos];
1099
 
                                if (usable_cpus < 0) {
1100
 
                                        error("step_layout_create exclusive");
1101
 
                                        return NULL;
1102
 
                                }
1103
 
                                usable_cpus = MIN(usable_cpus, 
1104
 
                                                  (num_tasks - set_cpus));
 
1446
                                usable_cpus = select_ptr->cpus[pos] -
 
1447
                                              select_ptr->cpus_used[pos];
1105
1448
                        } else
1106
 
                                usable_cpus = job_ptr->alloc_lps[pos];
 
1449
                                usable_cpus = select_ptr->cpus[pos];
 
1450
                        if (step_ptr->mem_per_task) {
 
1451
                                usable_mem = select_ptr->memory_allocated[pos] -
 
1452
                                             select_ptr->memory_used[pos];
 
1453
                                usable_mem /= step_ptr->mem_per_task;
 
1454
                                if (cpus_per_task > 0)
 
1455
                                        usable_mem *= cpus_per_task;
 
1456
                                usable_cpus = MIN(usable_cpus, usable_mem);
 
1457
                        }
 
1458
                        if (usable_cpus <= 0) {
 
1459
                                error("step_layout_create no usable cpus");
 
1460
                                return NULL;
 
1461
                        }
1107
1462
                        debug3("step_layout cpus = %d pos = %d", 
1108
1463
                               usable_cpus, pos);
1109
1464
                        
1117
1472
                                cpu_count_reps[cpu_inx]++;
1118
1473
                        set_nodes++;
1119
1474
                        set_cpus += usable_cpus;
 
1475
                        if (cpus_per_task > 0)
 
1476
                                set_tasks += usable_cpus / cpus_per_task;
 
1477
                        else
 
1478
                                set_tasks = num_tasks;
1120
1479
                        if (set_nodes == node_count)
1121
1480
                                break;
1122
1481
                }
1123
1482
        }
1124
 
        
 
1483
 
1125
1484
        /* layout the tasks on the nodes */
1126
1485
        return slurm_step_layout_create(step_node_list,
1127
1486
                                        cpus_per_node, cpu_count_reps, 
1149
1508
                node_list = step_ptr->job_ptr->nodes;   
1150
1509
        }
1151
1510
        pack32(step_ptr->job_ptr->job_id, buffer);
1152
 
        pack16(step_ptr->step_id, buffer);
 
1511
        pack32(step_ptr->step_id, buffer);
1153
1512
        pack16(step_ptr->ckpt_interval, buffer);
1154
1513
        pack32(step_ptr->job_ptr->user_id, buffer);
1155
1514
        pack32(task_cnt, buffer);
1165
1524
        }
1166
1525
        pack_time(run_time, buffer);
1167
1526
        packstr(step_ptr->job_ptr->partition, buffer);
 
1527
        packstr(step_ptr->resv_ports, buffer);
1168
1528
        packstr(node_list, buffer);
1169
1529
        packstr(step_ptr->name, buffer);
1170
1530
        packstr(step_ptr->network, buffer);
1171
1531
        pack_bit_fmt(step_ptr->step_node_bitmap, buffer);
1172
 
        packstr(step_ptr->ckpt_path, buffer);
 
1532
        packstr(step_ptr->ckpt_dir, buffer);
1173
1533
        
1174
1534
}
1175
1535
 
1205
1565
                while ((job_ptr = 
1206
1566
                                (struct job_record *) 
1207
1567
                                list_next(job_iterator))) {
1208
 
                        if (((show_flags & SHOW_ALL) == 0) && (uid != 0) &&
 
1568
                        if (((show_flags & SHOW_ALL) == 0) &&
1209
1569
                            (job_ptr->part_ptr) && 
1210
1570
                            (job_ptr->part_ptr->hidden))
1211
1571
                                continue;
1227
1587
                }
1228
1588
                list_iterator_destroy(job_iterator);
1229
1589
 
1230
 
        } else if (step_id == 0) {
1231
 
                /* Return all steps for specific job_id */
 
1590
        } else {
1232
1591
                job_ptr = find_job_record(job_id);
1233
 
                if (((show_flags & SHOW_ALL) == 0) && 
1234
 
                    (job_ptr->part_ptr) && 
1235
 
                    (job_ptr->part_ptr->hidden))
1236
 
                        job_ptr = NULL;
1237
 
                else if ((slurmctld_conf.private_data & PRIVATE_DATA_JOBS)
1238
 
                &&  (job_ptr->user_id != uid) && !validate_super_user(uid))
1239
 
                        job_ptr = NULL;
 
1592
                /* first lets filter this step based on permission and
 
1593
                   request if not allowable set job_ptr = NULL */
 
1594
                if(job_ptr) {
 
1595
                        if (((show_flags & SHOW_ALL) == 0) 
 
1596
                            &&  (job_ptr->part_ptr) 
 
1597
                            &&  (job_ptr->part_ptr->hidden))
 
1598
                                job_ptr = NULL;
 
1599
                        else if ((slurmctld_conf.private_data 
 
1600
                                  & PRIVATE_DATA_JOBS)
 
1601
                                 && (job_ptr->user_id != uid)
 
1602
                                 && !validate_super_user(uid))
 
1603
                                job_ptr = NULL;
 
1604
                }
1240
1605
 
 
1606
                /* now send the requested steps */
1241
1607
                if (job_ptr) {
1242
1608
                        step_iterator = 
1243
1609
                                list_iterator_create(job_ptr->step_list);
1244
 
                        while ((step_ptr =
1245
 
                                        (struct step_record *)
1246
 
                                        list_next(step_iterator))) {
1247
 
                                _pack_ctld_job_step_info(step_ptr, buffer);
1248
 
                                steps_packed++;
 
1610
                        /* If step_id is 0 that means to send all
 
1611
                           steps (We understand this is incorrect
 
1612
                           since 0 is a valid job step,
 
1613
                           but changing it would need to be done in
 
1614
                           the api and so we wait until 2.1 */
 
1615
                        while ((step_ptr = list_next(step_iterator))) {
 
1616
                                if ((step_id == 0) 
 
1617
                                    || (step_ptr->step_id == step_id)) {
 
1618
                                        _pack_ctld_job_step_info(
 
1619
                                                step_ptr, buffer);
 
1620
                                        steps_packed++;
 
1621
                                }
1249
1622
                        }
1250
1623
                        list_iterator_destroy(step_iterator);
 
1624
 
 
1625
                        if(!steps_packed)
 
1626
                                error_code = ESLURM_INVALID_JOB_ID;
1251
1627
                } else
1252
1628
                        error_code = ESLURM_INVALID_JOB_ID;
1253
 
        } else {
1254
 
                /* Return data for specific job_id.step_id */
1255
 
                job_ptr = find_job_record(job_id);
1256
 
                if (((show_flags & SHOW_ALL) == 0) 
1257
 
                &&  (job_ptr != NULL)
1258
 
                &&  (job_ptr->part_ptr) 
1259
 
                &&  (job_ptr->part_ptr->hidden))
1260
 
                        job_ptr = NULL;
1261
 
                else if ((slurmctld_conf.private_data & PRIVATE_DATA_JOBS)
1262
 
                &&  (job_ptr->user_id != uid) && !validate_super_user(uid))
1263
 
                        job_ptr = NULL;
1264
 
 
1265
 
                step_ptr = find_step_record(job_ptr, step_id);
1266
 
                if (step_ptr == NULL)
1267
 
                        error_code = ESLURM_INVALID_JOB_ID;
1268
 
                else {
1269
 
                        _pack_ctld_job_step_info(step_ptr, buffer);
1270
 
                        steps_packed++;
1271
 
                }
1272
 
        }
 
1629
        } 
1273
1630
        part_filter_clear();
1274
1631
 
1275
1632
        /* put the real record count in the message body header */
1283
1640
}
1284
1641
 
1285
1642
/* 
1286
 
 * step_on_node - determine if the specified job has any job steps allocated to 
1287
 
 *      the specified node 
 
1643
 * kill_step_on_node - determine if the specified job has any job steps
 
1644
 *      allocated to the specified node and kill them unless no_kill flag
 
1645
 *      is set on the step
1288
1646
 * IN job_ptr - pointer to an active job record
1289
1647
 * IN node_ptr - pointer to a node record
1290
 
 * RET true of job has step on the node, false otherwise 
 
1648
 * RET count of killed job steps
1291
1649
 */
1292
 
bool step_on_node(struct job_record  *job_ptr, struct node_record *node_ptr)
 
1650
extern int kill_step_on_node(struct job_record  *job_ptr, 
 
1651
                             struct node_record *node_ptr)
1293
1652
{
1294
1653
        ListIterator step_iterator;
1295
1654
        struct step_record *step_ptr;
1296
 
        bool found = false;
 
1655
        int found = 0;
1297
1656
        int bit_position;
1298
1657
 
1299
1658
        if ((job_ptr == NULL) || (node_ptr == NULL))
1300
 
                return false;
 
1659
                return found;
1301
1660
 
1302
1661
        bit_position = node_ptr - node_record_table_ptr;
1303
1662
        step_iterator = list_iterator_create (job_ptr->step_list);      
1304
1663
        while ((step_ptr = (struct step_record *) list_next (step_iterator))) {
1305
 
                if (bit_test(step_ptr->step_node_bitmap, bit_position)) {
1306
 
                        found = true;
1307
 
                        break;
1308
 
                }
 
1664
                if (step_ptr->no_kill ||
 
1665
                    (bit_test(step_ptr->step_node_bitmap, bit_position) == 0))
 
1666
                        continue;
 
1667
                info("killing step %u.%u on down node %s", 
 
1668
                     job_ptr->job_id, step_ptr->step_id, node_ptr->name);
 
1669
                srun_step_complete(step_ptr);
 
1670
                signal_step_tasks(step_ptr, SIGKILL);
 
1671
                found++;
1309
1672
        }               
1310
1673
 
1311
1674
        list_iterator_destroy (step_iterator);
1353
1716
                goto reply;
1354
1717
        }
1355
1718
 
1356
 
        bzero((void *)&resp_data, sizeof(checkpoint_resp_msg_t));
1357
 
        /* find the individual job step */
1358
 
        if (ckpt_ptr->step_id != NO_VAL) {
1359
 
                step_ptr = find_step_record(job_ptr, ckpt_ptr->step_id);
1360
 
                if (step_ptr == NULL) {
1361
 
                        rc = ESLURM_INVALID_JOB_ID;
1362
 
                        goto reply;
1363
 
                } else {
1364
 
                        rc = checkpoint_op(ckpt_ptr->op, ckpt_ptr->data, 
1365
 
                                (void *)step_ptr, &resp_data.event_time, 
1366
 
                                &resp_data.error_code, &resp_data.error_msg);
1367
 
                        last_job_update = time(NULL);
1368
 
                }
1369
 
        }
1370
 
 
1371
 
        /* operate on all of a job's steps */
1372
 
        else {
1373
 
                int update_rc = -2;
1374
 
                ListIterator step_iterator;
1375
 
 
1376
 
                step_iterator = list_iterator_create (job_ptr->step_list);
1377
 
                while ((step_ptr = (struct step_record *) 
1378
 
                                        list_next (step_iterator))) {
1379
 
                        update_rc = checkpoint_op(ckpt_ptr->op, 
1380
 
                                                  ckpt_ptr->data,
1381
 
                                                  (void *)step_ptr,
1382
 
                                                  &resp_data.event_time,
1383
 
                                                  &resp_data.error_code,
1384
 
                                                  &resp_data.error_msg);
1385
 
                        rc = MAX(rc, update_rc);
1386
 
                }
1387
 
                if (update_rc != -2)    /* some work done */
1388
 
                        last_job_update = time(NULL);
1389
 
                list_iterator_destroy (step_iterator);
 
1719
        memset((void *)&resp_data, 0, sizeof(checkpoint_resp_msg_t));
 
1720
        step_ptr = find_step_record(job_ptr, ckpt_ptr->step_id);
 
1721
        if (step_ptr == NULL) {
 
1722
                rc = ESLURM_INVALID_JOB_ID;
 
1723
        } else {
 
1724
                if (ckpt_ptr->image_dir == NULL) {
 
1725
                        ckpt_ptr->image_dir = xstrdup(step_ptr->ckpt_dir);
 
1726
                }
 
1727
                xstrfmtcat(ckpt_ptr->image_dir, "/%u.%u", job_ptr->job_id, 
 
1728
                           step_ptr->step_id);
 
1729
 
 
1730
                rc = checkpoint_op(ckpt_ptr->job_id, ckpt_ptr->step_id, 
 
1731
                                   step_ptr, ckpt_ptr->op, ckpt_ptr->data,
 
1732
                                   ckpt_ptr->image_dir, &resp_data.event_time, 
 
1733
                                   &resp_data.error_code, 
 
1734
                                   &resp_data.error_msg);
 
1735
                last_job_update = time(NULL);
1390
1736
        }
1391
1737
 
1392
1738
    reply:
1520
1866
 * step_partial_comp - Note the completion of a job step on at least
1521
1867
 *      some of its nodes
1522
1868
 * IN req     - step_completion_msg RPC from slurmstepd
 
1869
 * IN uid     - UID issuing the request
1523
1870
 * OUT rem    - count of nodes for which responses are still pending
1524
1871
 * OUT max_rc - highest return code for any step thus far
1525
1872
 * RET 0 on success, otherwise ESLURM error code
1526
1873
 */
1527
 
extern int step_partial_comp(step_complete_msg_t *req, int *rem, 
1528
 
                             uint32_t *max_rc)
 
1874
extern int step_partial_comp(step_complete_msg_t *req, uid_t uid, 
 
1875
                             int *rem, uint32_t *max_rc)
1529
1876
{
1530
1877
        struct job_record *job_ptr;
1531
1878
        struct step_record *step_ptr;
1533
1880
 
1534
1881
        /* find the job, step, and validate input */
1535
1882
        job_ptr = find_job_record (req->job_id);
1536
 
        if (job_ptr == NULL)
 
1883
        if (job_ptr == NULL) {
 
1884
                info("step_partial_comp: JobID=%u invalid", req->job_id);
1537
1885
                return ESLURM_INVALID_JOB_ID;
1538
 
        if (job_ptr->job_state == JOB_PENDING)
 
1886
        }
 
1887
        if (job_ptr->job_state == JOB_PENDING) {
 
1888
                info("step_partial_comp: JobID=%u pending", req->job_id);
1539
1889
                return ESLURM_JOB_PENDING;
 
1890
        }
 
1891
 
 
1892
        if ((!validate_super_user(uid)) && (uid != job_ptr->user_id)) {
 
1893
                /* Normally from slurmstepd, from srun on some failures */
 
1894
                error("Security violation: "
 
1895
                      "REQUEST_STEP_COMPLETE RPC for job %u from uid=%u",
 
1896
                      job_ptr->job_id, (unsigned int) uid);
 
1897
                return ESLURM_USER_ID_MISSING;
 
1898
        }
 
1899
 
1540
1900
        step_ptr = find_step_record(job_ptr, req->job_step_id);
1541
 
        if (step_ptr == NULL)
 
1901
        if (step_ptr == NULL) {
 
1902
                info("step_partial_comp: StepID=%u.%u invalid", 
 
1903
                     req->job_id, req->job_step_id);
1542
1904
                return ESLURM_INVALID_JOB_ID;
 
1905
        }
1543
1906
        if (step_ptr->batch_step) {
1544
1907
                if(rem)
1545
1908
                        *rem = 0;
1555
1918
                return SLURM_SUCCESS;
1556
1919
        }
1557
1920
        if (req->range_last < req->range_first) {
1558
 
                error("step_partial_comp: range: %u-%u", req->range_first, 
1559
 
                        req->range_last);
 
1921
                error("step_partial_comp: JobID=%u range=%u-%u", 
 
1922
                      req->job_id, req->range_first, req->range_last);
1560
1923
                return EINVAL;
1561
1924
        }
1562
1925
 
1566
1929
                /* initialize the node bitmap for exited nodes */
1567
1930
                nodes = bit_set_count(step_ptr->step_node_bitmap);
1568
1931
                if (req->range_last >= nodes) { /* range is zero origin */
1569
 
                        error("step_partial_comp: last=%u, nodes=%d",
1570
 
                                req->range_last, nodes);
 
1932
                        error("step_partial_comp: JobID=%u last=%u, nodes=%d",
 
1933
                              req->job_id, req->range_last, nodes);
1571
1934
                        return EINVAL;
1572
1935
                }
1573
1936
                step_ptr->exit_node_bitmap = bit_alloc(nodes);
1577
1940
        } else {
1578
1941
                nodes = _bitstr_bits(step_ptr->exit_node_bitmap);
1579
1942
                if (req->range_last >= nodes) { /* range is zero origin */
1580
 
                        error("step_partial_comp: last=%u, nodes=%d",
1581
 
                                req->range_last, nodes);
 
1943
                        error("step_partial_comp: JobID=%u last=%u, nodes=%d",
 
1944
                              req->job_id, req->range_last, nodes);
1582
1945
                        return EINVAL;
1583
1946
                }
1584
1947
                step_ptr->exit_code = MAX(step_ptr->exit_code, req->step_rc);
1784
2147
 */
1785
2148
extern void dump_job_step_state(struct step_record *step_ptr, Buf buffer)
1786
2149
{
1787
 
        pack16(step_ptr->step_id, buffer);
 
2150
        pack32(step_ptr->step_id, buffer);
1788
2151
        pack16(step_ptr->cyclic_alloc, buffer);
1789
2152
        pack16(step_ptr->port, buffer);
1790
2153
        pack16(step_ptr->ckpt_interval, buffer);
1791
 
        pack16(step_ptr->mem_per_task, buffer);
1792
 
 
 
2154
        pack16(step_ptr->cpus_per_task, buffer);
 
2155
        pack16(step_ptr->resv_port_cnt, buffer);
 
2156
 
 
2157
        pack8(step_ptr->no_kill, buffer);
 
2158
 
 
2159
        pack32(step_ptr->cpu_count, buffer);
 
2160
        pack32(step_ptr->mem_per_task, buffer);
1793
2161
        pack32(step_ptr->exit_code, buffer);
1794
2162
        if (step_ptr->exit_code != NO_VAL) {
1795
2163
                pack_bit_fmt(step_ptr->exit_node_bitmap, buffer);
1796
2164
                pack16((uint16_t) _bitstr_bits(step_ptr->exit_node_bitmap), 
1797
2165
                        buffer);
1798
2166
        }
 
2167
        if (step_ptr->core_bitmap_job) {
 
2168
                uint32_t core_size = bit_size(step_ptr->core_bitmap_job);
 
2169
                pack32(core_size, buffer);
 
2170
                pack_bit_fmt(step_ptr->core_bitmap_job, buffer);
 
2171
        } else
 
2172
                pack32((uint32_t) 0, buffer);
1799
2173
 
1800
2174
        pack_time(step_ptr->start_time, buffer);
1801
2175
        pack_time(step_ptr->pre_sus_time, buffer);
1803
2177
        pack_time(step_ptr->ckpt_time, buffer);
1804
2178
 
1805
2179
        packstr(step_ptr->host,  buffer);
 
2180
        packstr(step_ptr->resv_ports, buffer);
1806
2181
        packstr(step_ptr->name, buffer);
1807
2182
        packstr(step_ptr->network, buffer);
1808
 
        packstr(step_ptr->ckpt_path, buffer);
 
2183
        packstr(step_ptr->ckpt_dir, buffer);
1809
2184
        pack16(step_ptr->batch_step, buffer);
1810
2185
        if (!step_ptr->batch_step) {
1811
2186
                pack_slurm_step_layout(step_ptr->step_layout, buffer);
1815
2190
}
1816
2191
 
1817
2192
/*
1818
 
 * Create a new job step from data in a buffer (as created by dump_job_step_state)
 
2193
 * Create a new job step from data in a buffer (as created by 
 
2194
 *      dump_job_step_state)
1819
2195
 * IN/OUT - job_ptr - point to a job for which the step is to be loaded.
1820
 
 * IN/OUT buffer - location from which to get data, pointers automatically advanced
 
2196
 * IN/OUT buffer - location to get data from, pointers advanced
1821
2197
 */
1822
2198
extern int load_step_state(struct job_record *job_ptr, Buf buffer)
1823
2199
{
1824
2200
        struct step_record *step_ptr = NULL;
1825
 
        uint16_t step_id, cyclic_alloc, port, batch_step, bit_cnt;
1826
 
        uint16_t ckpt_interval, mem_per_task;
1827
 
        uint32_t exit_code, name_len;
 
2201
        uint8_t no_kill;
 
2202
        uint16_t cyclic_alloc, port, batch_step, bit_cnt;
 
2203
        uint16_t ckpt_interval, cpus_per_task, resv_port_cnt;
 
2204
        uint32_t core_size, cpu_count, exit_code, mem_per_task, name_len;
 
2205
        uint32_t step_id;
1828
2206
        time_t start_time, pre_sus_time, tot_sus_time, ckpt_time;
1829
 
        char *host = NULL, *ckpt_path = NULL;
1830
 
        char *name = NULL, *network = NULL, *bit_fmt = NULL;
 
2207
        char *host = NULL, *ckpt_dir = NULL, *core_job = NULL;
 
2208
        char *resv_ports = NULL, *name = NULL, *network = NULL, *bit_fmt = NULL;
1831
2209
        switch_jobinfo_t switch_tmp = NULL;
1832
2210
        check_jobinfo_t check_tmp = NULL;
1833
2211
        slurm_step_layout_t *step_layout = NULL;
1834
2212
        
1835
 
        safe_unpack16(&step_id, buffer);
 
2213
        safe_unpack32(&step_id, buffer);
1836
2214
        safe_unpack16(&cyclic_alloc, buffer);
1837
2215
        safe_unpack16(&port, buffer);
1838
2216
        safe_unpack16(&ckpt_interval, buffer);
1839
 
        safe_unpack16(&mem_per_task, buffer);
1840
 
 
 
2217
        safe_unpack16(&cpus_per_task, buffer);
 
2218
        safe_unpack16(&resv_port_cnt, buffer);
 
2219
 
 
2220
        safe_unpack8(&no_kill, buffer);
 
2221
 
 
2222
        safe_unpack32(&cpu_count, buffer);
 
2223
        safe_unpack32(&mem_per_task, buffer);
1841
2224
        safe_unpack32(&exit_code, buffer);
1842
2225
        if (exit_code != NO_VAL) {
1843
2226
                safe_unpackstr_xmalloc(&bit_fmt, &name_len, buffer);
1844
2227
                safe_unpack16(&bit_cnt, buffer);
1845
2228
        }
1846
 
        
 
2229
        safe_unpack32(&core_size, buffer);
 
2230
        if (core_size)
 
2231
                safe_unpackstr_xmalloc(&core_job, &name_len, buffer);
 
2232
 
1847
2233
        safe_unpack_time(&start_time, buffer);
1848
2234
        safe_unpack_time(&pre_sus_time, buffer);
1849
2235
        safe_unpack_time(&tot_sus_time, buffer);
1850
2236
        safe_unpack_time(&ckpt_time, buffer);
1851
2237
 
1852
2238
        safe_unpackstr_xmalloc(&host, &name_len, buffer);
 
2239
        safe_unpackstr_xmalloc(&resv_ports, &name_len, buffer);
1853
2240
        safe_unpackstr_xmalloc(&name, &name_len, buffer);
1854
2241
        safe_unpackstr_xmalloc(&network, &name_len, buffer);
1855
 
        safe_unpackstr_xmalloc(&ckpt_path, &name_len, buffer);
 
2242
        safe_unpackstr_xmalloc(&ckpt_dir, &name_len, buffer);
1856
2243
        safe_unpack16(&batch_step, buffer);
1857
2244
        if (!batch_step) {
1858
2245
                if (unpack_slurm_step_layout(&step_layout, buffer))
1871
2258
                      job_ptr->job_id, step_id, cyclic_alloc);
1872
2259
                goto unpack_error;
1873
2260
        }
 
2261
        if (no_kill > 1) {
 
2262
                error("Invalid data for job %u.%u: no_kill=%u",
 
2263
                      job_ptr->job_id, step_id, no_kill);
 
2264
                goto unpack_error;
 
2265
        }
1874
2266
 
1875
2267
        step_ptr = find_step_record(job_ptr, step_id);
1876
2268
        if (step_ptr == NULL)
1877
 
                step_ptr = create_step_record(job_ptr);
 
2269
                step_ptr = _create_step_record(job_ptr);
1878
2270
        if (step_ptr == NULL)
1879
2271
                goto unpack_error;
1880
2272
 
1881
2273
        /* set new values */
1882
2274
        step_ptr->step_id      = step_id;
 
2275
        step_ptr->cpu_count    = cpu_count;
 
2276
        step_ptr->cpus_per_task= cpus_per_task;
1883
2277
        step_ptr->cyclic_alloc = cyclic_alloc;
 
2278
        step_ptr->resv_port_cnt= resv_port_cnt;
 
2279
        step_ptr->resv_ports   = resv_ports;
1884
2280
        step_ptr->name         = name;
1885
2281
        step_ptr->network      = network;
1886
 
        step_ptr->ckpt_path    = ckpt_path;
 
2282
        step_ptr->no_kill      = no_kill;
 
2283
        step_ptr->ckpt_dir     = ckpt_dir;
1887
2284
        step_ptr->port         = port;
1888
2285
        step_ptr->ckpt_interval= ckpt_interval;
1889
2286
        step_ptr->mem_per_task = mem_per_task;
1890
2287
        step_ptr->host         = host;
 
2288
        host                   = NULL;  /* re-used, nothing left to free */
1891
2289
        step_ptr->batch_step   = batch_step;
1892
 
        host                   = NULL;  /* re-used, nothing left to free */
1893
2290
        step_ptr->start_time   = start_time;
1894
2291
        step_ptr->pre_sus_time = pre_sus_time;
1895
2292
        step_ptr->tot_sus_time = tot_sus_time;
1896
2293
        step_ptr->ckpt_time    = ckpt_time;
1897
 
        step_ptr->cpus_per_task = 1;    /* Need to save/restore in v1.4 */
1898
2294
 
1899
2295
        slurm_step_layout_destroy(step_ptr->step_layout);
1900
2296
        step_ptr->step_layout  = step_layout;
1916
2312
                }
1917
2313
                xfree(bit_fmt);
1918
2314
        }
 
2315
        if (core_size) {
 
2316
                step_ptr->core_bitmap_job = bit_alloc(core_size);
 
2317
                if (bit_unfmt(step_ptr->core_bitmap_job, core_job)) {
 
2318
                        error("error recovering core_bitmap_job from %s",
 
2319
                              core_job);
 
2320
                }
 
2321
                xfree(core_job);
 
2322
        }
1919
2323
 
1920
2324
        if (step_ptr->step_layout && step_ptr->step_layout->node_list) {
1921
2325
                switch_g_job_step_allocated(switch_tmp, 
1928
2332
 
1929
2333
      unpack_error:
1930
2334
        xfree(host);
 
2335
        xfree(resv_ports);
1931
2336
        xfree(name);
1932
2337
        xfree(network);
1933
 
        xfree(ckpt_path);
 
2338
        xfree(ckpt_dir);
1934
2339
        xfree(bit_fmt);
 
2340
        xfree(core_job);
1935
2341
        if (switch_tmp)
1936
2342
                switch_free_jobinfo(switch_tmp);
1937
2343
        slurm_step_layout_destroy(step_layout);
1950
2356
        time_t event_time;
1951
2357
        uint32_t error_code;
1952
2358
        char *error_msg;
 
2359
        checkpoint_msg_t ckpt_req;
1953
2360
 
1954
2361
        /* Exit if "checkpoint/none" is configured */
1955
2362
        if (ckpt_run == -1) {
1967
2374
        while ((job_ptr = (struct job_record *) list_next(job_iterator))) {
1968
2375
                if (job_ptr->job_state != JOB_RUNNING)
1969
2376
                        continue;
 
2377
                if (job_ptr->batch_flag &&
 
2378
                    (job_ptr->ckpt_interval != 0)) { /* periodic job ckpt */
 
2379
                        ckpt_due = job_ptr->ckpt_time +
 
2380
                                   (job_ptr->ckpt_interval * 60);
 
2381
                        if (ckpt_due > now)
 
2382
                                continue;
 
2383
                        /* 
 
2384
                         * DO NOT initiate a checkpoint request if the job is
 
2385
                         * started just now, in case it is restarting from checkpoint.
 
2386
                         */
 
2387
                        ckpt_due = job_ptr->start_time +
 
2388
                                   (job_ptr->ckpt_interval * 60);
 
2389
                        if (ckpt_due > now)
 
2390
                                continue;
 
2391
 
 
2392
                        ckpt_req.op = CHECK_CREATE;
 
2393
                        ckpt_req.data = 0;
 
2394
                        ckpt_req.job_id = job_ptr->job_id;
 
2395
                        ckpt_req.step_id = SLURM_BATCH_SCRIPT;
 
2396
                        ckpt_req.image_dir = NULL;
 
2397
                        job_checkpoint(&ckpt_req, getuid(), -1);
 
2398
                        job_ptr->ckpt_time = now;
 
2399
                        last_job_update = now;
 
2400
                        continue; /* ignore periodic step ckpt */
 
2401
                }
1970
2402
                step_iterator = list_iterator_create (job_ptr->step_list);
1971
2403
                while ((step_ptr = (struct step_record *) 
1972
2404
                                list_next (step_iterator))) {
 
2405
                        char *image_dir = NULL;
1973
2406
                        if (step_ptr->ckpt_interval == 0)
1974
2407
                                continue;
1975
2408
                        ckpt_due = step_ptr->ckpt_time +
1976
 
                                (step_ptr->ckpt_interval * 60);
 
2409
                                   (step_ptr->ckpt_interval * 60);
1977
2410
                        if (ckpt_due > now) 
1978
2411
                                continue;
 
2412
                        /* 
 
2413
                         * DO NOT initiate a checkpoint request if the step is
 
2414
                         * started just now, in case it is restarting from 
 
2415
                         * checkpoint.
 
2416
                         */
 
2417
                        ckpt_due = step_ptr->start_time + 
 
2418
                                   (step_ptr->ckpt_interval * 60);
 
2419
                        if (ckpt_due > now)
 
2420
                                continue;
 
2421
 
1979
2422
                        step_ptr->ckpt_time = now;
1980
2423
                        last_job_update = now;
1981
 
                        (void) checkpoint_op(CHECK_CREATE, 0, 
1982
 
                                (void *)step_ptr, &event_time, 
1983
 
                                &error_code, &error_msg);
 
2424
                        image_dir = xstrdup(step_ptr->ckpt_dir);
 
2425
                        xstrfmtcat(image_dir, "/%u.%u", job_ptr->job_id, 
 
2426
                                   step_ptr->step_id);
 
2427
                        (void) checkpoint_op(job_ptr->job_id, 
 
2428
                                             step_ptr->step_id,
 
2429
                                             step_ptr, CHECK_CREATE, 0, 
 
2430
                                             image_dir, &event_time,
 
2431
                                             &error_code, &error_msg);
 
2432
                        xfree(image_dir);
1984
2433
                }
1985
2434
                list_iterator_destroy (step_iterator);
1986
2435
        }