~ubuntu-branches/ubuntu/vivid/slurm-llnl/vivid

« back to all changes in this revision

Viewing changes to src/scancel/scancel.c

  • Committer: Bazaar Package Importer
  • Author(s): Gennaro Oliva
  • Date: 2009-09-24 23:28:15 UTC
  • mfrom: (1.1.11 upstream) (3.2.4 sid)
  • Revision ID: james.westby@ubuntu.com-20090924232815-enh65jn32q1ebg07
Tags: 2.0.5-1
* New upstream release 
* Changed dependecy from lib-mysqlclient15 to lib-mysqlclient 
* Added Default-Start for runlevel 2 and 4 and $remote_fs requirement in
  init.d scripts (Closes: #541252)
* Postinst checks for wrong runlevels 2 and 4 links
* Upgraded to standard version 3.8.3
* Add lintian overrides for missing slurm-llnl-configurator.html in doc
  base registration
* modified postrm scripts to ignore pkill return value in order to avoid
  postrm failure when no slurm process is running
* Checking for slurmctld.pid before cancelling running and pending
  jobs during package removal 

Show diffs side-by-side

added added

removed removed

Lines of Context:
2
2
 *  scancel - cancel specified job(s) and/or job step(s)
3
3
 *****************************************************************************
4
4
 *  Copyright (C) 2002-2007 The Regents of the University of California.
 
5
 *  Copyright (C) 2008-2009 Lawrence Livermore National Security.
5
6
 *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
6
7
 *  Written by Morris Jette <jette1@llnl.gov>
7
 
 *  LLNL-CODE-402394.
 
8
 *  CODE-OCEC-09-009. All rights reserved.
8
9
 *  
9
10
 *  This file is part of SLURM, a resource management program.
10
 
 *  For details, see <http://www.llnl.gov/linux/slurm/>.
 
11
 *  For details, see <https://computing.llnl.gov/linux/slurm/>.
 
12
 *  Please also read the included file: DISCLAIMER.
11
13
 *  
12
14
 *  SLURM is free software; you can redistribute it and/or modify it under
13
15
 *  the terms of the GNU General Public License as published by the Free
45
47
#include <stdlib.h>
46
48
#include <string.h>
47
49
#include <signal.h>
 
50
#include <pthread.h>
48
51
 
49
52
#if HAVE_INTTYPES_H
50
53
#  include <inttypes.h>
59
62
#include "src/common/log.h"
60
63
#include "src/common/xstring.h"
61
64
#include "src/common/xmalloc.h"
 
65
#include "src/common/hostlist.h"
62
66
#include "src/scancel/scancel.h"
63
67
 
64
68
#define MAX_CANCEL_RETRY 10
 
69
#define MAX_THREADS 20
 
70
 
65
71
 
66
72
static void _cancel_jobs (void);
67
 
static void _cancel_job_id (uint32_t job_id, uint16_t sig);
68
 
static void _cancel_step_id (uint32_t job_id, uint32_t step_id, 
69
 
                             uint16_t sig);
 
73
static void *_cancel_job_id (void *cancel_info);
 
74
static void *_cancel_step_id (void *cancel_info);
 
75
 
70
76
static int  _confirmation (int i, uint32_t step_id);
71
77
static void _filter_job_records (void);
72
78
static void _load_job_records (void);
 
79
static void _verify_job_ids (void);
73
80
 
74
81
static job_info_msg_t * job_buffer_ptr = NULL;
75
82
 
 
83
typedef struct job_cancel_info {
 
84
        uint32_t job_id;
 
85
        uint32_t step_id;
 
86
        uint16_t sig;
 
87
        int             *num_active_threads;
 
88
        pthread_mutex_t *num_active_threads_lock;
 
89
        pthread_cond_t  *num_active_threads_cond;
 
90
} job_cancel_info_t;
 
91
 
 
92
static  pthread_attr_t  attr;
 
93
static  int num_active_threads = 0;
 
94
static  pthread_mutex_t  num_active_threads_lock;
 
95
static  pthread_cond_t   num_active_threads_cond;
 
96
 
76
97
int
77
98
main (int argc, char *argv[]) 
78
99
{
85
106
                log_alter (log_opts, SYSLOG_FACILITY_DAEMON, NULL);
86
107
        } 
87
108
        
 
109
        _load_job_records();
 
110
        _verify_job_ids();
 
111
 
88
112
        if ((opt.interactive) ||
89
113
            (opt.job_name) ||
90
114
            (opt.partition) ||
91
115
            (opt.state != JOB_END) ||
92
 
            (opt.user_name)) {
93
 
                _load_job_records ();
 
116
            (opt.user_name) ||
 
117
            (opt.nodelist)) {
94
118
                _filter_job_records ();
95
119
        }
96
120
        _cancel_jobs ();
114
138
}
115
139
 
116
140
 
 
141
static void
 
142
_verify_job_ids (void)
 
143
{
 
144
        /* If a list of jobs was given, make sure each job is actually in
 
145
         * our list of job records. */
 
146
        int i, j;
 
147
        job_info_t *job_ptr = job_buffer_ptr->job_array;
 
148
 
 
149
        for (j = 0; j < opt.job_cnt; j++ ) {
 
150
                for (i = 0; i < job_buffer_ptr->record_count; i++) {
 
151
                        if (job_ptr[i].job_id == opt.job_id[j])
 
152
                                break;
 
153
                }
 
154
                if (((job_ptr[i].job_state >= JOB_COMPLETE) ||
 
155
                     (i >= job_buffer_ptr->record_count)) &&
 
156
                     (opt.verbose >= 0)) {
 
157
                        if (opt.step_id[j] == SLURM_BATCH_SCRIPT)
 
158
                                error("Kill job error on job id %u: %s", 
 
159
                                      opt.job_id[j], 
 
160
                                      slurm_strerror(ESLURM_INVALID_JOB_ID));
 
161
                        else
 
162
                                error("Kill job error on job step id %u.%u: %s",
 
163
                                      opt.job_id[j], opt.step_id[j],
 
164
                                      slurm_strerror(ESLURM_INVALID_JOB_ID));
 
165
                }
 
166
        }
 
167
}
 
168
 
 
169
 
117
170
/* _filter_job_records - filtering job information per user specification */
118
171
static void 
119
172
_filter_job_records (void)
133
186
                        continue;
134
187
                }
135
188
 
136
 
                if (opt.job_name != NULL) {
137
 
                        char *quote = NULL;
138
 
                        int set = 0;
139
 
                        if ((quote = strchr(job_ptr[i].name, (int) '\"'))) 
140
 
                                /* take out the wckey */
141
 
                                *quote = '\0';
142
 
                        else 
143
 
                                quote = job_ptr[i].name;
144
 
                        
145
 
                        if(strcmp(job_ptr[i].name, opt.job_name) != 0) {
146
 
                                job_ptr[i].job_id = 0;
147
 
                                if(set)
148
 
                                        *quote = '\"';  
149
 
                                continue;
150
 
                        }
151
 
                        if(set)
152
 
                                *quote = '\"';                                  
 
189
                if (opt.job_name != NULL &&
 
190
                    (strcmp(job_ptr[i].name, opt.job_name) != 0)) {
 
191
                        job_ptr[i].job_id = 0;
 
192
                        continue;
 
193
                }
 
194
 
 
195
                if (opt.wckey != NULL &&
 
196
                    (strcmp(job_ptr[i].wckey, opt.wckey) != 0)) {
 
197
                        job_ptr[i].job_id = 0;
 
198
                        continue;
153
199
                }
154
200
 
155
201
                if ((opt.partition != NULL) &&
170
216
                        continue;
171
217
                }
172
218
 
 
219
                if (opt.nodelist != NULL) {
 
220
                        /* If nodelist contains a '/', treat it as a file name */
 
221
                        if (strchr(opt.nodelist, '/') != NULL) {
 
222
                                char *reallist;
 
223
                                reallist = slurm_read_hostfile(opt.nodelist,
 
224
                                                               NO_VAL);
 
225
                                if (reallist) {
 
226
                                        xfree(opt.nodelist);
 
227
                                        opt.nodelist = reallist;
 
228
                                }
 
229
                        }
 
230
 
 
231
                        hostset_t hs = hostset_create(job_ptr[i].nodes);
 
232
                        if (!hostset_intersects(hs, opt.nodelist)) {
 
233
                                job_ptr[i].job_id = 0;
 
234
                                hostset_destroy(hs);
 
235
                                continue;
 
236
                        } else {
 
237
                                hostset_destroy(hs);
 
238
                        }
 
239
                }
 
240
 
173
241
                if (opt.job_cnt == 0)
174
242
                        continue;
175
243
                for (j = 0; j < opt.job_cnt; j++) {
183
251
        }
184
252
}
185
253
 
186
 
 
187
 
/* _cancel_jobs - filter then cancel jobs or job steps per request */
188
254
static void
189
 
_cancel_jobs (void)
 
255
_cancel_jobs_by_state(uint16_t job_state)
190
256
{
191
 
        int i, j;
192
 
        job_info_t *job_ptr = NULL;
193
 
 
194
 
        if (opt.job_cnt && opt.interactive) {   /* confirm cancel */
195
 
                job_ptr = job_buffer_ptr->job_array ;
196
 
                for (j = 0; j < opt.job_cnt; j++ ) {
197
 
                        for (i = 0; i < job_buffer_ptr->record_count; i++) {
198
 
                                if (job_ptr[i].job_id != opt.job_id[j]) 
 
257
        int i, j, err;
 
258
        job_cancel_info_t *cancel_info;
 
259
        job_info_t *job_ptr = job_buffer_ptr->job_array;
 
260
        pthread_t  dummy;
 
261
 
 
262
        /* Spawn a thread to cancel each job or job step marked for
 
263
         * cancellation */
 
264
        for (i = 0; i < job_buffer_ptr->record_count; i++) {
 
265
                if (job_ptr[i].job_id == 0)
 
266
                        continue;
 
267
 
 
268
                if ((job_state < JOB_END) &&
 
269
                    (job_ptr[i].job_state != job_state))
 
270
                        continue;
 
271
 
 
272
                /* If cancelling a list of jobs, see if the current job
 
273
                 * included a step id */
 
274
                if (opt.job_cnt) {
 
275
                        for (j = 0; j < opt.job_cnt; j++ ) {
 
276
                                if (job_ptr[i].job_id != opt.job_id[j])
199
277
                                        continue;
200
 
                                if (opt.interactive && 
 
278
 
 
279
                                if (opt.interactive &&
201
280
                                    (_confirmation(i, opt.step_id[j]) == 0))
 
281
                                        continue;
 
282
 
 
283
                                cancel_info =
 
284
                                        (job_cancel_info_t *)
 
285
                                        xmalloc(sizeof(job_cancel_info_t));
 
286
                                cancel_info->job_id  = job_ptr[i].job_id;
 
287
                                cancel_info->sig     = opt.signal;
 
288
                                cancel_info->num_active_threads =
 
289
                                        &num_active_threads;
 
290
                                cancel_info->num_active_threads_lock =
 
291
                                        &num_active_threads_lock;
 
292
                                cancel_info->num_active_threads_cond =
 
293
                                        &num_active_threads_cond;
 
294
 
 
295
                                pthread_mutex_lock(&num_active_threads_lock);
 
296
                                num_active_threads++;
 
297
                                while (num_active_threads > MAX_THREADS) {
 
298
                                        pthread_cond_wait(
 
299
                                                &num_active_threads_cond,
 
300
                                                &num_active_threads_lock);
 
301
                                }
 
302
                                pthread_mutex_unlock(&num_active_threads_lock);
 
303
 
 
304
                                if (opt.step_id[j] == SLURM_BATCH_SCRIPT) {
 
305
                                        err = pthread_create(&dummy, &attr,
 
306
                                                             _cancel_job_id,
 
307
                                                             cancel_info);
 
308
                                        if (err)
 
309
                                                _cancel_job_id(cancel_info);
202
310
                                        break;
203
 
                                if (opt.step_id[j] == SLURM_BATCH_SCRIPT)
204
 
                                        _cancel_job_id (opt.job_id[j], 
205
 
                                                        opt.signal);
206
 
                                else
207
 
                                        _cancel_step_id (opt.job_id[j], 
208
 
                                                        opt.step_id[j],
209
 
                                                        opt.signal);
210
 
                                break;
 
311
                                } else {
 
312
                                        cancel_info->step_id = opt.step_id[j];
 
313
                                        err = pthread_create(&dummy, &attr,
 
314
                                                             _cancel_step_id,
 
315
                                                             cancel_info);
 
316
                                        if (err)
 
317
                                                _cancel_step_id(cancel_info);
 
318
                                        /* Don't break here.  Keep looping in
 
319
                                         * case other steps from the same job
 
320
                                         * are cancelled. */
 
321
                                }
211
322
                        }
212
 
                        if (i >= job_buffer_ptr->record_count)
213
 
                                fprintf (stderr, "Job %u not found\n", 
214
 
                                         opt.job_id[j]);
215
 
                }
216
 
 
217
 
        } else if (opt.job_cnt) {       /* delete specific jobs */
218
 
                for (j = 0; j < opt.job_cnt; j++ ) {
219
 
                        if (opt.step_id[j] == SLURM_BATCH_SCRIPT)
220
 
                                _cancel_job_id (opt.job_id[j], 
221
 
                                                opt.signal);
222
 
                        else
223
 
                                _cancel_step_id (opt.job_id[j], 
224
 
                                                opt.step_id[j], 
225
 
                                                opt.signal);
226
 
                }
227
 
 
228
 
        } else {                /* delete all jobs per filtering */
229
 
                job_ptr = job_buffer_ptr->job_array ;
230
 
                for (i = 0; i < job_buffer_ptr->record_count; i++) {
231
 
                        if (job_ptr[i].job_id == 0) 
232
 
                                continue;
233
 
                        if (opt.interactive && 
 
323
                } else {
 
324
                        if (opt.interactive &&
234
325
                            (_confirmation(i, SLURM_BATCH_SCRIPT) == 0))
235
326
                                continue;
236
 
                        _cancel_job_id (job_ptr[i].job_id, opt.signal);
 
327
 
 
328
                        cancel_info = (job_cancel_info_t *)
 
329
                                xmalloc(sizeof(job_cancel_info_t));
 
330
                        cancel_info->job_id  = job_ptr[i].job_id;
 
331
                        cancel_info->sig     = opt.signal;
 
332
                        cancel_info->num_active_threads = &num_active_threads;
 
333
                        cancel_info->num_active_threads_lock =
 
334
                                &num_active_threads_lock;
 
335
                        cancel_info->num_active_threads_cond =
 
336
                                &num_active_threads_cond;
 
337
 
 
338
                        pthread_mutex_lock( &num_active_threads_lock );
 
339
                        num_active_threads++;
 
340
                        while (num_active_threads > MAX_THREADS) {
 
341
                                pthread_cond_wait(&num_active_threads_cond,
 
342
                                                  &num_active_threads_lock);
 
343
                        }
 
344
                        pthread_mutex_unlock(&num_active_threads_lock);
 
345
 
 
346
                        err = pthread_create(&dummy, &attr, _cancel_job_id,
 
347
                                             cancel_info);
 
348
                        if (err)
 
349
                                _cancel_job_id(cancel_info);
237
350
                }
 
351
                job_ptr[i].job_id = 0;
238
352
        }
239
353
}
240
354
 
 
355
/* _cancel_jobs - filter then cancel jobs or job steps per request */
241
356
static void
242
 
_cancel_job_id (uint32_t job_id, uint16_t sig)
 
357
_cancel_jobs (void)
 
358
{
 
359
        slurm_attr_init(&attr);
 
360
        if (pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED))
 
361
                error("pthread_attr_setdetachstate error %m");
 
362
 
 
363
        slurm_mutex_init(&num_active_threads_lock);
 
364
 
 
365
        if (pthread_cond_init(&num_active_threads_cond, NULL))
 
366
                error("pthread_cond_init error %m");
 
367
 
 
368
        _cancel_jobs_by_state(JOB_PENDING);
 
369
        _cancel_jobs_by_state(JOB_END);
 
370
 
 
371
        /* Wait for any spawned threads that have not finished */
 
372
        pthread_mutex_lock( &num_active_threads_lock );
 
373
        while (num_active_threads > 0) {
 
374
                pthread_cond_wait( &num_active_threads_cond,
 
375
                                   &num_active_threads_lock );
 
376
        }
 
377
        pthread_mutex_unlock( &num_active_threads_lock );
 
378
 
 
379
        slurm_attr_destroy(&attr);
 
380
        slurm_mutex_destroy(&num_active_threads_lock);
 
381
        if (pthread_cond_destroy(&num_active_threads_cond))
 
382
                error("pthread_cond_destroy error %m");
 
383
}
 
384
 
 
385
static void *
 
386
_cancel_job_id (void *ci)
243
387
{
244
388
        int error_code = SLURM_SUCCESS, i;
245
389
        bool sig_set = true;
246
390
 
 
391
        job_cancel_info_t *cancel_info = (job_cancel_info_t *)ci;
 
392
        uint32_t job_id = cancel_info->job_id;
 
393
        uint16_t sig    = cancel_info->sig;
 
394
 
247
395
        if (sig == (uint16_t)-1) {
248
396
                sig = SIGKILL;
249
397
                sig_set = false;
281
429
                        error("Kill job error on job id %u: %s", 
282
430
                                job_id, slurm_strerror(slurm_get_errno()));
283
431
        }
 
432
 
 
433
        /* Purposely free the struct passed in here, so the caller doesn't have
 
434
         * to keep track of it, but don't destroy the mutex and condition 
 
435
         * variables contained. */ 
 
436
        pthread_mutex_lock(   cancel_info->num_active_threads_lock );
 
437
        (*(cancel_info->num_active_threads))--;
 
438
        pthread_cond_signal(  cancel_info->num_active_threads_cond );
 
439
        pthread_mutex_unlock( cancel_info->num_active_threads_lock );
 
440
 
 
441
        xfree(cancel_info);
 
442
        return NULL;
284
443
}
285
444
 
286
 
static void
287
 
_cancel_step_id (uint32_t job_id, uint32_t step_id, uint16_t sig)
 
445
static void *
 
446
_cancel_step_id (void *ci)
288
447
{
289
448
        int error_code = SLURM_SUCCESS, i;
 
449
        job_cancel_info_t *cancel_info = (job_cancel_info_t *)ci;
 
450
        uint32_t job_id  = cancel_info->job_id;
 
451
        uint32_t step_id = cancel_info->step_id;
 
452
        uint16_t sig     = cancel_info->sig;
290
453
 
291
454
        if (sig == (uint16_t)-1)
292
455
                sig = SIGKILL;
320
483
                                job_id, step_id, 
321
484
                                slurm_strerror(slurm_get_errno()));
322
485
        }
 
486
 
 
487
        /* Purposely free the struct passed in here, so the caller doesn't have
 
488
         * to keep track of it, but don't destroy the mutex and condition 
 
489
         * variables contained. */ 
 
490
        pthread_mutex_lock(   cancel_info->num_active_threads_lock );
 
491
        (*(cancel_info->num_active_threads))--;
 
492
        pthread_cond_signal(  cancel_info->num_active_threads_cond );
 
493
        pthread_mutex_unlock( cancel_info->num_active_threads_lock );
 
494
 
 
495
        xfree(cancel_info);
 
496
        return NULL;
323
497
}
324
498
 
325
499
/* _confirmation - Confirm job cancel request interactively */
328
502
{
329
503
        char in_line[128];
330
504
        job_info_t *job_ptr = NULL;
 
505
        char *line = NULL;
331
506
 
332
507
        job_ptr = job_buffer_ptr->job_array ;
333
508
        while (1) {
341
516
                                job_ptr[i].partition);
342
517
                }
343
518
 
344
 
                fgets (in_line, sizeof (in_line), stdin);
 
519
                /* we only set this here to avoid a warning.  We throw it away
 
520
                   later. */
 
521
                line = fgets (in_line, sizeof (in_line), stdin);
345
522
                if ((in_line[0] == 'y') || (in_line[0] == 'Y'))
346
523
                        return 1;
347
524
                if ((in_line[0] == 'n') || (in_line[0] == 'N'))