59
62
#include "src/common/log.h"
60
63
#include "src/common/xstring.h"
61
64
#include "src/common/xmalloc.h"
65
#include "src/common/hostlist.h"
62
66
#include "src/scancel/scancel.h"
64
68
#define MAX_CANCEL_RETRY 10
69
#define MAX_THREADS 20
66
72
static void _cancel_jobs (void);
67
static void _cancel_job_id (uint32_t job_id, uint16_t sig);
68
static void _cancel_step_id (uint32_t job_id, uint32_t step_id,
73
static void *_cancel_job_id (void *cancel_info);
74
static void *_cancel_step_id (void *cancel_info);
70
76
static int _confirmation (int i, uint32_t step_id);
71
77
static void _filter_job_records (void);
72
78
static void _load_job_records (void);
79
static void _verify_job_ids (void);
74
81
static job_info_msg_t * job_buffer_ptr = NULL;
83
typedef struct job_cancel_info {
87
int *num_active_threads;
88
pthread_mutex_t *num_active_threads_lock;
89
pthread_cond_t *num_active_threads_cond;
92
static pthread_attr_t attr;
93
static int num_active_threads = 0;
94
static pthread_mutex_t num_active_threads_lock;
95
static pthread_cond_t num_active_threads_cond;
77
98
main (int argc, char *argv[])
142
_verify_job_ids (void)
144
/* If a list of jobs was given, make sure each job is actually in
145
* our list of job records. */
147
job_info_t *job_ptr = job_buffer_ptr->job_array;
149
for (j = 0; j < opt.job_cnt; j++ ) {
150
for (i = 0; i < job_buffer_ptr->record_count; i++) {
151
if (job_ptr[i].job_id == opt.job_id[j])
154
if (((job_ptr[i].job_state >= JOB_COMPLETE) ||
155
(i >= job_buffer_ptr->record_count)) &&
156
(opt.verbose >= 0)) {
157
if (opt.step_id[j] == SLURM_BATCH_SCRIPT)
158
error("Kill job error on job id %u: %s",
160
slurm_strerror(ESLURM_INVALID_JOB_ID));
162
error("Kill job error on job step id %u.%u: %s",
163
opt.job_id[j], opt.step_id[j],
164
slurm_strerror(ESLURM_INVALID_JOB_ID));
117
170
/* _filter_job_records - filtering job information per user specification */
119
172
_filter_job_records (void)
187
/* _cancel_jobs - filter then cancel jobs or job steps per request */
255
_cancel_jobs_by_state(uint16_t job_state)
192
job_info_t *job_ptr = NULL;
194
if (opt.job_cnt && opt.interactive) { /* confirm cancel */
195
job_ptr = job_buffer_ptr->job_array ;
196
for (j = 0; j < opt.job_cnt; j++ ) {
197
for (i = 0; i < job_buffer_ptr->record_count; i++) {
198
if (job_ptr[i].job_id != opt.job_id[j])
258
job_cancel_info_t *cancel_info;
259
job_info_t *job_ptr = job_buffer_ptr->job_array;
262
/* Spawn a thread to cancel each job or job step marked for
264
for (i = 0; i < job_buffer_ptr->record_count; i++) {
265
if (job_ptr[i].job_id == 0)
268
if ((job_state < JOB_END) &&
269
(job_ptr[i].job_state != job_state))
272
/* If cancelling a list of jobs, see if the current job
273
* included a step id */
275
for (j = 0; j < opt.job_cnt; j++ ) {
276
if (job_ptr[i].job_id != opt.job_id[j])
200
if (opt.interactive &&
279
if (opt.interactive &&
201
280
(_confirmation(i, opt.step_id[j]) == 0))
284
(job_cancel_info_t *)
285
xmalloc(sizeof(job_cancel_info_t));
286
cancel_info->job_id = job_ptr[i].job_id;
287
cancel_info->sig = opt.signal;
288
cancel_info->num_active_threads =
290
cancel_info->num_active_threads_lock =
291
&num_active_threads_lock;
292
cancel_info->num_active_threads_cond =
293
&num_active_threads_cond;
295
pthread_mutex_lock(&num_active_threads_lock);
296
num_active_threads++;
297
while (num_active_threads > MAX_THREADS) {
299
&num_active_threads_cond,
300
&num_active_threads_lock);
302
pthread_mutex_unlock(&num_active_threads_lock);
304
if (opt.step_id[j] == SLURM_BATCH_SCRIPT) {
305
err = pthread_create(&dummy, &attr,
309
_cancel_job_id(cancel_info);
203
if (opt.step_id[j] == SLURM_BATCH_SCRIPT)
204
_cancel_job_id (opt.job_id[j],
207
_cancel_step_id (opt.job_id[j],
312
cancel_info->step_id = opt.step_id[j];
313
err = pthread_create(&dummy, &attr,
317
_cancel_step_id(cancel_info);
318
/* Don't break here. Keep looping in
319
* case other steps from the same job
212
if (i >= job_buffer_ptr->record_count)
213
fprintf (stderr, "Job %u not found\n",
217
} else if (opt.job_cnt) { /* delete specific jobs */
218
for (j = 0; j < opt.job_cnt; j++ ) {
219
if (opt.step_id[j] == SLURM_BATCH_SCRIPT)
220
_cancel_job_id (opt.job_id[j],
223
_cancel_step_id (opt.job_id[j],
228
} else { /* delete all jobs per filtering */
229
job_ptr = job_buffer_ptr->job_array ;
230
for (i = 0; i < job_buffer_ptr->record_count; i++) {
231
if (job_ptr[i].job_id == 0)
233
if (opt.interactive &&
324
if (opt.interactive &&
234
325
(_confirmation(i, SLURM_BATCH_SCRIPT) == 0))
236
_cancel_job_id (job_ptr[i].job_id, opt.signal);
328
cancel_info = (job_cancel_info_t *)
329
xmalloc(sizeof(job_cancel_info_t));
330
cancel_info->job_id = job_ptr[i].job_id;
331
cancel_info->sig = opt.signal;
332
cancel_info->num_active_threads = &num_active_threads;
333
cancel_info->num_active_threads_lock =
334
&num_active_threads_lock;
335
cancel_info->num_active_threads_cond =
336
&num_active_threads_cond;
338
pthread_mutex_lock( &num_active_threads_lock );
339
num_active_threads++;
340
while (num_active_threads > MAX_THREADS) {
341
pthread_cond_wait(&num_active_threads_cond,
342
&num_active_threads_lock);
344
pthread_mutex_unlock(&num_active_threads_lock);
346
err = pthread_create(&dummy, &attr, _cancel_job_id,
349
_cancel_job_id(cancel_info);
351
job_ptr[i].job_id = 0;
355
/* _cancel_jobs - filter then cancel jobs or job steps per request */
242
_cancel_job_id (uint32_t job_id, uint16_t sig)
359
slurm_attr_init(&attr);
360
if (pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED))
361
error("pthread_attr_setdetachstate error %m");
363
slurm_mutex_init(&num_active_threads_lock);
365
if (pthread_cond_init(&num_active_threads_cond, NULL))
366
error("pthread_cond_init error %m");
368
_cancel_jobs_by_state(JOB_PENDING);
369
_cancel_jobs_by_state(JOB_END);
371
/* Wait for any spawned threads that have not finished */
372
pthread_mutex_lock( &num_active_threads_lock );
373
while (num_active_threads > 0) {
374
pthread_cond_wait( &num_active_threads_cond,
375
&num_active_threads_lock );
377
pthread_mutex_unlock( &num_active_threads_lock );
379
slurm_attr_destroy(&attr);
380
slurm_mutex_destroy(&num_active_threads_lock);
381
if (pthread_cond_destroy(&num_active_threads_cond))
382
error("pthread_cond_destroy error %m");
386
_cancel_job_id (void *ci)
244
388
int error_code = SLURM_SUCCESS, i;
245
389
bool sig_set = true;
391
job_cancel_info_t *cancel_info = (job_cancel_info_t *)ci;
392
uint32_t job_id = cancel_info->job_id;
393
uint16_t sig = cancel_info->sig;
247
395
if (sig == (uint16_t)-1) {
281
429
error("Kill job error on job id %u: %s",
282
430
job_id, slurm_strerror(slurm_get_errno()));
433
/* Purposely free the struct passed in here, so the caller doesn't have
434
* to keep track of it, but don't destroy the mutex and condition
435
* variables contained. */
436
pthread_mutex_lock( cancel_info->num_active_threads_lock );
437
(*(cancel_info->num_active_threads))--;
438
pthread_cond_signal( cancel_info->num_active_threads_cond );
439
pthread_mutex_unlock( cancel_info->num_active_threads_lock );
287
_cancel_step_id (uint32_t job_id, uint32_t step_id, uint16_t sig)
446
_cancel_step_id (void *ci)
289
448
int error_code = SLURM_SUCCESS, i;
449
job_cancel_info_t *cancel_info = (job_cancel_info_t *)ci;
450
uint32_t job_id = cancel_info->job_id;
451
uint32_t step_id = cancel_info->step_id;
452
uint16_t sig = cancel_info->sig;
291
454
if (sig == (uint16_t)-1)