1
/*****************************************************************************\
2
* jobacct_gather_linux.c - slurm job accounting gather plugin for linux.
3
*****************************************************************************
4
* Copyright (C) 2005 Hewlett-Packard Development Company, L.P.
5
* Written by Andy Riebs, <andy.riebs@hp.com>, who borrowed heavily
6
* from other parts of SLURM, and Danny Auble, <da@llnl.gov>
9
* This file is part of SLURM, a resource management program.
10
* For details, see <http://www.llnl.gov/linux/slurm/>.
12
* SLURM is free software; you can redistribute it and/or modify it under
13
* the terms of the GNU General Public License as published by the Free
14
* Software Foundation; either version 2 of the License, or (at your option)
17
* In addition, as a special exception, the copyright holders give permission
18
* to link the code of portions of this program with the OpenSSL library under
19
* certain conditions as described in each individual source file, and
20
* distribute linked combinations including the two. You must obey the GNU
21
* General Public License in all respects for all of the code used other than
22
* OpenSSL. If you modify file(s) with this exception, you may extend this
23
* exception to your version of the file(s), but you are not obligated to do
24
* so. If you do not wish to do so, delete this exception statement from your
25
* version. If you delete this exception statement from all source files in
26
* the program, then also delete it here.
28
* SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
29
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
30
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
33
* You should have received a copy of the GNU General Public License along
34
* with SLURM; if not, write to the Free Software Foundation, Inc.,
35
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
37
* This file is patterned after jobcomp_linux.c, written by Morris Jette and
38
* Copyright (C) 2002 The Regents of the University of California.
39
\*****************************************************************************/
43
#include "src/common/jobacct_common.h"
44
#include "src/common/slurm_protocol_api.h"
45
#include "src/common/slurm_protocol_defs.h"
46
#include "src/slurmd/common/proctrack.h"
51
* These variables are required by the generic plugin interface. If they
52
* are not found in the plugin, the plugin loader will ignore it.
54
* plugin_name - a string giving a human-readable description of the
55
* plugin. There is no maximum length, but the symbol must refer to
58
* plugin_type - a string suggesting the type of the plugin or its
59
* applicability to a particular form of data or method of data handling.
60
* If the low-level plugin API is used, the contents of this string are
61
* unimportant and may be anything. SLURM uses the higher-level plugin
62
* interface which requires this string to be of the form
64
* <application>/<method>
66
* where <application> is a description of the intended application of
67
* the plugin (e.g., "jobacct" for SLURM job completion logging) and <method>
68
* is a description of how this plugin satisfies that application. SLURM will
69
* only load job completion logging plugins if the plugin_type string has a
70
* prefix of "jobacct/".
72
* plugin_version - an unsigned 32-bit integer giving the version number
73
* of the plugin. If major and minor revisions are desired, the major
74
* version number may be multiplied by a suitable magnitude constant such
75
* as 100 or 1000. Various SLURM versions will likely require a certain
76
* minimum versions for their plugins as the job accounting API
79
const char plugin_name[] = "Job accounting gather LINUX plugin";
80
const char plugin_type[] = "jobacct_gather/linux";
81
const uint32_t plugin_version = 100;
83
/* Other useful declarations */
85
typedef struct prec { /* process record */
88
int usec; /* user cpu time */
89
int ssec; /* system cpu time */
90
int pages; /* pages */
92
int vsize; /* virtual size */
96
static DIR *slash_proc = NULL;
97
static pthread_mutex_t reading_mutex = PTHREAD_MUTEX_INITIALIZER;
99
/* Finally, pre-define all local routines. */
101
static void _acct_kill_job(void);
102
static void _get_offspring_data(List prec_list, prec_t *ancestor, pid_t pid);
103
static void _get_process_data();
104
static int _get_process_data_line(int in, prec_t *prec);
105
static void *_watch_tasks(void *arg);
106
static void _destroy_prec(void *object);
109
* _get_offspring_data() -- collect memory usage data for the offspring
111
* For each process that lists <pid> as its parent, add its memory
112
* usage data to the ancestor's <prec> record. Recurse to gather data
113
* for *all* subsequent generations.
115
* IN: prec_list list of prec's
116
* ancestor The entry in precTable[] to which the data
117
* should be added. Even as we recurse, this will
118
* always be the prec for the base of the family
120
* pid The process for which we are currently looking
127
* THREADSAFE! Only one thread ever gets here.
130
_get_offspring_data(List prec_list, prec_t *ancestor, pid_t pid) {
135
itr = list_iterator_create(prec_list);
136
while((prec = list_next(itr))) {
137
if (prec->ppid == pid) {
139
info("pid:%u ppid:%u rss:%d KB",
140
prec->pid, prec->ppid, prec->rss);
142
_get_offspring_data(prec_list, ancestor, prec->pid);
143
ancestor->usec += prec->usec;
144
ancestor->ssec += prec->ssec;
145
ancestor->pages += prec->pages;
146
ancestor->rss += prec->rss;
147
ancestor->vsize += prec->vsize;
150
list_iterator_destroy(itr);
155
* _get_process_data() - Build a table of all current processes
161
* THREADSAFE! Only one thread ever gets here.
164
* Any file with a name of the form "/proc/[0-9]+/stat"
165
* is a Linux-style stat entry. We disregard the data if they look
168
static void _get_process_data() {
169
static int slash_proc_open = 0;
171
struct dirent *slash_proc_entry;
172
char *iptr = NULL, *optr = NULL;
173
FILE *stat_fp = NULL;
174
char proc_stat_file[256]; /* Allow ~20x extra length */
175
List prec_list = NULL;
178
uint32_t total_job_mem = 0;
183
struct jobacctinfo *jobacct = NULL;
184
static int processing = 0;
186
if(!pgid_plugin && cont_id == (uint32_t)NO_VAL) {
187
debug("cont_id hasn't been set yet not running poll");
192
debug("already running, returning");
196
prec_list = list_create(_destroy_prec);
199
/* get only the processes in the proctrack container */
200
slurm_container_get_pids(cont_id, &pids, &npids);
202
debug4("no pids in this container %d", cont_id);
205
for (i = 0; i < npids; i++) {
206
snprintf(proc_stat_file, 256,
207
"/proc/%d/stat", pids[i]);
208
if ((stat_fp = fopen(proc_stat_file, "r"))==NULL)
209
continue; /* Assume the process went away */
211
* Close the file on exec() of user tasks.
213
* NOTE: If we fork() slurmstepd after the
214
* fopen() above and before the fcntl() below,
215
* then the user task may have this extra file
216
* open, which can cause problems for
217
* checkpoint/restart, but this should be a very rare
218
* problem in practice.
220
fd = fileno(stat_fp);
221
fcntl(fd, F_SETFD, FD_CLOEXEC);
223
prec = xmalloc(sizeof(prec_t));
224
if (_get_process_data_line(fd, prec))
225
list_append(prec_list, prec);
231
slurm_mutex_lock(&reading_mutex);
233
if (slash_proc_open) {
234
rewinddir(slash_proc);
236
slash_proc=opendir("/proc");
237
if (slash_proc == NULL) {
238
perror("opening /proc");
239
slurm_mutex_unlock(&reading_mutex);
244
strcpy(proc_stat_file, "/proc/");
246
while ((slash_proc_entry = readdir(slash_proc))) {
248
/* Save a few cyles by simulating
249
strcat(statFileName, slash_proc_entry->d_name);
250
strcat(statFileName, "/stat");
251
while checking for a numeric filename (which really
254
optr = proc_stat_file + sizeof("/proc");
255
iptr = slash_proc_entry->d_name;
259
|| ((*optr++ = *iptr++) > '9')) {
267
iptr = (char*)"/stat";
274
if ((stat_fp = fopen(proc_stat_file,"r"))==NULL)
275
continue; /* Assume the process went away */
277
* Close the file on exec() of user tasks.
279
* NOTE: If we fork() slurmstepd after the
280
* fopen() above and before the fcntl() below,
281
* then the user task may have this extra file
282
* open, which can cause problems for
283
* checkpoint/restart, but this should be a very rare
284
* problem in practice.
286
fd = fileno(stat_fp);
287
fcntl(fd, F_SETFD, FD_CLOEXEC);
289
prec = xmalloc(sizeof(prec_t));
290
if (_get_process_data_line(fd, prec))
291
list_append(prec_list, prec);
296
slurm_mutex_unlock(&reading_mutex);
300
if (!list_count(prec_list)) {
301
goto finished; /* We have no business being here! */
304
slurm_mutex_lock(&jobacct_lock);
305
if(!task_list || !list_count(task_list)) {
306
slurm_mutex_unlock(&jobacct_lock);
310
itr = list_iterator_create(task_list);
311
while((jobacct = list_next(itr))) {
312
itr2 = list_iterator_create(prec_list);
313
while((prec = list_next(itr2))) {
314
if (prec->pid == jobacct->pid) {
316
info("pid:%u ppid:%u rss:%d KB",
317
prec->pid, prec->ppid, prec->rss);
319
/* find all my descendents */
320
_get_offspring_data(prec_list,
322
/* tally their usage */
323
jobacct->max_rss = jobacct->tot_rss =
324
MAX(jobacct->max_rss, prec->rss);
325
total_job_mem += prec->rss;
326
jobacct->max_vsize = jobacct->tot_vsize =
327
MAX(jobacct->max_vsize, prec->vsize);
328
jobacct->max_pages = jobacct->tot_pages =
329
MAX(jobacct->max_pages, prec->pages);
330
jobacct->min_cpu = jobacct->tot_cpu =
331
MAX(jobacct->min_cpu,
332
(prec->usec + prec->ssec));
333
debug2("%d mem size %u %u time %u",
334
jobacct->pid, jobacct->max_rss,
335
jobacct->max_vsize, jobacct->tot_cpu);
339
list_iterator_destroy(itr2);
341
list_iterator_destroy(itr);
342
slurm_mutex_unlock(&jobacct_lock);
345
debug("Job %u memory used:%u limit:%u KB",
346
acct_job_id, total_job_mem, job_mem_limit);
348
if (acct_job_id && job_mem_limit &&
349
(total_job_mem > job_mem_limit)) {
350
error("Job %u exceeded %u KB memory limit, being killed",
351
acct_job_id, job_mem_limit);
356
list_destroy(prec_list);
361
/* _acct_kill_job() issue RPC to kill a slurm job */
362
static void _acct_kill_job(void)
365
job_step_kill_msg_t req;
367
slurm_msg_t_init(&msg);
371
req.job_id = acct_job_id;
372
req.job_step_id = NO_VAL;
373
req.signal = SIGKILL;
375
msg.msg_type = REQUEST_CANCEL_JOB_STEP;
378
slurm_send_only_controller_msg(&msg);
381
/* _get_process_data_line() - get line of data from /proc/<pid>/stat
383
* IN: in - input file descriptor
384
* OUT: prec - the destination for the data
386
* RETVAL: ==0 - no valid data
387
* !=0 - data are valid
389
* Based upon stat2proc() from the ps command. It can handle arbitrary executable
390
* file basenames for `cmd', i.e. those with embedded whitespace or embedded ')'s.
391
* Such names confuse %s (see scanf(3)), so the string is split and %39c is used
392
* instead. (except for embedded ')' "(%[^)]c)" would work.
394
static int _get_process_data_line(int in, prec_t *prec) {
395
char sbuf[256], *tmp;
397
char cmd[40], state[1];
398
int ppid, pgrp, session, tty_nr, tpgid;
399
long unsigned flags, minflt, cminflt, majflt, cmajflt;
400
long unsigned utime, stime, starttime, vsize;
401
long int cutime, cstime, priority, nice, timeout, itrealvalue, rss;
403
num_read = read(in, sbuf, (sizeof(sbuf) - 1));
406
sbuf[num_read] = '\0';
408
tmp = strrchr(sbuf, ')'); /* split into "PID (cmd" and "<rest>" */
409
*tmp = '\0'; /* replace trailing ')' with NUL */
410
/* parse these two strings separately, skipping the leading "(". */
411
nvals = sscanf(sbuf, "%d (%39c", &prec->pid, cmd);
415
nvals = sscanf(tmp + 2, /* skip space after ')' too */
417
"%lu %lu %lu %lu %lu "
418
"%lu %lu %ld %ld %ld %ld "
419
"%ld %ld %lu %lu %ld",
420
state, &ppid, &pgrp, &session, &tty_nr, &tpgid,
421
&flags, &minflt, &cminflt, &majflt, &cmajflt,
422
&utime, &stime, &cutime, &cstime, &priority, &nice,
423
&timeout, &itrealvalue, &starttime, &vsize, &rss);
424
/* There are some additional fields, which we do not scan or use */
425
if ((nvals < 22) || (rss < 0))
428
/* Copy the values that slurm records into our data structure */
430
prec->pages = majflt;
433
prec->vsize = vsize / 1024; /* convert from bytes to KB */
434
prec->rss = rss * getpagesize() / 1024; /* convert from pages to KB */
438
static void _task_sleep(int rem)
441
rem = sleep(rem); /* subject to interupt */
444
/* _watch_tasks() -- monitor slurm jobs and track their memory usage
446
* IN, OUT: Irrelevant; this is invoked by pthread_create()
449
static void *_watch_tasks(void *arg)
451
/* Give chance for processes to spawn before starting
452
* the polling. This should largely eliminate the
453
* the chance of having /proc open when the tasks are
454
* spawned, which would prevent a valid checkpoint/restart
455
* with some systems */
458
while(!jobacct_shutdown) { /* Do this until shutdown is requested */
459
if(!jobacct_suspended) {
460
_get_process_data(); /* Update the data */
468
static void _destroy_prec(void *object)
470
prec_t *prec = (prec_t *)object;
476
* init() is called when the plugin is loaded, before any other functions
477
* are called. Put global initialization here.
479
extern int init ( void )
481
char *temp = slurm_get_proctrack_type();
482
if(!strcasecmp(temp, "proctrack/pgid")) {
483
info("WARNING: We will use a much slower algorithm with "
484
"proctrack/pgid, use Proctracktype=proctrack/linuxproc "
485
"or Proctracktype=proctrack/rms with %s",
490
temp = slurm_get_accounting_storage_type();
491
if(!strcasecmp(temp, ACCOUNTING_STORAGE_TYPE_NONE)) {
492
error("WARNING: Even though we are collecting accounting "
493
"information you have asked for it not to be stored "
494
"(%s) if this is not what you have in mind you will "
495
"need to change it.", ACCOUNTING_STORAGE_TYPE_NONE);
498
verbose("%s loaded", plugin_name);
499
return SLURM_SUCCESS;
502
extern int fini ( void )
504
return SLURM_SUCCESS;
507
extern struct jobacctinfo *jobacct_gather_p_create(jobacct_id_t *jobacct_id)
509
return jobacct_common_alloc_jobacct(jobacct_id);
512
extern void jobacct_gather_p_destroy(struct jobacctinfo *jobacct)
514
jobacct_common_free_jobacct(jobacct);
517
extern int jobacct_gather_p_setinfo(struct jobacctinfo *jobacct,
518
enum jobacct_data_type type, void *data)
520
return jobacct_common_setinfo(jobacct, type, data);
524
extern int jobacct_gather_p_getinfo(struct jobacctinfo *jobacct,
525
enum jobacct_data_type type, void *data)
527
return jobacct_common_getinfo(jobacct, type, data);
530
extern void jobacct_gather_p_pack(struct jobacctinfo *jobacct, Buf buffer)
532
jobacct_common_pack(jobacct, buffer);
535
extern int jobacct_gather_p_unpack(struct jobacctinfo **jobacct, Buf buffer)
537
return jobacct_common_unpack(jobacct, buffer);
540
extern void jobacct_gather_p_aggregate(struct jobacctinfo *dest,
541
struct jobacctinfo *from)
543
jobacct_common_aggregate(dest, from);
547
* jobacct_startpoll() is called when the plugin is loaded by
548
* slurmd, before any other functions are called. Put global
549
* initialization here.
552
extern int jobacct_gather_p_startpoll(uint16_t frequency)
554
int rc = SLURM_SUCCESS;
557
pthread_t _watch_tasks_thread_id;
559
debug("%s loaded", plugin_name);
561
debug("jobacct-gather: frequency = %d", frequency);
563
jobacct_shutdown = false;
565
if (frequency == 0) { /* don't want dynamic monitoring? */
566
debug2("jobacct-gather LINUX dynamic logging disabled");
571
task_list = list_create(jobacct_common_free_jobacct);
573
/* create polling thread */
574
slurm_attr_init(&attr);
575
if (pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED))
576
error("pthread_attr_setdetachstate error %m");
578
if (pthread_create(&_watch_tasks_thread_id, &attr,
579
&_watch_tasks, NULL)) {
580
debug("jobacct-gather failed to create _watch_tasks "
585
debug3("jobacct-gather LINUX dynamic logging enabled");
586
slurm_attr_destroy(&attr);
591
extern int jobacct_gather_p_endpoll()
593
slurm_mutex_lock(&jobacct_lock);
595
list_destroy(task_list);
597
slurm_mutex_unlock(&jobacct_lock);
600
slurm_mutex_lock(&reading_mutex);
601
(void) closedir(slash_proc);
602
slurm_mutex_unlock(&reading_mutex);
605
jobacct_shutdown = true;
607
return SLURM_SUCCESS;
610
extern void jobacct_gather_p_change_poll(uint16_t frequency)
614
jobacct_shutdown = true;
618
extern void jobacct_gather_p_suspend_poll()
620
jobacct_common_suspend_poll();
623
extern void jobacct_gather_p_resume_poll()
625
jobacct_common_resume_poll();
628
extern int jobacct_gather_p_set_proctrack_container_id(uint32_t id)
630
return jobacct_common_set_proctrack_container_id(id);
633
extern int jobacct_gather_p_add_task(pid_t pid, jobacct_id_t *jobacct_id)
635
return jobacct_common_add_task(pid, jobacct_id);
639
extern struct jobacctinfo *jobacct_gather_p_stat_task(pid_t pid)
642
return jobacct_common_stat_task(pid);
645
extern struct jobacctinfo *jobacct_gather_p_remove_task(pid_t pid)
647
return jobacct_common_remove_task(pid);
650
extern void jobacct_gather_p_2_sacct(sacct_t *sacct,
651
struct jobacctinfo *jobacct)
653
jobacct_common_2_sacct(sacct, jobacct);