1
/*****************************************************************************\
2
* proctrack_cgroup.c - process tracking via linux cgroup containers
3
*****************************************************************************
4
* Copyright (C) 2009 CEA/DAM/DIF
5
* Written by Matthieu Hautreux <matthieu.hautreux@cea.fr>
7
* This file is part of SLURM, a resource management program.
8
* For details, see <https://computing.llnl.gov/linux/slurm/>.
9
* Please also read the included file: DISCLAIMER.
11
* SLURM is free software; you can redistribute it and/or modify it under
12
* the terms of the GNU General Public License as published by the Free
13
* Software Foundation; either version 2 of the License, or (at your option)
16
* In addition, as a special exception, the copyright holders give permission
17
* to link the code of portions of this program with the OpenSSL library under
18
* certain conditions as described in each individual source file, and
19
* distribute linked combinations including the two. You must obey the GNU
20
* General Public License in all respects for all of the code used other than
21
* OpenSSL. If you modify file(s) with this exception, you may extend this
22
* exception to your version of the file(s), but you are not obligated to do
23
* so. If you do not wish to do so, delete this exception statement from your
24
* version. If you delete this exception statement from all source files in
25
* the program, then also delete it here.
27
* SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
28
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
29
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
32
* You should have received a copy of the GNU General Public License along
33
* with SLURM; if not, write to the Free Software Foundation, Inc.,
34
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
35
\*****************************************************************************/
45
# include <inttypes.h>
48
#include <slurm/slurm.h>
49
#include <slurm/slurm_errno.h>
50
#include "src/common/log.h"
51
#include "src/slurmd/slurmd/slurmd.h"
53
#include "src/slurmd/slurmstepd/slurmstepd_job.h"
55
#include <sys/types.h>
60
#include "read_config.h"
65
* These variables are required by the generic plugin interface. If they
66
* are not found in the plugin, the plugin loader will ignore it.
68
* plugin_name - a string giving a human-readable description of the
69
* plugin. There is no maximum length, but the symbol must refer to
72
* plugin_type - a string suggesting the type of the plugin or its
73
* applicability to a particular form of data or method of data handling.
74
* If the low-level plugin API is used, the contents of this string are
75
* unimportant and may be anything. SLURM uses the higher-level plugin
76
* interface which requires this string to be of the form
78
* <application>/<method>
80
* where <application> is a description of the intended application of
81
* the plugin (e.g., "jobcomp" for SLURM job completion logging) and <method>
82
* is a description of how this plugin satisfies that application. SLURM will
83
* only load job completion logging plugins if the plugin_type string has a
84
* prefix of "jobcomp/".
86
* plugin_version - an unsigned 32-bit integer giving the version number
87
* of the plugin. If major and minor revisions are desired, the major
88
* version number may be multiplied by a suitable magnitude constant such
89
* as 100 or 1000. Various SLURM versions will likely require a certain
90
* minimum version for their plugins as the job completion logging API
93
const char plugin_name[] = "Process tracking via linux cgroup";
94
const char plugin_type[] = "proctrack/cgroup";
95
const uint32_t plugin_version = 10;
101
#define CGROUP_SLURMDIR CGROUP_BASEDIR "/slurm"
103
char user_cgroup_path[PATH_MAX];
104
char job_cgroup_path[PATH_MAX];
105
char jobstep_cgroup_path[PATH_MAX];
107
int _slurm_cgroup_init()
112
/* initialize job/jobstep cgroup path */
113
user_cgroup_path[0]='\0';
114
job_cgroup_path[0]='\0';
115
jobstep_cgroup_path[0]='\0';
117
/* we first check that cgroup is mounted */
118
if ( ! xcgroup_is_available() ) {
119
if ( slurm_cgroup_conf->cgroup_automount ) {
120
if ( xcgroup_mount(slurm_cgroup_conf->
121
cgroup_mount_opts) ) {
122
error("unable to mount cgroup");
125
info("cgroup system is now mounted");
126
/* we then set the release_agent if necessary */
127
if ( slurm_cgroup_conf->cgroup_release_agent ) {
128
xcgroup_set_release_agent(slurm_cgroup_conf->
129
cgroup_release_agent);
133
error("cgroup is not mounted. aborting");
138
/* create a non releasable root cgroup for slurm usage */
143
fstatus = xcgroup_create(CGROUP_SLURMDIR,&opts);
144
if ( fstatus != SLURM_SUCCESS ) {
145
error("unable to create SLURM cgroup directory '%s'. aborting",
150
return SLURM_SUCCESS;
153
int _slurm_cgroup_create(slurmd_job_t *job,uint32_t id,uid_t uid,gid_t gid)
158
uint32_t cur_memlimit,cur_memswlimit;
160
/* build user cgroup path if no set (should not be) */
161
if ( *user_cgroup_path == '\0' ) {
162
if ( snprintf(user_cgroup_path,PATH_MAX,CGROUP_SLURMDIR
163
"/uid_%u",uid) >= PATH_MAX ) {
164
error("unable to build uid %u cgroup filepath : %m",
170
/* build job cgroup path if no set (should not be) */
171
if ( *job_cgroup_path == '\0' ) {
172
if ( snprintf(job_cgroup_path,PATH_MAX,"%s/job_%u",
173
user_cgroup_path,job->jobid) >= PATH_MAX ) {
174
error("unable to build job %u cgroup filepath : %m",
180
/* build job step cgroup path (should not be) */
181
if ( *jobstep_cgroup_path == '\0' ) {
182
if ( snprintf(jobstep_cgroup_path,PATH_MAX,"%s/step_%u",
183
job_cgroup_path,job->stepid) >= PATH_MAX ) {
184
error("unable to build job step %u cgroup filepath "
190
/* create user cgroup (it could already exist) */
195
if ( xcgroup_create(user_cgroup_path,&opts)
198
if ( slurm_cgroup_conf->user_cgroup_params )
199
xcgroup_set_params(user_cgroup_path,
200
slurm_cgroup_conf->user_cgroup_params);
203
* if memory constraints have to be added to uid cgroup
204
* use_hierachy=1 must be set here, but this would result
205
* in impossibility to configure some job memory parameters
206
* differently, so skip this stage for now
209
/* create job cgroup (it could already exist) */
214
if ( xcgroup_create(job_cgroup_path,&opts)
218
/* job cgroup parameters must be set before any sub cgroups
220
xcgroup_set_mem_use_hierarchy(job_cgroup_path,1);
221
if ( slurm_cgroup_conf->job_cgroup_params )
222
xcgroup_set_params(job_cgroup_path,
223
slurm_cgroup_conf->job_cgroup_params);
226
* Warning: OOM Killer must be disabled for slurmstepd
227
* or it would be destroyed if the application use
228
* more memory than permitted
230
* If an env value is already set for slurmstepd
231
* OOM killer behavior, keep it, otherwise set the
232
* -17 value, wich means do not let OOM killer kill it
234
* FYI, setting "export SLURMSTEPD_OOM_ADJ=-17"
235
* in /etc/sysconfig/slurm would be the same
237
setenv("SLURMSTEPD_OOM_ADJ","-17",0);
241
* Warning, with slurm-2.1.0 job_mem more corresponds to the
242
* missing field jobstep_mem and thus must not be
243
* trusted to set the job mem limit constraint
244
* Due to the lack of jobstep_mem field in slurm-2.1.0
245
* we only allow to extend the amount of allowed memory
246
* as a step requiring less than the max allowed amount
247
* for the job could otherwise reduce the allowed amount of other
248
* already running steps
249
* Thus, as a long as a step comes with a value that is higher
250
* than the current value, we use it as it means that the
251
* job is at least authorized to use this amount
252
* In the future, a jobstep_mem field should be added
253
* to avoid this workaround and be more deterministic
255
* Unfortunately with this workaround comes a collateral problem !
256
* As we propose to alter already fixed limits for both mem and
257
* mem+swap, we have to respect a certain order while doing the
258
* modification to respect the kernel cgroup implementation
259
* requirements : when sets, memory limit must be lower or equal
260
* to memory+swap limit
262
* Notes : a limit value of -1 means that the limit was not
264
* Notes : this whole part should be much more simpler when
265
* the jobstep_mem field will be added
270
* Get current limits for both mem and mem+swap
272
xcgroup_get_memlimit(job_cgroup_path,&cur_memlimit);
273
xcgroup_get_memswlimit(job_cgroup_path,&cur_memswlimit);
276
* set memory constraints according to cgroup conf
278
if ( slurm_cgroup_conf->constrain_ram_space &&
279
cur_memlimit == -1 ) {
281
limit = (uint32_t) job->job_mem ;
282
limit = (uint32_t) limit *
283
( slurm_cgroup_conf->allowed_ram_space / 100.0 ) ;
284
xcgroup_set_memlimit(job_cgroup_path,limit);
286
if ( slurm_cgroup_conf->constrain_swap_space ) {
287
uint32_t limit,memlimit,swaplimit;
288
memlimit = (uint32_t) job->job_mem ;
289
swaplimit = memlimit ;
290
memlimit = (uint32_t) memlimit *
291
( slurm_cgroup_conf->allowed_ram_space / 100.0 ) ;
292
swaplimit = (uint32_t) swaplimit *
293
( slurm_cgroup_conf->allowed_swap_space / 100.0 ) ;
294
limit = memlimit + swaplimit ;
296
* if memlimit was not set in the previous block,
297
* we have to set it here or it will not be possible
298
* to set mem+swap limit as the mem limit value could be
301
* However, due to the restriction mentioned in the previous
302
* block (job_mem...) if a step already set it, we will
303
* have to skip this as if the new amount is bigger
304
* we will not be allowed by the kernel to set it as
305
* the mem+swap value will certainly be lower. In such
306
* scenario, we will have to set memlimit after mem+swap limit
307
* to still be clean regarding to cgroup kernel implementation
308
* ( memlimit must be lower or equal to mem+swap limit when
309
* set ). See stage 2 below...
311
if ( !slurm_cgroup_conf->constrain_ram_space &&
313
xcgroup_set_memlimit(job_cgroup_path,limit);
316
* for the reason why we do this, see the previous block too
319
if ( cur_memswlimit == -1 || cur_memswlimit < limit )
320
xcgroup_set_memswlimit(job_cgroup_path,limit);
322
debug3("keeping previously set mem+swap limit of %uMB"
323
" for '%s'",cur_memswlimit,job_cgroup_path);
328
if ( !slurm_cgroup_conf->constrain_ram_space &&
329
cur_memlimit != -1 ) {
332
* for the reason why we do this, see the previous
335
if ( cur_memlimit == -1 || cur_memlimit < limit )
336
xcgroup_set_memlimit(job_cgroup_path,limit);
338
debug3("keeping previously set mem limit of "
339
"%uMB for '%s'",cur_memlimit,
345
* yet an other stage 2 due to jobstep_mem lack...
346
* only used when ram_space constraint is enforced
348
if ( slurm_cgroup_conf->constrain_ram_space &&
349
cur_memlimit != -1 ) {
351
limit = (uint32_t) job->job_mem ;
352
limit = (uint32_t) limit *
353
( slurm_cgroup_conf->allowed_ram_space / 100.0 ) ;
354
if ( cur_memlimit == -1 || cur_memlimit < limit )
355
xcgroup_set_memlimit(job_cgroup_path,limit);
357
debug3("keeping previously set mem limit of "
358
"%uMB for '%s'",cur_memlimit,job_cgroup_path);
361
/* set cores constraints if required by conf */
362
if ( slurm_cgroup_conf->constrain_cores &&
363
job->job_alloc_cores ) {
365
* abstract mapping of cores in slurm must
366
* first be mapped into the machine one
369
if ( xcpuinfo_abs_to_mac(job->job_alloc_cores,&mach) !=
371
error("unable to convert abstract slurm allocated "
372
"cores '%s' into a valid machine map",
373
job->job_alloc_cores);
376
debug3("allocated cores conversion done : "
377
"%s (abstract) -> %s (machine)",
378
job->job_alloc_cores,mach);
379
xcgroup_set_cpuset_cpus(job_cgroup_path,
384
else if ( ! job->job_alloc_cores ) {
385
error("job_alloc_cores not defined for this job! ancestor's conf"
386
" will be used instead");
389
/* create the step sub cgroup (it sould not already exists) */
394
fstatus = xcgroup_create(jobstep_cgroup_path,&opts);
395
if ( fstatus != XCGROUP_SUCCESS ) {
396
rmdir(job_cgroup_path);
400
/* set jobstep cgroup parameters */
401
if ( slurm_cgroup_conf->jobstep_cgroup_params )
402
xcgroup_set_params(jobstep_cgroup_path,
403
slurm_cgroup_conf->jobstep_cgroup_params);
408
int _slurm_cgroup_destroy(void)
410
if ( jobstep_cgroup_path[0] != '\0' )
411
xcgroup_destroy(jobstep_cgroup_path);
413
if ( job_cgroup_path[0] != '\0' )
414
xcgroup_destroy(job_cgroup_path);
416
if ( user_cgroup_path[0] != '\0' )
417
xcgroup_destroy(user_cgroup_path);
419
return SLURM_SUCCESS;
422
int _slurm_cgroup_add_pids(uint32_t id,pid_t* pids,int npids)
424
if ( *jobstep_cgroup_path == '\0' )
427
return xcgroup_add_pids(jobstep_cgroup_path,pids,npids);
431
_slurm_cgroup_get_pids(uint32_t id, pid_t **pids, int *npids)
433
if ( *jobstep_cgroup_path == '\0' )
436
return xcgroup_get_pids(jobstep_cgroup_path,pids,npids);
439
int _slurm_cgroup_set_memlimit(uint32_t id,uint32_t memlimit)
441
if ( *jobstep_cgroup_path == '\0' )
444
return xcgroup_set_memlimit(jobstep_cgroup_path,memlimit);
447
int _slurm_cgroup_set_memswlimit(uint32_t id,uint32_t memlimit)
449
if ( *jobstep_cgroup_path == '\0' )
452
return xcgroup_set_memswlimit(jobstep_cgroup_path,memlimit);
456
_slurm_cgroup_find_by_pid(uint32_t* pcont_id, pid_t pid)
461
char cpath[PATH_MAX];
464
fstatus = xcgroup_find_by_pid(cpath,pid);
465
if ( fstatus != SLURM_SUCCESS )
468
token = rindex(cpath,'/');
469
if ( token == NULL ) {
470
debug3("pid %u cgroup '%s' does not match %s cgroup pattern",
471
pid,cpath,plugin_type);
475
rc = sscanf(token,"/%u",&cont_id);
477
if ( pcont_id != NULL )
479
fstatus = SLURM_SUCCESS;
482
fstatus = SLURM_ERROR;
489
* init() is called when the plugin is loaded, before any other functions
490
* are called. Put global initialization here.
492
extern int init ( void )
494
/* read cgroup configuration */
495
if ( read_slurm_cgroup_conf() )
498
/* initialize cpuinfo internal data */
499
if ( xcpuinfo_init() != XCPUINFO_SUCCESS ) {
500
free_slurm_cgroup_conf();
504
/* initialize cgroup internal data */
505
if ( _slurm_cgroup_init() != SLURM_SUCCESS ) {
507
free_slurm_cgroup_conf();
511
return SLURM_SUCCESS;
514
extern int fini ( void )
516
_slurm_cgroup_destroy();
518
free_slurm_cgroup_conf();
519
return SLURM_SUCCESS;
523
* Uses slurmd job-step manager's pid as the unique container id.
525
extern int slurm_container_create ( slurmd_job_t *job )
529
/* create a new cgroup for that container */
530
fstatus = _slurm_cgroup_create(job,(uint32_t)job->jmgr_pid,
535
/* set the cgroup paths to adhoc env variables */
536
env_array_overwrite(&job->env,"SLURM_JOB_CGROUP",
538
env_array_overwrite(&job->env,"SLURM_STEP_CGROUP",
539
jobstep_cgroup_path);
541
/* add slurmstepd pid to this newly created container */
542
fstatus = _slurm_cgroup_add_pids((uint32_t)job->jmgr_pid,
545
_slurm_cgroup_destroy();
549
/* we use slurmstepd pid as the identifier of the container
550
* the corresponding cgroup could be found using
551
* _slurm_cgroup_find_by_pid */
552
job->cont_id = (uint32_t)job->jmgr_pid;
554
return SLURM_SUCCESS;
557
extern int slurm_container_add ( slurmd_job_t *job, pid_t pid )
559
return _slurm_cgroup_add_pids(job->cont_id,&pid,1);
562
extern int slurm_container_signal ( uint32_t id, int signal )
568
if ( _slurm_cgroup_get_pids(id,&pids,&npids) !=
570
error("unable to get pids list for cont_id=%u",id);
574
for ( i = 0 ; i<npids ; i++ ) {
575
/* do not kill slurmstepd */
576
if ( pids[i] != id ) {
577
debug2("killing process %d with signal %d",
579
kill(pids[i],signal);
585
return SLURM_SUCCESS;
588
extern int slurm_container_destroy ( uint32_t id )
590
_slurm_cgroup_destroy();
591
return SLURM_SUCCESS;
594
extern uint32_t slurm_container_find(pid_t pid)
597
_slurm_cgroup_find_by_pid(&cont_id,pid);
601
extern bool slurm_container_has_pid(uint32_t cont_id, pid_t pid)
606
fstatus = _slurm_cgroup_find_by_pid(&lid,pid);
607
if ( fstatus != SLURM_SUCCESS )
610
if ( lid == cont_id )
617
extern int slurm_container_wait(uint32_t cont_id)
621
if (cont_id == 0 || cont_id == 1) {
626
/* Spin until the container is successfully destroyed */
627
while (slurm_container_destroy(cont_id) != SLURM_SUCCESS) {
628
slurm_container_signal(cont_id, SIGKILL);
633
error("Unable to destroy container %u", cont_id);
637
return SLURM_SUCCESS;
640
extern int slurm_container_get_pids(uint32_t cont_id, pid_t **pids, int *npids)
642
return _slurm_cgroup_get_pids(cont_id,pids,npids);