2
2
* src/slurmd/slurmstepd/req.c - slurmstepd domain socket request handling
3
3
*****************************************************************************
4
4
* Copyright (C) 2005-2007 The Regents of the University of California.
5
* Copyright (C) 2008 Lawrence Livermore National Security.
5
* Copyright (C) 2008-2009 Lawrence Livermore National Security.
6
6
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
7
7
* Written by Christopher Morrone <morrone2@llnl.gov>
8
* CODE-OCEC-09-009. All rights reserved.
10
10
* This file is part of SLURM, a resource management program.
11
* For details, see <http://www.llnl.gov/linux/slurm/>.
11
* For details, see <https://computing.llnl.gov/linux/slurm/>.
12
* Please also read the included file: DISCLAIMER.
13
14
* SLURM is free software; you can redistribute it and/or modify it under
14
15
* the terms of the GNU General Public License as published by the Free
227
228
int rc = SLURM_SUCCESS, retries = 0;
229
230
fd = _domain_socket_create(conf->spooldir, conf->node_name,
230
job->jobid, job->stepid);
231
job->jobid, job->stepid);
232
233
return SLURM_ERROR;
799
800
_handle_checkpoint_tasks(int fd, slurmd_job_t *job, uid_t uid)
801
static time_t last_timestamp = 0;
802
802
int rc = SLURM_SUCCESS;
804
803
time_t timestamp;
805
char *image_dir = NULL;
806
807
debug3("_handle_checkpoint_tasks for job %u.%u",
807
808
job->jobid, job->stepid);
809
safe_read(fd, &signal, sizeof(int));
810
810
safe_read(fd, ×tamp, sizeof(time_t));
811
safe_read(fd, &len, sizeof(int));
813
image_dir = xmalloc (len);
814
safe_read(fd, image_dir, len); /* '\0' terminated */
812
817
debug3(" uid = %d", uid);
813
818
if (uid != job->uid && !_slurm_authorized_user(uid)) {
814
debug("checkpoint req from uid %ld for job %u.%u owned by uid %ld",
819
debug("checkpoint req from uid %ld for job %u.%u "
815
821
(long)uid, job->jobid, job->stepid, (long)job->uid);
820
if (timestamp == last_timestamp) {
821
debug("duplicate checkpoint req for job %u.%u, timestamp %ld. discarded.",
826
if (job->ckpt_timestamp &&
827
timestamp == job->ckpt_timestamp) {
828
debug("duplicate checkpoint req for job %u.%u, "
829
"timestamp %ld. discarded.",
822
830
job->jobid, job->stepid, (long)timestamp);
823
831
rc = ESLURM_ALREADY_DONE; /* EINPROGRESS? */
847
/* TODO: send timestamp with signal */
848
if (killpg(job->pgid, signal) == -1) {
849
rc = -1; /* Most probable ESRCH, resulting in ESLURMD_JOB_NOTRUNNING */
850
verbose("Error sending signal %d to %u.%u, pgid %d, errno: %d: %s",
851
signal, job->jobid, job->stepid, job->pgid,
852
errno, slurm_strerror(rc));
855
/* set timestamp in case another request comes */
856
job->ckpt_timestamp = timestamp;
858
/* TODO: do we need job->ckpt_dir any more, except for checkpoint/xlch? */
859
/* if (! image_dir) { */
860
/* image_dir = xstrdup(job->ckpt_dir); */
863
/* call the plugin to send the request */
864
if (checkpoint_signal_tasks(job, image_dir) != SLURM_SUCCESS) {
866
verbose("Error sending checkpoint request to %u.%u: %s",
867
job->jobid, job->stepid, slurm_strerror(rc));
854
last_timestamp = timestamp;
855
verbose("Sent signal %d to %u.%u, pgid %d",
856
signal, job->jobid, job->stepid, job->pgid);
869
verbose("Sent checkpoint request to %u.%u",
870
job->jobid, job->stepid);
858
873
pthread_mutex_unlock(&suspend_mutex);