~ubuntu-branches/ubuntu/vivid/slurm-llnl/vivid

« back to all changes in this revision

Viewing changes to src/slurmd/slurmstepd/req.c

  • Committer: Bazaar Package Importer
  • Author(s): Gennaro Oliva
  • Date: 2009-09-24 23:28:15 UTC
  • mfrom: (1.1.11 upstream) (3.2.4 sid)
  • Revision ID: james.westby@ubuntu.com-20090924232815-enh65jn32q1ebg07
Tags: 2.0.5-1
* New upstream release 
* Changed dependecy from lib-mysqlclient15 to lib-mysqlclient 
* Added Default-Start for runlevel 2 and 4 and $remote_fs requirement in
  init.d scripts (Closes: #541252)
* Postinst checks for wrong runlevels 2 and 4 links
* Upgraded to standard version 3.8.3
* Add lintian overrides for missing slurm-llnl-configurator.html in doc
  base registration
* modified postrm scripts to ignore pkill return value in order to avoid
  postrm failure when no slurm process is running
* Checking for slurmctld.pid before cancelling running and pending
  jobs during package removal 

Show diffs side-by-side

added added

removed removed

Lines of Context:
2
2
 *  src/slurmd/slurmstepd/req.c - slurmstepd domain socket request handling
3
3
 *****************************************************************************
4
4
 *  Copyright (C) 2005-2007 The Regents of the University of California.
5
 
 *  Copyright (C) 2008 Lawrence Livermore National Security.
 
5
 *  Copyright (C) 2008-2009 Lawrence Livermore National Security.
6
6
 *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
7
7
 *  Written by Christopher Morrone <morrone2@llnl.gov>
8
 
 *  LLNL-CODE-402394.
 
8
 *  CODE-OCEC-09-009. All rights reserved.
9
9
 *  
10
10
 *  This file is part of SLURM, a resource management program.
11
 
 *  For details, see <http://www.llnl.gov/linux/slurm/>.
 
11
 *  For details, see <https://computing.llnl.gov/linux/slurm/>.
 
12
 *  Please also read the included file: DISCLAIMER.
12
13
 *  
13
14
 *  SLURM is free software; you can redistribute it and/or modify it under
14
15
 *  the terms of the GNU General Public License as published by the Free
227
228
        int rc = SLURM_SUCCESS, retries = 0;
228
229
        errno = 0;
229
230
        fd = _domain_socket_create(conf->spooldir, conf->node_name,
230
 
                                  job->jobid, job->stepid);
 
231
                                   job->jobid, job->stepid);
231
232
        if (fd == -1)
232
233
                return SLURM_ERROR;
233
234
 
798
799
static int
799
800
_handle_checkpoint_tasks(int fd, slurmd_job_t *job, uid_t uid)
800
801
{
801
 
        static time_t last_timestamp = 0;
802
802
        int rc = SLURM_SUCCESS;
803
 
        int signal;
804
803
        time_t timestamp;
 
804
        int len;
 
805
        char *image_dir = NULL;
805
806
 
806
807
        debug3("_handle_checkpoint_tasks for job %u.%u",
807
808
               job->jobid, job->stepid);
808
809
 
809
 
        safe_read(fd, &signal, sizeof(int));
810
810
        safe_read(fd, &timestamp, sizeof(time_t));
 
811
        safe_read(fd, &len, sizeof(int));
 
812
        if (len) {
 
813
                image_dir = xmalloc (len);
 
814
                safe_read(fd, image_dir, len); /* '\0' terminated */
 
815
        }
811
816
 
812
817
        debug3("  uid = %d", uid);
813
818
        if (uid != job->uid && !_slurm_authorized_user(uid)) {
814
 
                debug("checkpoint req from uid %ld for job %u.%u owned by uid %ld",
 
819
                debug("checkpoint req from uid %ld for job %u.%u "
 
820
                      "owned by uid %ld",
815
821
                      (long)uid, job->jobid, job->stepid, (long)job->uid);
816
822
                rc = EPERM;
817
823
                goto done;
818
824
        }
819
825
 
820
 
        if (timestamp == last_timestamp) {
821
 
                debug("duplicate checkpoint req for job %u.%u, timestamp %ld. discarded.",
 
826
        if (job->ckpt_timestamp &&
 
827
            timestamp == job->ckpt_timestamp) {
 
828
                debug("duplicate checkpoint req for job %u.%u, "
 
829
                      "timestamp %ld. discarded.",
822
830
                      job->jobid, job->stepid, (long)timestamp);
823
831
                rc = ESLURM_ALREADY_DONE; /* EINPROGRESS? */
824
832
                goto done;
844
852
               goto done;
845
853
       }
846
854
 
847
 
       /* TODO: send timestamp with signal */
848
 
       if (killpg(job->pgid, signal) == -1) {
849
 
               rc = -1;        /* Most probable ESRCH, resulting in ESLURMD_JOB_NOTRUNNING */
850
 
               verbose("Error sending signal %d to %u.%u, pgid %d, errno: %d: %s",
851
 
                       signal, job->jobid, job->stepid, job->pgid,
852
 
                       errno, slurm_strerror(rc));
 
855
       /* set timestamp in case another request comes */
 
856
       job->ckpt_timestamp = timestamp;
 
857
 
 
858
       /* TODO: do we need job->ckpt_dir any more, except for checkpoint/xlch? */
 
859
/*      if (! image_dir) { */
 
860
/*              image_dir = xstrdup(job->ckpt_dir); */
 
861
/*      } */
 
862
       
 
863
       /* call the plugin to send the request */
 
864
       if (checkpoint_signal_tasks(job, image_dir) != SLURM_SUCCESS) {
 
865
               rc = -1;
 
866
               verbose("Error sending checkpoint request to %u.%u: %s",
 
867
                     job->jobid, job->stepid, slurm_strerror(rc));
853
868
       } else {
854
 
               last_timestamp = timestamp;
855
 
               verbose("Sent signal %d to %u.%u, pgid %d",
856
 
                       signal, job->jobid, job->stepid, job->pgid);
 
869
               verbose("Sent checkpoint request to %u.%u",
 
870
                       job->jobid, job->stepid);
857
871
       }
 
872
 
858
873
       pthread_mutex_unlock(&suspend_mutex);
859
874
 
860
875
done: