~ubuntu-branches/ubuntu/vivid/slurm-llnl/vivid

« back to all changes in this revision

Viewing changes to src/slurmctld/trigger_mgr.c

  • Committer: Bazaar Package Importer
  • Author(s): Gennaro Oliva
  • Date: 2009-09-24 23:28:15 UTC
  • mfrom: (1.1.11 upstream) (3.2.4 sid)
  • Revision ID: james.westby@ubuntu.com-20090924232815-enh65jn32q1ebg07
Tags: 2.0.5-1
* New upstream release 
* Changed dependecy from lib-mysqlclient15 to lib-mysqlclient 
* Added Default-Start for runlevel 2 and 4 and $remote_fs requirement in
  init.d scripts (Closes: #541252)
* Postinst checks for wrong runlevels 2 and 4 links
* Upgraded to standard version 3.8.3
* Add lintian overrides for missing slurm-llnl-configurator.html in doc
  base registration
* modified postrm scripts to ignore pkill return value in order to avoid
  postrm failure when no slurm process is running
* Checking for slurmctld.pid before cancelling running and pending
  jobs during package removal 

Show diffs side-by-side

added added

removed removed

Lines of Context:
5
5
 *  Copyright (C) 2008 Lawrence Livermore National Security.
6
6
 *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
7
7
 *  Written by Morris Jette <jette1@llnl.gov> et. al.
8
 
 *  LLNL-CODE-402394.
 
8
 *  CODE-OCEC-09-009. All rights reserved.
9
9
 *  
10
10
 *  This file is part of SLURM, a resource management program.
11
 
 *  For details, see <http://www.llnl.gov/linux/slurm/>.
 
11
 *  For details, see <https://computing.llnl.gov/linux/slurm/>.
 
12
 *  Please also read the included file: DISCLAIMER.
12
13
 *  
13
14
 *  SLURM is free software; you can redistribute it and/or modify it under
14
15
 *  the terms of the GNU General Public License as published by the Free
61
62
#include "src/slurmctld/state_save.h"
62
63
#include "src/slurmctld/trigger_mgr.h"
63
64
 
64
 
#define _DEBUG 0
65
65
#define MAX_PROG_TIME 300       /* maximum run time for program */
66
66
 
67
67
/* Change TRIGGER_STATE_VERSION value when changing the state save format */
68
68
#define TRIGGER_STATE_VERSION      "VER002"
69
69
 
70
 
/* TRIG_IS_JOB_FINI differs from IS_JOB_FINISHED by considering 
71
 
 * completing jobs as not really finished */
72
 
#define TRIG_IS_JOB_FINI(_X)             \
73
 
        (IS_JOB_FINISHED(_X) && ((_X->job_state & JOB_COMPLETING) == 0))
74
 
 
75
70
List trigger_list;
76
71
uint32_t next_trigger_id = 1;
77
72
static pthread_mutex_t trigger_mutex = PTHREAD_MUTEX_INITIALIZER;
140
135
                return "unknown";
141
136
}
142
137
 
143
 
#if _DEBUG
144
138
static int _trig_offset(uint16_t offset)
145
139
{
146
140
        static int rc;
153
147
{
154
148
        int i;
155
149
 
 
150
        if ((slurm_get_debug_flags() & DEBUG_FLAG_TRIGGERS) == 0)
 
151
                return;
 
152
 
156
153
        info(header);
157
154
        if ((msg == NULL) || (msg->record_count == 0)) {
158
155
                info("Trigger has no entries");
171
168
                        msg->trigger_array[i].program);
172
169
        }
173
170
}
174
 
#else
175
 
static void _dump_trigger_msg(char *header, trigger_info_msg_t *msg)
176
 
{
177
 
}
178
 
#endif
179
171
 
180
172
/* Validate trigger program */
181
173
static bool _validate_trigger(trig_mgr_info_t *trig_in)
495
487
        if (trig_ptr->res_type == TRIGGER_RES_TYPE_JOB) {
496
488
                trig_ptr->job_id = (uint32_t) atol(trig_ptr->res_id);
497
489
                trig_ptr->job_ptr = find_job_record(trig_ptr->job_id);
498
 
                if ((trig_ptr->job_id == 0)
499
 
                ||  (trig_ptr->job_ptr == NULL)
500
 
                ||  (TRIG_IS_JOB_FINI(trig_ptr->job_ptr)))
 
490
                if ((trig_ptr->job_id == 0)     ||
 
491
                    (trig_ptr->job_ptr == NULL) ||
 
492
                    (IS_JOB_COMPLETED(trig_ptr->job_ptr)))
501
493
                        goto unpack_error;
502
494
        } else {
503
495
                trig_ptr->job_id = 0;
504
496
                trig_ptr->job_ptr = NULL;
505
 
                if ((trig_ptr->res_id != NULL)
506
 
                &&  (trig_ptr->res_id[0] != '*')
507
 
                &&  (node_name2bitmap(trig_ptr->res_id, false,
 
497
                if ((trig_ptr->res_id != NULL)   &&
 
498
                    (trig_ptr->res_id[0] != '*') &&
 
499
                    (node_name2bitmap(trig_ptr->res_id, false,
508
500
                                &trig_ptr->nodes_bitmap) != 0))
509
501
                        goto unpack_error;
510
502
        }
528
520
}
529
521
extern int trigger_state_save(void)
530
522
{
 
523
        /* Save high-water mark to avoid buffer growth with copies */
531
524
        static int high_buffer_size = (1024 * 1024);
532
525
        int error_code = 0, log_fd;
533
526
        char *old_file, *new_file, *reg_file;
590
583
                (void) unlink(new_file);
591
584
        else {                  /* file shuffle */
592
585
                (void) unlink(old_file);
593
 
                (void) link(reg_file, old_file);
 
586
                if(link(reg_file, old_file))
 
587
                        debug4("unable to create link for %s -> %s: %m",
 
588
                               reg_file, old_file);
594
589
                (void) unlink(reg_file);
595
 
                (void) link(new_file, reg_file);
 
590
                if(link(new_file, reg_file))
 
591
                        debug4("unable to create link for %s -> %s: %m",
 
592
                               new_file, reg_file);
596
593
                (void) unlink(new_file);
597
594
        }
598
595
        xfree(old_file);
684
681
 
685
682
        if ((trig_in->trig_type & TRIGGER_TYPE_FINI)
686
683
        &&  ((trig_in->job_ptr == NULL) ||
687
 
             (TRIG_IS_JOB_FINI(trig_in->job_ptr)))) {
 
684
             (IS_JOB_COMPLETED(trig_in->job_ptr)))) {
688
685
                trig_in->state = 1;
689
686
                trig_in->trig_time = now + (trig_in->trig_time - 0x8000);
690
 
#if _DEBUG
691
 
                info("trigger[%u] event for job %u fini",
692
 
                        trig_in->trig_id, trig_in->job_id);
693
 
#endif
 
687
                if (slurm_get_debug_flags() & DEBUG_FLAG_TRIGGERS) {
 
688
                        info("trigger[%u] event for job %u fini",
 
689
                                trig_in->trig_id, trig_in->job_id);
 
690
                }
694
691
                return;
695
692
        }
696
693
 
697
694
        if (trig_in->job_ptr == NULL) {
698
 
#if _DEBUG
699
 
                info("trigger[%u] for defunct job %u",
700
 
                        trig_in->trig_id, trig_in->job_id);
701
 
#endif
 
695
                if (slurm_get_debug_flags() & DEBUG_FLAG_TRIGGERS) {
 
696
                        info("trigger[%u] for defunct job %u",
 
697
                                trig_in->trig_id, trig_in->job_id);
 
698
                }
702
699
                trig_in->state = 2;
703
700
                trig_in->trig_time = now;
704
701
                return;
709
706
                if (rem_time <= (0x8000 - trig_in->trig_time)) {
710
707
                        trig_in->state = 1;
711
708
                        trig_in->trig_time = now;
712
 
#if _DEBUG
713
 
                        info("trigger[%u] for job %u time",
714
 
                                trig_in->trig_id, trig_in->job_id);
715
 
#endif
 
709
                        if (slurm_get_debug_flags() & DEBUG_FLAG_TRIGGERS) {
 
710
                                info("trigger[%u] for job %u time",
 
711
                                        trig_in->trig_id, trig_in->job_id);
 
712
                        }
716
713
                        return;
717
714
                }
718
715
        }
721
718
                if (trigger_down_nodes_bitmap
722
719
                &&  bit_overlap(trig_in->job_ptr->node_bitmap, 
723
720
                                trigger_down_nodes_bitmap)) {
724
 
#if _DEBUG
725
 
                        info("trigger[%u] for job %u down",
726
 
                                trig_in->trig_id, trig_in->job_id);
727
 
#endif
 
721
                        if (slurm_get_debug_flags() & DEBUG_FLAG_TRIGGERS) {
 
722
                                info("trigger[%u] for job %u down",
 
723
                                        trig_in->trig_id, trig_in->job_id);
 
724
                        }
728
725
                        trig_in->state = 1;
729
726
                        trig_in->trig_time = now + 
730
727
                                        (trig_in->trig_time - 0x8000);
736
733
                if (trigger_fail_nodes_bitmap
737
734
                &&  bit_overlap(trig_in->job_ptr->node_bitmap, 
738
735
                                trigger_fail_nodes_bitmap)) {
739
 
#if _DEBUG
740
 
                        info("trigger[%u] for job %u node fail",
741
 
                                trig_in->trig_id, trig_in->job_id);
742
 
#endif
 
736
                        if (slurm_get_debug_flags() & DEBUG_FLAG_TRIGGERS) {
 
737
                                info("trigger[%u] for job %u node fail",
 
738
                                        trig_in->trig_id, trig_in->job_id);
 
739
                        }
743
740
                        trig_in->state = 1;
744
741
                        trig_in->trig_time = now + 
745
742
                                        (trig_in->trig_time - 0x8000);
754
751
                        trig_in->state = 1;
755
752
                        trig_in->trig_time = now + 
756
753
                                        (0x8000 - trig_in->trig_time);
757
 
#if _DEBUG
758
 
                        info("trigger[%u] for job %u up",
759
 
                                trig_in->trig_id, trig_in->job_id);
760
 
#endif
 
754
                        if (slurm_get_debug_flags() & DEBUG_FLAG_TRIGGERS) {
 
755
                                info("trigger[%u] for job %u up",
 
756
                                        trig_in->trig_id, trig_in->job_id);
 
757
                        }
761
758
                        return;
762
759
                }
763
760
        }
769
766
        &&   trigger_block_err) {
770
767
                trig_in->state = 1;
771
768
                trig_in->trig_time = now + (trig_in->trig_time - 0x8000);
772
 
#if _DEBUG
773
 
                info("trigger[%u] for block_err", trig_in->trig_id);
774
 
#endif
 
769
                if (slurm_get_debug_flags() & DEBUG_FLAG_TRIGGERS)
 
770
                        info("trigger[%u] for block_err", trig_in->trig_id);
775
771
                return;
776
772
        }
777
773
 
795
791
                if (trig_in->state == 1) {
796
792
                        trig_in->trig_time = now + 
797
793
                                        (trig_in->trig_time - 0x8000);
798
 
#if _DEBUG
799
 
                        info("trigger[%u] for node %s down",
800
 
                                trig_in->trig_id, trig_in->res_id);
801
 
#endif
 
794
                        if (slurm_get_debug_flags() & DEBUG_FLAG_TRIGGERS) {
 
795
                                info("trigger[%u] for node %s down",
 
796
                                        trig_in->trig_id, trig_in->res_id);
 
797
                        }
802
798
                        return;
803
799
                }
804
800
        }
823
819
                if (trig_in->state == 1) {
824
820
                        trig_in->trig_time = now + 
825
821
                                        (trig_in->trig_time - 0x8000);
826
 
#if _DEBUG
827
 
                        info("trigger[%u] for node %s drained",
828
 
                                trig_in->trig_id, trig_in->res_id);
829
 
#endif
 
822
                        if (slurm_get_debug_flags() & DEBUG_FLAG_TRIGGERS) {
 
823
                                info("trigger[%u] for node %s drained",
 
824
                                        trig_in->trig_id, trig_in->res_id);
 
825
                        }
830
826
                        return;
831
827
                }
832
828
        }
851
847
                if (trig_in->state == 1) {
852
848
                        trig_in->trig_time = now + 
853
849
                                        (trig_in->trig_time - 0x8000);
854
 
#if _DEBUG
855
 
                        info("trigger[%u] for node %s fail",
856
 
                                trig_in->trig_id, trig_in->res_id);
857
 
#endif
 
850
                        if (slurm_get_debug_flags() & DEBUG_FLAG_TRIGGERS) {
 
851
                                info("trigger[%u] for node %s fail",
 
852
                                        trig_in->trig_id, trig_in->res_id);
 
853
                        }
858
854
                        return;
859
855
                }
860
856
        }
893
889
                bit_free(trigger_idle_node_bitmap);
894
890
                if (trig_in->state == 1) {
895
891
                        trig_in->trig_time = now;
896
 
#if _DEBUG
897
 
                        info("trigger[%u] for node %s idle",
898
 
                                trig_in->trig_id, trig_in->res_id);
899
 
#endif
 
892
                        if (slurm_get_debug_flags() & DEBUG_FLAG_TRIGGERS) {
 
893
                                info("trigger[%u] for node %s idle",
 
894
                                        trig_in->trig_id, trig_in->res_id);
 
895
                        }
900
896
                        return;
901
897
                }
902
898
        }
921
917
                if (trig_in->state == 1) {
922
918
                        trig_in->trig_time = now + 
923
919
                                        (trig_in->trig_time - 0x8000);
924
 
#if _DEBUG
925
 
                        info("trigger[%u] for node %s up",
926
 
                                trig_in->trig_id, trig_in->res_id);
927
 
#endif
 
920
                        if (slurm_get_debug_flags() & DEBUG_FLAG_TRIGGERS) {
 
921
                                info("trigger[%u] for node %s up",
 
922
                                        trig_in->trig_id, trig_in->res_id);
 
923
                        }
928
924
                        return;
929
925
                }
930
926
        }
935
931
                trig_in->trig_time = now + (trig_in->trig_time - 0x8000);
936
932
                xfree(trig_in->res_id);
937
933
                trig_in->res_id = xstrdup("reconfig");
938
 
#if _DEBUG
939
 
                info("trigger[%u] for reconfig", trig_in->trig_id);
940
 
#endif
 
934
                if (slurm_get_debug_flags() & DEBUG_FLAG_TRIGGERS)
 
935
                        info("trigger[%u] for reconfig", trig_in->trig_id);
941
936
                return;
942
937
        }
943
938
}
1049
1044
                }
1050
1045
                if ((trig_in->state == 1) &&
1051
1046
                    (trig_in->trig_time <= now)) {
1052
 
#if _DEBUG
1053
 
                        info("launching program for trigger[%u]",
1054
 
                                trig_in->trig_id);
1055
 
                        info("  uid=%u gid=%u program=%s arg=%s", 
1056
 
                                trig_in->user_id, trig_in->group_id,
1057
 
                                trig_in->program, trig_in->res_id);
1058
 
#endif
 
1047
                        if (slurm_get_debug_flags() & DEBUG_FLAG_TRIGGERS) {
 
1048
                                info("launching program for trigger[%u]",
 
1049
                                        trig_in->trig_id);
 
1050
                                info("  uid=%u gid=%u program=%s arg=%s", 
 
1051
                                        trig_in->user_id, trig_in->group_id,
 
1052
                                        trig_in->program, trig_in->res_id);
 
1053
                        }
1059
1054
                        trig_in->state = 2;
1060
1055
                        trig_in->trig_time = now;
1061
1056
                        state_change = true;
1082
1077
                        }
1083
1078
 
1084
1079
                        if (trig_in->group_id == 0) {
1085
 
#if _DEBUG
1086
 
                                info("purging trigger[%u]", trig_in->trig_id);
1087
 
#endif
 
1080
                                if (slurm_get_debug_flags() & 
 
1081
                                    DEBUG_FLAG_TRIGGERS) {
 
1082
                                        info("purging trigger[%u]", 
 
1083
                                             trig_in->trig_id);
 
1084
                                }
1088
1085
                                list_delete_item(trig_iter);
1089
1086
                                state_change = true;
1090
1087
                        }