2
* Local Resource Manager Daemon
4
* Author: Huang Zhen <zhenhltc@cn.ibm.com>
5
* Partly contributed by Andrew Beekhof <andrew@beekhof.net>
6
* Copyright (c) 2004 International Business Machines
8
* This program is free software; you can redistribute it and/or
9
* modify it under the terms of the GNU General Public
10
* License as published by the Free Software Foundation; either
11
* version 2 of the License, or (at your option) any later version.
13
* This software is distributed in the hope that it will be useful,
14
* but WITHOUT ANY WARRANTY; without even the implied warranty of
15
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16
* General Public License for more details.
18
* You should have received a copy of the GNU General Public
19
* License along with this library; if not, write to the Free Software
20
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23
#include <lha_internal.h>
31
#include <sys/types.h>
40
#include <pils/plugin.h>
41
#include <pils/generic.h>
42
#include <clplumbing/GSource.h>
43
#include <clplumbing/lsb_exitcodes.h>
44
#include <clplumbing/cl_signal.h>
45
#include <clplumbing/proctrack.h>
46
#include <clplumbing/coredumps.h>
47
#include <clplumbing/uids.h>
48
#include <clplumbing/Gmain_timeout.h>
49
#include <clplumbing/cl_pidfile.h>
50
#include <clplumbing/realtime.h>
55
/* #include <hb_api.h> */
57
#include <lrm/lrm_api.h>
58
#include <lrm/lrm_msg.h>
59
#include <lrm/raexec.h>
62
#include <lrmd_fdecl.h>
64
static gboolean in_alloc_dump = FALSE;
66
ProcTrack_ops ManagedChildTrackOps = {
68
on_ra_proc_registered,
72
/* msg dispatch table */
73
typedef int (*msg_handler)(lrmd_client_t* client, struct ha_msg* msg);
82
* two ways to handle replies:
83
* REPLY_NOW: pack whatever the handler returned and send it
84
* NO_MSG: the handler will send the reply itself
88
#define send_msg_now(p) \
89
(p->reply_time==REPLY_NOW)
91
struct msg_map msg_maps[] = {
92
{REGISTER, REPLY_NOW, on_msg_register},
93
{GETRSCCLASSES, NO_MSG, on_msg_get_rsc_classes},
94
{GETRSCTYPES, NO_MSG, on_msg_get_rsc_types},
95
{GETPROVIDERS, NO_MSG, on_msg_get_rsc_providers},
96
{ADDRSC, REPLY_NOW, on_msg_add_rsc},
97
{GETRSC, NO_MSG, on_msg_get_rsc},
98
{GETLASTOP, NO_MSG, on_msg_get_last_op},
99
{GETALLRCSES, NO_MSG, on_msg_get_all},
100
{DELRSC, REPLY_NOW, on_msg_del_rsc},
101
{FAILRSC, REPLY_NOW, on_msg_fail_rsc},
102
{PERFORMOP, REPLY_NOW, on_msg_perform_op},
103
{FLUSHOPS, REPLY_NOW, on_msg_flush_all},
104
{CANCELOP, REPLY_NOW, on_msg_cancel_op},
105
{GETRSCSTATE, NO_MSG, on_msg_get_state},
106
{GETRSCMETA, NO_MSG, on_msg_get_metadata},
107
{SETLRMDPARAM, REPLY_NOW, on_msg_set_lrmd_param},
108
{GETLRMDPARAM, NO_MSG, on_msg_get_lrmd_param},
110
#define MSG_NR sizeof(msg_maps)/sizeof(struct msg_map)
112
GHashTable* clients = NULL; /* a GHashTable indexed by pid */
113
GHashTable* resources = NULL; /* a GHashTable indexed by rsc_id */
115
static GMainLoop* mainloop = NULL;
116
static int call_id = 1;
117
static const char* lrm_system_name = "lrmd";
118
static GHashTable * RAExecFuncs = NULL;
119
static GList* ra_class_list = NULL;
120
static gboolean shutdown_in_progress = FALSE;
121
static unsigned long apphb_interval = 2000; /* Millisecond */
122
static gboolean reg_to_apphbd = FALSE;
123
static int max_child_count = 4;
124
static int retry_interval = 1000; /* Millisecond */
125
static int child_count = 0;
133
/* define indexes into logmsg_ctrl_defs */
134
#define OP_STAYED_TOO_LONG 0
135
static struct logspam logmsg_ctrl_defs[] = {
136
{ "operation stayed too long in the queue",
137
10, 60, 120, /* max 10 messages in 60s, then delay for 120s */
138
"configuration advice: reduce operation contention "
139
"either by increasing lrmd max_children or by increasing intervals "
140
"of monitor operations"
144
#define set_fd_opts(fd,opts) do { \
146
if ((flag = fcntl(fd, F_GETFL)) >= 0) { \
147
if (fcntl(fd, F_SETFL, flag|opts) < 0) { \
148
cl_perror("%s::%d: fcntl", __FUNCTION__ \
152
cl_perror("%s::%d: fcntl", __FUNCTION__, __LINE__); \
156
static ra_pipe_op_t *
157
ra_pipe_op_new(int child_stdout, int child_stderr, lrmd_op_t * lrmd_op)
159
ra_pipe_op_t * rapop;
160
lrmd_rsc_t* rsc = NULL;
162
if ( NULL == lrmd_op ) {
164
, "%s:%d: lrmd_op==NULL, no need to malloc ra_pipe_op"
165
, __FUNCTION__, __LINE__);
168
rapop = calloc(sizeof(ra_pipe_op_t), 1);
169
if ( rapop == NULL) {
170
lrmd_log(LOG_ERR, "%s:%d out of memory"
171
, __FUNCTION__, __LINE__);
174
rapop->first_line_read = FALSE;
177
* No any obviouse proof of lrmd hang in pipe read yet.
178
* Bug 475 may be a duplicate of bug 499.
179
* Anyway, via test, it's proved that NOBLOCK read will
180
* obviously reduce the RA execution time (bug 553).
182
/* Let the read operation be NONBLOCK */
183
set_fd_opts(child_stdout,O_NONBLOCK);
184
set_fd_opts(child_stderr,O_NONBLOCK);
186
/* there's so much code duplication here */
187
rapop->ra_stdout_fd = child_stdout;
188
if (rapop->ra_stdout_fd <= STDERR_FILENO) {
189
lrmd_log(LOG_ERR, "%s: invalid stdout fd [%d]"
190
, __FUNCTION__, rapop->ra_stdout_fd);
192
rapop->ra_stdout_gsource = G_main_add_fd(G_PRIORITY_HIGH
193
, child_stdout, FALSE, handle_pipe_ra_stdout
194
, rapop, destroy_pipe_ra_stdout);
196
rapop->ra_stderr_fd = child_stderr;
197
if (rapop->ra_stderr_fd <= STDERR_FILENO) {
198
lrmd_log(LOG_ERR, "%s: invalid stderr fd [%d]"
199
, __FUNCTION__, rapop->ra_stderr_fd);
201
rapop->ra_stderr_gsource = G_main_add_fd(G_PRIORITY_HIGH
202
, child_stderr, FALSE, handle_pipe_ra_stderr
203
, rapop, destroy_pipe_ra_stderr);
205
rapop->lrmd_op = lrmd_op;
207
rapop->op_type = strdup(ha_msg_value(lrmd_op->msg, F_LRM_OP));
208
rapop->rsc_id = strdup(lrmd_op->rsc_id);
209
rsc = lookup_rsc(lrmd_op->rsc_id);
211
lrmd_debug(LOG_WARNING
212
, "%s::%d: the rsc (id=%s) does not exist"
213
, __FUNCTION__, __LINE__, lrmd_op->rsc_id);
214
rapop->rsc_class = NULL;
216
rapop->rsc_class = strdup(rsc->class);
223
ra_pipe_op_destroy(ra_pipe_op_t * rapop)
225
CHECK_ALLOCATED(rapop, "ra_pipe_op", );
227
if ( NULL != rapop->ra_stdout_gsource) {
228
G_main_del_fd(rapop->ra_stdout_gsource);
229
rapop->ra_stdout_gsource = NULL;
232
if ( NULL != rapop->ra_stderr_gsource) {
233
G_main_del_fd(rapop->ra_stderr_gsource);
234
rapop->ra_stderr_gsource = NULL;
237
if (rapop->ra_stdout_fd >= STDERR_FILENO) {
238
close(rapop->ra_stdout_fd);
239
rapop->ra_stdout_fd = -1;
240
}else if (rapop->ra_stdout_fd >= 0) {
241
lrmd_log(LOG_ERR, "%s: invalid stdout fd %d"
242
, __FUNCTION__, rapop->ra_stdout_fd);
244
if (rapop->ra_stderr_fd >= STDERR_FILENO) {
245
close(rapop->ra_stderr_fd);
246
rapop->ra_stderr_fd = -1;
247
}else if (rapop->ra_stderr_fd >= 0) {
248
lrmd_log(LOG_ERR, "%s: invalid stderr fd %d"
249
, __FUNCTION__, rapop->ra_stderr_fd);
251
rapop->first_line_read = FALSE;
254
free(rapop->op_type);
255
rapop->op_type = NULL;
256
free(rapop->rsc_class);
257
rapop->rsc_class = NULL;
259
if (rapop->lrmd_op != NULL) {
260
rapop->lrmd_op->rapop = NULL;
261
rapop->lrmd_op = NULL;
268
lrmd_op_destroy(lrmd_op_t* op)
270
CHECK_ALLOCATED(op, "op", );
271
--lrm_objectstats.opcount;
273
if (op->exec_pid > 1) {
275
, "%s: lingering operation process %d, op %s"
276
, __FUNCTION__, op->exec_pid, small_op_info(op));
279
lrmd_debug2(LOG_DEBUG, "%s: free the %s with address %p"
280
,__FUNCTION__, op_info(op), op);
288
if ( op->rapop != NULL ) {
289
op->rapop->lrmd_op = NULL;
292
op->first_line_ra_stdout[0] = EOS;
294
if( op->repeat_timeout_tag ) {
295
Gmain_timeout_remove(op->repeat_timeout_tag);
303
lrmd_op_t* op = (lrmd_op_t*)calloc(sizeof(lrmd_op_t),1);
306
lrmd_log(LOG_ERR, "lrmd_op_new(): out of memory when "
307
"calloc a lrmd_op_t.");
313
op->repeat_timeout_tag = 0;
315
op->first_line_ra_stdout[0] = EOS;
316
op->t_recv = time_longclock();
317
op->t_perform = zero_longclock;
318
op->t_done = zero_longclock;
319
op->t_rcchange = zero_longclock;
320
op->t_lastlogmsg = zero_longclock;
322
memset(op->killseq, 0, sizeof(op->killseq));
323
++lrm_objectstats.opcount;
328
lrmd_op_copy(const lrmd_op_t* op)
336
/* Do a "shallow" copy */
339
* Some things, like timer ids and child pids are duplicated here
340
* but can be destroyed in one copy, but kept intact
341
* in the other, to later be destroyed.
342
* This isn't a complete disaster, since the timer ids aren't
343
* pointers, but it's still untidy at the least.
344
* Be sure and care of this situation when using this function.
346
/* Do a "deep" copy of the message structure */
348
ret->msg = ha_msg_copy(op->msg);
349
ret->rsc_id = strdup(op->rsc_id);
351
ret->first_line_ra_stdout[0] = EOS;
352
ret->repeat_timeout_tag = 0;
354
ret->t_recv = op->t_recv;
355
ret->t_perform = op->t_perform;
356
ret->t_done = op->t_done;
357
ret->t_rcchange = op->t_rcchange;
359
ret->is_cancelled = FALSE;
360
ret->weight = op->weight;
366
op_status_to_str(int op_status)
368
static char whatwasthat[25];
371
return "LRM_OP_DONE";
372
case LRM_OP_CANCELLED:
373
return "LRM_OP_CANCELLED";
375
return "LRM_OP_TIMEOUT";
376
case LRM_OP_NOTSUPPORTED:
377
return "LRM_OP_NOTSUPPORTED";
383
snprintf(whatwasthat, sizeof(whatwasthat), "UNDEFINED STATUS: %d?", op_status);
388
op_target_rc_to_str(int target)
390
static char whatwasthat[25];
399
snprintf(whatwasthat, sizeof(whatwasthat)
400
,"UNDEFINED TARGET_RC: %d", target);
405
* We need a separate function to dump out operations for
406
* debugging. Then we wouldn't have to have the code for this
407
* inline. In particular, we could then call this from on_op_done()
408
* which would shorten and simplify that code - which could use
413
/* Debug oriented funtions */
414
static gboolean debug_level_adjust(int nsig, gpointer user_data);
417
lrmd_op_dump(const lrmd_op_t* op, const char * text)
421
const char * pidstat;
422
longclock_t now = time_longclock();
424
CHECK_ALLOCATED(op, "op", );
426
|| ((kill(op->exec_pid, 0) < 0) && ESRCH == errno)) {
427
pidstat = "not running";
431
ha_msg_value_int(op->msg, F_LRM_OPSTATUS, &op_status);
432
ha_msg_value_int(op->msg, F_LRM_TARGETRC, &target_rc);
434
, "%s: lrmd_op: %s status: %s, target_rc=%s, client pid %d call_id"
435
": %d, child pid: %d (%s) %s %s"
436
, text, op_info(op), op_status_to_str(op_status)
437
, op_target_rc_to_str(target_rc)
438
, op->client_id, op->call_id, op->exec_pid, pidstat
439
, (op->is_copy ? "copy" : "original")
440
, (op->is_cancelled ? "cancelled" : ""));
442
, "%s: lrmd_op2: rt_tag: %d, interval: %d, delay: %d"
443
, text, op->repeat_timeout_tag
444
, op->interval, op->delay);
446
, "%s: lrmd_op3: t_recv: %ldms, t_add: %ldms"
447
", t_perform: %ldms, t_done: %ldms, t_rcchange: %ldms"
448
, text, tm2age(op->t_recv), tm2age(op->t_addtolist)
449
, tm2age(op->t_perform), tm2age(op->t_done), tm2age(op->t_rcchange));
450
lrmd_rsc_dump(op->rsc_id, text);
455
lrmd_client_destroy(lrmd_client_t* client)
457
CHECK_ALLOCATED(client, "client", );
459
--lrm_objectstats.clientcount;
461
* Delete direct references to this client
462
* and repeating operations it might have scheduled
464
unregister_client(client);
465
if (client->app_name) {
466
free(client->app_name);
467
client->app_name = NULL;
472
static lrmd_client_t*
473
lrmd_client_new(void)
475
lrmd_client_t* client;
476
client = calloc(sizeof(lrmd_client_t), 1);
477
if (client == NULL) {
478
lrmd_log(LOG_ERR, "lrmd_client_new(): out of memory when "
479
"calloc lrmd_client_t.");
482
client->g_src = NULL;
483
client->g_src_cbk = NULL;
484
++lrm_objectstats.clientcount;
488
lrmd_client_dump(gpointer key, gpointer value, gpointer user_data)
490
lrmd_client_t * client = (lrmd_client_t*)value;
491
CHECK_ALLOCATED(client, "client", );
496
lrmd_debug(LOG_DEBUG, "client name: %s, client pid: %d"
497
", client uid: %d, gid: %d, last request: %s"
498
", last op in: %s, lastop out: %s"
500
, lrm_str(client->app_name)
502
, client->uid, client->gid
503
, client->lastrequest
504
, ctime(&client->lastreqstart)
505
, ctime(&client->lastreqend)
506
, ctime(&client->lastrcsent)
508
if (!client->ch_cmd) {
509
lrmd_debug(LOG_DEBUG, "NULL client ch_cmd in %s()", __FUNCTION__);
512
, "Command channel status: %d, read queue addr: %p, write queue addr: %p"
513
, client->ch_cmd->ch_status
514
, client->ch_cmd->recv_queue
515
, client->ch_cmd->send_queue );
517
if (client->ch_cmd->recv_queue && client->ch_cmd->send_queue) {
518
lrmd_debug(LOG_DEBUG, "read Qlen: %ld, write Qlen: %ld"
519
, (long)client->ch_cmd->recv_queue->current_qlen
520
, (long)client->ch_cmd->send_queue->current_qlen);
523
if (!client->ch_cbk) {
524
lrmd_debug(LOG_DEBUG, "NULL client ch_cbk in %s()", __FUNCTION__);
527
, "Callback channel status: %d, read Qlen: %ld, write Qlen: %ld"
528
, client->ch_cbk->ch_status
529
, (long)client->ch_cbk->recv_queue->current_qlen
530
, (long)client->ch_cbk->send_queue->current_qlen);
534
lrmd_dump_all_clients(void)
536
static gboolean incall = FALSE;
544
lrmd_debug(LOG_DEBUG, "%d clients connected to lrmd"
545
, g_hash_table_size(clients));
547
g_hash_table_foreach(clients, lrmd_client_dump, NULL);
552
lrmd_rsc_destroy(lrmd_rsc_t* rsc)
555
CHECK_ALLOCATED(rsc, "resource", );
556
--lrm_objectstats.rsccount;
557
if( rsc->op_list || rsc->repeat_op_list ) {
558
lrmd_log(LOG_ERR, "%s: refusing to remove resource %s"
559
" which is still holding operations"
560
, __FUNCTION__, lrm_str(rsc->id));
563
lrmd_debug(LOG_DEBUG, "%s: removing resource %s"
564
, __FUNCTION__, lrm_str(rsc->id));
566
g_hash_table_remove(resources, rsc->id);
581
rsc->provider = NULL;
583
if (NULL != rsc->params) {
584
free_str_table(rsc->params);
587
if (rsc->last_op_table) {
588
g_hash_table_foreach_remove(rsc->last_op_table
589
, free_str_hash_pair, NULL);
590
g_hash_table_destroy(rsc->last_op_table);
591
rsc->last_op_table = NULL;
593
if (rsc->last_op_done) {
594
lrmd_op_destroy(rsc->last_op_done);
595
rsc->last_op_done = NULL;
598
if (rsc->delay_timeout > 0) {
599
Gmain_timeout_remove(rsc->delay_timeout);
600
rsc->delay_timeout = (guint)0;
608
lrmd_rsc_new(const char * id, struct ha_msg* msg)
611
rsc = (lrmd_rsc_t *)calloc(sizeof(lrmd_rsc_t),1);
613
lrmd_log(LOG_ERR, "%s: out of memory when calloc "
614
"a lrmd_rsc_t", __FUNCTION__);
617
rsc->delay_timeout = (guint)0;
619
rsc->id = strdup(id);
622
rsc->type = strdup(ha_msg_value(msg, F_LRM_RTYPE));
623
rsc->class = strdup(ha_msg_value(msg, F_LRM_RCLASS));
624
if (NULL == ha_msg_value(msg, F_LRM_RPROVIDER)) {
625
lrmd_log(LOG_NOTICE, "%s(): No %s field in message"
626
, __FUNCTION__, F_LRM_RPROVIDER);
628
rsc->provider = strdup(ha_msg_value(msg, F_LRM_RPROVIDER));
629
if (rsc->provider == NULL) {
635
|| rsc->class == NULL) {
639
g_hash_table_insert(resources, strdup(id), rsc);
640
++lrm_objectstats.rsccount;
643
lrmd_rsc_destroy(rsc); /* violated property */ /* Or so BEAM thinks :-) */
649
dump_op(gpointer key, gpointer val, gpointer data)
651
lrmd_op_t* lrmd_op = (lrmd_op_t*) val;
653
lrmd_op_dump(lrmd_op, "rsc->last_op_table");
656
dump_op_table(gpointer key, gpointer val, gpointer data)
658
GHashTable* table = (GHashTable*) val;
660
g_hash_table_foreach(table, dump_op, data);
663
lrmd_rsc_dump(char* rsc_id, const char * text)
665
static gboolean incall = FALSE;
667
lrmd_rsc_t* rsc=NULL;
670
rsc = lookup_rsc(rsc_id);
673
, "%s:%d: the rsc_id is NULL"
674
, __FUNCTION__, __LINE__);
677
CHECK_ALLOCATED(rsc, "rsc", );
682
/* Avoid infinite recursion loops... */
687
/* TODO: Dump params and last_op_table FIXME */
689
lrmd_debug(LOG_DEBUG, "%s: BEGIN resource dump", text);
690
lrmd_debug(LOG_DEBUG, "%s: resource %s/%s/%s/%s"
694
, lrm_str(rsc->class)
695
, lrm_str(rsc->provider));
697
lrmd_debug(LOG_DEBUG, "%s: rsc->op_list...", text);
698
for(oplist = g_list_first(rsc->op_list); oplist;
699
oplist = g_list_next(oplist)) {
700
lrmd_op_dump(oplist->data, "rsc->op_list");
703
lrmd_debug(LOG_DEBUG, "%s: rsc->repeat_op_list...", text);
704
for(oplist = g_list_first(rsc->repeat_op_list); oplist;
705
oplist=g_list_next(oplist)) {
706
lrmd_op_dump(oplist->data, "rsc->repeat_op_list");
709
if (rsc->last_op_done != NULL) {
710
lrmd_debug(LOG_DEBUG, "%s: rsc->last_op_done...", text);
711
lrmd_op_dump(rsc->last_op_done, "rsc->last_op_done");
714
lrmd_debug(LOG_DEBUG, "%s: rsc->last_op_done==NULL", text);
716
if (rsc->last_op_table) {
717
g_hash_table_foreach(rsc->last_op_table,dump_op_table,NULL);
720
lrmd_debug(LOG_DEBUG, "%s: rsc->last_op_table==NULL", text);
722
lrmd_debug(LOG_DEBUG, "%s: END resource dump", text);
726
dump_id_rsc_pair(gpointer key, gpointer value, gpointer user_data)
728
char* rid = (char*)key;
729
char* text = (char*)user_data;
730
lrmd_rsc_dump(rid,text);
733
lrmd_dump_all_resources(void)
735
static gboolean incall = FALSE;
736
char text[]= "lrmd_dump_all_resources";
742
lrmd_debug(LOG_DEBUG, "%d resources are managed by lrmd"
743
, g_hash_table_size(resources));
744
g_hash_table_foreach(resources, dump_id_rsc_pair, text);
751
lrm_debug_running_op(lrmd_op_t* op, const char * text)
754
lrmd_op_dump(op, text);
755
CHECK_ALLOCATED(op, "op", );
756
if (op->exec_pid >= 1) {
757
/* This really ought to use our logger
758
* So... it might not get forwarded to the central machine
759
* if you're testing with CTS -- FIXME!!!
761
snprintf(cmd, sizeof(cmd)
762
, "ps -l -f -s %d | logger -p daemon.info -t 'T/O PS:'"
764
lrmd_debug(LOG_DEBUG, "Running [%s]", cmd);
765
if (system(cmd) != 0) {
766
lrmd_log(LOG_ERR, "Running [%s] failed", cmd);
768
snprintf(cmd, sizeof(cmd)
769
, "ps axww | logger -p daemon.info -t 't/o ps:'");
770
lrmd_debug(LOG_DEBUG, "Running [%s]", cmd);
771
if (system(cmd) != 0) {
772
lrmd_log(LOG_ERR, "Running [%s] failed", cmd);
778
main(int argc, char ** argv)
780
int req_restart = TRUE;
781
int req_status = FALSE;
782
int req_stop = FALSE;
787
while ((flag = getopt(argc, argv, OPTARGS)) != EOF) {
789
case 'h': /* Help message */
790
usage(lrm_system_name, LSB_EXIT_OK);
792
case 'v': /* Debug mode, more logs*/
795
case 's': /* Status */
798
case 'k': /* Stop (kill) */
801
case 'r': /* Restart */
804
/* Register to apphbd then monitored by it */
806
reg_to_apphbd = TRUE;
808
case 'i': /* Get apphb interval */
810
apphb_interval = atoi(optarg);
824
usage(lrm_system_name, LSB_EXIT_GENERIC);
827
cl_log_set_entity(lrm_system_name);
828
cl_log_enable_stderr(debug_level?TRUE:FALSE);
829
cl_log_set_facility(HA_LOG_FACILITY);
831
/* Use logd if it's enabled by heartbeat */
832
cl_inherit_logging_environment(0);
835
return init_status(PID_FILE, lrm_system_name);
839
return init_stop(PID_FILE);
850
init_status(const char *pid_file, const char *client_name)
852
long pid = cl_read_pidfile(pid_file);
855
fprintf(stderr, "%s is running [pid: %ld]\n"
857
return LSB_STATUS_OK;
859
fprintf(stderr, "%s is stopped.\n", client_name);
860
return LSB_STATUS_STOPPED;
864
init_stop(const char *pid_file)
867
int rc = LSB_EXIT_OK;
871
if (pid_file == NULL) {
872
lrmd_log(LOG_ERR, "No pid file specified to kill process");
873
return LSB_EXIT_GENERIC;
875
pid = cl_read_pidfile(pid_file);
878
if (CL_KILL((pid_t)pid, SIGTERM) < 0) {
880
? LSB_EXIT_EPERM : LSB_EXIT_GENERIC);
881
fprintf(stderr, "Cannot kill pid %ld\n", pid);
884
"Signal sent to pid=%ld,"
885
" waiting for process to exit",
888
while (CL_PID_EXISTS(pid)) {
896
static const char usagemsg[] = "[-srkhv]\n\ts: status\n\tr: restart"
897
"\n\tk: kill\n\tm: register to apphbd\n\ti: the interval of apphb\n\t"
898
"h: help\n\tv: debug\n";
901
usage(const char* cmd, int exit_status)
905
stream = exit_status ? stderr : stdout;
907
fprintf(stream, "usage: %s %s", cmd, usagemsg);
913
* In design, the lrmd should not know the meaning of operation type
914
* and the meaning of rc. This function is just for logging.
917
warning_on_active_rsc(gpointer key, gpointer value, gpointer user_data)
922
lrmd_rsc_t* rsc = (lrmd_rsc_t*)value;
923
if (rsc->last_op_done != NULL) {
924
if (HA_OK != ha_msg_value_int(rsc->last_op_done->msg
925
, F_LRM_OPSTATUS, &op_status)) {
926
lrmd_debug(LOG_WARNING
927
,"resource %s is left in UNKNOWN status." \
928
"(last op done is damaged..)"
932
op_type = ha_msg_value(rsc->last_op_done->msg, F_LRM_OP);
933
if (op_status != LRM_OP_DONE) {
934
lrmd_debug(LOG_WARNING
935
,"resource %s is left in UNKNOWN status." \
936
"(last op %s finished without LRM_OP_DONE status.)"
940
if (HA_OK != ha_msg_value_int(rsc->last_op_done->msg
942
lrmd_debug(LOG_WARNING
943
,"resource %s is left in UNKNOWN status." \
944
"(last op done is damaged..)"
949
(STRNCMP_CONST(op_type,"start") ==0
950
||STRNCMP_CONST(op_type,"monitor") ==0
951
||STRNCMP_CONST(op_type,"status") ==0)) {
952
lrmd_debug(LOG_WARNING
953
,"resource %s is left in RUNNING status." \
954
"(last op %s finished with rc 0.)"
959
(STRNCMP_CONST(op_type,"start") ==0
960
||STRNCMP_CONST(op_type,"stop") ==0)) {
961
lrmd_debug(LOG_WARNING
962
,"resource %s is left in UNKNOWN status." \
963
"(last op %s finished with rc %d.)"
964
,rsc->id, op_type, rc);
973
lrmd_log(LOG_INFO,"lrmd is shutting down");
974
if (mainloop != NULL && g_main_is_running(mainloop)) {
975
g_hash_table_foreach(resources, warning_on_active_rsc, NULL);
976
g_main_quit(mainloop);
983
has_pending_op(gpointer key, gpointer value, gpointer user_data)
985
lrmd_rsc_t* rsc = (lrmd_rsc_t*)value;
986
int* result = (int*)user_data;
987
if (rsc->op_list != NULL) {
995
g_hash_table_foreach(resources, has_pending_op, &has_ops);
1000
sigterm_action(int nsig, gpointer user_data)
1002
shutdown_in_progress = TRUE;
1004
if (can_shutdown()) {
1007
lrmd_log(LOG_INFO, "sigterm_action: shutdown postponed, some operations are still running");
1013
register_pid(gboolean do_fork,
1014
gboolean (*shutdown)(int nsig, gpointer userdata))
1020
for (j=0; j < 3; ++j) {
1022
(void)open("/dev/null", j == 0 ? O_RDONLY : O_WRONLY);
1024
CL_IGNORE_SIG(SIGINT);
1025
CL_IGNORE_SIG(SIGHUP);
1026
CL_DEFAULT_SIG(SIGPIPE);
1027
G_main_add_SignalHandler(G_PRIORITY_HIGH, SIGTERM
1028
, shutdown, NULL, NULL);
1029
cl_signal_set_interrupt(SIGTERM, 1);
1030
cl_signal_set_interrupt(SIGCHLD, 1);
1031
/* At least they are harmless, I think. ;-) */
1032
cl_signal_set_interrupt(SIGINT, 0);
1033
cl_signal_set_interrupt(SIGHUP, 0);
1037
init_using_apphb(void)
1040
char lrmd_instance[40];
1042
if (reg_to_apphbd == FALSE) {
1046
snprintf(lrmd_instance, sizeof(lrmd_instance), "%s_%ld"
1047
, lrm_system_name, (long)getpid());
1048
if (apphb_register(lrm_system_name, lrmd_instance) != 0) {
1049
lrmd_log(LOG_ERR, "Failed when trying to register to apphbd.");
1050
lrmd_log(LOG_ERR, "Maybe apphbd is not running. Quit.");
1053
lrmd_log(LOG_INFO, "Registered to apphbd.");
1055
apphb_setinterval(apphb_interval);
1056
apphb_setwarn(apphb_interval*APPHB_WARNTIME_FACTOR);
1058
Gmain_timeout_add(apphb_interval - APPHB_INTVL_DETLA, emit_apphb, NULL);
1064
emit_apphb(gpointer data)
1067
if (reg_to_apphbd == FALSE) {
1071
if (apphb_hb() != 0) {
1072
lrmd_log(LOG_ERR, "emit_apphb: Failed to emit an apphb.");
1073
reg_to_apphbd = FALSE;
1080
/* main loop of the daemon*/
1085
PILPluginUniv * PluginLoadingSystem = NULL;
1086
struct dirent* subdir;
1087
struct passwd* pw_entry;
1089
char* ra_name = NULL;
1091
IPC_Auth * auth = NULL;
1093
GHashTable* uidlist;
1094
IPC_WaitConnection* conn_cmd = NULL;
1095
IPC_WaitConnection* conn_cbk = NULL;
1097
GHashTable* conn_cmd_attrs;
1098
GHashTable* conn_cbk_attrs;
1100
char path[] = IPC_PATH_ATTR;
1101
char cmd_path[] = LRM_CMDPATH;
1102
char cbk_path[] = LRM_CALLBACKPATH;
1104
PILGenericIfMgmtRqst RegisterRqsts[]= {
1105
{"RAExec", &RAExecFuncs, NULL, NULL, NULL},
1106
{ NULL, NULL, NULL, NULL, NULL} };
1108
qsort(msg_maps, MSG_NR, sizeof(struct msg_map), msg_type_cmp);
1110
if (cl_lock_pidfile(PID_FILE) < 0) {
1111
lrmd_log(LOG_ERR, "already running: [pid %d].", cl_read_pidfile(PID_FILE));
1112
lrmd_log(LOG_ERR, "Startup aborted (already running). Shutting down.");
1116
register_pid(FALSE, sigterm_action);
1118
/* load RA plugins */
1119
PluginLoadingSystem = NewPILPluginUniv (HA_PLUGIN_DIR);
1120
PILLoadPlugin(PluginLoadingSystem, "InterfaceMgr", "generic",
1125
* Much of the code through the end of the next loop is
1126
* unnecessary - The plugin system will do this for you quite
1127
* nicely. And, it does it portably, too...
1130
dir = opendir(LRM_PLUGIN_DIR);
1132
lrmd_log(LOG_ERR, "main: can not open RA plugin dir "LRM_PLUGIN_DIR);
1133
lrmd_log(LOG_ERR, "Startup aborted (no RA plugin). Shutting down.");
1137
while ( NULL != (subdir = readdir(dir))) {
1139
if ( '.' == subdir->d_name[0]) {
1142
/* skip the other type files */
1143
if (NULL == strstr(subdir->d_name, ".so")) {
1146
/* remove the ".so" */
1147
dot = strchr(subdir->d_name,'.');
1149
len = (int)(dot - subdir->d_name);
1150
ra_name = g_strndup(subdir->d_name,len);
1153
ra_name = g_strdup(subdir->d_name);
1155
PILLoadPlugin(PluginLoadingSystem , "RAExec", ra_name, NULL);
1156
ra_class_list = g_list_append(ra_class_list,ra_name);
1158
closedir(dir); dir = NULL; /* Don't forget to close 'dir' */
1161
*create the waiting connections
1162
*one for register the client,
1163
*the other is for create the callback channel
1166
uidlist = g_hash_table_new(g_direct_hash, g_direct_equal);
1167
/* Add root's uid */
1168
g_hash_table_insert(uidlist, GUINT_TO_POINTER(0), &one);
1170
pw_entry = getpwnam(HA_CCMUSER);
1171
if (pw_entry == NULL) {
1172
lrmd_log(LOG_ERR, "Cannot get the uid of HACCMUSER");
1174
g_hash_table_insert(uidlist, GUINT_TO_POINTER(pw_entry->pw_uid)
1178
if ( NULL == (auth = MALLOCT(struct IPC_AUTH)) ) {
1179
lrmd_log(LOG_ERR, "init_start: MALLOCT (IPC_AUTH) failed.");
1181
auth->uid = uidlist;
1185
/*Create a waiting connection to accept command connect from client*/
1186
conn_cmd_attrs = g_hash_table_new(g_str_hash, g_str_equal);
1187
g_hash_table_insert(conn_cmd_attrs, path, cmd_path);
1188
conn_cmd = ipc_wait_conn_constructor(IPC_ANYTYPE, conn_cmd_attrs);
1189
g_hash_table_destroy(conn_cmd_attrs);
1190
if (NULL == conn_cmd) {
1192
"main: can not create wait connection for command.");
1193
lrmd_log(LOG_ERR, "Startup aborted (can't create comm channel). Shutting down.");
1198
/*Create a source to handle new connect rquests for command*/
1199
G_main_add_IPC_WaitConnection( G_PRIORITY_HIGH, conn_cmd, auth, FALSE,
1200
on_connect_cmd, conn_cmd, NULL);
1203
* Create a waiting connection to accept the callback connect from client
1205
conn_cbk_attrs = g_hash_table_new(g_str_hash, g_str_equal);
1206
g_hash_table_insert(conn_cbk_attrs, path, cbk_path);
1207
conn_cbk = ipc_wait_conn_constructor( IPC_ANYTYPE, conn_cbk_attrs);
1208
g_hash_table_destroy(conn_cbk_attrs);
1210
if (NULL == conn_cbk) {
1212
"main: can not create wait connection for callback.");
1213
lrmd_log(LOG_ERR, "Startup aborted (can't create comm channel). Shutting down.");
1217
/*Create a source to handle new connect rquests for callback*/
1218
G_main_add_IPC_WaitConnection( G_PRIORITY_HIGH, conn_cbk, auth, FALSE,
1219
on_connect_cbk, conn_cbk, NULL);
1221
/* our child signal handling involves calls with
1222
* unpredictable timing; so we raise the limit to
1223
* reduce the number of warnings
1225
set_sigchld_proctrack(G_PRIORITY_HIGH,10*DEFAULT_MAXDISPATCHTIME);
1227
lrmd_log(LOG_INFO, "enabling coredumps");
1228
/* Although lrmd can count on the parent to enable coredump, still
1229
* set it here for test, when start manually.
1232
cl_enable_coredumps(TRUE);
1234
/* Allow us to always take a "secure" core dump
1235
* We might have STONITH logins and passwords, etc. in our address
1236
* space - so we need to make sure it's only readable by root.
1237
* Calling this function accomplishes that.
1239
cl_set_all_coredump_signal_handlers();
1240
if( drop_privs(0, 0) ) { /* become "nobody" */
1241
lrmd_log(LOG_WARNING,"%s: failed to drop privileges: %s"
1242
, __FUNCTION__, strerror(errno));
1246
* Add the signal handler for SIGUSR1, SIGUSR2.
1247
* They are used to change the debug level.
1249
G_main_add_SignalHandler(G_PRIORITY_HIGH, SIGUSR1,
1250
debug_level_adjust, NULL, NULL);
1251
G_main_add_SignalHandler(G_PRIORITY_HIGH, SIGUSR2,
1252
debug_level_adjust, NULL, NULL);
1255
* alloc memory for client table and resource table
1257
clients = g_hash_table_new(g_int_hash, g_int_equal);
1258
if (clients == NULL) {
1259
cl_log(LOG_ERR, "can not new hash table clients");
1262
resources = g_hash_table_new_full(g_str_hash
1263
, g_str_equal, free, NULL);
1264
if (resources == NULL) {
1265
cl_log(LOG_ERR, "can not new hash table resources");
1269
/*Create the mainloop and run it*/
1270
mainloop = g_main_new(FALSE);
1271
lrmd_debug(LOG_DEBUG, "main: run the loop...");
1272
lrmd_log(LOG_INFO, "Started.");
1274
/* apphb initializing */
1276
emit_apphb(NULL); /* Avoid warning */
1278
g_main_run(mainloop);
1281
if (reg_to_apphbd == TRUE) {
1285
reg_to_apphbd = FALSE;
1288
if( return_to_orig_privs() ) {
1289
cl_perror("%s: failed to raise privileges", __FUNCTION__);
1291
conn_cmd->ops->destroy(conn_cmd);
1294
conn_cbk->ops->destroy(conn_cbk);
1297
g_hash_table_destroy(uidlist);
1298
if ( NULL != auth ) {
1301
if (cl_unlock_pidfile(PID_FILE) == 0) {
1302
lrmd_debug(LOG_DEBUG, "[%s] stopped", lrm_system_name);
1308
*GLoop Message Handlers
1311
on_connect_cmd (IPC_Channel* ch, gpointer user_data)
1313
lrmd_client_t* client = NULL;
1315
/* check paremeters */
1317
lrmd_log(LOG_ERR, "on_connect_cmd: channel is null");
1320
/* create new client */
1321
/* the register will be finished in on_msg_register */
1322
client = lrmd_client_new();
1323
if (client == NULL) {
1326
client->app_name = NULL;
1327
client->ch_cmd = ch;
1328
client->g_src = G_main_add_IPC_Channel(G_PRIORITY_DEFAULT,
1329
ch, FALSE, on_receive_cmd, (gpointer)client,
1337
on_connect_cbk (IPC_Channel* ch, gpointer user_data)
1339
/*client connect for create the second channel for call back*/
1341
const char* type = NULL;
1342
struct ha_msg* msg = NULL;
1343
lrmd_client_t* client = NULL;
1346
lrmd_log(LOG_ERR, "on_connect_cbk: channel is null");
1350
/* Isn't this kind of a tight timing assumption ??
1351
* This operation is non-blocking -- IIRC
1352
* Maybe this should be moved to the input dispatch function
1353
* for this channel when we make a GSource from it.
1357
/*get the message, ends up in socket_waitin */
1358
msg = msgfromIPC_noauth(ch);
1360
lrmd_log(LOG_ERR, "on_connect_cbk: can not receive msg");
1364
/*check if it is a register message*/
1365
type = ha_msg_value(msg, F_LRM_TYPE);
1366
if (0 != STRNCMP_CONST(type, REGISTER)) {
1367
lrmd_log(LOG_ERR, "on_connect_cbk: received a message which is "
1368
"not known by lrmd.");
1370
send_ret_msg(ch, HA_FAIL);
1374
/*get the pid of client */
1375
if (HA_OK != ha_msg_value_int(msg, F_LRM_PID, &pid)) {
1376
lrmd_log(LOG_ERR, "on_connect_cbk: can not get pid from the "
1379
send_ret_msg(ch, HA_FAIL);
1384
/*get the client in the client list*/
1385
client = lookup_client(pid);
1386
if (NULL == client) {
1387
lrmd_log(LOG_ERR, "on_connect_cbk: donnot find the client "
1388
"[pid:%d] in internal client list. ", pid);
1389
send_ret_msg(ch, HA_FAIL);
1392
if (client->ch_cbk != NULL) {
1393
client->ch_cbk->ops->destroy(client->ch_cbk);
1394
client->ch_cbk = NULL;
1396
client->g_src_cbk = G_main_add_IPC_Channel(G_PRIORITY_DEFAULT
1397
, ch, FALSE,NULL,NULL,NULL);
1399
/*fill the channel of callback field*/
1400
client->ch_cbk = ch;
1401
send_ret_msg(ch, HA_OK);
1406
msg_type_cmp(const void *p1, const void *p2)
1410
((const struct msg_map *)p1)->msg_type,
1411
((const struct msg_map *)p2)->msg_type,
1416
on_receive_cmd (IPC_Channel* ch, gpointer user_data)
1418
struct msg_map *msgmap_p, in_type;
1419
lrmd_client_t* client = NULL;
1420
struct ha_msg* msg = NULL;
1423
client = (lrmd_client_t*)user_data;
1425
if (IPC_DISCONNECT == ch->ch_status) {
1426
lrmd_debug(LOG_DEBUG,
1427
"on_receive_cmd: the IPC to client [pid:%d] disconnected."
1432
if (!ch->ops->is_message_pending(ch)) {
1433
lrmd_debug(LOG_DEBUG, "on_receive_cmd: no pending message in IPC "
1439
/*get the message */
1440
msg = msgfromIPC_noauth(ch);
1442
lrmd_log(LOG_ERR, "on_receive_cmd: can not receive messages.");
1446
if (TRUE == shutdown_in_progress ) {
1447
send_ret_msg(ch,HA_FAIL);
1449
lrmd_log(LOG_INFO, "%s: new requests denied," \
1450
" we're about to shutdown", __FUNCTION__);
1454
/*dispatch the message*/
1455
in_type.msg_type = ha_msg_value(msg, F_LRM_TYPE);
1456
if( !in_type.msg_type ) {
1457
LOG_FAILED_TO_GET_FIELD(F_LRM_TYPE);
1460
msg_s = msg2string(msg);
1462
lrmd_debug2(LOG_DEBUG,"dumping request: %s",msg_s);
1466
if (!(msgmap_p = bsearch(&in_type, msg_maps,
1467
MSG_NR, sizeof(struct msg_map), msg_type_cmp)
1470
lrmd_log(LOG_ERR, "on_receive_cmd: received an unknown msg");
1474
strncpy(client->lastrequest, in_type.msg_type, sizeof(client->lastrequest));
1475
client->lastrequest[sizeof(client->lastrequest)-1]='\0';
1476
client->lastreqstart = time(NULL);
1477
/*call the handler of the message*/
1478
ret = msgmap_p->handler(client, msg);
1479
client->lastreqend = time(NULL);
1481
/*return rc to client if need*/
1482
if (send_msg_now(msgmap_p)) {
1483
send_ret_msg(ch, ret);
1484
client->lastrcsent = time(NULL);
1494
remove_repeat_op_from_client(gpointer key, gpointer value, gpointer user_data)
1496
lrmd_rsc_t* rsc = (lrmd_rsc_t*)value;
1497
pid_t pid = GPOINTER_TO_UINT(user_data); /* pointer cast as int */
1499
(void)flush_all(&(rsc->repeat_op_list),pid);
1502
/* Remove all direct pointer references to 'client' before destroying it */
1504
unregister_client(lrmd_client_t* client)
1506
CHECK_ALLOCATED(client, "client", HA_FAIL);
1508
if (NULL == lookup_client(client->pid)) {
1509
lrmd_log(LOG_ERR,"%s: can not find client %s [pid %d] when try "
1512
, client->app_name, client->pid);
1516
/* Search all resources for repeating ops this client owns */
1517
g_hash_table_foreach(resources
1518
, remove_repeat_op_from_client, GUINT_TO_POINTER(client->pid));
1520
/* Remove from clients */
1521
g_hash_table_remove(clients, (gpointer)&client->pid);
1523
lrmd_debug(LOG_DEBUG, "%s: client %s [pid:%d] is unregistered"
1531
on_remove_client (gpointer user_data)
1533
lrmd_client_t* client = (lrmd_client_t*) user_data;
1535
CHECK_ALLOCATED(client, "client", );
1536
if (client->g_src != NULL) {
1537
G_main_del_IPC_Channel(client->g_src);
1539
if (client->g_src_cbk != NULL) {
1540
G_main_del_IPC_Channel(client->g_src_cbk);
1542
lrmd_client_destroy(client);
1547
/* This function called when its time to run a repeating operation now */
1548
/* Move op from repeat queue to running queue */
1550
on_repeat_op_readytorun(gpointer data)
1552
lrmd_op_t* op = NULL;
1553
lrmd_rsc_t* rsc = NULL;
1556
op = (lrmd_op_t*)data;
1557
CHECK_ALLOCATED(op, "op", FALSE );
1559
if (op->exec_pid == 0) {
1560
lrmd_log(LOG_ERR, "%s: exec_pid is 0 (internal error)"
1565
lrmd_debug2(LOG_DEBUG
1566
, "%s: remove operation %s from the repeat operation list and "
1567
"add it to the operation list"
1568
, __FUNCTION__, op_info(op));
1571
rsc = lookup_rsc(op->rsc_id);
1574
, "%s: the rsc_id in op %s is NULL"
1575
, __FUNCTION__, op_info(op));
1579
rsc->repeat_op_list = g_list_remove(rsc->repeat_op_list, op);
1580
if (op->repeat_timeout_tag != 0) {
1581
Gmain_timeout_remove(op->repeat_timeout_tag);
1582
op->repeat_timeout_tag = (guint)0;
1587
if (!shutdown_in_progress) {
1588
add_op_to_runlist(rsc,op);
1596
/*LRM Message Handlers*/
1598
on_msg_register(lrmd_client_t* client, struct ha_msg* msg)
1600
lrmd_client_t* exist = NULL;
1601
const char* app_name = NULL;
1603
CHECK_ALLOCATED(msg, "register message", HA_FAIL);
1605
app_name = ha_msg_value(msg, F_LRM_APP);
1606
if (NULL == app_name) {
1607
lrmd_log(LOG_ERR, "on_msg_register: no app_name in "
1611
client->app_name = strdup(app_name);
1613
return_on_no_int_value(msg, F_LRM_PID, &client->pid);
1614
return_on_no_int_value(msg, F_LRM_GID, (int *)&client->gid);
1615
return_on_no_int_value(msg, F_LRM_UID, (int *)&client->uid);
1617
exist = lookup_client(client->pid);
1618
if (NULL != exist) {
1619
g_hash_table_remove(clients, (gpointer)&client->pid);
1620
on_remove_client(exist);
1621
lrmd_log(LOG_NOTICE,
1622
"on_msg_register: the client [pid:%d] already exists in "
1623
"internal client list, let remove it at first."
1627
g_hash_table_insert(clients, (gpointer)&client->pid, client);
1628
lrmd_debug(LOG_DEBUG, "on_msg_register:client %s [%d] registered"
1636
on_msg_get_rsc_classes(lrmd_client_t* client, struct ha_msg* msg)
1638
struct ha_msg* ret = NULL;
1640
CHECK_ALLOCATED(client, "client", HA_FAIL);
1641
CHECK_ALLOCATED(msg, "message", HA_FAIL);
1643
lrmd_debug2(LOG_DEBUG
1644
, "on_msg_get_rsc_classes:client [%d] wants to get rsc classes"
1647
ret = create_lrm_ret(HA_OK, 4);
1648
CHECK_RETURN_OF_CREATE_LRM_RET;
1650
cl_msg_add_list(ret,F_LRM_RCLASS,ra_class_list);
1651
if (HA_OK != msg2ipcchan(ret, client->ch_cmd)) {
1653
"on_msg_get_rsc_classes: cannot send the ret mesage");
1661
on_msg_get_rsc_types(lrmd_client_t* client, struct ha_msg* msg)
1663
struct ha_msg* ret = NULL;
1664
struct RAExecOps * RAExec = NULL;
1665
GList* types = NULL;
1667
const char* rclass = NULL;
1669
CHECK_ALLOCATED(client, "client", HA_FAIL);
1670
CHECK_ALLOCATED(msg, "message", HA_FAIL);
1672
ret = create_lrm_ret(HA_OK,5);
1673
CHECK_RETURN_OF_CREATE_LRM_RET;
1675
rclass = ha_msg_value(msg, F_LRM_RCLASS);
1676
if (rclass == NULL) {
1677
lrmd_log(LOG_ERR, "on_msg_get_rsc_types: cannot get the "
1678
"resource class field from the message.");
1679
send_ret_msg(client->ch_cmd, HA_FAIL);
1683
lrmd_debug2(LOG_DEBUG, "on_msg_get_rsc_types: the client [pid:%d] "
1684
"wants to get resource types of resource class %s"
1685
, client->pid, rclass);
1687
RAExec = g_hash_table_lookup(RAExecFuncs,rclass);
1689
if (NULL == RAExec) {
1690
lrmd_log(LOG_NOTICE, "on_msg_get_rsc_types: can not find this "
1691
"RA class %s.", rclass);
1693
if (0 <= RAExec->get_resource_list(&types) && types != NULL) {
1694
cl_msg_add_list(ret, F_LRM_RTYPES, types);
1695
while (NULL != (type = g_list_first(types))) {
1696
types = g_list_remove_link(types, type);
1698
g_list_free_1(type);
1704
if (HA_OK != msg2ipcchan(ret, client->ch_cmd)) {
1706
"on_msg_get_rsc_types: can not send the ret message.");
1714
on_msg_get_rsc_providers(lrmd_client_t* client, struct ha_msg* msg)
1716
struct ha_msg* ret = NULL;
1717
struct RAExecOps * RAExec = NULL;
1718
GList* providers = NULL;
1719
GList* provider = NULL;
1720
const char* rclass = NULL;
1721
const char* rtype = NULL;
1723
CHECK_ALLOCATED(client, "client", HA_FAIL);
1724
CHECK_ALLOCATED(msg, "message", HA_FAIL);
1726
ret = create_lrm_ret(HA_OK,5);
1727
CHECK_RETURN_OF_CREATE_LRM_RET;
1729
rclass = ha_msg_value(msg, F_LRM_RCLASS);
1730
rtype = ha_msg_value(msg, F_LRM_RTYPE);
1731
if( !rclass || !rtype ) {
1733
, "%s: could not retrieve resource class or type"
1735
send_ret_msg(client->ch_cmd, HA_FAIL);
1739
lrmd_debug2(LOG_DEBUG
1740
, "%s: the client [%d] wants to get rsc privider of %s::%s"
1746
RAExec = g_hash_table_lookup(RAExecFuncs, rclass);
1748
if (NULL == RAExec) {
1750
, "%s: can not find the class %s."
1755
if (0 <= RAExec->get_provider_list(rtype, &providers)) {
1756
if (providers != NULL) {
1757
cl_msg_add_list(ret, F_LRM_RPROVIDERS, providers);
1759
while (NULL != (provider = g_list_first(providers))) {
1760
providers = g_list_remove_link(providers, provider);
1761
g_free(provider->data);
1762
g_list_free_1(provider);
1764
g_list_free(providers);
1768
if (HA_OK != msg2ipcchan(ret, client->ch_cmd)) {
1770
"on_msg_get_rsc_providers: can not send the ret msg");
1778
on_msg_get_metadata(lrmd_client_t* client, struct ha_msg* msg)
1780
struct ha_msg* ret = NULL;
1781
struct RAExecOps * RAExec = NULL;
1782
const char* rtype = NULL;
1783
const char* rclass = NULL;
1784
const char* provider = NULL;
1786
CHECK_ALLOCATED(client, "client", HA_FAIL);
1787
CHECK_ALLOCATED(msg, "message", HA_FAIL);
1789
rtype = ha_msg_value(msg, F_LRM_RTYPE);
1790
rclass = ha_msg_value(msg, F_LRM_RCLASS);
1791
provider = ha_msg_value(msg, F_LRM_RPROVIDER);
1793
lrmd_debug2(LOG_DEBUG
1794
, "%s: the client [pid:%d] wants to get rsc metadata of %s::%s::%s."
1801
ret = create_lrm_ret(HA_OK, 5);
1802
CHECK_RETURN_OF_CREATE_LRM_RET;
1804
RAExec = g_hash_table_lookup(RAExecFuncs,rclass);
1805
if (NULL == RAExec) {
1807
, "%s: can not find the class %s."
1812
char* meta = RAExec->get_resource_meta(rtype,provider);
1813
if (NULL != meta && strlen(meta) > 0) {
1814
if (HA_OK != ha_msg_add(ret,F_LRM_METADATA, meta)) {
1815
LOG_FAILED_TO_ADD_FIELD("metadata");
1820
lrmd_log(LOG_WARNING
1821
, "%s: empty metadata for %s::%s::%s."
1826
ha_msg_mod_int(ret, F_LRM_RET, HA_FAIL);
1830
if (HA_OK != msg2ipcchan(ret, client->ch_cmd)) {
1832
"on_msg_get_metadata: can not send the ret msg");
1839
add_rid_to_msg(gpointer key, gpointer value, gpointer user_data)
1841
char* rid = (char*)key;
1842
struct ha_msg* msg = (struct ha_msg*)user_data;
1843
if (HA_OK != cl_msg_list_add_string(msg,F_LRM_RID,rid)) {
1844
LOG_FAILED_TO_ADD_FIELD("resource id");
1848
on_msg_get_all(lrmd_client_t* client, struct ha_msg* msg)
1850
struct ha_msg* ret = NULL;
1852
CHECK_ALLOCATED(client, "client", HA_FAIL);
1853
CHECK_ALLOCATED(msg, "message", HA_FAIL);
1855
lrmd_debug2(LOG_DEBUG
1856
, "on_msg_get_all:client [%d] want to get all rsc information."
1859
ret = create_lrm_ret(HA_OK, g_hash_table_size(resources) + 1);
1860
CHECK_RETURN_OF_CREATE_LRM_RET;
1862
g_hash_table_foreach(resources, add_rid_to_msg, ret);
1864
if (HA_OK != msg2ipcchan(ret, client->ch_cmd)) {
1865
lrmd_log(LOG_ERR, "on_msg_get_all: can not send the ret msg");
1872
on_msg_get_rsc(lrmd_client_t* client, struct ha_msg* msg)
1874
struct ha_msg* ret = NULL;
1875
lrmd_rsc_t* rsc = NULL;
1876
const char* id = NULL;
1878
CHECK_ALLOCATED(client, "client", HA_FAIL);
1879
CHECK_ALLOCATED(msg, "message", HA_FAIL);
1881
id = ha_msg_value(msg, F_LRM_RID);
1883
lrmd_debug2(LOG_DEBUG
1884
, "on_msg_get_rsc: the client [pid:%d] wants to get "
1885
"the information of the resource [rsc_id: %s]"
1886
, client->pid, lrmd_nullcheck(id));
1888
rsc = lookup_rsc_by_msg(msg);
1890
lrmd_debug2(LOG_DEBUG
1891
, "on_msg_get_rsc: no rsc with id %s."
1892
, lrmd_nullcheck(id));
1893
ret = create_lrm_ret(HA_FAIL, 1);
1894
CHECK_RETURN_OF_CREATE_LRM_RET;
1897
ret = create_lrm_ret(HA_OK, 5);
1898
CHECK_RETURN_OF_CREATE_LRM_RET;
1900
if (HA_OK != ha_msg_add(ret, F_LRM_RID, rsc->id)
1901
|| HA_OK != ha_msg_add(ret, F_LRM_RTYPE, rsc->type)
1902
|| HA_OK != ha_msg_add(ret, F_LRM_RCLASS, rsc->class)) {
1905
"on_msg_get_rsc: failed to add fields to msg.");
1908
if( rsc->provider ) {
1909
if (HA_OK != ha_msg_add(ret, F_LRM_RPROVIDER,
1912
LOG_FAILED_TO_ADD_FIELD("provider");
1918
HA_OK!=ha_msg_add_str_table(ret,F_LRM_PARAM,rsc->params)) {
1920
LOG_FAILED_TO_ADD_FIELD("parameter");
1925
if (HA_OK != msg2ipcchan(ret, client->ch_cmd)) {
1926
lrmd_log(LOG_ERR, "on_msg_get_rsc: can not send the ret msg");
1934
on_msg_get_last_op(lrmd_client_t* client, struct ha_msg* msg)
1936
struct ha_msg* ret = NULL;
1937
const char* op_type = NULL;
1938
lrmd_rsc_t* rsc = NULL;
1939
const char* rid = NULL;
1941
CHECK_ALLOCATED(client, "client", HA_FAIL);
1942
CHECK_ALLOCATED(msg, "message", HA_FAIL);
1944
rid = ha_msg_value(msg, F_LRM_RID);
1945
op_type = ha_msg_value(msg, F_LRM_OP);
1947
lrmd_debug2(LOG_DEBUG
1948
, "on_msg_get_last_op:client %s[%d] want to get the information "
1949
"regarding last %s op on %s"
1950
, client->app_name, client->pid
1951
, lrmd_nullcheck(op_type), lrmd_nullcheck(rid));
1953
rsc = lookup_rsc_by_msg(msg);
1954
if (NULL != rsc && NULL != op_type) {
1955
GHashTable* table = g_hash_table_lookup(rsc->last_op_table
1956
, client->app_name);
1957
if (NULL != table ) {
1958
lrmd_op_t* op = g_hash_table_lookup(table, op_type);
1960
lrmd_debug(LOG_DEBUG
1961
, "%s: will return op %s"
1965
ret = op_to_msg(op);
1968
, "%s: can't create a message with op_to_msg."
1972
if (HA_OK != ha_msg_add_int(ret
1973
, F_LRM_OPCNT, 1)) {
1974
LOG_FAILED_TO_ADD_FIELD("operation count");
1982
, "%s: return ha_msg ret is null, will re-create it again."
1984
ret = create_lrm_ret(HA_OK, 1);
1985
CHECK_RETURN_OF_CREATE_LRM_RET;
1987
if (HA_OK != ha_msg_add_int(ret, F_LRM_OPCNT, 0)) {
1988
LOG_FAILED_TO_ADD_FIELD("operation count");
1993
if (HA_OK != msg2ipcchan(ret, client->ch_cmd)) {
1994
lrmd_log(LOG_ERR, "on_msg_get_last_op: can not send the ret msg");
2002
on_msg_del_rsc(lrmd_client_t* client, struct ha_msg* msg)
2004
lrmd_rsc_t* rsc = NULL;
2005
const char* id = NULL;
2007
CHECK_ALLOCATED(client, "client", HA_FAIL);
2008
CHECK_ALLOCATED(msg, "message", HA_FAIL);
2010
id = ha_msg_value(msg, F_LRM_RID);
2011
lrmd_debug2(LOG_DEBUG
2012
, "%s: client [%d] wants to delete rsc %s"
2013
, __FUNCTION__, client->pid, lrmd_nullcheck(id));
2015
rsc = lookup_rsc_by_msg(msg);
2017
lrmd_log(LOG_ERR, "%s: no rsc with id %s.",__FUNCTION__,id);
2021
(void)flush_all(&(rsc->repeat_op_list),0);
2022
if( flush_all(&(rsc->op_list),0) ) {
2023
set_rsc_removal_pending(rsc);
2024
lrmd_log(LOG_INFO, "resource %s busy, removal pending", rsc->id);
2026
return HA_RSCBUSY; /* resource is busy, removal delayed */
2028
lrmd_rsc_destroy(rsc);
2034
prepare_failmsg(struct ha_msg* msg, int fail_rc, const char *fail_reason)
2036
call_id++; /* use the next id */
2037
if (HA_OK != ha_msg_mod(msg,F_LRM_OP,ASYNC_OP_NAME)
2038
|| HA_OK != ha_msg_add(msg,F_LRM_FAIL_REASON,fail_reason)
2039
|| HA_OK != ha_msg_mod_int(msg,F_LRM_ASYNCMON_RC,fail_rc)
2040
|| HA_OK != ha_msg_mod_int(msg,F_LRM_RC,fail_rc)
2041
|| HA_OK != ha_msg_mod_int(msg,F_LRM_OPSTATUS,(int)LRM_OP_DONE)
2042
|| HA_OK != ha_msg_mod_int(msg,F_LRM_CALLID,call_id)
2043
|| HA_OK != ha_msg_mod_int(msg,F_LRM_TIMEOUT,0)
2044
|| HA_OK != ha_msg_mod_int(msg,F_LRM_INTERVAL,0)
2045
|| HA_OK != ha_msg_mod_int(msg,F_LRM_TARGETRC,EVERYTIME)
2046
|| HA_OK != ha_msg_mod_int(msg,F_LRM_DELAY,0)
2048
lrmd_log(LOG_ERR,"%s:%d: cannot add field to a message"
2049
, __FUNCTION__, __LINE__);
2056
async_notify(gpointer key, gpointer val, gpointer data)
2058
struct ha_msg* msg = (struct ha_msg*)data;
2059
lrmd_client_t* client;
2061
client = lookup_client_by_name((char *)key);
2064
"%s: client %s not found, probably signed out", __FUNCTION__, (char *)key);
2066
send_msg(msg, client);
2071
on_msg_fail_rsc(lrmd_client_t* client, struct ha_msg* msg)
2076
const char *fail_reason;
2078
CHECK_ALLOCATED(client, "client", HA_FAIL);
2079
CHECK_ALLOCATED(msg, "message", HA_FAIL);
2081
id = ha_msg_value(msg, F_LRM_RID);
2082
lrmd_debug2(LOG_DEBUG
2083
, "%s: client [%d] wants to fail rsc %s"
2084
, __FUNCTION__, client->pid, lrmd_nullcheck(id));
2086
rsc = lookup_rsc_by_msg(msg);
2088
lrmd_log(LOG_ERR, "%s: no resource with id %s."
2089
, __FUNCTION__, lrmd_nullcheck(id));
2092
fail_reason = ha_msg_value(msg,F_LRM_FAIL_REASON);
2093
if (!fail_reason || *fail_reason == '\0') {
2094
fail_reason = DEFAULT_FAIL_REASON;
2096
if (HA_OK != ha_msg_value_int(msg,F_LRM_ASYNCMON_RC,&fail_rc) || fail_rc <= 0) {
2097
fail_rc = DEFAULT_FAIL_RC;
2099
if (prepare_failmsg(msg,fail_rc,fail_reason))
2101
lrmd_log(LOG_WARNING
2102
, "received asynchronous failure for rsc %s (rc: %d, reason: %s)"
2103
, lrmd_nullcheck(id), fail_rc, fail_reason);
2104
/* notify all clients from last_op table about the failure */
2105
if (rsc->last_op_table) {
2106
g_hash_table_foreach(rsc->last_op_table,async_notify,msg);
2109
, "rsc to be failed %s had no operations so far", lrmd_nullcheck(id));
2110
send_msg(msg, client);
2116
free_str_hash_pair(gpointer key, gpointer value, gpointer user_data)
2118
GHashTable* table = (GHashTable*) value;
2120
g_hash_table_foreach_remove(table, free_str_op_pair, NULL);
2121
g_hash_table_destroy(table);
2126
free_str_op_pair(gpointer key, gpointer value, gpointer user_data)
2128
lrmd_op_t* op = (lrmd_op_t*)value;
2131
lrmd_log(LOG_ERR, "%s(): NULL op in op_pair(%s)" , __FUNCTION__
2132
, (const char *)key);
2134
lrmd_op_destroy(op);
2140
on_msg_add_rsc(lrmd_client_t* client, struct ha_msg* msg)
2143
gboolean ra_type_exist = FALSE;
2145
lrmd_rsc_t* rsc = NULL;
2146
const char* id = NULL;
2148
CHECK_ALLOCATED(client, "client", HA_FAIL);
2149
CHECK_ALLOCATED(msg, "message", HA_FAIL);
2151
return_on_no_value(msg, F_LRM_RID,id);
2153
lrmd_debug(LOG_DEBUG
2154
, "on_msg_add_rsc:client [%d] adds resource %s"
2155
, client->pid, lrmd_nullcheck(id));
2157
if (RID_LEN <= strlen(id)) {
2158
lrmd_log(LOG_ERR, "on_msg_add_rsc: rsc_id is too long.");
2162
if (NULL != lookup_rsc(id)) {
2163
lrmd_log(LOG_ERR, "on_msg_add_rsc: same id resource exists.");
2168
rsc = lrmd_rsc_new(id, msg);
2173
ra_type_exist = FALSE;
2174
for(node=g_list_first(ra_class_list); NULL!=node; node=g_list_next(node)){
2175
class = (char*)node->data;
2176
if (0 == strncmp(class, rsc->class, MAX_CLASSNAMELEN)) {
2177
ra_type_exist = TRUE;
2181
if (!ra_type_exist) {
2183
, "on_msg_add_rsc: RA class [%s] does not exist."
2185
lrmd_rsc_destroy(rsc);
2191
rsc->last_op_done = NULL;
2192
rsc->params = ha_msg_value_str_table(msg,F_LRM_PARAM);
2193
rsc->last_op_table = g_hash_table_new(g_str_hash, g_str_equal);
2194
g_hash_table_insert(resources, strdup(rsc->id), rsc);
2201
cancel_op(GList** listp,int cancel_op_id)
2204
lrmd_op_t* op = NULL;
2207
for( node = g_list_first(*listp)
2208
; node; node = g_list_next(node) ) {
2209
op = (lrmd_op_t*)node->data;
2210
if( op->call_id == cancel_op_id ) {
2213
, __FUNCTION__, op_info(op));
2215
if( rc != HA_RSCBUSY && rc != HA_FAIL ) {
2216
notify_client(op); /* send notification now */
2217
*listp = g_list_remove(*listp, op);
2218
remove_op_history(op);
2219
lrmd_op_destroy(op);
2228
on_msg_cancel_op(lrmd_client_t* client, struct ha_msg* msg)
2230
lrmd_rsc_t* rsc = NULL;
2231
int cancel_op_id = 0;
2232
int op_cancelled = HA_OK;
2235
CHECK_ALLOCATED(client, "client", HA_FAIL);
2236
CHECK_ALLOCATED(msg, "message", HA_FAIL);
2238
rsc = lookup_rsc_by_msg(msg);
2241
"%s: no resource with such id.", __FUNCTION__);
2245
return_on_no_int_value(msg, F_LRM_CALLID, &cancel_op_id);
2247
lrmd_debug2(LOG_DEBUG
2248
, "%s:client [pid:%d] cancel the operation [callid:%d]"
2253
if( cancel_op(&(rsc->repeat_op_list), cancel_op_id) != HA_OK ) {
2254
op_cancelled = cancel_op(&(rsc->op_list), cancel_op_id);
2256
if( op_cancelled == HA_FAIL ) {
2257
lrmd_log(LOG_INFO, "%s: no operation with id %d",
2258
__FUNCTION__, cancel_op_id);
2259
} else if( op_cancelled == HA_RSCBUSY ) {
2260
lrmd_log(LOG_INFO, "%s: operation %d running, cancel pending",
2261
__FUNCTION__, cancel_op_id);
2263
lrmd_debug(LOG_DEBUG, "%s: operation %d cancelled",
2264
__FUNCTION__, cancel_op_id);
2267
return op_cancelled;
2271
flush_all(GList** listp, int client_pid)
2274
lrmd_op_t* op = NULL;
2275
gboolean rsc_busy = FALSE;
2277
node = g_list_first(*listp);
2279
op = (lrmd_op_t*)node->data;
2280
if (client_pid && op->client_id != client_pid) {
2281
node = g_list_next(node);
2282
continue; /* not the client's operation */
2284
if( flush_op(op) == HA_RSCBUSY ) {
2286
node = g_list_next(node);
2287
} else if (!client_pid || op->client_id == client_pid) {
2288
node = *listp = g_list_remove(*listp, op);
2289
remove_op_history(op);
2290
lrmd_op_destroy(op);
2292
node = g_list_next(node);
2299
on_msg_flush_all(lrmd_client_t* client, struct ha_msg* msg)
2301
lrmd_rsc_t* rsc = NULL;
2302
const char* id = NULL;
2305
CHECK_ALLOCATED(client, "client", HA_FAIL);
2306
CHECK_ALLOCATED(msg, "message", HA_FAIL);
2308
return_on_no_value(msg, F_LRM_RID,id);
2309
rsc = lookup_rsc_by_msg(msg);
2312
"%s: no resource with id %s.", __FUNCTION__,id);
2317
/* when a flush request arrived, flush all pending ops */
2318
lrmd_debug2(LOG_DEBUG
2319
, "%s:client [%d] flush operations"
2320
, __FUNCTION__, client->pid);
2321
(void)flush_all(&(rsc->repeat_op_list),0);
2322
if( flush_all(&(rsc->op_list),0) ) {
2323
set_rsc_flushing_ops(rsc); /* resource busy */
2324
lrmd_log(LOG_INFO, "resource %s busy, all flush pending", rsc->id);
2333
on_msg_perform_op(lrmd_client_t* client, struct ha_msg* msg)
2335
lrmd_rsc_t* rsc = NULL;
2337
const char* id = NULL;
2343
CHECK_ALLOCATED(client, "client", HA_FAIL);
2344
CHECK_ALLOCATED(msg, "message", HA_FAIL);
2346
return_on_no_value(msg, F_LRM_RID,id);
2347
return_on_no_int_value(msg, F_LRM_INTERVAL, &interval);
2348
return_on_no_int_value(msg, F_LRM_TIMEOUT, &timeout);
2349
return_on_no_int_value(msg, F_LRM_DELAY, &delay);
2351
rsc = lookup_rsc_by_msg(msg);
2354
"%s: no resource with such id.", __FUNCTION__);
2357
if( rsc_frozen(rsc) ) {
2358
lrmd_log(LOG_NOTICE, "%s: resource %s is frozen, "
2359
"no ops can run.", __FUNCTION__, rsc->id);
2366
, "%s:%d: the resource id is NULL"
2367
, __FUNCTION__, __LINE__);
2370
if (HA_OK != ha_msg_add_int(msg, F_LRM_CALLID, call_id)) {
2371
LOG_FAILED_TO_ADD_FIELD("callid");
2374
if (HA_OK !=ha_msg_mod(msg, F_LRM_APP, client->app_name)) {
2375
LOG_FAILED_TO_ADD_FIELD("app_name");
2383
op->call_id = call_id;
2384
op->client_id = client->pid;
2385
op->rsc_id = strdup(rsc->id);
2386
op->interval = interval;
2388
op->weight = no_child_count(rsc) ? 0 : 1;
2390
op->msg = ha_msg_copy(msg);
2392
if( ha_msg_value_int(msg,F_LRM_COPYPARAMS,&op->copyparams) == HA_OK
2393
&& op->copyparams ) {
2394
lrmd_debug(LOG_DEBUG
2395
, "%s:%d: copying parameters for rsc %s"
2396
, __FUNCTION__, __LINE__,rsc->id);
2398
free_str_table(rsc->params);
2400
rsc->params = ha_msg_value_str_table(msg, F_LRM_PARAM);
2403
lrmd_debug2(LOG_DEBUG
2404
, "%s: client [%d] want to add an operation %s on resource %s."
2408
, NULL!=op->rsc_id ? op->rsc_id : "#EMPTY#");
2410
if ( 0 < op->delay ) {
2411
op->repeat_timeout_tag = Gmain_timeout_add(op->delay
2412
,on_repeat_op_readytorun, op);
2413
rsc->repeat_op_list =
2414
g_list_append (rsc->repeat_op_list, op);
2415
lrmd_debug(LOG_DEBUG
2416
, "%s: an operation %s is added to the repeat "
2417
"operation list for delay execution"
2421
lrmd_debug(LOG_DEBUG
2422
, "%s: add an operation %s to the operation list."
2425
add_op_to_runlist(rsc,op);
2435
send_last_op(gpointer key, gpointer value, gpointer user_data)
2437
IPC_Channel* ch = NULL;
2438
lrmd_op_t* op = NULL;
2439
struct ha_msg* msg = NULL;
2441
ch = (IPC_Channel*)user_data;
2442
op = (lrmd_op_t*)value;
2443
msg = op_to_msg(op);
2445
lrmd_log(LOG_ERR, "send_last_op: failed to convert an operation "
2446
"information to a ha_msg.");
2449
if (HA_OK != msg2ipcchan(msg, ch)) {
2450
lrmd_log(LOG_ERR, "send_last_op: can not send a message.");
2456
on_msg_get_state(lrmd_client_t* client, struct ha_msg* msg)
2459
lrmd_rsc_t* rsc = NULL;
2461
struct ha_msg* ret = NULL;
2462
lrmd_op_t* op = NULL;
2463
struct ha_msg* op_msg = NULL;
2464
const char* id = NULL;
2465
GHashTable* last_ops = NULL;
2467
CHECK_ALLOCATED(client, "client", HA_FAIL);
2468
CHECK_ALLOCATED(msg, "message", HA_FAIL);
2470
id = ha_msg_value(msg,F_LRM_RID);
2471
lrmd_debug2(LOG_DEBUG
2472
, "%s: client [%d] want to get the state of resource %s"
2473
, __FUNCTION__, client->pid, lrmd_nullcheck(id));
2475
rsc = lookup_rsc_by_msg(msg);
2477
lrmd_log(LOG_ERR, "on_msg_get_state: no resource with id %s."
2478
, lrmd_nullcheck(id));
2479
send_ret_msg(client->ch_cmd, HA_FAIL);
2483
ret = ha_msg_new(5);
2485
lrmd_log(LOG_ERR, "on_msg_get_state: can't create a ha_msg.");
2488
/* add the F_LRM_STATE field */
2489
if (HA_OK != ha_msg_add_int(ret, F_LRM_STATE
2490
, rsc->op_list ? LRM_RSC_BUSY : LRM_RSC_IDLE)) {
2491
LOG_FAILED_TO_ADD_FIELD("state");
2495
lrmd_debug(LOG_DEBUG
2496
, "on_msg_get_state:state of rsc %s is %s"
2497
, lrmd_nullcheck(id)
2498
, rsc->op_list ? "LRM_RSC_BUSY" : "LRM_RSC_IDLE" );
2499
/* calculate the count of ops being returned */
2500
last_ops = g_hash_table_lookup(rsc->last_op_table, client->app_name);
2501
if (last_ops == NULL) {
2502
op_count = g_list_length(rsc->op_list)
2503
+ g_list_length(rsc->repeat_op_list);
2506
op_count = g_hash_table_size(last_ops)
2507
+ g_list_length(rsc->op_list)
2508
+ g_list_length(rsc->repeat_op_list);
2510
/* add the count of ops being returned */
2511
if (HA_OK != ha_msg_add_int(ret, F_LRM_OPCNT, op_count)) {
2512
LOG_FAILED_TO_ADD_FIELD("operation count");
2516
/* send the first message to client */
2517
if (HA_OK != msg2ipcchan(ret, client->ch_cmd)) {
2519
"on_msg_get_state: can not send the ret message.");
2525
/* send the ops in last ops table */
2526
if(last_ops != NULL) {
2527
g_hash_table_foreach(last_ops, send_last_op, client->ch_cmd);
2530
/* send the ops in op list */
2531
for(node = g_list_first(rsc->op_list)
2532
; NULL != node; node = g_list_next(node)){
2533
op = (lrmd_op_t*)node->data;
2534
op_msg = op_to_msg(op);
2535
if (NULL == op_msg) {
2537
"on_msg_get_state: failed to make a message "
2538
"from a operation: %s", op_info(op));
2541
if (HA_OK != msg2ipcchan(op_msg, client->ch_cmd)) {
2543
"on_msg_get_state: failed to send a message.");
2548
/* send the ops in repeat op list */
2549
for(node = g_list_first(rsc->repeat_op_list)
2550
; NULL != node; node = g_list_next(node)){
2551
op = (lrmd_op_t*)node->data;
2552
op_msg = op_to_msg(op);
2553
if (NULL == op_msg) {
2555
"on_msg_get_state: failed to make a message "
2556
"from a operation: %s", op_info(op));
2559
if (HA_OK != msg2ipcchan(op_msg, client->ch_cmd)) {
2561
"on_msg_get_state: failed to send a message.");
2568
#define safe_len(s) (s ? strlen(s) : 0)
2571
lrm_concat(const char *prefix, const char *suffix, char join)
2574
char *new_str = NULL;
2575
len += safe_len(prefix);
2576
len += safe_len(suffix);
2578
new_str = malloc(sizeof(char)*len);
2579
if (NULL == new_str) {
2580
lrmd_log(LOG_ERR,"%s:%d: malloc failed"
2581
, __FUNCTION__, __LINE__);
2585
memset(new_str, 0, len);
2586
sprintf(new_str, "%s%c%s", prefix?prefix:"", join, suffix?suffix:"");
2591
/* /////////////////////op functions////////////////////// */
2593
#define mk_op_id(op,id) do { \
2594
const char *op_type = ha_msg_value(op->msg, F_LRM_OP); \
2595
const char *op_interval = ha_msg_value(op->msg, F_LRM_INTERVAL); \
2596
id = lrm_concat(op_type, op_interval, '_'); \
2599
/* find the last operation for the client
2600
* replace it with the new one (if requested)
2603
replace_last_op(lrmd_client_t* client, lrmd_rsc_t* rsc, lrmd_op_t* op)
2606
GHashTable *client_last_op;
2607
lrmd_op_t *old_op, *new_op;
2609
if (!client || !rsc || !op)
2611
client_last_op = g_hash_table_lookup(rsc->last_op_table, client->app_name);
2612
if (!client_last_op) {
2613
lrmd_debug2(LOG_DEBUG
2614
, "%s: new last op table for client %s"
2615
, __FUNCTION__, client->app_name);
2616
client_last_op = g_hash_table_new_full( g_str_hash
2617
, g_str_equal, free, NULL);
2618
g_hash_table_insert(rsc->last_op_table
2619
, (gpointer)strdup(client->app_name)
2620
, (gpointer)client_last_op);
2622
mk_op_id(op,op_hash_key);
2623
old_op = (lrmd_op_t*)g_hash_table_lookup(client_last_op, op_hash_key);
2625
/* make a copy of op and insert it into client_last_op */
2626
if (!(new_op = lrmd_op_copy(op))) {
2627
lrmd_log(LOG_ERR, "%s:%d out of memory"
2628
, __FUNCTION__, __LINE__);
2631
lrmd_debug2(LOG_DEBUG
2632
, "%s: replace last op %s for client %s"
2633
, __FUNCTION__, op_hash_key, client->app_name);
2634
g_hash_table_replace(client_last_op,op_hash_key,(gpointer)new_op);
2635
lrmd_op_destroy(old_op);
2637
lrmd_debug2(LOG_DEBUG
2638
, "%s: add last op %s for client %s"
2639
, __FUNCTION__, op_hash_key, client->app_name);
2640
g_hash_table_insert(client_last_op,op_hash_key,(gpointer)new_op);
2645
record_op_completion(lrmd_rsc_t* rsc, lrmd_op_t* op)
2647
lrmd_client_t* client;
2650
/*save the op in the last op finished*/
2651
if (rsc->last_op_done != NULL) {
2652
lrmd_op_destroy(rsc->last_op_done);
2654
if (!(rsc->last_op_done = lrmd_op_copy(op))) {
2655
lrmd_log(LOG_ERR, "%s:%d out of memory"
2656
, __FUNCTION__, __LINE__);
2659
rsc->last_op_done->repeat_timeout_tag = (guint)0;
2661
client = lookup_client(op->client_id);
2663
lrmd_log(LOG_INFO, "%s: cannot record %s: the client is gone"
2664
, __FUNCTION__, small_op_info(op));
2668
/* insert (or replace) the new op in last_op_table for the client */
2669
replace_last_op(client,rsc,op);
2675
to_repeatlist(lrmd_rsc_t* rsc, lrmd_op_t* op)
2677
lrmd_op_t *repeat_op;
2679
if (!(repeat_op = lrmd_op_copy(op))) {
2680
lrmd_log(LOG_ERR, "%s:%d out of memory"
2681
, __FUNCTION__, __LINE__);
2683
reset_timestamps(repeat_op);
2684
repeat_op->is_copy = FALSE;
2685
repeat_op->repeat_timeout_tag =
2686
Gmain_timeout_add(op->interval,
2687
on_repeat_op_readytorun, repeat_op);
2688
rsc->repeat_op_list =
2689
g_list_append (rsc->repeat_op_list, repeat_op);
2690
lrmd_debug2(LOG_DEBUG
2691
, "%s: repeat %s is added to repeat op list to wait"
2692
, __FUNCTION__, op_info(op));
2696
remove_op_history(lrmd_op_t* op)
2698
lrmd_client_t* client = lookup_client(op->client_id);
2699
lrmd_rsc_t* rsc = NULL;
2700
char *op_id, *last_op_id;
2701
lrmd_op_t* old_op = NULL;
2702
GHashTable* client_last_op = NULL;
2705
if( !(rsc = lookup_rsc(op->rsc_id)) ) {
2708
lrmd_debug2(LOG_DEBUG, "%s: remove history of the op %s"
2709
,__FUNCTION__, op_info(op));
2711
if (rsc->last_op_done != NULL ) {
2712
mk_op_id(rsc->last_op_done,last_op_id);
2713
if( !strcmp(op_id,last_op_id) ) {
2714
lrmd_debug2(LOG_DEBUG, "%s: remove history of the last op done %s"
2715
,__FUNCTION__, op_info(rsc->last_op_done));
2716
lrmd_op_destroy(rsc->last_op_done);
2717
rsc->last_op_done = NULL;
2722
(client_last_op = g_hash_table_lookup(rsc->last_op_table
2723
, client->app_name)) ) {
2724
lrmd_debug2(LOG_DEBUG, "%s: found client %s in the last op table"
2725
,__FUNCTION__, client->app_name);
2726
old_op = g_hash_table_lookup(client_last_op, op_id);
2728
g_hash_table_remove(client_last_op, op_id);
2729
lrmd_debug2(LOG_DEBUG, "%s: remove history of the client's last %s"
2730
,__FUNCTION__, op_info(old_op));
2731
lrmd_op_destroy(old_op);
2739
add_op_to_runlist(lrmd_rsc_t* rsc, lrmd_op_t* op)
2741
op->t_addtolist = time_longclock();
2742
rsc->op_list = g_list_append(rsc->op_list, op);
2743
if (g_list_length(rsc->op_list) >= 4) {
2744
lrmd_log(LOG_WARNING
2745
, "operations list for %s is suspiciously"
2748
, g_list_length(rsc->op_list));
2749
lrmd_rsc_dump(rsc->id, "rsc->op_list: too many ops");
2753
/* 1. this function sends a message to the client:
2754
* a) on operation instance exit using the callback channel
2755
* b) in case a client requested that operation to be cancelled,
2756
* using the command channel
2757
* c) in case a client requested a resource removal or flushing
2758
* all ops and this is the last operation that finished, again
2759
* using the command channel
2760
* 2. if the op was not cancelled:
2761
* a) it is copied to the last_op_done field of rsc
2762
* b) if it's a repeating op, it is put in the repeat_op_list
2763
* c) the outcome is recorded for future reference
2764
* 3. op is destroyed and removed from the op_list
2767
on_op_done(lrmd_rsc_t* rsc, lrmd_op_t* op)
2770
int target_rc, last_rc, op_rc;
2772
op_status_t op_status;
2775
CHECK_ALLOCATED(op, "op", HA_FAIL );
2776
if (op->exec_pid == 0) {
2777
lrmd_log(LOG_ERR, "%s: op->exec_pid == 0",__FUNCTION__);
2780
op->t_done = time_longclock();
2782
if (debug_level >= 2) {
2783
lrmd_debug(LOG_DEBUG, "%s: %s",__FUNCTION__, op_info(op));
2784
lrmd_op_dump(op, __FUNCTION__);
2787
return_on_no_int_value(op->msg,F_LRM_TARGETRC,&target_rc);
2788
return_on_no_int_value(op->msg,F_LRM_OPSTATUS,(int *)&op_status);
2790
last_rc = op_rc = -1; /* set all rc to -1 */
2791
ha_msg_value_int(op->msg,F_LRM_RC,&op_rc);
2792
ha_msg_value_int(op->msg,F_LRM_LASTRC,&last_rc);
2794
op_status == LRM_OP_DONE
2796
&& ((last_rc == -1) || (last_rc != op_rc))
2799
if (HA_OK != ha_msg_mod_int(op->msg, F_LRM_LASTRC, op_rc)) {
2800
lrmd_log(LOG_ERR,"%s: cannot save status to msg",__FUNCTION__);
2803
op->t_rcchange = op->t_perform;
2805
if (store_timestamps(op))
2808
/* remove the op from op_list */
2809
rsc->op_list = g_list_remove(rsc->op_list,op);
2810
lrmd_debug2(LOG_DEBUG
2811
, "%s:%s is removed from op list"
2812
, __FUNCTION__, op_info(op));
2814
if (!op->is_cancelled) {
2815
if( !record_op_completion(rsc,op) ) { /*record the outcome of the op */
2816
if (op->interval) /* copy op to the repeat list */
2817
to_repeatlist(rsc,op);
2820
remove_op_history(op);
2823
if (rsc_removal_pending(rsc)) {
2824
if (HA_OK != ha_msg_add_int(op->msg,F_LRM_RSCDELETED,1)) {
2825
LOG_FAILED_TO_ADD_FIELD(F_LRM_RSCDELETED);
2828
if (op_status != LRM_OP_DONE
2830
|| (op_rc == target_rc)
2831
|| (target_rc == EVERYTIME)
2832
|| ((target_rc == CHANGED) && rc_changed)
2833
|| rsc_removal_pending(rsc)
2837
lrmd_op_destroy(op);
2838
if( !rsc->op_list ) {
2839
if( rsc_removal_pending(rsc) ) {
2840
lrmd_log(LOG_INFO, "late removal of resource %s", rsc->id);
2841
lrmd_rsc_destroy(rsc);
2842
rc = -1; /* let the caller know that the rsc is gone */
2844
rsc_reset_state(rsc);
2848
if (shutdown_in_progress && can_shutdown()) {
2855
* an operation is flushed only in case there is
2856
* no process running initiated by this operation
2857
* NB: the caller has to destroy the operation itself
2860
flush_op(lrmd_op_t* op)
2862
CHECK_ALLOCATED(op, "op", HA_FAIL );
2863
if (op->exec_pid == 0) {
2864
lrmd_debug(LOG_ERR, "%s: op->exec_pid == 0",__FUNCTION__);
2868
if (HA_OK != ha_msg_mod_int(op->msg, F_LRM_RC, HA_FAIL)) {
2869
LOG_FAILED_TO_ADD_FIELD("F_LRM_RC");
2873
if( op->exec_pid == -1 ) {
2874
if (HA_OK != ha_msg_mod_int(op->msg,F_LRM_OPSTATUS,(int)LRM_OP_CANCELLED)){
2875
LOG_FAILED_TO_ADD_FIELD("opstatus");
2880
op->is_cancelled = TRUE; /* mark the op as cancelled */
2881
lrmd_log(LOG_INFO, "%s: process for %s still "
2882
"running, flush delayed"
2883
,__FUNCTION__,small_op_info(op));
2888
/* Resume the execution of ops of the resource */
2890
rsc_execution_freeze_timeout(gpointer data)
2892
lrmd_rsc_t* rsc = (lrmd_rsc_t*)data;
2898
if (rsc->delay_timeout > 0) {
2899
Gmain_timeout_remove(rsc->delay_timeout);
2900
rsc->delay_timeout = (guint)0;
2908
/* this function gets the first op in the rsc op list and execute it*/
2910
perform_op(lrmd_rsc_t* rsc)
2913
lrmd_op_t* op = NULL;
2916
CHECK_ALLOCATED(rsc, "resource", HA_FAIL);
2917
if (shutdown_in_progress && can_shutdown()) {
2921
if (rsc_frozen(rsc)) {
2922
lrmd_log(LOG_INFO,"%s: resource %s is frozen, "
2923
"no ops allowed to run"
2924
, __FUNCTION__, rsc->id);
2928
if (NULL == rsc->op_list) {
2929
lrmd_debug2(LOG_DEBUG,"%s: no op to perform?", __FUNCTION__);
2933
node = g_list_first(rsc->op_list);
2934
while (NULL != node) {
2936
if (-1 != op->exec_pid) {
2937
if (!g_list_next(node)) {
2938
/* this is the only operation, no need to do
2939
* anything further */
2942
lrmd_log(LOG_INFO, "%s:%d: %s for rsc is already running."
2943
, __FUNCTION__, __LINE__, op_info(op));
2944
if( rsc->delay_timeout > 0 ) {
2946
, "%s:%d: operations on resource %s already delayed"
2947
, __FUNCTION__, __LINE__, lrm_str(rsc->id));
2950
, "%s:%d: postponing "
2951
"all ops on resource %s by %d ms"
2952
, __FUNCTION__, __LINE__
2953
, lrm_str(rsc->id), retry_interval);
2954
rsc->delay_timeout = Gmain_timeout_add(retry_interval
2955
, rsc_execution_freeze_timeout, rsc);
2959
if (op->weight && child_count >= max_child_count) {
2960
if ((int)rsc->delay_timeout > 0) {
2962
, "%s:%d: max_child_count (%d) reached and operations on resource %s already delayed"
2963
, __FUNCTION__, __LINE__, max_child_count, lrm_str(rsc->id));
2965
lrmd_debug(LOG_NOTICE
2966
, "max_child_count (%d) reached, postponing "
2967
"execution of %s by %d ms"
2968
, max_child_count, op_info(op), retry_interval);
2969
rsc->delay_timeout = Gmain_timeout_add(retry_interval
2970
, rsc_execution_freeze_timeout, rsc);
2975
if (HA_OK != perform_ra_op(op)) {
2977
, "unable to perform_ra_op on %s"
2979
if (HA_OK != ha_msg_add_int(op->msg, F_LRM_OPSTATUS,
2981
LOG_FAILED_TO_ADD_FIELD("opstatus");
2984
node = g_list_first(rsc->op_list);
2996
store_timestamps(lrmd_op_t* op)
2998
struct ha_msg* msg = op->msg;
2999
longclock_t now = time_longclock(), /* tm2unix() needs this */
3000
exec_time = zero_longclock,
3001
queue_time = zero_longclock;
3003
if (op->t_perform) {
3005
longclockto_ms(sub_longclock(op->t_perform,op->t_addtolist));
3008
longclockto_ms(sub_longclock(op->t_done,op->t_perform));
3011
if ((HA_OK!=ha_msg_mod_ul(msg,F_LRM_T_RUN,tm2unix(op->t_perform)))
3012
|| (HA_OK!=ha_msg_mod_ul(msg,F_LRM_T_RCCHANGE,tm2unix(op->t_rcchange)))
3013
|| (HA_OK!=ha_msg_mod_ul(msg,F_LRM_EXEC_TIME,exec_time))
3014
|| (HA_OK!=ha_msg_mod_ul(msg,F_LRM_QUEUE_TIME,queue_time))
3016
lrmd_log(LOG_ERR,"%s: can not save timestamps to msg",__FUNCTION__);
3023
reset_timestamps(lrmd_op_t* op)
3025
op->t_perform = zero_longclock;
3026
op->t_done = zero_longclock;
3027
cl_msg_remove(op->msg, F_LRM_T_RUN);
3028
cl_msg_remove(op->msg, F_LRM_T_RCCHANGE);
3029
cl_msg_remove(op->msg, F_LRM_EXEC_TIME);
3030
cl_msg_remove(op->msg, F_LRM_QUEUE_TIME);
3034
op_to_msg(lrmd_op_t* op)
3036
struct ha_msg* msg = NULL;
3038
CHECK_ALLOCATED(op, "op", NULL);
3039
if (op->exec_pid == 0) {
3040
lrmd_log(LOG_ERR, "%s: op->exec_pid is 0",__FUNCTION__);
3043
msg = ha_msg_copy(op->msg);
3045
lrmd_log(LOG_ERR,"%s: can not copy the msg",__FUNCTION__);
3048
if ((HA_OK!=ha_msg_mod_int(msg,F_LRM_CALLID,op->call_id))) {
3049
lrmd_log(LOG_ERR,"%s: can not save F_LRM_CALLID to msg",__FUNCTION__);
3056
/* //////////////////////////////RA wrap funcs/////////////////////////////////// */
3058
perform_ra_op(lrmd_op_t* op)
3064
struct RAExecOps * RAExec = NULL;
3065
const char* op_type = NULL;
3066
GHashTable* params = NULL;
3067
GHashTable* op_params = NULL;
3068
lrmd_rsc_t* rsc = NULL;
3069
ra_pipe_op_t * rapop;
3072
CHECK_ALLOCATED(op, "op", HA_FAIL);
3073
rsc = (lrmd_rsc_t*)lookup_rsc(op->rsc_id);
3074
CHECK_ALLOCATED(rsc, "rsc", HA_FAIL);
3076
if ( pipe(stdout_fd) < 0 ) {
3077
cl_perror("%s::%d: pipe", __FUNCTION__, __LINE__);
3080
if ( pipe(stderr_fd) < 0 ) {
3081
cl_perror("%s::%d: pipe", __FUNCTION__, __LINE__);
3084
if (op->exec_pid == 0) {
3085
lrmd_log(LOG_ERR, "%s::%d: op->exec_pid == 0.", __FUNCTION__, __LINE__);
3089
op_type = ha_msg_value(op->msg, F_LRM_OP);
3090
op_params = ha_msg_value_str_table(op->msg, F_LRM_PARAM);
3091
params = merge_str_tables(rsc->params,op_params);
3092
ha_msg_mod_str_table(op->msg, F_LRM_PARAM, params);
3094
free_str_table(op_params);
3098
free_str_table(params);
3101
op->t_perform = time_longclock();
3102
check_queue_duration(op);
3104
if(HA_OK != ha_msg_value_int(op->msg, F_LRM_TIMEOUT, &timeout)){
3106
lrmd_log(LOG_ERR,"%s::%d: failed to get timeout for %s"
3107
, __FUNCTION__, __LINE__, small_op_info(op));
3110
if( return_to_orig_privs() ) {
3111
cl_perror("%s::%d: failed to raise privileges"
3112
, __FUNCTION__, __LINE__);
3114
switch(pid=fork()) {
3116
cl_perror("%s::%d: fork", __FUNCTION__, __LINE__);
3117
close(stdout_fd[0]);
3118
close(stdout_fd[1]);
3119
close(stderr_fd[0]);
3120
close(stderr_fd[1]);
3121
if( return_to_dropped_privs() ) {
3122
cl_perror("%s::%d: failed to drop privileges"
3123
, __FUNCTION__, __LINE__);
3127
default: /* Parent */
3128
child_count += op->weight;
3129
NewTrackedProc(pid, 1
3131
((op->interval && !is_logmsg_due(op)) ? PT_LOGNORMAL : PT_LOGVERBOSE) : PT_LOGNONE
3132
, op, &ManagedChildTrackOps);
3134
if (!op->interval || is_logmsg_due(op)) { /* log non-repeating ops */
3135
lrmd_log(LOG_INFO,"rsc:%s %s[%d] (pid %d)",
3136
rsc->id,probe_str(op,op_type),op->call_id,pid);
3138
lrmd_debug(LOG_DEBUG,"rsc:%s %s[%d] (pid %d)",
3139
rsc->id,op_type,op->call_id,pid);
3141
close(stdout_fd[1]);
3142
close(stderr_fd[1]);
3143
rapop = ra_pipe_op_new(stdout_fd[0], stderr_fd[0], op);
3148
/* Wait 'timeout' ms then send SIGTERM */
3149
/* allow for extra 15 seconds for stonith,
3150
* because stonithd handles its children with the
3151
* same timeout; in this case the lrmd child
3152
* should never timeout, but return the timeout
3153
* reported by stonithd
3155
op->killseq[0].mstimeout = timeout
3156
+ (!strcmp(rsc->class,"stonith") ? 15000 : 0);
3157
op->killseq[0].signalno = SIGTERM;
3159
/* Wait 5 seconds then send SIGKILL */
3160
op->killseq[1].mstimeout = 5000;
3161
op->killseq[1].signalno = SIGKILL;
3163
/* Wait 5 more seconds then moan and complain */
3164
op->killseq[2].mstimeout = 5000;
3165
op->killseq[2].signalno = 0;
3167
SetTrackedProcTimeouts(pid, op->killseq);
3169
if( return_to_dropped_privs() ) {
3170
lrmd_log(LOG_WARNING,"%s::%d: failed to drop privileges: %s"
3171
, __FUNCTION__, __LINE__, strerror(errno));
3174
if ( rapop == NULL) {
3181
#ifdef DEFAULT_REALTIME_POLICY
3182
if (sched_getscheduler(0) != SCHED_OTHER) {
3183
struct sched_param sp;
3184
lrmd_debug(LOG_DEBUG,
3185
"perform_ra_op: resetting scheduler class to SCHED_OTHER");
3186
sp.sched_priority = 0;
3187
if (sched_setscheduler(0, SCHED_OTHER, &sp) == -1)
3188
cl_perror("%s::%d: sched_setscheduler",
3189
__FUNCTION__, __LINE__);
3192
/* Man: The call setpgrp() is equivalent to setpgid(0,0)
3193
* _and_ compiles on BSD variants too
3194
* need to investigate if it works the same too.
3197
close(stdout_fd[0]);
3198
close(stderr_fd[0]);
3199
if (STDOUT_FILENO != stdout_fd[1]) {
3200
if (dup2(stdout_fd[1], STDOUT_FILENO)!=STDOUT_FILENO) {
3201
cl_perror("%s::%d: dup2"
3202
, __FUNCTION__, __LINE__);
3204
close(stdout_fd[1]);
3206
if (STDERR_FILENO != stderr_fd[1]) {
3207
if (dup2(stderr_fd[1], STDERR_FILENO)!=STDERR_FILENO) {
3208
cl_perror("%s::%d: dup2", __FUNCTION__, __LINE__);
3210
close(stderr_fd[1]);
3212
RAExec = g_hash_table_lookup(RAExecFuncs,rsc->class);
3213
if (NULL == RAExec) {
3214
close(stdout_fd[1]);
3215
close(stderr_fd[1]);
3216
lrmd_log(LOG_ERR,"%s::%d: can't find RAExec for class %s"
3217
, __FUNCTION__, __LINE__, rsc->class);
3218
exit(EXECRA_EXEC_UNKNOWN_ERROR);
3221
/*should we use logging daemon or not in script*/
3222
setenv(HALOGD, cl_log_get_uselogd()?"yes":"no",1);
3224
/* Name of the resource and some others also
3225
* need to be passed in. Maybe pass through the
3226
* entire lrm_op_t too? */
3227
lrmd_debug2(LOG_DEBUG
3228
, "perform_ra_op:calling RA plugin to perform %s, pid: [%d]"
3229
, op_info(op), getpid());
3230
params = ha_msg_value_str_table(op->msg, F_LRM_PARAM);
3231
if (replace_secret_params(rsc->id, params) < 0) {
3232
/* replacing secrets failed! */
3233
if (!strcmp(op_type,"stop")) {
3234
/* don't fail on stop! */
3236
, "%s:%d: proceeding with the stop operation for %s"
3237
, __FUNCTION__, __LINE__, rsc->id);
3240
, "%s:%d: failed to get secrets for %s, "
3241
"considering resource not configured"
3242
, __FUNCTION__, __LINE__, rsc->id);
3243
exit(EXECRA_NOT_CONFIGURED);
3246
RAExec->execra (rsc->id,
3253
/* execra should never return. */
3254
exit(EXECRA_EXEC_UNKNOWN_ERROR);
3257
lrmd_log(LOG_ERR, "perform_ra_op: end(impossible).");
3262
on_ra_proc_registered(ProcTrack* p)
3266
/* Handle one of our ra child processes finished*/
3268
on_ra_proc_finished(ProcTrack* p, int status, int signo, int exitcode
3271
lrmd_op_t* op = NULL;
3272
lrmd_rsc_t* rsc = NULL;
3273
struct RAExecOps * RAExec = NULL;
3274
const char* op_type;
3275
int rc = EXECRA_EXEC_UNKNOWN_ERROR;
3281
CHECK_ALLOCATED(p, "ProcTrack p", );
3282
op = proctrack_data(p);
3284
child_count -= op->weight;
3285
if (child_count < 0) {
3286
lrmd_log(LOG_ERR, "%s:%d: child count is less than zero: %d"
3287
, __FUNCTION__, __LINE__, child_count);
3291
lrmd_debug2(LOG_DEBUG, "on_ra_proc_finished: accessing the op whose "
3292
"address is %p", op);
3293
CHECK_ALLOCATED(op, "op", );
3294
if (op->exec_pid == 0) {
3295
lrmd_log(LOG_ERR, "on_ra_proc_finished: the op was freed.");
3296
dump_data_for_debug();
3299
RemoveTrackedProcTimeouts(op->exec_pid);
3302
rsc = lookup_rsc(op->rsc_id);
3304
lrmd_log(LOG_ERR, "%s: the rsc (id=%s) does not exist"
3305
, __FUNCTION__, lrm_str(op->rsc_id));
3306
lrmd_op_dump(op, __FUNCTION__);
3307
lrmd_dump_all_resources();
3309
lrmd_op_destroy(op);
3310
reset_proctrack_data(p);
3315
RAExec = g_hash_table_lookup(RAExecFuncs,rsc->class);
3316
if (NULL == RAExec) {
3317
lrmd_log(LOG_ERR,"on_ra_proc_finished: can not find RAExec for"
3318
" resource class <%s>", rsc->class);
3319
dump_data_for_debug();
3323
op_type = ha_msg_value(op->msg, F_LRM_OP);
3325
if ( (NULL == strchr(op->first_line_ra_stdout, '\n'))
3326
&& (0==STRNCMP_CONST(rsc->class, "heartbeat"))
3327
&& ( (0==STRNCMP_CONST(op_type, "monitor"))
3328
||(0==STRNCMP_CONST(op_type, "status"))) ) {
3329
if ( ( op->rapop != NULL )
3330
&& (op->rapop->ra_stdout_fd >= 0) ) {
3331
handle_pipe_ra_stdout(op->rapop->ra_stdout_fd
3334
lrmd_log(LOG_WARNING, "There is something wrong: the "
3335
"first line isn't read in. Maybe the heartbeat "
3336
"does not ouput string correctly for status "
3337
"operation. Or the code (myself) is wrong.");
3342
if( proctrack_timedout(p) ) {
3343
lrmd_log(LOG_WARNING, "%s: pid %d timed out"
3344
, small_op_info(op), proctrack_pid(p));
3345
op_status = LRM_OP_TIMEOUT;
3347
op_status = LRM_OP_ERROR;
3350
rc = RAExec->map_ra_retvalue(exitcode, op_type
3351
, op->first_line_ra_stdout);
3352
if (!op->interval || is_logmsg_due(op) || debug_level > 0) { /* log non-repeating ops */
3353
if (rc == exitcode) {
3355
, "%s: pid %d exited with"
3356
" return code %d", small_op_info(op), proctrack_pid(p), rc);
3359
, "%s: pid %d exited with"
3360
" return code %d (mapped from %d)"
3361
, small_op_info(op), proctrack_pid(p), rc, exitcode);
3364
if (EXECRA_EXEC_UNKNOWN_ERROR == rc || EXECRA_NO_RA == rc) {
3365
op_status = LRM_OP_ERROR;
3367
, "on_ra_proc_finished: the exit code indicates a problem.");
3369
op_status = LRM_OP_DONE;
3372
if (op->interval && is_logmsg_due(op)) {
3373
op->t_lastlogmsg = time_longclock();
3376
ha_msg_mod_int(op->msg, F_LRM_OPSTATUS, op_status)) {
3377
LOG_FAILED_TO_ADD_FIELD("opstatus");
3380
if (HA_OK != ha_msg_mod_int(op->msg, F_LRM_RC, rc)) {
3381
LOG_FAILED_TO_ADD_FIELD("F_LRM_RC");
3385
if ( 0 < strlen(op->first_line_ra_stdout) ) {
3386
if (NULL != cl_get_string(op->msg, F_LRM_DATA)) {
3387
cl_msg_remove(op->msg, F_LRM_DATA);
3389
ret = ha_msg_add(op->msg, F_LRM_DATA, op->first_line_ra_stdout);
3391
LOG_FAILED_TO_ADD_FIELD("data");
3395
if (on_op_done(rsc,op) >= 0) {
3398
reset_proctrack_data(p);
3402
/* Handle the death of one of our managed child processes */
3404
on_ra_proc_query_name(ProcTrack* p)
3406
static char proc_name[MAX_PROC_NAME];
3407
lrmd_op_t* op = NULL;
3408
lrmd_rsc_t* rsc = NULL;
3409
const char* op_type = NULL;
3412
op = (lrmd_op_t*)(proctrack_data(p));
3413
if (NULL == op || op->exec_pid == 0) {
3417
op_type = ha_msg_value(op->msg, F_LRM_OP);
3418
rsc = lookup_rsc(op->rsc_id);
3422
, "unknown rsc(%s):%s maybe deleted"
3423
, op->rsc_id, op_type);
3425
snprintf(proc_name, MAX_PROC_NAME, "%s:%s", rsc->id, op_type);
3432
get_lrmd_param(const char *name, char *value, int maxstring)
3435
lrmd_log(LOG_ERR, "%s: empty name", __FUNCTION__);
3438
if (!strcmp(name,"max-children")) {
3439
snprintf(value, maxstring, "%d", max_child_count);
3442
lrmd_log(LOG_ERR, "%s: unknown lrmd parameter %s", __FUNCTION__, name);
3448
set_lrmd_param(const char *name, const char *value)
3453
lrmd_log(LOG_ERR, "%s: empty name", __FUNCTION__);
3457
lrmd_log(LOG_ERR, "%s: empty value", __FUNCTION__);
3460
if (!strcmp(name,"max-children")) {
3463
lrmd_log(LOG_ERR, "%s: invalid value for lrmd parameter %s"
3464
, __FUNCTION__, name);
3467
lrmd_log(LOG_INFO, "setting max-children to %d", ival);
3468
max_child_count = ival;
3471
lrmd_log(LOG_ERR, "%s: unknown lrmd parameter %s"
3472
, __FUNCTION__, name);
3478
on_msg_set_lrmd_param(lrmd_client_t* client, struct ha_msg* msg)
3480
const char *name, *value;
3482
CHECK_ALLOCATED(client, "client", HA_FAIL);
3483
CHECK_ALLOCATED(msg, "message", HA_FAIL);
3485
name = ha_msg_value(msg,F_LRM_LRMD_PARAM_NAME);
3486
value = ha_msg_value(msg,F_LRM_LRMD_PARAM_VAL);
3487
if (!name || !value) {
3488
lrmd_log(LOG_ERR, "%s: no parameter defined"
3492
return set_lrmd_param(name,value);
3496
on_msg_get_lrmd_param(lrmd_client_t* client, struct ha_msg* msg)
3498
struct ha_msg* ret = NULL;
3500
char value[MAX_NAME_LEN];
3502
CHECK_ALLOCATED(client, "client", HA_FAIL);
3503
CHECK_ALLOCATED(msg, "message", HA_FAIL);
3505
ret = create_lrm_ret(HA_OK, 1);
3506
CHECK_RETURN_OF_CREATE_LRM_RET;
3508
name = ha_msg_value(msg,F_LRM_LRMD_PARAM_NAME);
3509
if (get_lrmd_param(name, value, MAX_NAME_LEN) != HA_OK) {
3512
if (HA_OK != ha_msg_add(ret, F_LRM_LRMD_PARAM_VAL, value)) {
3514
LOG_FAILED_TO_ADD_FIELD(F_LRM_LRMD_PARAM_VAL);
3517
if (HA_OK != msg2ipcchan(ret, client->ch_cmd)) {
3518
lrmd_log(LOG_ERR, "%s: can not send the ret msg",__FUNCTION__);
3525
/* /////////////////Util Functions////////////////////////////////////////////// */
3527
send_ret_msg (IPC_Channel* ch, int ret)
3529
struct ha_msg* msg = NULL;
3531
msg = create_lrm_ret(ret, 1);
3532
CHECK_RETURN_OF_CREATE_LRM_RET;
3534
if (HA_OK != msg2ipcchan(msg, ch)) {
3535
lrmd_log(LOG_ERR, "send_ret_msg: can not send the ret msg");
3542
send_cbk_msg(struct ha_msg* msg, lrmd_client_t* client)
3545
lrmd_log(LOG_WARNING,
3546
"%s: zero client", __FUNCTION__);
3549
if (!client->ch_cbk) {
3550
lrmd_log(LOG_WARNING,
3551
"%s: callback channel is null", __FUNCTION__);
3552
} else if (HA_OK != msg2ipcchan(msg, client->ch_cbk)) {
3553
lrmd_log(LOG_WARNING,
3554
"%s: can not send the ret msg", __FUNCTION__);
3559
send_msg(struct ha_msg* msg, lrmd_client_t* client)
3562
lrmd_log(LOG_WARNING,
3563
"%s: zero client", __FUNCTION__);
3566
if (HA_OK != ha_msg_mod(msg,F_LRM_APP,client->app_name)) {
3567
lrmd_log(LOG_ERR,"%s:%d: cannot add field to a message"
3568
, __FUNCTION__, __LINE__);
3571
send_cbk_msg(msg, client);
3575
notify_client(lrmd_op_t* op)
3577
lrmd_client_t* client = lookup_client(op->client_id);
3580
/* send the result to client */
3581
send_cbk_msg(op->msg, client);
3583
lrmd_log(LOG_WARNING
3584
, "%s: client for the operation %s does not exist"
3585
" and client requested notification."
3586
, __FUNCTION__, op_info(op));
3591
lookup_client (pid_t pid)
3593
return (lrmd_client_t*) g_hash_table_lookup(clients, &pid);
3597
client_cmp_name(gpointer key, gpointer val, gpointer app_name)
3599
return strcmp(((lrmd_client_t*)val)->app_name,(char *)app_name) ?
3603
static lrmd_client_t*
3604
lookup_client_by_name(char *app_name)
3606
return (lrmd_client_t*)g_hash_table_find(clients,client_cmp_name,app_name);
3610
lookup_rsc (const char* rid)
3613
(lrmd_rsc_t*)g_hash_table_lookup(resources, rid) :
3618
lookup_rsc_by_msg (struct ha_msg* msg)
3620
const char* id = NULL;
3621
lrmd_rsc_t* rsc = NULL;
3623
CHECK_ALLOCATED(msg, "msg", NULL);
3624
id = ha_msg_value(msg, F_LRM_RID);
3626
lrmd_log(LOG_ERR, "lookup_rsc_by_msg: got a NULL resource id.");
3629
if (RID_LEN <= strnlen(id, RID_LEN+2)) {
3630
lrmd_log(LOG_ERR, "lookup_rsc_by_msg: resource id is too long.");
3633
rsc = lookup_rsc(id);
3638
destroy_pipe_ra_stdout(gpointer user_data)
3640
ra_pipe_op_t * rapop = (ra_pipe_op_t *)user_data;
3642
CHECK_ALLOCATED(rapop, "ra_pipe_op",);
3643
if (rapop->ra_stderr_fd < 0) {
3644
ra_pipe_op_destroy(rapop);
3649
destroy_pipe_ra_stderr(gpointer user_data)
3651
ra_pipe_op_t * rapop = (ra_pipe_op_t *)user_data;
3653
CHECK_ALLOCATED(rapop, "ra_pipe_op",);
3654
if (rapop->ra_stdout_fd < 0) {
3655
ra_pipe_op_destroy(rapop);
3660
handle_pipe_ra_stdout(int fd, gpointer user_data)
3663
ra_pipe_op_t * rapop = (ra_pipe_op_t *)user_data;
3665
lrmd_op_t* lrmd_op = NULL;
3667
CHECK_ALLOCATED(rapop, "ra_pipe_op", FALSE);
3669
if (rapop->lrmd_op == NULL) {
3670
lrmd_debug2(LOG_DEBUG, "%s:%d: Unallocated lrmd_op 0x%lx!!"
3671
, __FUNCTION__, __LINE__
3672
, (unsigned long)rapop->lrmd_op);
3674
lrmd_op = rapop->lrmd_op;
3677
if (fd <= STDERR_FILENO) {
3678
lrmd_log(LOG_CRIT, "%s:%d: Attempt to read from "
3679
"closed/invalid file descriptor %d."
3680
, __FUNCTION__, __LINE__, fd);
3684
if (0 != read_pipe(fd, &data, rapop)) {
3685
/* error or reach the EOF */
3686
if (fd > STDERR_FILENO) {
3688
if (fd == rapop->ra_stdout_fd) {
3689
rapop->ra_stdout_fd = -1;
3692
if ( NULL != rapop->ra_stdout_gsource) {
3694
* Returning FALSE will trigger ipc code to release
3695
* the GFDSource, so donn't release it here.
3697
rapop->ra_stdout_gsource = NULL;
3703
if ( (0==STRNCMP_CONST(rapop->op_type, "meta-data"))
3704
||(0==STRNCMP_CONST(rapop->op_type, "monitor"))
3705
||(0==STRNCMP_CONST(rapop->op_type, "status")) ) {
3706
lrmd_debug(LOG_DEBUG, "RA output: (%s:%s:stdout) %s"
3707
, lrm_str(rapop->rsc_id), rapop->op_type, data);
3709
lrmd_log(LOG_INFO, "RA output: (%s:%s:stdout) %s"
3710
, lrm_str(rapop->rsc_id), rapop->op_type, data);
3714
* This code isn't good enough, it produces erratic and hard-to
3715
* read messages in the logs. But this does not affect the
3716
* function correctness, since the first line output is ensured
3717
* to be collected into the buffer completely.
3718
* Anyway, the meta-data (which is _many_ lines long) can be
3719
* handled by another function, see raexec.h
3721
if ( (rapop->first_line_read == FALSE)
3722
&& (0==STRNCMP_CONST(rapop->rsc_class, "heartbeat"))
3723
&& ( lrmd_op != NULL )
3724
&& ( (0==STRNCMP_CONST(rapop->op_type, "monitor"))
3725
||(0==STRNCMP_CONST(rapop->op_type, "status")) )) {
3726
if (lrmd_op != NULL) {
3727
strncat(lrmd_op->first_line_ra_stdout, data
3728
, sizeof(lrmd_op->first_line_ra_stdout) -
3729
strlen(lrmd_op->first_line_ra_stdout)-1);
3730
if (strchr(lrmd_op->first_line_ra_stdout, '\n')
3732
rapop->first_line_read = TRUE;
3736
, "Before read the first line, the RA "
3737
"execution child quitted and waited.");
3748
handle_pipe_ra_stderr(int fd, gpointer user_data)
3752
ra_pipe_op_t * rapop = (ra_pipe_op_t *)user_data;
3754
CHECK_ALLOCATED(rapop, "ra_pipe_op", FALSE);
3756
if (fd <= STDERR_FILENO) {
3757
lrmd_log(LOG_CRIT, "%s:%d: Attempt to read from "
3758
" closed/invalid file descriptor %d."
3759
, __FUNCTION__, __LINE__, fd);
3763
if (0 != read_pipe(fd, &data, rapop)) {
3764
/* error or reach the EOF */
3765
if (fd > STDERR_FILENO) {
3767
if (fd == rapop->ra_stderr_fd) {
3768
rapop->ra_stderr_fd = -1;
3771
if ( NULL != rapop->ra_stderr_gsource) {
3773
* G_main_del_fd will trigger
3774
* destroy_pipe_ra_stderr
3775
* ra_pipe_op_destroy
3777
* Returning FALSE will trigger ipc code to release
3778
* the GFDSource, so donn't release it here.
3780
rapop->ra_stderr_gsource = NULL;
3786
lrmd_log(LOG_INFO, "RA output: (%s:%s:stderr) %s"
3787
, lrm_str(rapop->rsc_id), probe_str(rapop->lrmd_op,rapop->op_type), data);
3795
read_pipe(int fd, char ** data, void * user_data)
3797
const int BUFFLEN = 81;
3798
char buffer[BUFFLEN];
3802
lrmd_op_t * op = NULL;
3803
ra_pipe_op_t * rapop = (ra_pipe_op_t *)user_data;
3805
lrmd_debug3(LOG_DEBUG, "%s begin.", __FUNCTION__);
3807
CHECK_ALLOCATED(rapop, "ra_pipe_op", FALSE);
3809
op = (lrmd_op_t *)rapop->lrmd_op;
3811
lrmd_debug2(LOG_DEBUG, "%s:%d: Unallocated lrmd_op 0x%lx!!"
3812
, __FUNCTION__, __LINE__
3813
, (unsigned long)op);
3817
gstr_tmp = g_string_new("");
3821
readlen = read(fd, buffer, BUFFLEN - 1);
3823
lrmd_debug2(LOG_NOTICE
3824
, "read's ret: %d when lrmd_op finished"
3827
if ( readlen > 0 ) {
3828
buffer[readlen] = EOS;
3829
g_string_append(gstr_tmp, buffer);
3831
} while (readlen == BUFFLEN - 1 || errno == EINTR);
3833
if (errno == EINTR || errno == EAGAIN) {
3842
if ((readlen < 0) && (errno !=0)) {
3846
cl_perror("%s:%d read error: fd %d errno=%d"
3847
, __FUNCTION__, __LINE__
3850
lrmd_op_dump(op, "op w/bad errno");
3853
, "%s::%d: lrmd_op has been freed"
3854
, __FUNCTION__, __LINE__);
3861
" Attempt to read from closed file descriptor %d."
3862
, __FUNCTION__, __LINE__, fd);
3864
lrmd_op_dump(op, "op w/bad errno");
3867
, "%s::%d: lrmd_op has been freed"
3868
, __FUNCTION__, __LINE__);
3874
if ( gstr_tmp->len == 0 ) {
3875
g_string_free(gstr_tmp, TRUE);
3877
*data = gstr_tmp->str;
3878
g_string_free(gstr_tmp, FALSE);
3881
lrmd_debug3(LOG_DEBUG, "%s end.", __FUNCTION__);
3887
debug_level_adjust(int nsig, gpointer user_data)
3894
dump_data_for_debug();
3898
dump_data_for_debug();
3900
if (debug_level < 0) {
3906
lrmd_log(LOG_WARNING, "debug_level_adjust: Received an "
3907
"unexpected signal(%d). Something wrong?.",nsig);
3910
snprintf(s, sizeof(s), "%d", debug_level);
3911
setenv(HADEBUGVAL, s, 1);
3916
dump_data_for_debug(void)
3918
lrmd_debug(LOG_DEBUG, "begin to dump internal data for debugging.");
3919
lrmd_dump_all_clients();
3920
lrmd_dump_all_resources();
3921
lrmd_debug(LOG_DEBUG, "end to dump internal data for debugging.");
3925
gen_op_info(const lrmd_op_t* op, gboolean add_params)
3927
static char info[512];
3928
lrmd_rsc_t* rsc = NULL;
3929
const char * op_type;
3930
GString * param_gstr;
3931
GHashTable* op_params = NULL;
3934
lrmd_log(LOG_ERR, "%s:%d: op==NULL"
3935
, __FUNCTION__, __LINE__);
3938
rsc = lookup_rsc(op->rsc_id);
3939
op_type = ha_msg_value(op->msg, F_LRM_OP);
3942
snprintf(info,sizeof(info)
3943
,"operation %s[%d] on unknown rsc(maybe deleted) for client %d"
3945
,op->call_id ,op->client_id);
3948
if (op->exec_pid > 1) {
3949
snprintf(info, sizeof(info)
3950
,"operation %s[%d] with pid %d on %s for client %d"
3951
,lrm_str(op_type), op->call_id, op->exec_pid, lrm_str(rsc->id)
3954
snprintf(info, sizeof(info)
3955
,"operation %s[%d] on %s for client %d"
3956
,lrm_str(op_type), op->call_id, lrm_str(rsc->id)
3961
param_gstr = g_string_new("");
3962
op_params = ha_msg_value_str_table(op->msg, F_LRM_PARAM);
3963
hash_to_str(op_params, param_gstr);
3965
free_str_table(op_params);
3969
snprintf(info+strlen(info), sizeof(info)-strlen(info)
3970
,", its parameters: %s",param_gstr->str);
3972
g_string_free(param_gstr, TRUE);
3979
hash_to_str(GHashTable * params , GString * str)
3982
g_hash_table_foreach(params, hash_to_str_foreach, str);
3987
hash_to_str_foreach(gpointer key, gpointer value, gpointer user_data)
3989
char buffer_tmp[80];
3990
GString * str = (GString *)user_data;
3992
g_snprintf(buffer_tmp, sizeof(buffer_tmp), "%s=[%s] "
3993
, (char *)key, (char *)value);
3994
str = g_string_append(str, buffer_tmp);
3998
check_queue_duration(lrmd_op_t* op)
4000
unsigned long t_stay_in_list = 0;
4001
static struct msg_ctrl *ml;
4003
CHECK_ALLOCATED(op, "op", );
4004
t_stay_in_list = longclockto_ms(op->t_perform - op->t_addtolist);
4005
if ( t_stay_in_list > WARNINGTIME_IN_LIST)
4008
ml = cl_limit_log_new(logmsg_ctrl_defs + OP_STAYED_TOO_LONG);
4009
cl_limit_log(ml, LOG_WARNING
4010
, "perform_ra_op: the %s stayed in operation "
4011
"list for %lu ms (longer than %d ms)"
4012
, small_op_info(op), t_stay_in_list
4013
, WARNINGTIME_IN_LIST
4015
if (debug_level >= 2) {
4016
dump_data_for_debug();