20
20
#include <pacemaker.h>
23
24
#include <sys/stat.h>
24
25
#include <sys/types.h>
25
26
#include <sys/time.h>
26
27
#include <sys/resource.h>
28
#include <sys/reboot.h>
28
30
#include <crm/msg_xml.h>
29
31
#include <crm/common/ipcs.h>
30
32
#include <crm/common/mainloop.h>
33
#include <crm/cluster/internal.h>
31
34
#include <crm/cluster.h>
33
36
#include <dirent.h>
35
38
gboolean fatal_error = FALSE;
36
39
GMainLoop *mainloop = NULL;
37
GHashTable *peers = NULL;
39
41
#define PCMK_PROCESS_CHECK_INTERVAL 5
41
char *local_name = NULL;
43
const char *local_name = NULL;
42
44
uint32_t local_nodeid = 0;
43
45
crm_trigger_t *shutdown_trigger = NULL;
44
46
const char *pid_file = "/var/run/pacemaker.pid";
48
crm_proc_none = 0x00000001,
49
crm_proc_plugin = 0x00000002,
50
crm_proc_lrmd = 0x00000010,
51
crm_proc_cib = 0x00000100,
52
crm_proc_crmd = 0x00000200,
53
crm_proc_attrd = 0x00001000,
54
crm_proc_stonithd = 0x00002000,
55
crm_proc_pe = 0x00010000,
56
crm_proc_te = 0x00020000,
57
crm_proc_mgmtd = 0x00040000,
58
crm_proc_stonith_ng = 0x00100000,
62
48
typedef struct pcmk_child_s {
154
146
update_node_processes(local_nodeid, NULL, get_process_list());
156
148
} else if (child->respawn) {
149
gboolean fail_fast = crm_is_true(getenv("PCMK_fail_fast"));
157
151
crm_notice("Respawning failed child process: %s", child->name);
153
#ifdef RB_HALT_SYSTEM
155
crm_err("Rebooting system", child->name);
157
reboot(RB_HALT_SYSTEM);
158
crm_exit(DAEMON_RESPAWN_STOP);
158
161
start_child(child);
163
pcmk_child_exit(GPid pid, gint status, gpointer user_data)
166
pcmk_child_exit(mainloop_child_t * p, pid_t pid, int core, int signo, int exitcode)
166
pcmk_child_t *child = user_data;
168
if (WIFSIGNALED(status)) {
169
int signo = WTERMSIG(status);
170
int core = WCOREDUMP(status);
168
pcmk_child_t *child = mainloop_child_userdata(p);
169
const char *name = mainloop_child_name(p);
172
172
crm_notice("Child process %s terminated with signal %d (pid=%d, core=%d)",
173
child->name, signo, child->pid, core);
173
name, signo, pid, core);
175
} else if (WIFEXITED(status)) {
176
exitcode = WEXITSTATUS(status);
177
176
do_crm_log(exitcode == 0 ? LOG_INFO : LOG_ERR,
178
"Child process %s exited (pid=%d, rc=%d)", child->name, child->pid, exitcode);
177
"Child process %s (%d) exited: %s (%d)", name, pid, pcmk_strerror(exitcode), exitcode);
181
180
if (exitcode == 100) {
182
181
crm_warn("Pacemaker child process %s no longer wishes to be respawned. "
183
"Shutting ourselves down.", child->name);
182
"Shutting ourselves down.", name);
184
183
child->respawn = FALSE;
185
184
fatal_error = TRUE;
186
185
pcmk_shutdown(15);
261
262
use_valgrind = FALSE;
266
if (crm_user_lookup(child->uid, &uid, &gid) < 0) {
267
crm_err("Invalid user (%s) for %s: not found", child->uid, child->name);
270
crm_info("Using uid=%u and group=%u for process %s", uid, gid, child->name);
264
273
child->pid = fork();
265
274
CRM_ASSERT(child->pid != -1);
267
276
if (child->pid > 0) {
269
g_child_watch_add(child->pid, pcmk_child_exit, child);
278
mainloop_child_add(child->pid, 0, child->name, child, pcmk_child_exit);
271
280
crm_info("Forked child %d for process %s%s", child->pid, child->name,
272
281
use_valgrind ? " (valgrind enabled: " VALGRIND_BIN ")" : "");
293
302
opts_default[0] = strdup(child->command);;
296
/* Dont set the group for now - it prevents connection to the cluster */
297
if (gid && setgid(gid) < 0) {
298
crm_perror("Could not set group to %d", gid);
305
if(stack == pcmk_cluster_corosync) {
306
/* Drop root privileges completely
308
* We can do this because we set uidgid.gid.${gid}=1
309
* via CMAP which allows these processes to connect to
312
if (setgid(gid) < 0) {
313
crm_perror(LOG_ERR, "Could not set group to %d", gid);
303
if (crm_user_lookup(child->uid, &uid, NULL) < 0) {
304
crm_err("Invalid uid (%s) specified for %s", child->uid, child->name);
316
/* Keep the root group (so we can access corosync), but add the haclient group (so we can access ipc) */
317
} else if (initgroups(child->uid, gid) < 0) {
318
crm_err("Cannot initalize groups for %s: %s (%d)", child->uid, pcmk_strerror(errno), errno);
418
431
mainloop_set_trigger(shutdown_trigger);
422
build_path(const char *path_c, mode_t mode)
424
int offset = 1, len = 0;
425
char *path = strdup(path_c);
427
CRM_CHECK(path != NULL, return);
428
for (len = strlen(path); offset < len; offset++) {
429
if (path[offset] == '/') {
431
if (mkdir(path, mode) < 0 && errno != EEXIST) {
432
crm_perror(LOG_ERR, "Could not create directory '%s'", path);
438
if (mkdir(path, mode) < 0 && errno != EEXIST) {
439
crm_perror(LOG_ERR, "Could not create directory '%s'", path);
445
435
pcmk_ipc_accept(qb_ipcs_connection_t * c, uid_t uid, gid_t gid)
482
472
crm_element_value(msg, F_CRM_REFERENCE), crm_element_value(msg, F_CRM_ORIGIN));
483
473
pcmk_shutdown(15);
475
} else if (crm_str_eq(task, CRM_OP_RM_NODE_CACHE, TRUE)) {
476
/* Send to everyone */
479
const char *name = NULL;
481
crm_element_value_int(msg, XML_ATTR_ID, &id);
482
name = crm_element_value(msg, XML_ATTR_UNAME);
483
crm_notice("Instructing peers to remove references to node %s/%u", name, id);
485
iov = calloc(1, sizeof(struct iovec));
486
iov->iov_base = dump_xml_unformatted(msg);
487
iov->iov_len = 1 + strlen(iov->iov_base);
486
/* Just send to everyone */
487
update_process_clients();
491
update_process_clients(c);
516
520
.connection_destroyed = pcmk_ipc_destroy
520
ghash_send_proc_details(gpointer key, gpointer value, gpointer data)
522
crm_ipcs_send(value, 0, data, TRUE);
526
peer_loop_fn(gpointer key, gpointer value, gpointer user_data)
528
pcmk_peer_t *node = value;
529
xmlNode *update = user_data;
531
xmlNode *xml = create_xml_node(update, "node");
533
crm_xml_add_int(xml, "id", node->id);
534
crm_xml_add(xml, "uname", node->uname);
535
crm_xml_add_int(xml, "processes", node->processes);
539
update_process_clients(void)
524
update_process_clients(crm_client_t *client)
527
crm_node_t *node = NULL;
541
528
xmlNode *update = create_xml_node(NULL, "nodes");
543
530
crm_trace("Sending process list to %d children", crm_hash_table_size(client_connections));
545
g_hash_table_foreach(peers, peer_loop_fn, update);
546
g_hash_table_foreach(client_connections, ghash_send_proc_details, update);
532
g_hash_table_iter_init(&iter, crm_peer_cache);
533
while (g_hash_table_iter_next(&iter, NULL, (gpointer *) & node)) {
534
xmlNode *xml = create_xml_node(update, "node");
536
crm_xml_add_int(xml, "id", node->id);
537
crm_xml_add(xml, "uname", node->uname);
538
crm_xml_add(xml, "state", node->state);
539
crm_xml_add_int(xml, "processes", node->processes);
543
crm_ipcs_send(client, 0, update, TRUE);
546
g_hash_table_iter_init(&iter, client_connections);
547
while (g_hash_table_iter_next(&iter, NULL, (gpointer *) & client)) {
548
crm_ipcs_send(client, 0, update, TRUE);
548
552
free_xml(update);
564
570
rc = snprintf(buffer, SIZEOF(buffer) - 1, "<node proclist=\"%u\"/>", get_process_list());
567
iov.iov_base = buffer;
568
iov.iov_len = rc + 1;
570
573
crm_trace("Sending %s", buffer);
571
send_cpg_message(&iov);
574
iov = calloc(1, sizeof(struct iovec));
575
iov->iov_base = strdup(buffer);
576
iov->iov_len = rc + 1;
575
581
update_node_processes(uint32_t id, const char *uname, uint32_t procs)
577
583
gboolean changed = FALSE;
578
pcmk_peer_t *node = g_hash_table_lookup(peers, GUINT_TO_POINTER(id));
583
node = calloc(1, sizeof(pcmk_peer_t));
586
g_hash_table_insert(peers, GUINT_TO_POINTER(id), node);
587
node = g_hash_table_lookup(peers, GUINT_TO_POINTER(id));
588
CRM_ASSERT(node != NULL);
592
if (node->uname == NULL || safe_str_eq(node->uname, uname) == FALSE) {
593
int lpc, len = strlen(uname);
595
crm_notice("%p Node %u now known as %s%s%s", node, id, uname,
596
node->uname ? node->uname : ", was: ", node->uname ? node->uname : "");
598
node->uname = strdup(uname);
601
for (lpc = 0; lpc < len; lpc++) {
602
if (uname[lpc] >= 'A' && uname[lpc] <= 'Z') {
604
("Node names with capitals are discouraged, consider changing '%s' to something else",
612
crm_trace("Empty uname for node %u", id);
584
crm_node_t *node = crm_get_peer(id, uname);
615
586
if (procs != 0) {
616
587
if (procs != node->processes) {
641
613
{"features", 0, 0, 'F', "\tDisplay the full version and list of features Pacemaker was built with"},
643
615
{"-spacer-", 1, 0, '-', "\nAdditional Options:"},
644
{"foreground", 0, 0, 'f', "\tRun in the foreground instead of as a daemon"},
645
{"pid-file", 1, 0, 'p', "\t(Advanced) Daemon pid file location"},
616
{"foreground", 0, 0, 'f', "\t(Ignored) Pacemaker always runs in the foreground"},
617
{"pid-file", 1, 0, 'p', "\t(Ignored) Daemon pid file location"},
767
mcp_cpg_destroy(gpointer user_data)
769
crm_err("Connection destroyed");
774
mcp_cpg_deliver(cpg_handle_t handle,
775
const struct cpg_name *groupName,
776
uint32_t nodeid, uint32_t pid, void *msg, size_t msg_len)
778
xmlNode *xml = string2xml(msg);
779
const char *task = crm_element_value(xml, F_CRM_TASK);
781
crm_trace("Recieved %s %.200s", task, msg);
782
if (task == NULL && nodeid != local_nodeid) {
784
const char *uname = crm_element_value(xml, "uname");
786
crm_element_value_int(xml, "proclist", (int *)&procs);
787
/* crm_debug("Got proclist %.32x from %s", procs, uname); */
788
if (update_node_processes(nodeid, uname, procs)) {
789
update_process_clients(NULL);
792
} else if (crm_str_eq(task, CRM_OP_RM_NODE_CACHE, TRUE)) {
794
const char *name = NULL;
796
crm_element_value_int(xml, XML_ATTR_ID, &id);
797
name = crm_element_value(xml, XML_ATTR_UNAME);
798
reap_crm_member(id, name);
803
mcp_cpg_membership(cpg_handle_t handle,
804
const struct cpg_name *groupName,
805
const struct cpg_address *member_list, size_t member_list_entries,
806
const struct cpg_address *left_list, size_t left_list_entries,
807
const struct cpg_address *joined_list, size_t joined_list_entries)
809
/* Don't care about CPG membership, but we do want to broadcast our own presence */
810
update_process_peers();
814
mcp_quorum_callback(unsigned long long seq, gboolean quorate)
821
mcp_quorum_destroy(gpointer user_data)
823
crm_info("connection closed");
795
827
main(int argc, char **argv)
817
850
crm_log_init(NULL, LOG_INFO, TRUE, FALSE, argc, argv, FALSE);
818
851
crm_set_options(NULL, "mode [options]", long_options, "Start/Stop Pacemaker\n");
820
/* Restore the original facility so that read_config() does the right thing */
853
/* Restore the original facility so that mcp_read_config() does the right thing */
821
854
set_daemon_option("logfacility", facility);
846
printf("Pacemaker %s (Build: %s)\n Supporting: %s\n", VERSION, BUILD_VERSION,
879
printf("Pacemaker %s (Build: %s)\n Supporting v%s: %s\n", VERSION, BUILD_VERSION,
880
CRM_FEATURE_SET, CRM_FEATURES);
850
883
printf("Argument code 0%o (%c) is not (?yet?) supported\n", flag, flag);
882
915
crm_ipc_close(old_instance);
883
916
crm_ipc_destroy(old_instance);
886
919
} else if (crm_ipc_connected(old_instance)) {
887
920
crm_ipc_close(old_instance);
888
921
crm_ipc_destroy(old_instance);
889
922
crm_err("Pacemaker is already active, aborting startup");
923
crm_exit(DAEMON_RESPAWN_STOP);
893
926
crm_ipc_close(old_instance);
894
927
crm_ipc_destroy(old_instance);
896
if (read_config() == FALSE) {
929
if (mcp_read_config() == FALSE) {
897
930
crm_notice("Could not obtain corosync config data, exiting");
901
934
crm_notice("Starting Pacemaker %s (Build: %s): %s", VERSION, BUILD_VERSION, CRM_FEATURES);
932
965
if (crm_user_lookup(CRM_DAEMON_USER, &pcmk_uid, &pcmk_gid) < 0) {
933
966
crm_err("Cluster user %s does not exist, aborting Pacemaker startup", CRM_DAEMON_USER);
937
970
mkdir(CRM_STATE_DIR, 0750);
938
971
mcp_chown(CRM_STATE_DIR, pcmk_uid, pcmk_gid);
940
/* Used by stonithd */
941
build_path(HA_STATE_DIR "/heartbeat", 0755);
942
mcp_chown(HA_STATE_DIR "/heartbeat", pcmk_uid, pcmk_gid);
944
/* Used by RAs - Leave owned by root */
945
build_path(CRM_RSCTMP_DIR, 0755);
947
973
/* Used to store core files in */
948
build_path(CRM_CORE_DIR, 0755);
974
crm_build_path(CRM_CORE_DIR, 0775);
949
975
mcp_chown(CRM_CORE_DIR, pcmk_uid, pcmk_gid);
951
977
/* Used to store blackbox dumps in */
952
build_path(CRM_BLACKBOX_DIR, 0755);
978
crm_build_path(CRM_BLACKBOX_DIR, 0755);
953
979
mcp_chown(CRM_BLACKBOX_DIR, pcmk_uid, pcmk_gid);
955
981
/* Used to store policy engine inputs in */
956
build_path(PE_STATE_DIR, 0755);
982
crm_build_path(PE_STATE_DIR, 0755);
957
983
mcp_chown(PE_STATE_DIR, pcmk_uid, pcmk_gid);
959
985
/* Used to store the cluster configuration */
960
build_path(CRM_CONFIG_DIR, 0755);
986
crm_build_path(CRM_CONFIG_DIR, 0755);
961
987
mcp_chown(CRM_CONFIG_DIR, pcmk_uid, pcmk_gid);
963
peers = g_hash_table_new(g_direct_hash, g_direct_equal);
989
/* Resource agent paths are constructed by the lrmd */
965
ipcs = mainloop_add_ipc_server(CRM_SYSTEM_MCP, QB_IPC_NATIVE, &ipc_callbacks);
991
ipcs = mainloop_add_ipc_server(CRM_SYSTEM_MCP, QB_IPC_NATIVE, &mcp_ipc_callbacks);
966
992
if (ipcs == NULL) {
967
993
crm_err("Couldn't start IPC server");
997
/* Allows us to block shutdown */
971
998
if (cluster_connect_cfg(&local_nodeid) == FALSE) {
972
999
crm_err("Couldn't connect to Corosync's CFG service");
1000
crm_exit(ENOPROTOOPT);
976
if (cluster_connect_cpg() == FALSE) {
1003
cluster.destroy = mcp_cpg_destroy;
1004
cluster.cpg.cpg_deliver_fn = mcp_cpg_deliver;
1005
cluster.cpg.cpg_confchg_fn = mcp_cpg_membership;
1007
if(cluster_connect_cpg(&cluster) == FALSE) {
977
1008
crm_err("Couldn't connect to Corosync's CPG service");
1009
crm_exit(ENOPROTOOPT);
1012
if (is_corosync_cluster()) {
1013
/* Keep the membership list up-to-date for crm_node to query */
1014
rc = cluster_connect_quorum(mcp_quorum_callback, mcp_quorum_destroy);
981
1017
local_name = get_local_node_name();