2
* Copyright (C) 2008 Lars Marowsky-Bree <lmb@suse.de>
4
* This program is free software; you can redistribute it and/or
5
* modify it under the terms of the GNU General Public
6
* License as published by the Free Software Foundation; either
7
* version 2.1 of the License, or (at your option) any later version.
9
* This software is distributed in the hope that it will be useful,
10
* but WITHOUT ANY WARRANTY; without even the implied warranty of
11
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12
* General Public License for more details.
14
* You should have received a copy of the GNU General Public
15
* License along with this library; if not, write to the Free Software
16
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21
#include <sys/types.h>
26
#include <asm/unistd.h>
30
#include <sys/types.h>
32
#include <sys/ptrace.h>
35
#include <clplumbing/cl_log.h>
36
#include <clplumbing/coredumps.h>
37
#include <clplumbing/realtime.h>
38
#include <clplumbing/cl_reboot.h>
39
#include <clplumbing/setproctitle.h>
42
#include <sys/utsname.h>
43
#include <sys/ioctl.h>
44
#include <linux/types.h>
45
#include <linux/watchdog.h>
50
struct servants_list_item *servants_leader = NULL;
52
static int servant_count = 0;
53
static int servant_restart_interval = 3600;
55
/* signals reserved for multi-disk sbd */
56
#define SIG_LIVENESS (SIGRTMIN + 1) /* report liveness of the disk */
57
#define SIG_EXITREQ (SIGRTMIN + 2) /* exit request to inquisitor */
58
#define SIG_TEST (SIGRTMIN + 3) /* trigger self test */
59
#define SIG_RESTART (SIGRTMIN + 4) /* trigger restart of all failed disk */
60
/* FIXME: should add dynamic check of SIG_XX >= SIGRTMAX */
64
#define DBGPRINT(...) fprintf(stderr, __VA_ARGS__)
66
#define DBGPRINT(...) do {} while (0)
69
int quorum_write(int good_servants)
71
return (good_servants > servant_count/2);
74
int quorum_read(int good_servants)
76
if (servant_count >= 3)
77
return (good_servants > servant_count/2);
79
return (good_servants >= 1);
82
int assign_servant(const char* devname, functionp_t functionp, const void* argp)
87
DBGPRINT("fork servant for %s\n", devname);
89
if (pid == 0) { /* child */
91
rc = (*functionp)(devname, argp);
96
} else if (pid != -1) { /* parent */
99
cl_log(LOG_ERR,"Failed to fork servant");
108
struct servants_list_item *s;
110
for (s = servants_leader; s; s = s->next) {
111
fprintf(stdout, "Initializing device %s\n",
113
devfd = open_device(s->devname);
117
rc = init_device(devfd);
120
fprintf(stderr, "Failed to init device %s\n", s->devname);
123
fprintf(stdout, "Device %s is initialized.\n", s->devname);
128
int slot_msg_wrapper(const char* devname, const void* argp)
132
const struct slot_msg_arg_t* arg = (const struct slot_msg_arg_t*)argp;
134
devfd = open_device(devname);
137
rc = slot_msg(devfd, arg->name, arg->msg);
142
int slot_ping_wrapper(const char* devname, const void* argp)
145
const char* name = (const char*)argp;
148
devfd = open_device(devname);
151
rc = slot_ping(devfd, name);
156
int allocate_slots(const char *name)
160
struct servants_list_item *s;
162
for (s = servants_leader; s; s = s->next) {
163
fprintf(stdout, "Trying to allocate slot for %s on device %s.\n",
166
devfd = open_device(s->devname);
170
rc = slot_allocate(devfd, name);
174
fprintf(stdout, "Slot for %s has been allocated on %s.\n",
184
struct servants_list_item *s;
187
for (s = servants_leader; s; s = s->next) {
188
DBGPRINT("list slots on device %s\n", s->devname);
189
devfd = open_device(s->devname);
192
rc = slot_list(devfd);
200
int ping_via_slots(const char *name)
205
int servants_finished = 0;
208
struct servants_list_item *s;
210
DBGPRINT("you shall know no fear\n");
211
sigemptyset(&procmask);
212
sigaddset(&procmask, SIGCHLD);
213
sigprocmask(SIG_BLOCK, &procmask, NULL);
215
for (s = servants_leader; s; s = s->next) {
216
s->pid = assign_servant(s->devname, &slot_ping_wrapper, (const void*)name);
219
while (servants_finished < servant_count) {
220
sig = sigwaitinfo(&procmask, &sinfo);
221
DBGPRINT("get signal %d\n", sig);
222
if (sig == SIGCHLD) {
223
while ((pid = wait(&status))) {
224
if (pid == -1 && errno == ECHILD) {
227
s = lookup_servant_by_pid(pid);
230
("A ping is delivered to %s via %s. ",
234
("They responed to the emporer\n");
237
("There's no response\n");
243
DBGPRINT("signal %d handled\n", sig);
248
int servant(const char *diskname, const void* argp)
250
struct sector_mbox_s *s_mbox = NULL;
253
time_t t0, t1, latency;
254
union sigval signal_value;
255
sigset_t servant_masks;
260
cl_log(LOG_ERR, "Empty disk name %s.", diskname);
264
/* Block most of the signals */
265
sigfillset(&servant_masks);
266
sigdelset(&servant_masks, SIGKILL);
267
sigdelset(&servant_masks, SIGFPE);
268
sigdelset(&servant_masks, SIGILL);
269
sigdelset(&servant_masks, SIGSEGV);
270
sigdelset(&servant_masks, SIGBUS);
271
sigdelset(&servant_masks, SIGALRM);
272
/* FIXME: check error */
273
sigprocmask(SIG_SETMASK, &servant_masks, NULL);
275
devfd = open_device(diskname);
280
mbox = slot_allocate(devfd, local_uname);
283
"No slot allocated, and automatic allocation failed for disk %s.",
288
cl_log(LOG_INFO, "Monitoring slot %d on disk %s", mbox, diskname);
289
set_proc_title("sbd: watcher: %s - slot: %d", diskname, mbox);
291
s_mbox = sector_alloc();
292
if (mbox_write(devfd, mbox, s_mbox) < 0) {
297
memset(&signal_value, 0, sizeof(signal_value));
306
/* Our parent died unexpectedly. Triggering
311
if (mbox_read(devfd, mbox, s_mbox) < 0) {
312
cl_log(LOG_ERR, "mbox read failed in servant.");
316
if (s_mbox->cmd > 0) {
318
"Received command %s from %s on disk %s",
319
char2cmd(s_mbox->cmd), s_mbox->from, diskname);
321
switch (s_mbox->cmd) {
323
memset(s_mbox, 0, sizeof(*s_mbox));
324
mbox_write(devfd, mbox, s_mbox);
325
sigqueue(ppid, SIG_TEST, signal_value);
334
sigqueue(ppid, SIG_EXITREQ, signal_value);
336
case SBD_MSG_CRASHDUMP:
341
An "unknown" message might result
342
from a partial write.
343
log it and clear the slot.
345
cl_log(LOG_ERR, "Unknown message on disk %s",
347
memset(s_mbox, 0, sizeof(*s_mbox));
348
mbox_write(devfd, mbox, s_mbox);
352
sigqueue(ppid, SIG_LIVENESS, signal_value);
356
if (timeout_watchdog_warn && (latency > timeout_watchdog_warn)) {
358
"Latency: %d exceeded threshold %d on disk %s",
359
(int)latency, (int)timeout_watchdog_warn,
362
cl_log(LOG_INFO, "Latency: %d on disk %s", (int)latency,
373
void recruit_servant(const char *devname, pid_t pid)
375
struct servants_list_item *s = servants_leader;
376
struct servants_list_item *newbie;
378
newbie = malloc(sizeof(*newbie));
380
fprintf(stderr, "malloc failed in recruit_servant.");
383
memset(newbie, 0, sizeof(*newbie));
384
newbie->devname = strdup(devname);
388
servants_leader = newbie;
398
struct servants_list_item *lookup_servant_by_dev(const char *devname)
400
struct servants_list_item *s;
402
for (s = servants_leader; s; s = s->next) {
403
if (strncasecmp(s->devname, devname, strlen(s->devname)))
409
struct servants_list_item *lookup_servant_by_pid(pid_t pid)
411
struct servants_list_item *s;
413
for (s = servants_leader; s; s = s->next) {
420
int check_all_dead(void)
422
struct servants_list_item *s;
426
for (s = servants_leader; s; s = s->next) {
428
r = sigqueue(s->pid, 0, svalue);
429
if (r == -1 && errno == ESRCH)
438
void servants_start(void)
440
struct servants_list_item *s;
444
for (s = servants_leader; s; s = s->next) {
446
r = sigqueue(s->pid, 0, svalue);
447
if ((r != -1 || errno != ESRCH))
451
s->pid = assign_servant(s->devname, servant, NULL);
455
void servants_kill(void)
457
struct servants_list_item *s;
460
for (s = servants_leader; s; s = s->next) {
462
sigqueue(s->pid, SIGKILL, svalue);
466
int check_timeout_inconsistent(void)
469
struct sector_header_s *hdr_cur = 0, *hdr_last = 0;
470
struct servants_list_item* s;
471
int inconsistent = 0;
473
for (s = servants_leader; s; s = s->next) {
474
devfd = open_device(s->devname);
477
hdr_cur = header_get(devfd);
482
if (hdr_last->timeout_watchdog != hdr_cur->timeout_watchdog
483
|| hdr_last->timeout_allocate != hdr_cur->timeout_allocate
484
|| hdr_last->timeout_loop != hdr_cur->timeout_loop
485
|| hdr_last->timeout_msgwait != hdr_cur->timeout_msgwait)
493
timeout_watchdog = hdr_last->timeout_watchdog;
494
timeout_allocate = hdr_last->timeout_allocate;
495
timeout_loop = hdr_last->timeout_loop;
496
timeout_msgwait = hdr_last->timeout_msgwait;
498
cl_log(LOG_ERR, "No devices were available at start-up.");
506
inline void cleanup_servant_by_pid(pid_t pid)
508
struct servants_list_item* s;
510
s = lookup_servant_by_pid(pid);
514
/* TODO: This points to an inconsistency in our internal
515
* data - how to recover? */
516
cl_log(LOG_ERR, "Cannot cleanup after unknown pid %i",
521
void restart_servant_by_pid(pid_t pid)
523
struct servants_list_item* s;
525
s = lookup_servant_by_pid(pid);
527
if (s->restarts < 10) {
528
s->pid = assign_servant(s->devname, servant, NULL);
531
cl_log(LOG_WARNING, "Max retry count reached: not restarting servant for %s",
536
/* TODO: This points to an inconsistency in our internal
537
* data - how to recover? */
538
cl_log(LOG_ERR, "Cannot restart unknown pid %i",
543
int inquisitor_decouple(void)
545
pid_t ppid = getppid();
546
union sigval signal_value;
548
/* During start-up, we only arm the watchdog once we've got
549
* quorum at least once. */
551
if (watchdog_init() < 0) {
557
sigqueue(ppid, SIG_LIVENESS, signal_value);
562
void inquisitor_child(void)
569
struct timespec timeout;
570
int good_servants = 0;
574
struct timespec t_last_tickle, t_now, t_last_restarted;
576
set_proc_title("sbd: inquisitor");
578
reports = malloc(sizeof(int) * servant_count);
580
cl_log(LOG_ERR, "malloc failed");
583
memset(reports, 0, sizeof(int) * servant_count);
585
sigemptyset(&procmask);
586
sigaddset(&procmask, SIGCHLD);
587
sigaddset(&procmask, SIG_LIVENESS);
588
sigaddset(&procmask, SIG_EXITREQ);
589
sigaddset(&procmask, SIG_TEST);
590
sigaddset(&procmask, SIGUSR1);
591
sigaddset(&procmask, SIGUSR2);
592
sigprocmask(SIG_BLOCK, &procmask, NULL);
596
timeout.tv_sec = timeout_loop;
599
clock_gettime(CLOCK_MONOTONIC, &t_last_tickle);
600
clock_gettime(CLOCK_MONOTONIC, &t_last_restarted);
603
sig = sigtimedwait(&procmask, &sinfo, &timeout);
604
DBGPRINT("got signal %d\n", sig);
606
if (sig == SIG_EXITREQ) {
610
} else if (sig == SIGCHLD) {
611
while ((pid = waitpid(-1, &status, WNOHANG))) {
612
if (pid == -1 && errno == ECHILD) {
614
} else if (exiting) {
615
cleanup_servant_by_pid(pid);
617
restart_servant_by_pid(pid);
620
} else if (sig == SIG_LIVENESS) {
621
for (i = 0; i < servant_count; i++) {
622
if (reports[i] == sinfo.si_pid) {
624
} else if (reports[i] == 0) {
625
reports[i] = sinfo.si_pid;
630
} else if (sig == SIG_TEST) {
631
} else if (sig == SIGUSR1) {
634
clock_gettime(CLOCK_MONOTONIC, &t_last_restarted);
639
if (check_all_dead())
645
if (quorum_read(good_servants)) {
646
DBGPRINT("Enough liveness messages\n");
648
if (inquisitor_decouple() < 0) {
658
clock_gettime(CLOCK_MONOTONIC, &t_last_tickle);
659
memset(reports, 0, sizeof(int) * servant_count);
663
clock_gettime(CLOCK_MONOTONIC, &t_now);
664
latency = t_now.tv_sec - t_last_tickle.tv_sec;
665
if (timeout_watchdog && (latency > timeout_watchdog)) {
667
/* We're still being watched by our
668
* parent. We don't fence, but exit. */
669
cl_log(LOG_ERR, "SBD: Not enough votes to proceed. Aborting start-up.");
676
if (timeout_watchdog_warn && (latency > timeout_watchdog_warn)) {
678
"Latency: No liveness for %d s exceeds threshold of %d s (healthy servants: %d)",
679
(int)latency, (int)timeout_watchdog_warn, good_servants);
682
latency = t_now.tv_sec - t_last_restarted.tv_sec;
683
if (servant_restart_interval > 0
684
&& latency > servant_restart_interval) {
685
/* Restart all children every hour */
686
clock_gettime(CLOCK_MONOTONIC, &t_last_restarted);
696
int sig, pid, inquisitor_pid;
701
DBGPRINT("inquisitor starting\n");
703
/* Where's the best place for sysrq init ?*/
706
sigemptyset(&procmask);
707
sigaddset(&procmask, SIGCHLD);
708
sigaddset(&procmask, SIG_LIVENESS);
709
sigprocmask(SIG_BLOCK, &procmask, NULL);
711
if (check_timeout_inconsistent() == 1) {
712
fprintf(stderr, "Timeout settings are different across SBD devices!\n");
713
fprintf(stderr, "You have to correct them and re-start SBD again.\n");
717
inquisitor_pid = make_daemon();
718
if (inquisitor_pid == 0) {
722
/* We're the parent. Wait for a happy signal from our child
723
* before we proceed - we either get "SIG_LIVENESS" when the
724
* inquisitor has completed the first successful round, or
725
* ECHLD when it exits with an error. */
728
sig = sigwaitinfo(&procmask, &sinfo);
729
DBGPRINT("get signal %d\n", sig);
730
if (sig == SIGCHLD) {
731
while ((pid = waitpid(-1, &status, WNOHANG))) {
732
if (pid == -1 && errno == ECHILD) {
735
/* We got here because the inquisitor
736
* did not succeed. */
739
} else if (sig == SIG_LIVENESS) {
740
/* Inquisitor started up properly. */
743
fprintf(stderr, "Nobody expected the spanish inquisition!\n");
751
int messenger(const char *name, const char *msg)
756
int servants_finished = 0;
757
int successful_delivery = 0;
760
struct servants_list_item *s;
761
struct slot_msg_arg_t slot_msg_arg = {name, msg};
763
sigemptyset(&procmask);
764
sigaddset(&procmask, SIGCHLD);
765
sigprocmask(SIG_BLOCK, &procmask, NULL);
767
for (s = servants_leader; s; s = s->next) {
768
s->pid = assign_servant(s->devname, &slot_msg_wrapper, &slot_msg_arg);
771
while (!(quorum_write(successful_delivery) ||
772
(servants_finished == servant_count))) {
773
sig = sigwaitinfo(&procmask, &sinfo);
774
DBGPRINT("get signal %d\n", sig);
775
if (sig == SIGCHLD) {
776
while ((pid = waitpid(-1, &status, WNOHANG))) {
777
if (pid == -1 && errno == ECHILD) {
780
DBGPRINT("process %d finished\n", pid);
782
if (WIFEXITED(status)
783
&& WEXITSTATUS(status) == 0) {
784
DBGPRINT("exit with %d\n",
785
WEXITSTATUS(status));
786
successful_delivery++;
791
DBGPRINT("signal %d handled\n", sig);
793
if (quorum_write(successful_delivery)) {
796
fprintf(stderr, "Message is not delivered via more then a half of devices\n");
801
int dump_headers(void)
804
struct servants_list_item *s = servants_leader;
807
for (s = servants_leader; s; s = s->next) {
808
fprintf(stdout, "==Dumping header on disk %s\n", s->devname);
809
devfd = open_device(s->devname);
812
rc = header_dump(devfd);
816
fprintf(stdout, "==Header on disk %s is dumped\n", s->devname);
821
int main(int argc, char **argv, char **envp)
826
if ((cmdname = strrchr(argv[0], '/')) == NULL) {
832
cl_log_set_entity(cmdname);
833
cl_log_enable_stderr(0);
834
cl_log_set_facility(LOG_DAEMON);
838
while ((c = getopt(argc, argv, "DRWhvw:d:n:1:2:3:4:5:t:")) != -1) {
841
/* Ignore for historical reasons */
850
watchdog_set_timeout = 0;
856
watchdogdev = optarg;
859
recruit_servant(optarg, 0);
862
local_uname = optarg;
865
timeout_watchdog = atoi(optarg);
868
timeout_allocate = atoi(optarg);
871
timeout_loop = atoi(optarg);
874
timeout_msgwait = atoi(optarg);
877
timeout_watchdog_warn = atoi(optarg);
880
servant_restart_interval = atoi(optarg);
892
if (servant_count < 1 || servant_count > 3) {
893
fprintf(stderr, "You must specify 1 to 3 devices via the -d option.\n");
898
/* There must at least be one command following the options: */
899
if ((argc - optind) < 1) {
900
fprintf(stderr, "Not enough arguments.\n");
905
if (init_set_proc_title(argc, argv, envp) < 0) {
906
fprintf(stderr, "Allocation of proc title failed.");
912
if (strcmp(argv[optind], "create") == 0) {
913
exit_status = init_devices();
914
} else if (strcmp(argv[optind], "dump") == 0) {
915
exit_status = dump_headers();
916
} else if (strcmp(argv[optind], "allocate") == 0) {
917
exit_status = allocate_slots(argv[optind + 1]);
918
} else if (strcmp(argv[optind], "list") == 0) {
919
exit_status = list_slots();
920
} else if (strcmp(argv[optind], "message") == 0) {
921
exit_status = messenger(argv[optind + 1], argv[optind + 2]);
922
} else if (strcmp(argv[optind], "ping") == 0) {
923
exit_status = ping_via_slots(argv[optind + 1]);
924
} else if (strcmp(argv[optind], "watch") == 0) {
925
exit_status = inquisitor();
931
if (exit_status < 0) {