Skip to content

Commit cca781e

Browse files
committed
Fix: sbd watchdog rebooting upon restart of pacemaker-remote
1 parent 5ec38cf commit cca781e

File tree

3 files changed

+31
-6
lines changed

3 files changed

+31
-6
lines changed

src/sbd-cluster.c

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,15 @@ static crm_cluster_t cluster;
5858
static gboolean sbd_remote_check(gpointer user_data);
5959
static long unsigned int find_pacemaker_remote(void);
6060
static void sbd_membership_destroy(gpointer user_data);
61+
static bool wait_for_pacemaker_remote_lost = false;
6162

63+
static void signal_exitreq(void)
64+
{
65+
union sigval signal_value;
66+
pid_t ppid = getppid();
67+
68+
sigqueue(ppid, SIG_EXITREQ, signal_value);
69+
}
6270

6371
#if SUPPORT_PLUGIN
6472
static void
@@ -675,6 +683,10 @@ sbd_remote_check(gpointer user_data)
675683
set_servant_health(pcmk_health_online, LOG_INFO,
676684
"Connected to Pacemaker Remote %lu", (long unsigned int)remoted_pid);
677685
} else {
686+
if (wait_for_pacemaker_remote_lost) {
687+
signal_exitreq();
688+
return true;
689+
}
678690
set_servant_health(pcmk_health_unclean, LOG_WARNING,
679691
"Connection to Pacemaker Remote %lu lost", (long unsigned int)remoted_pid);
680692
}
@@ -742,6 +754,16 @@ cluster_shutdown(int nsig)
742754
clean_up(0);
743755
}
744756

757+
static void
758+
trigger_wait_for_pacemaker_remote_lost(int nsig)
759+
{
760+
/* if we've never seen pacemaker_remoted request exit immeditely */
761+
if ((remoted_pid <= 0) || !remote_node) {
762+
signal_exitreq();
763+
}
764+
wait_for_pacemaker_remote_lost = true;
765+
}
766+
745767
int
746768
servant_cluster(const char *diskname, int mode, const void* argp)
747769
{
@@ -761,6 +783,7 @@ servant_cluster(const char *diskname, int mode, const void* argp)
761783

762784
mainloop_add_signal(SIGTERM, cluster_shutdown);
763785
mainloop_add_signal(SIGINT, cluster_shutdown);
786+
mainloop_add_signal(SIGUSR2, trigger_wait_for_pacemaker_remote_lost);
764787

765788
g_main_loop_run(mainloop);
766789
g_main_loop_unref(mainloop);

src/sbd-inquisitor.c

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -248,14 +248,14 @@ void servants_start(void)
248248
}
249249
}
250250

251-
void servants_kill(void)
251+
void servants_kill(int sig)
252252
{
253253
struct servants_list_item *s;
254254
union sigval svalue;
255255

256256
for (s = servants_leader; s; s = s->next) {
257257
if (s->pid != 0)
258-
sigqueue(s->pid, SIGKILL, svalue);
258+
sigqueue(s->pid, sig, svalue);
259259
}
260260
}
261261

@@ -536,7 +536,7 @@ void inquisitor_child(void)
536536
clock_gettime(CLOCK_MONOTONIC, &t_now);
537537

538538
if (sig == SIG_EXITREQ || sig == SIGTERM) {
539-
servants_kill();
539+
servants_kill(SIGKILL);
540540
watchdog_close(true);
541541
exiting = 1;
542542
} else if (sig == SIGCHLD) {
@@ -610,6 +610,8 @@ void inquisitor_child(void)
610610
if (exiting)
611611
continue;
612612
servants_start();
613+
} else if (sig == SIGUSR2) {
614+
servants_kill(SIGUSR2);
613615
}
614616

615617
if (exiting) {
@@ -718,7 +720,7 @@ void inquisitor_child(void)
718720
*/
719721
cl_log(LOG_DEBUG, "Decoupling");
720722
if (inquisitor_decouple() < 0) {
721-
servants_kill();
723+
servants_kill(SIGKILL);
722724
exiting = 1;
723725
continue;
724726
} else {
@@ -734,7 +736,7 @@ void inquisitor_child(void)
734736
/* We're still being watched by our
735737
* parent. We don't fence, but exit. */
736738
cl_log(LOG_ERR, "SBD: Not enough votes to proceed. Aborting start-up.");
737-
servants_kill();
739+
servants_kill(SIGKILL);
738740
exiting = 1;
739741
continue;
740742
}

src/sbd_remote.service.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ Type=forking
1111
PIDFile=@runstatedir@/sbd.pid
1212
EnvironmentFile=-@CONFIGDIR@/sbd
1313
ExecStart=@sbindir@/sbd $SBD_OPTS -p @runstatedir@/sbd.pid watch
14-
ExecStop=@bindir@/kill -TERM $MAINPID
14+
ExecStop=@bindir@/kill -USR2 $MAINPID
1515

1616
# Could this benefit from exit codes for restart?
1717
# Does this need to be set to msgwait * 1.2?

0 commit comments

Comments
 (0)