From: Ronnie Sahlberg Date: Tue, 1 Mar 2011 01:09:42 +0000 (+1100) Subject: If/when the recovery daemon terminates unexpectedly, try to restart it again from... X-Git-Tag: ctdb-1.2.40~204 X-Git-Url: http://git.samba.org/?p=ctdb.git;a=commitdiff_plain;h=0e65b08c669e76b35b7a1e0ac95422e40c76d982 If/when the recovery daemon terminates unexpectedly, try to restart it again from the main daemon instead of just shutting down the main deamon too. While it does not address the reason for recovery daemon shutting down, it reduces the impact of such issues and makes the system more robust. --- diff --git a/server/ctdb_recoverd.c b/server/ctdb_recoverd.c index b82f0e77..13dafa7e 100644 --- a/server/ctdb_recoverd.c +++ b/server/ctdb_recoverd.c @@ -70,6 +70,7 @@ struct ctdb_recoverd { #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0) #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0) +static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te, struct timeval t, void *private_data); /* ban a node for a period of time @@ -3518,18 +3519,12 @@ static void ctdb_check_recd(struct event_context *ev, struct timed_event *te, struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context); if (kill(ctdb->recoverd_pid, 0) != 0) { - DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Shutting down main daemon\n", (int)ctdb->recoverd_pid)); + DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid)); - ctdb_stop_recoverd(ctdb); - ctdb_stop_keepalive(ctdb); - ctdb_stop_monitoring(ctdb); - ctdb_release_all_ips(ctdb); - if (ctdb->methods != NULL) { - ctdb->methods->shutdown(ctdb); - } - ctdb_event_script(ctdb, CTDB_EVENT_SHUTDOWN); + event_add_timed(ctdb->ev, ctdb, timeval_zero(), + ctdb_restart_recd, ctdb); - exit(10); + return; } event_add_timed(ctdb->ev, ctdb, @@ -3631,3 +3626,13 @@ void ctdb_stop_recoverd(struct ctdb_context *ctdb) DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n")); kill(ctdb->recoverd_pid, SIGTERM); } + +static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te, + struct timeval t, void *private_data) +{ + struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context); + + DEBUG(DEBUG_ERR,("Restarting recovery daemon\n")); + ctdb_stop_recoverd(ctdb); + ctdb_start_recoverd(ctdb); +}