From: Ronnie Sahlberg Date: Tue, 1 Mar 2011 01:09:42 +0000 (+1100) Subject: If/when the recovery daemon terminates unexpectedly, try to restart it again from... X-Git-Url: http://git.samba.org/?p=sahlberg%2Fctdb.git;a=commitdiff_plain;h=939e7c925edf14e89f94f7084f35e3992cab08ce If/when the recovery daemon terminates unexpectedly, try to restart it again from the main daemon instead of just shutting down the main deamon too. While it does not address the reason for recovery daemon shutting down, it reduces the impact of such issues and makes the system more robust. --- diff --git a/server/ctdb_recoverd.c b/server/ctdb_recoverd.c index b82f0e77..13dafa7e 100644 --- a/server/ctdb_recoverd.c +++ b/server/ctdb_recoverd.c @@ -70,6 +70,7 @@ struct ctdb_recoverd { #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0) #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0) +static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te, struct timeval t, void *private_data); /* ban a node for a period of time @@ -3518,18 +3519,12 @@ static void ctdb_check_recd(struct event_context *ev, struct timed_event *te, struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context); if (kill(ctdb->recoverd_pid, 0) != 0) { - DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Shutting down main daemon\n", (int)ctdb->recoverd_pid)); + DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid)); - ctdb_stop_recoverd(ctdb); - ctdb_stop_keepalive(ctdb); - ctdb_stop_monitoring(ctdb); - ctdb_release_all_ips(ctdb); - if (ctdb->methods != NULL) { - ctdb->methods->shutdown(ctdb); - } - ctdb_event_script(ctdb, CTDB_EVENT_SHUTDOWN); + event_add_timed(ctdb->ev, ctdb, timeval_zero(), + ctdb_restart_recd, ctdb); - exit(10); + return; } event_add_timed(ctdb->ev, ctdb, @@ -3631,3 +3626,13 @@ void ctdb_stop_recoverd(struct ctdb_context *ctdb) DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n")); kill(ctdb->recoverd_pid, SIGTERM); } + +static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te, + struct timeval t, void *private_data) +{ + struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context); + + DEBUG(DEBUG_ERR,("Restarting recovery daemon\n")); + ctdb_stop_recoverd(ctdb); + ctdb_start_recoverd(ctdb); +}