When starting up ctdbd, wait until all initial recoveries have finished
authorRonnie Sahlberg <ronniesahlberg@gmail.com>
Tue, 1 Dec 2009 02:19:58 +0000 (13:19 +1100)
committerRonnie Sahlberg <ronniesahlberg@gmail.com>
Tue, 1 Dec 2009 02:19:58 +0000 (13:19 +1100)
and until we have gone through a full re-recovery timeout without triggering
any pending recoveries before we start up the services and start monitoring
the node.

server/ctdb_monitor.c

index 437b9d894dfaeae7cdfed7982cd510462ab6e79f..f4223772b6c8cef06f8d8aff0d7111fe31b2f18a 100644 (file)
@@ -204,6 +204,51 @@ static void ctdb_startup_callback(struct ctdb_context *ctdb, int status, void *p
 }
 
 
+/*
+  wait until we have finished initial recoveries before we start the
+  monitoring events
+ */
+static void ctdb_wait_until_recovered(struct event_context *ev, struct timed_event *te, 
+                             struct timeval t, void *private_data)
+{
+       struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
+
+       DEBUG(DEBUG_NOTICE,("CTDB_WAIT_UNTIL_RECOVERED\n"));
+
+       if (ctdb->vnn_map->generation == INVALID_GENERATION) {
+               DEBUG(DEBUG_NOTICE,(__location__ " generation is INVALID. Wait one more second\n"));
+               event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
+                                    timeval_current_ofs(1, 0), 
+                                    ctdb_wait_until_recovered, ctdb);
+               return;
+       }
+
+       if (ctdb->recovery_mode != CTDB_RECOVERY_NORMAL) {
+               DEBUG(DEBUG_NOTICE,(__location__ " in recovery. Wait one more second\n"));
+               event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
+                                    timeval_current_ofs(1, 0), 
+                                    ctdb_wait_until_recovered, ctdb);
+               return;
+       }
+
+
+       if (timeval_elapsed(&ctdb->last_recovery_finished) < (ctdb->tunable.rerecovery_timeout + 3)) {
+               DEBUG(DEBUG_NOTICE,(__location__ " wait for pending recoveries to end. Wait one more second.\n"));
+
+               event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
+                                    timeval_current_ofs(1, 0), 
+                                    ctdb_wait_until_recovered, ctdb);
+               return;
+       }
+
+
+       DEBUG(DEBUG_NOTICE,(__location__ " Recoveries finished. Running the \"startup\" event.\n"));
+       event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
+                            timeval_current_ofs(1, 0), 
+                            ctdb_check_health, ctdb);
+}
+
+
 /*
   see if the event scripts think we are healthy
  */
@@ -301,8 +346,6 @@ void ctdb_stop_monitoring(struct ctdb_context *ctdb)
  */
 void ctdb_start_monitoring(struct ctdb_context *ctdb)
 {
-       struct timed_event *te;
-
        if (ctdb->monitor != NULL) {
                return;
        }
@@ -315,10 +358,9 @@ void ctdb_start_monitoring(struct ctdb_context *ctdb)
        ctdb->monitor->monitor_context = talloc_new(ctdb->monitor);
        CTDB_NO_MEMORY_FATAL(ctdb, ctdb->monitor->monitor_context);
 
-       te = event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
+       event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
                             timeval_current_ofs(1, 0), 
-                            ctdb_check_health, ctdb);
-       CTDB_NO_MEMORY_FATAL(ctdb, te);
+                            ctdb_wait_until_recovered, ctdb);
 
        ctdb->monitor->monitoring_mode  = CTDB_MONITORING_ACTIVE;
        DEBUG(DEBUG_NOTICE,("Monitoring has been started\n"));