recoverd: When starting a takeover run disable IP verification
authorMartin Schwenke <martin@meltin.net>
Thu, 30 Aug 2012 23:34:17 +0000 (09:34 +1000)
committerMartin Schwenke <martin@meltin.net>
Thu, 11 Oct 2012 01:10:45 +0000 (12:10 +1100)
Disable for TakeoverTimeout seconds.

Otherwise the the recovery daemon can get overzealous and start trying
to add/delete addresses that it thinks are missing but where the
eventscript just hasn't finished.  This didn't used to matter so much
but it is more important now that concurrent takeip/releaseip/updateip
generate error - we want to avoid spamming the log.

Signed-off-by: Martin Schwenke <martin@meltin.net>
server/ctdb_recoverd.c
server/ctdb_takeover.c

index 02ce69fd23a4ab8748f94ae314781595ab36acd0..1153a40c704c2c3f72413f796c81cd76cadbaba2 100644 (file)
@@ -2174,6 +2174,12 @@ static void disable_ip_check_handler(struct ctdb_context *ctdb, uint64_t srvid,
        }
 
        timeout = *((uint32_t *)data.dptr);
+
+       if (timeout == 0) {
+               DEBUG(DEBUG_NOTICE,("Reenabling ip check\n"));
+               return;
+       }
+               
        DEBUG(DEBUG_NOTICE,("Disabling ip check for %u seconds\n", timeout));
 
        rec->ip_check_disable_ctx = talloc_new(rec);
index b3e98d5f7aa3aaf812611085c7076152b611e80a..4fcddd1cbbe324b49d56f5200b9038ab950ebd48 100644 (file)
@@ -2161,6 +2161,7 @@ int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
        struct client_async_data *async_data;
        struct ctdb_client_control_state *state;
        TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+       uint32_t disable_timeout;
 
        /*
         * ip failover is completely disabled, just send out the 
@@ -2197,6 +2198,19 @@ int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
        /* Do the IP reassignment calculations */
        ctdb_takeover_run_core(ctdb, nodemap, &all_ips);
 
+       /* The recovery daemon does regular sanity checks of the IPs.
+        * However, sometimes it is overzealous and thinks changes are
+        * required when they're already underway.  This stops the
+        * checks for a while before we start moving IPs.
+        */
+       disable_timeout = ctdb->tunable.takeover_timeout;
+       data.dptr  = (uint8_t*)&disable_timeout;
+       data.dsize = sizeof(disable_timeout);
+       if (ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
+                                    CTDB_SRVID_DISABLE_IP_CHECK, data) != 0) {
+               DEBUG(DEBUG_INFO,("Failed to disable ip verification\n"));
+       }
+
        /* now tell all nodes to delete any alias that they should not
           have.  This will be a NOOP on nodes that don't currently
           hold the given alias */