ctdb-recoverd: Factor out recovery master validation
authorMartin Schwenke <martin@meltin.net>
Tue, 27 Oct 2015 05:43:07 +0000 (16:43 +1100)
committerAmitay Isaacs <amitay@samba.org>
Mon, 16 Nov 2015 10:47:44 +0000 (11:47 +0100)
Starting to untangle cluster management, database recovery and public
IP allocation.  This is a non-trivial subset of the cluster management
code that runs in the recovery daemon on all nodes.

Signed-off-by: Martin Schwenke <martin@meltin.net>
Reviewed-by: Amitay Isaacs <amitay@gmail.com>
Autobuild-User(master): Amitay Isaacs <amitay@samba.org>
Autobuild-Date(master): Mon Nov 16 11:47:45 CET 2015 on sn-devel-104

ctdb/server/ctdb_recoverd.c

index dd6e8b93d0b22836bb7b31921be429aa6bf78541..3f0cffc36d32783ec3969ed6ab09ff0f4362fa62 100644 (file)
@@ -3330,12 +3330,98 @@ static int update_recovery_lock_file(struct ctdb_context *ctdb)
        return 0;
 }
 
+static enum monitor_result validate_recovery_master(struct ctdb_recoverd *rec,
+                                                   TALLOC_CTX *mem_ctx)
+{
+       struct ctdb_context *ctdb = rec->ctdb;
+       uint32_t pnn = ctdb_get_pnn(ctdb);
+       struct ctdb_node_map_old *nodemap = rec->nodemap;
+       struct ctdb_node_map_old *recmaster_nodemap = NULL;
+       int ret;
+
+       /* When recovery daemon is started, recmaster is set to
+        * "unknown" so it knows to start an election.
+        */
+       if (rec->recmaster == CTDB_UNKNOWN_PNN) {
+               DEBUG(DEBUG_NOTICE,
+                     ("Initial recovery master set - forcing election\n"));
+               return MONITOR_ELECTION_NEEDED;
+       }
+
+       /*
+        * If the current recmaster does not have CTDB_CAP_RECMASTER,
+        * but we have, then force an election and try to become the new
+        * recmaster.
+        */
+       if (!ctdb_node_has_capabilities(rec->caps,
+                                       rec->recmaster,
+                                       CTDB_CAP_RECMASTER) &&
+           (rec->ctdb->capabilities & CTDB_CAP_RECMASTER) &&
+           !(nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE)) {
+               DEBUG(DEBUG_ERR,
+                     (" Current recmaster node %u does not have CAP_RECMASTER,"
+                      " but we (node %u) have - force an election\n",
+                      rec->recmaster, pnn));
+               return MONITOR_ELECTION_NEEDED;
+       }
+
+       /* Verify that the master node has not been deleted.  This
+        * should not happen because a node should always be shutdown
+        * before being deleted, causing a new master to be elected
+        * before now.  However, if something strange has happened
+        * then checking here will ensure we don't index beyond the
+        * end of the nodemap array. */
+       if (rec->recmaster >= nodemap->num) {
+               DEBUG(DEBUG_ERR,
+                     ("Recmaster node %u has been deleted. Force election\n",
+                      rec->recmaster));
+               return MONITOR_ELECTION_NEEDED;
+       }
+
+       /* if recovery master is disconnected/deleted we must elect a new recmaster */
+       if (nodemap->nodes[rec->recmaster].flags &
+           (NODE_FLAGS_DISCONNECTED|NODE_FLAGS_DELETED)) {
+               DEBUG(DEBUG_NOTICE,
+                     ("Recmaster node %u is disconnected/deleted. Force election\n",
+                      rec->recmaster));
+               return MONITOR_ELECTION_NEEDED;
+       }
+
+       /* get nodemap from the recovery master to check if it is inactive */
+       ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), rec->recmaster,
+                                  mem_ctx, &recmaster_nodemap);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,
+                     (__location__
+                      " Unable to get nodemap from recovery master %u\n",
+                         rec->recmaster));
+               return MONITOR_FAILED;
+       }
+
+
+       if ((recmaster_nodemap->nodes[rec->recmaster].flags & NODE_FLAGS_INACTIVE) &&
+           (rec->node_flags & NODE_FLAGS_INACTIVE) == 0) {
+               DEBUG(DEBUG_NOTICE,
+                     ("Recmaster node %u is inactive. Force election\n",
+                      rec->recmaster));
+               /*
+                * update our nodemap to carry the recmaster's notion of
+                * its own flags, so that we don't keep freezing the
+                * inactive recmaster node...
+                */
+               nodemap->nodes[rec->recmaster].flags =
+                       recmaster_nodemap->nodes[rec->recmaster].flags;
+               return MONITOR_ELECTION_NEEDED;
+       }
+
+       return MONITOR_OK;
+}
+
 static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
                      TALLOC_CTX *mem_ctx)
 {
        uint32_t pnn;
        struct ctdb_node_map_old *nodemap=NULL;
-       struct ctdb_node_map_old *recmaster_nodemap=NULL;
        struct ctdb_node_map_old **remote_nodemaps=NULL;
        struct ctdb_vnn_map *vnnmap=NULL;
        struct ctdb_vnn_map *remote_vnnmap=NULL;
@@ -3468,81 +3554,16 @@ static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
                return;
        }
 
-       /* When recovery daemon is started, recmaster is set to
-        * "unknown" so it knows to start an election.
-        */
-       if (rec->recmaster == CTDB_UNKNOWN_PNN) {
-               DEBUG(DEBUG_NOTICE,(__location__ " Initial recovery master set - forcing election\n"));
-               force_election(rec, pnn, nodemap);
-               return;
-       }
-
-       /*
-        * If the current recmaster does not have CTDB_CAP_RECMASTER,
-        * but we have, then force an election and try to become the new
-        * recmaster.
-        */
-       if (!ctdb_node_has_capabilities(rec->caps,
-                                       rec->recmaster,
-                                       CTDB_CAP_RECMASTER) &&
-           (rec->ctdb->capabilities & CTDB_CAP_RECMASTER) &&
-           !(nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE)) {
-               DEBUG(DEBUG_ERR, (__location__ " Current recmaster node %u does not have CAP_RECMASTER,"
-                                 " but we (node %u) have - force an election\n",
-                                 rec->recmaster, pnn));
-               force_election(rec, pnn, nodemap);
-               return;
-       }
-
-       /* Verify that the master node has not been deleted.  This
-        * should not happen because a node should always be shutdown
-        * before being deleted, causing a new master to be elected
-        * before now.  However, if something strange has happened
-        * then checking here will ensure we don't index beyond the
-        * end of the nodemap array. */
-       if (rec->recmaster >= nodemap->num) {
-               DEBUG(DEBUG_ERR,
-                     ("Recmaster node %u has been deleted. Force election\n",
-                      rec->recmaster));
-               force_election(rec, pnn, nodemap);
+       switch (validate_recovery_master(rec, mem_ctx)) {
+       case MONITOR_RECOVERY_NEEDED:
+               /* can not happen */
                return;
-       }
-
-       /* if recovery master is disconnected/deleted we must elect a new recmaster */
-       if (nodemap->nodes[rec->recmaster].flags &
-           (NODE_FLAGS_DISCONNECTED|NODE_FLAGS_DELETED)) {
-               DEBUG(DEBUG_NOTICE,
-                     ("Recmaster node %u is disconnected/deleted. Force election\n",
-                      rec->recmaster));
+       case MONITOR_ELECTION_NEEDED:
                force_election(rec, pnn, nodemap);
                return;
-       }
-
-       /* get nodemap from the recovery master to check if it is inactive */
-       ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), rec->recmaster,
-                                  mem_ctx, &recmaster_nodemap);
-       if (ret != 0) {
-               DEBUG(DEBUG_ERR,
-                     (__location__
-                      " Unable to get nodemap from recovery master %u\n",
-                         rec->recmaster));
-               return;
-       }
-
-
-       if ((recmaster_nodemap->nodes[rec->recmaster].flags & NODE_FLAGS_INACTIVE) &&
-           (rec->node_flags & NODE_FLAGS_INACTIVE) == 0) {
-               DEBUG(DEBUG_NOTICE,
-                     ("Recmaster node %u is inactive. Force election\n",
-                      rec->recmaster));
-               /*
-                * update our nodemap to carry the recmaster's notion of
-                * its own flags, so that we don't keep freezing the
-                * inactive recmaster node...
-                */
-               nodemap->nodes[rec->recmaster].flags =
-                       recmaster_nodemap->nodes[rec->recmaster].flags;
-               force_election(rec, pnn, nodemap);
+       case MONITOR_OK:
+               break;
+       case MONITOR_FAILED:
                return;
        }