ctdb:recoverd: fix endless banning due to non-frozen DBs.
authorMichael Adam <obnox@samba.org>
Tue, 31 May 2016 23:19:43 +0000 (01:19 +0200)
committerMichael Adam <obnox@samba.org>
Wed, 1 Jun 2016 01:49:46 +0000 (03:49 +0200)
When the banned node got marked RECOVERY_ACTIVE, but
freezing the DBs failed (e.g. if banning happened
while recovery was set to active but dbs not banned),
then the freezing will never be tried again, and the
node will keep banning itself indefinitely, until
ctdbd is restarted.

This is a regression from 4.3, introduced with

b4357a79d916b1f8ade8fa78563fbef0ce670aa9

and

d8f3b490bbb691c9916eed0df5b980c1aef23c85

This change lets the main loop in the banned case keep
trying to freeze the dbs, hence avoiding the endless loop.
Note that we currently have no means to tell in the
recovery daemon whether the DBs are frozen, so we
send the freeze control each time..

Signed-off-by: Michael Adam <obnox@samba.org>
ctdb/server/ctdb_recoverd.c

index 09940dc32f881b05a277b5299510fa9bf576e8f0..001d32e19df4694817e3946b9254cb4086556eca 100644 (file)
@@ -3542,7 +3542,9 @@ static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
                        DEBUG(DEBUG_ERR,(__location__ " Failed to read recmode from local node\n"));
                }
                if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
-                       DEBUG(DEBUG_ERR,("Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases\n"));
+                       DEBUG(DEBUG_ERR, ("Node is stopped or banned but "
+                             "recovery mode is not active. "
+                             "Activate recovery.\n"));
 
                        ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
                        if (ret != 0) {
@@ -3550,11 +3552,20 @@ static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
 
                                return;
                        }
-                       ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
-                       if (ret != 0) {
-                               DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node in STOPPED or BANNED state\n"));
-                               return;
-                       }
+               }
+
+               /*
+                * Make sure that the databases get frozen or we will
+                * never come out of banning!
+                * We currently have no way of telling whether freezing
+                * has completed here in the recovery daemon, so we just
+                * send the freeze out unconditionally. A banned node
+                * does not have anything useful to do anyways...
+                */
+               ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node in STOPPED or BANNED state\n"));
+                       return;
                }
 
                /* If this node is stopped or banned then it is not the recovery