ctdb-recovery: Ban a node that causes recovery failure
authorMartin Schwenke <martin@meltin.net>
Mon, 29 Oct 2018 03:33:08 +0000 (14:33 +1100)
committerAmitay Isaacs <amitay@samba.org>
Mon, 5 Nov 2018 05:52:33 +0000 (06:52 +0100)
... instead of applying banning credits.

There have been a couple of cases where recovery repeatedly takes just
over 2 minutes to fail.  Therefore, banning credits expire between
failures and a continuously problematic node is never banned,
resulting in endless recoveries.  This is because it takes 2
applications of banning credits before a node is banned, which
generally involves 2 recovery failures.

The recovery helper makes up to 3 attempts to recover each database
during a single run.  If a node causes 3 failures then this is really
equivalent to 3 recovery failures in the model that existed before the
recovery helper added retries.  In that case the node would have been
banned after 2 failures.

So, instead of applying banning credits to the "most failing" node,
simply ban it directly from the recovery helper.

If multiple nodes are causing recovery failures then this can cause a
node to be banned more quickly than it might otherwise have been, even
pre-recovery-helper.  However, 90 seconds (i.e. 3 failures) is a long
time to be in recovery, so banning earlier seems like the best
approach.

BUG: https://bugzilla.samba.org/show_bug.cgi?id=13670

Signed-off-by: Martin Schwenke <martin@meltin.net>
Reviewed-by: Amitay Isaacs <amitay@gmail.com>
Autobuild-User(master): Amitay Isaacs <amitay@samba.org>
Autobuild-Date(master): Mon Nov  5 06:52:33 CET 2018 on sn-devel-144

ctdb/server/ctdb_recovery_helper.c

index 7495eb3a6741d82b09030557cb0526a902378f40..7fdcc2e5a29ea0414bb8fe84a44e6fa9415b3cba 100644 (file)
@@ -2571,22 +2571,28 @@ static void recovery_db_recovery_done(struct tevent_req *subreq)
 
                /* If pulling database fails multiple times */
                if (max_credits >= NUM_RETRIES) {
-                       struct ctdb_req_message message;
-
-                       D_ERR("Assigning banning credits to node %u\n",
-                             max_pnn);
-
-                       message.srvid = CTDB_SRVID_BANNING;
-                       message.data.pnn = max_pnn;
-
-                       subreq = ctdb_client_message_send(
-                                       state, state->ev, state->client,
-                                       ctdb_client_pnn(state->client),
-                                       &message);
+                       struct ctdb_ban_state ban_state = {
+                               .pnn = max_pnn,
+                               .time = state->tun_list->recovery_ban_period,
+                       };
+
+                       D_ERR("Banning node %u for %u seconds\n",
+                             ban_state.pnn,
+                             ban_state.time);
+
+                       ctdb_req_control_set_ban_state(&request,
+                                                      &ban_state);
+                       subreq = ctdb_client_control_send(state,
+                                                         state->ev,
+                                                         state->client,
+                                                         ban_state.pnn,
+                                                         TIMEOUT(),
+                                                         &request);
                        if (tevent_req_nomem(subreq, req)) {
                                return;
                        }
-                       tevent_req_set_callback(subreq, recovery_failed_done,
+                       tevent_req_set_callback(subreq,
+                                               recovery_failed_done,
                                                req);
                } else {
                        tevent_req_error(req, EIO);
@@ -2609,15 +2615,25 @@ static void recovery_failed_done(struct tevent_req *subreq)
 {
        struct tevent_req *req = tevent_req_callback_data(
                subreq, struct tevent_req);
+       struct recovery_state *state = tevent_req_data(
+               req, struct recovery_state);
+       struct ctdb_reply_control *reply;
        int ret;
        bool status;
 
-       status = ctdb_client_message_recv(subreq, &ret);
+       status = ctdb_client_control_recv(subreq, &ret, state, &reply);
        TALLOC_FREE(subreq);
        if (! status) {
-               D_ERR("failed to assign banning credits, ret=%d\n", ret);
+               D_ERR("failed to ban node, ret=%d\n", ret);
+               goto done;
+       }
+
+       ret = ctdb_reply_control_set_ban_state(reply);
+       if (ret != 0) {
+               D_ERR("control SET_BAN_STATE failed, ret=%d\n", ret);
        }
 
+done:
        tevent_req_error(req, EIO);
 }