recoverd: Track the nodes that fail takeover run and set culprit count

author Amitay Isaacs <amitay@gmail.com>

Tue, 23 Oct 2012 05:23:12 +0000 (16:23 +1100)

committer Amitay Isaacs <amitay@gmail.com>

Thu, 22 Nov 2012 02:02:26 +0000 (13:02 +1100)
author Amitay Isaacs <amitay@gmail.com>
Tue, 23 Oct 2012 05:23:12 +0000 (16:23 +1100)
committer Amitay Isaacs <amitay@gmail.com>
Thu, 22 Nov 2012 02:02:26 +0000 (13:02 +1100)
diff --git a/include/ctdb_private.h b/include/ctdb_private.h

index e3a9a1577ada71504f7893bf23b3daa21371520a..32e8c6880b64e766c8256e2b89ce2fe5493261a7 100644 (file)
--- a/include/ctdb_private.h
+++ b/include/ctdb_private.h
@@ -1118,6 +1118,8 @@ int ctdb_sys_send_tcp(const ctdb_sock_addr *dest,
                       const ctdb_sock_addr *src,
                       uint32_t seq, uint32_t ack, int rst);
  
+typedef void (*client_async_callback)(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data);
+
  int ctdb_set_public_addresses(struct ctdb_context *ctdb, const char *alist);
  int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
                               const char *iface,
@@ -1125,7 +1127,7 @@ int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
  int ctdb_set_event_script(struct ctdb_context *ctdb, const char *script);
  int ctdb_set_event_script_dir(struct ctdb_context *ctdb, const char *script_dir);
  int ctdb_set_notification_script(struct ctdb_context *ctdb, const char *script);
-int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap);
+int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, client_async_callback fail_callback, void *callback_data);
  
  int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id, 
                                 TDB_DATA indata);
@@ -1264,8 +1266,6 @@ int32_t ctdb_monitoring_mode(struct ctdb_context *ctdb);
  int ctdb_set_child_logging(struct ctdb_context *ctdb);
  void ctdb_lockdown_memory(struct ctdb_context *ctdb);
  
-typedef void (*client_async_callback)(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data);
-
  struct client_async_data {
         enum ctdb_controls opcode;
         bool dont_log_errors;
diff --git a/server/ctdb_recoverd.c b/server/ctdb_recoverd.c

index 4963c3f78b7b7f6ae9068d9f874c70f7cae0f434..336a9a793f9e64d5e63a9480207794adc919187a 100644 (file)
--- a/server/ctdb_recoverd.c
+++ b/server/ctdb_recoverd.c
@@ -1344,6 +1344,21 @@ static int sync_recovery_lock_file_across_cluster(struct ctdb_recoverd *rec)
  }
  
  
+/*
+ * this callback is called for every node that failed to execute ctdb_takeover_run()
+ * and set flag to re-run takeover run.
+ */
+static void takeover_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
+{
+       struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
+
+       DEBUG(DEBUG_ERR, (__location__ " Node %u failed the takeover run. Setting it as recovery fail culprit\n", node_pnn));
+
+       ctdb_set_culprit(rec, node_pnn);
+       rec->need_takeover_run = true;
+}
+
+
  /*
    we are the recmaster, and recovery is needed - start a recovery run
   */
@@ -1631,7 +1646,7 @@ static int do_recovery(struct ctdb_recoverd *rec,
                 return -1;
         }
         rec->need_takeover_run = false;
-       ret = ctdb_takeover_run(ctdb, nodemap);
+       ret = ctdb_takeover_run(ctdb, nodemap, NULL, NULL);
         if (ret != 0) {
                 DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses. ctdb_takeover_run() failed.\n"));
                 rec->need_takeover_run = true;
@@ -2046,7 +2061,7 @@ static void process_ipreallocate_requests(struct ctdb_context *ctdb, struct ctdb
                 rec->need_takeover_run = true;
         }
         if (ret == 0) {
-               ret = ctdb_takeover_run(ctdb, rec->nodemap);
+               ret = ctdb_takeover_run(ctdb, rec->nodemap, NULL, NULL);
                 if (ret != 0) {
                         DEBUG(DEBUG_ERR,("Failed to reallocate addresses: ctdb_takeover_run() failed.\n"));
                         rec->need_takeover_run = true;
@@ -3412,9 +3427,18 @@ static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
                         return;
                 }
  
-               ret = ctdb_takeover_run(ctdb, nodemap);
+               /* If takeover run fails, then the offending nodes are
+                * assigned ban culprit counts. And we re-try takeover.
+                * If takeover run fails repeatedly, the node would get
+                * banned.
+                *
+                * If rec->need_takeover_run is not set to true at this
+                * failure, monitoring is disabled cluster-wide (via
+                * startrecovery eventscript) and will not get enabled.
+                */
+               ret = ctdb_takeover_run(ctdb, nodemap, takeover_fail_callback, rec);
                 if (ret != 0) {
-                       DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses. Try again later\n"));
+                       DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses. Trying again\n"));
                         return;
                 }
  
diff --git a/server/ctdb_takeover.c b/server/ctdb_takeover.c

index 9cbafaf595acd9e878f885cda155ca727802b415..e17b481ef0bfc3dabfa28c3984f8f1cfab2e8e5c 100644 (file)
--- a/server/ctdb_takeover.c
+++ b/server/ctdb_takeover.c
@@ -1933,7 +1933,8 @@ finished:
  /*
    make any IP alias changes for public addresses that are necessary 
   */
-int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
+int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
+                     client_async_callback fail_callback, void *callback_data)
  {
         int i;
         struct ctdb_public_ip ip;
@@ -1965,6 +1966,9 @@ int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
         async_data = talloc_zero(tmp_ctx, struct client_async_data);
         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
  
+       async_data->fail_callback = fail_callback;
+       async_data->callback_data = callback_data;
+
         for (i=0;i<nodemap->num;i++) {
                 /* don't talk to unconnected nodes, but do talk to banned nodes */
                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
@@ -2022,6 +2026,10 @@ int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
         /* tell all nodes to get their own IPs */
         async_data = talloc_zero(tmp_ctx, struct client_async_data);
         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
+
+       async_data->fail_callback = fail_callback;
+       async_data->callback_data = callback_data;
+
         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
                 if (tmp_ip->pnn == -1) {
                         /* this IP won't be taken over */
@@ -2074,8 +2082,8 @@ ipreallocated:
         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_RUN_EVENTSCRIPTS,
                                       nodes, 0, TAKEOVER_TIMEOUT(),
                                       false, data,
-                                     NULL, NULL,
-                                     NULL) != 0) {
+                                     NULL, fail_callback,
+                                     callback_data) != 0) {
                 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to updatenatgw failed\n"));
         }
author	Amitay Isaacs <amitay@gmail.com>
	Tue, 23 Oct 2012 05:23:12 +0000 (16:23 +1100)
committer	Amitay Isaacs <amitay@gmail.com>
	Thu, 22 Nov 2012 02:02:26 +0000 (13:02 +1100)
include/ctdb_private.h		patch \| blob \| history
server/ctdb_recoverd.c		patch \| blob \| history
server/ctdb_takeover.c		patch \| blob \| history