recoverd: Log node that causes takoever run to fail
authorMartin Schwenke <martin@meltin.net>
Fri, 31 May 2013 04:55:07 +0000 (14:55 +1000)
committerAmitay Isaacs <amitay@gmail.com>
Wed, 14 Aug 2013 06:15:13 +0000 (16:15 +1000)
Extend takeover_fail_callback() to just log (and not do any ban
processing) when the callback data is NULL.  Always call
ctdb_takeover_run() with the callback so that useful errors are always
logged.

Signed-off-by: Martin Schwenke <martin@meltin.net>
Pair-programmed-with: Amitay Isaacs <amitay@gmail.com>
(cherry picked from commit c429394afbabaee09f9216dc743419adddf523ea)

server/ctdb_recoverd.c

index 15d7bbe04a50d8972908ad9427a5535506669307..9f2c71ca639f496859ef481cad02f66d31c399b2 100644 (file)
@@ -1483,12 +1483,16 @@ static int sync_recovery_lock_file_across_cluster(struct ctdb_recoverd *rec)
  */
 static void takeover_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
 {
-       struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
+       DEBUG(DEBUG_ERR, ("Node %u failed the takeover run\n", node_pnn));
 
-       DEBUG(DEBUG_ERR, (__location__ " Node %u failed the takeover run. Setting it as recovery fail culprit\n", node_pnn));
+       if (callback_data != NULL) {
+               struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
 
-       ctdb_set_culprit(rec, node_pnn);
-       rec->need_takeover_run = true;
+               DEBUG(DEBUG_ERR, ("Setting node %u as recovery fail culprit\n", node_pnn));
+
+               ctdb_set_culprit(rec, node_pnn);
+               rec->need_takeover_run = true;
+       }
 }
 
 
@@ -1798,7 +1802,7 @@ static int do_recovery(struct ctdb_recoverd *rec,
                return -1;
        }
        rec->need_takeover_run = false;
-       ret = ctdb_takeover_run(ctdb, nodemap, NULL, NULL);
+       ret = ctdb_takeover_run(ctdb, nodemap, takeover_fail_callback, NULL);
        if (ret != 0) {
                DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses. ctdb_takeover_run() failed.\n"));
                rec->need_takeover_run = true;
@@ -2117,7 +2121,7 @@ static void ctdb_rebalance_timeout(struct event_context *ev, struct timed_event
 
        DEBUG(DEBUG_NOTICE,("Rebalance all nodes that have had ip assignment changes.\n"));
 
-       ret = ctdb_takeover_run(ctdb, rec->nodemap, NULL, NULL);
+       ret = ctdb_takeover_run(ctdb, rec->nodemap, takeover_fail_callback, NULL);
        if (ret != 0) {
                DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses. ctdb_takeover_run() failed.\n"));
                rec->need_takeover_run = true;
@@ -2264,7 +2268,7 @@ static void process_ipreallocate_requests(struct ctdb_context *ctdb, struct ctdb
                rec->need_takeover_run = true;
        }
        if (ret == 0) {
-               ret = ctdb_takeover_run(ctdb, rec->nodemap, NULL, NULL);
+               ret = ctdb_takeover_run(ctdb, rec->nodemap, takeover_fail_callback, NULL);
                if (ret != 0) {
                        DEBUG(DEBUG_ERR,("Failed to reallocate addresses: ctdb_takeover_run() failed.\n"));
                        rec->need_takeover_run = true;