ctdb-ipalloc: Do not use node count or PNNs from CTDB context
[obnox/samba/samba-obnox.git] / ctdb / server / ctdb_takeover.c
index dc5ce7da47cabfb9fbc6a3ec6c24613232c9a9a6..bb548910dd04224bdd18ffb759a31e419a31ae83 100644 (file)
@@ -423,8 +423,6 @@ static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
        TDB_DATA data;
 
        if (status != 0) {
-               struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
-       
                if (status == -ETIME) {
                        ctdb_ban_self(ctdb);
                }
@@ -433,7 +431,6 @@ static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
                                 ctdb_vnn_iface_string(state->vnn)));
                ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
 
-               node->flags |= NODE_FLAGS_UNHEALTHY;
                talloc_free(state);
                return;
        }
@@ -1243,16 +1240,12 @@ create_merged_ip_list(struct ctdb_context *ctdb, struct ipalloc_state *ipalloc_s
        TALLOC_FREE(ctdb->ip_tree);
        ctdb->ip_tree = trbt_create(ctdb, 0);
 
-       for (i=0; i < ctdb->num_nodes; i++) {
-
-               if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
-                       continue;
-               }
+       if (ipalloc_state->known_public_ips == NULL) {
+               DEBUG(DEBUG_ERR, ("Known public IPs not set\n"));
+               return NULL;
+       }
 
-               /* there were no public ips for this node */
-               if (ipalloc_state->known_public_ips == NULL) {
-                       continue;
-               }
+       for (i=0; i < ipalloc_state->num; i++) {
 
                public_ips = &ipalloc_state->known_public_ips[i];
 
@@ -1263,7 +1256,7 @@ create_merged_ip_list(struct ctdb_context *ctdb, struct ipalloc_state *ipalloc_s
                        CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
                        /* Do not use information about IP addresses hosted
                         * on other nodes, it may not be accurate */
-                       if (public_ips->ip[j].pnn == ctdb->nodes[i]->pnn) {
+                       if (public_ips->ip[j].pnn == i) {
                                tmp_ip->pnn = public_ips->ip[j].pnn;
                        } else {
                                tmp_ip->pnn = -1;
@@ -1554,16 +1547,12 @@ fail:
 
 struct takeover_callback_data {
        uint32_t num_nodes;
-       bool *node_failed;
-       client_async_callback fail_callback;
-       void *fail_callback_data;
+       unsigned int *fail_count;
 };
 
 static struct takeover_callback_data *
 takeover_callback_data_init(TALLOC_CTX *mem_ctx,
-                           uint32_t num_nodes,
-                           client_async_callback fail_callback,
-                           void *callback_data)
+                           uint32_t num_nodes)
 {
        static struct takeover_callback_data *takeover_data;
 
@@ -1573,17 +1562,15 @@ takeover_callback_data_init(TALLOC_CTX *mem_ctx,
                return NULL;
        }
 
-       takeover_data->node_failed = talloc_zero_array(takeover_data,
-                                                      bool, num_nodes);
-       if (takeover_data->node_failed == NULL) {
+       takeover_data->fail_count = talloc_zero_array(takeover_data,
+                                                     unsigned int, num_nodes);
+       if (takeover_data->fail_count == NULL) {
                DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
                talloc_free(takeover_data);
                return NULL;
        }
 
        takeover_data->num_nodes = num_nodes;
-       takeover_data->fail_callback = fail_callback;
-       takeover_data->fail_callback_data = callback_data;
 
        return takeover_data;
 }
@@ -1601,12 +1588,47 @@ static void takeover_run_fail_callback(struct ctdb_context *ctdb,
                return;
        }
 
-       if (!cd->node_failed[node_pnn]) {
+       if (cd->fail_count[node_pnn] == 0) {
                DEBUG(DEBUG_ERR,
                      ("Node %u failed the takeover run\n", node_pnn));
-               cd->node_failed[node_pnn] = true;
-               cd->fail_callback(ctdb, node_pnn, res, outdata,
-                                 cd->fail_callback_data);
+       }
+
+       cd->fail_count[node_pnn]++;
+}
+
+static void takeover_run_process_failures(struct ctdb_context *ctdb,
+                                         struct takeover_callback_data *tcd)
+{
+       unsigned int max_fails = 0;
+       uint32_t max_pnn = -1;
+       uint32_t i;
+
+       for (i = 0; i < tcd->num_nodes; i++) {
+               if (tcd->fail_count[i] > max_fails) {
+                       max_pnn = i;
+                       max_fails = tcd->fail_count[i];
+               }
+       }
+
+       if (max_fails > 0) {
+               int ret;
+               TDB_DATA data;
+
+               DEBUG(DEBUG_ERR,
+                     ("Sending banning credits to %u with fail count %u\n",
+                      max_pnn, max_fails));
+
+               data.dptr = (uint8_t *)&max_pnn;
+               data.dsize = sizeof(uint32_t);
+               ret = ctdb_client_send_message(ctdb,
+                                              CTDB_BROADCAST_CONNECTED,
+                                              CTDB_SRVID_BANNING,
+                                              data);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR,
+                             ("Failed to set banning credits for node %u\n",
+                              max_pnn));
+               }
        }
 }
 
@@ -1639,8 +1661,7 @@ static void takeover_run_fail_callback(struct ctdb_context *ctdb,
  * - Send IPREALLOCATED to all nodes (with backward compatibility hack)
  */
 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
-                     uint32_t *force_rebalance_nodes,
-                     client_async_callback fail_callback, void *callback_data)
+                     uint32_t *force_rebalance_nodes)
 {
        int i, ret;
        struct ctdb_public_ip ip;
@@ -1660,9 +1681,7 @@ int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodem
         * following steps will cause an early return, so this can be
         * reused for each of those steps without re-initialising. */
        takeover_data = takeover_callback_data_init(tmp_ctx,
-                                                   nodemap->num,
-                                                   fail_callback,
-                                                   callback_data);
+                                                   nodemap->num);
        if (takeover_data == NULL) {
                talloc_free(tmp_ctx);
                return -1;
@@ -1774,9 +1793,9 @@ int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodem
                }
        }
        if (ctdb_client_async_wait(ctdb, async_data) != 0) {
-               DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
-               talloc_free(tmp_ctx);
-               return -1;
+               DEBUG(DEBUG_ERR,
+                     ("Async control CTDB_CONTROL_RELEASE_IP failed\n"));
+               goto fail;
        }
        talloc_free(async_data);
 
@@ -1815,9 +1834,9 @@ int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodem
                ctdb_client_async_add(async_data, state);
        }
        if (ctdb_client_async_wait(ctdb, async_data) != 0) {
-               DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
-               talloc_free(tmp_ctx);
-               return -1;
+               DEBUG(DEBUG_ERR,
+                     ("Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
+               goto fail;
        }
 
 ipreallocated:
@@ -1837,10 +1856,16 @@ ipreallocated:
        if (ret != 0) {
                DEBUG(DEBUG_ERR,
                      ("Async CTDB_CONTROL_IPREALLOCATED control failed\n"));
+               goto fail;
        }
 
        talloc_free(tmp_ctx);
        return ret;
+
+fail:
+       takeover_run_process_failures(ctdb, takeover_data);
+       talloc_free(tmp_ctx);
+       return -1;
 }