ctdb-daemon: Move interface addition into interface parsing
[metze/samba/wip.git] / ctdb / server / ctdb_takeover.c
index cf67278ab2c4333cf0bae4bc3ea97153d057ab9a..f34803217f0b3b79c28e5920eda7199a2929bc5f 100644 (file)
@@ -55,13 +55,47 @@ struct ctdb_interface {
        uint32_t references;
 };
 
-static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
+/* state associated with a public ip address */
+struct ctdb_vnn {
+       struct ctdb_vnn *prev, *next;
+
+       struct ctdb_interface *iface;
+       const char **ifaces;
+       ctdb_sock_addr public_address;
+       uint8_t public_netmask_bits;
+
+       /* the node number that is serving this public address, if any.
+          If no node serves this ip it is set to -1 */
+       int32_t pnn;
+
+       /* List of clients to tickle for this public address */
+       struct ctdb_tcp_array *tcp_array;
+
+       /* whether we need to update the other nodes with changes to our list
+          of connected clients */
+       bool tcp_update_needed;
+
+       /* a context to hang sending gratious arp events off */
+       TALLOC_CTX *takeover_ctx;
+
+       /* Set to true any time an update to this VNN is in flight.
+          This helps to avoid races. */
+       bool update_in_flight;
+
+       /* If CTDB_CONTROL_DEL_PUBLIC_IP is received for this IP
+        * address then this flag is set.  It will be deleted in the
+        * release IP callback. */
+       bool delete_pending;
+};
+
+static const char *iface_string(const struct ctdb_interface *iface)
 {
-       if (vnn->iface) {
-               return vnn->iface->name;
-       }
+       return (iface != NULL ? iface->name : "__none__");
+}
 
-       return "__none__";
+static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
+{
+       return iface_string(vnn->iface);
 }
 
 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
@@ -82,9 +116,16 @@ static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
 
        /* create a new structure for this interface */
        i = talloc_zero(ctdb, struct ctdb_interface);
-       CTDB_NO_MEMORY_FATAL(ctdb, i);
+       if (i == NULL) {
+               DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
+               return -1;
+       }
        i->name = talloc_strdup(i, iface);
-       CTDB_NO_MEMORY(ctdb, i->name);
+       if (i->name == NULL) {
+               DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
+               talloc_free(i);
+               return -1;
+       }
 
        i->link_up = true;
 
@@ -400,12 +441,6 @@ static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
        return 0;
 }
 
-struct takeover_callback_state {
-       struct ctdb_req_control_old *c;
-       ctdb_sock_addr *addr;
-       struct ctdb_vnn *vnn;
-};
-
 struct ctdb_do_takeip_state {
        struct ctdb_req_control_old *c;
        struct ctdb_vnn *vnn;
@@ -423,8 +458,6 @@ static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
        TDB_DATA data;
 
        if (status != 0) {
-               struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
-       
                if (status == -ETIME) {
                        ctdb_ban_self(ctdb);
                }
@@ -433,7 +466,6 @@ static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
                                 ctdb_vnn_iface_string(state->vnn)));
                ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
 
-               node->flags |= NODE_FLAGS_UNHEALTHY;
                talloc_free(state);
                return;
        }
@@ -498,7 +530,7 @@ static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
        state = talloc(vnn, struct ctdb_do_takeip_state);
        CTDB_NO_MEMORY(ctdb, state);
 
-       state->c = talloc_steal(ctdb, c);
+       state->c = NULL;
        state->vnn   = vnn;
 
        vnn->update_in_flight = true;
@@ -527,6 +559,7 @@ static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
                return -1;
        }
 
+       state->c = talloc_steal(ctdb, c);
        return 0;
 }
 
@@ -550,10 +583,11 @@ static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
                if (status == -ETIME) {
                        ctdb_ban_self(ctdb);
                }
-               DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
-                       ctdb_addr_to_str(&state->vnn->public_address),
-                       state->old->name,
-                       ctdb_vnn_iface_string(state->vnn)));
+               DEBUG(DEBUG_ERR,
+                     ("Failed update of IP %s from interface %s to %s\n",
+                      ctdb_addr_to_str(&state->vnn->public_address),
+                      iface_string(state->old),
+                      ctdb_vnn_iface_string(state->vnn)));
 
                /*
                 * All we can do is reset the old interface
@@ -601,6 +635,7 @@ static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
        int ret;
        struct ctdb_do_updateip_state *state;
        struct ctdb_interface *old = vnn->iface;
+       const char *old_name = iface_string(old);
        const char *new_name;
 
        if (vnn->update_in_flight) {
@@ -614,16 +649,17 @@ static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
        ctdb_vnn_unassign_iface(ctdb, vnn);
        ret = ctdb_vnn_assign_iface(ctdb, vnn);
        if (ret != 0) {
-               DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
-                                "assin a usable interface (old iface '%s')\n",
+               DEBUG(DEBUG_ERR,("Update of IP %s/%u failed to "
+                                "assign a usable interface (old iface '%s')\n",
                                 ctdb_addr_to_str(&vnn->public_address),
                                 vnn->public_netmask_bits,
-                                old->name));
+                                old_name));
                return -1;
        }
 
        new_name = ctdb_vnn_iface_string(vnn);
-       if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
+       if (old_name != NULL && new_name != NULL &&
+           strcmp(old_name, new_name) == 0) {
                /* A benign update from one interface onto itself.
                 * no need to run the eventscripts in this case, just return
                 * success.
@@ -635,7 +671,7 @@ static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
        state = talloc(vnn, struct ctdb_do_updateip_state);
        CTDB_NO_MEMORY(ctdb, state);
 
-       state->c = talloc_steal(ctdb, c);
+       state->c = NULL;
        state->old = old;
        state->vnn = vnn;
 
@@ -646,7 +682,7 @@ static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
                            "interface %s to %s\n",
                            ctdb_addr_to_str(&vnn->public_address),
                            vnn->public_netmask_bits,
-                           old->name,
+                           old_name,
                            new_name));
 
        ret = ctdb_event_script_callback(ctdb,
@@ -655,18 +691,20 @@ static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
                                         state,
                                         CTDB_EVENT_UPDATE_IP,
                                         "%s %s %s %u",
-                                        state->old->name,
+                                        old_name,
                                         new_name,
                                         ctdb_addr_to_str(&vnn->public_address),
                                         vnn->public_netmask_bits);
        if (ret != 0) {
-               DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
-                                ctdb_addr_to_str(&vnn->public_address),
-                                old->name, new_name));
+               DEBUG(DEBUG_ERR,
+                     ("Failed update IP %s from interface %s to %s\n",
+                      ctdb_addr_to_str(&vnn->public_address),
+                      old_name, new_name));
                talloc_free(state);
                return -1;
        }
 
+       state->c = talloc_steal(ctdb, c);
        return 0;
 }
 
@@ -733,19 +771,6 @@ int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
                return -1;
        }
 
-       if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
-               DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
-               have_ip = false;
-       }
-
-
-       if (vnn->iface == NULL && have_ip) {
-               DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
-                                 "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
-                                ctdb_addr_to_str(&vnn->public_address)));
-               return 0;
-       }
-
        if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
                DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
                                  "and we have it on iface[%s], but it was assigned to node %d"
@@ -757,12 +782,16 @@ int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
        }
 
        if (vnn->pnn == -1 && have_ip) {
-               vnn->pnn = ctdb->pnn;
-               DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
-                                 "and we already have it on iface[%s], update local daemon\n",
-                                ctdb_addr_to_str(&vnn->public_address),
-                                 ctdb_vnn_iface_string(vnn)));
-               return 0;
+               /* This will cause connections to be reset and
+                * reestablished.  However, this is a very unusual
+                * situation and doing this will completely repair the
+                * inconsistency in the VNN.
+                */
+               DEBUG(DEBUG_WARNING,
+                     (__location__
+                      " Doing updateip for IP %s already on an interface\n",
+                      ctdb_addr_to_str(&vnn->public_address)));
+               do_updateip = true;
        }
 
        if (vnn->iface) {
@@ -820,15 +849,47 @@ static void do_delete_ip(struct ctdb_context *ctdb, struct ctdb_vnn *vnn)
        talloc_free(vnn);
 }
 
+static struct ctdb_vnn *release_ip_post(struct ctdb_context *ctdb,
+                                       struct ctdb_vnn *vnn,
+                                       ctdb_sock_addr *addr)
+{
+       TDB_DATA data;
+
+       /* Send a message to all clients of this node telling them
+        * that the cluster has been reconfigured and they should
+        * close any connections on this IP address
+        */
+       data.dptr = (uint8_t *)ctdb_addr_to_str(addr);
+       data.dsize = strlen((char *)data.dptr)+1;
+       DEBUG(DEBUG_INFO, ("Sending RELEASE_IP message for %s\n", data.dptr));
+       ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
+
+       ctdb_vnn_unassign_iface(ctdb, vnn);
+
+       /* Process the IP if it has been marked for deletion */
+       if (vnn->delete_pending) {
+               do_delete_ip(ctdb, vnn);
+               return NULL;
+       }
+
+       return vnn;
+}
+
+struct release_ip_callback_state {
+       struct ctdb_req_control_old *c;
+       ctdb_sock_addr *addr;
+       struct ctdb_vnn *vnn;
+       uint32_t target_pnn;
+};
+
 /*
   called when releaseip event finishes
  */
-static void release_ip_callback(struct ctdb_context *ctdb, int status, 
+static void release_ip_callback(struct ctdb_context *ctdb, int status,
                                void *private_data)
 {
-       struct takeover_callback_state *state = 
-               talloc_get_type(private_data, struct takeover_callback_state);
-       TDB_DATA data;
+       struct release_ip_callback_state *state =
+               talloc_get_type(private_data, struct release_ip_callback_state);
 
        if (status == -ETIME) {
                ctdb_ban_self(ctdb);
@@ -846,31 +907,15 @@ static void release_ip_callback(struct ctdb_context *ctdb, int status,
                }
        }
 
-       /* send a message to all clients of this node telling them
-          that the cluster has been reconfigured and they should
-          release any sockets on this IP */
-       data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
-       CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
-       data.dsize = strlen((char *)data.dptr)+1;
-
-       DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
-
-       ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
-
-       ctdb_vnn_unassign_iface(ctdb, state->vnn);
-
-       /* Process the IP if it has been marked for deletion */
-       if (state->vnn->delete_pending) {
-               do_delete_ip(ctdb, state->vnn);
-               state->vnn = NULL;
-       }
+       state->vnn->pnn = state->target_pnn;
+       state->vnn = release_ip_post(ctdb, state->vnn, state->addr);
 
        /* the control succeeded */
        ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
        talloc_free(state);
 }
 
-static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
+static int ctdb_releaseip_destructor(struct release_ip_callback_state *state)
 {
        if (state->vnn != NULL) {
                state->vnn->update_in_flight = false;
@@ -887,10 +932,10 @@ int32_t ctdb_control_release_ip(struct ctdb_context *ctdb,
                                bool *async_reply)
 {
        int ret;
-       struct takeover_callback_state *state;
+       struct release_ip_callback_state *state;
        struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
        struct ctdb_vnn *vnn;
-       char *iface;
+       const char *iface;
 
        /* update our vnn list */
        vnn = find_public_ip_vnn(ctdb, &pip->addr);
@@ -899,16 +944,20 @@ int32_t ctdb_control_release_ip(struct ctdb_context *ctdb,
                        ctdb_addr_to_str(&pip->addr)));
                return 0;
        }
-       vnn->pnn = pip->pnn;
 
        /* stop any previous arps */
        talloc_free(vnn->takeover_ctx);
        vnn->takeover_ctx = NULL;
 
-       /* Some ctdb tool commands (e.g. moveip) send
-        * lazy multicast to drop an IP from any node that isn't the
-        * intended new node.  The following causes makes ctdbd ignore
-        * a release for any address it doesn't host.
+       /* RELEASE_IP controls are sent to all nodes that should not
+        * be hosting a particular IP.  This serves 2 purposes.  The
+        * first is to help resolve any inconsistencies.  If a node
+        * does unexpectly host an IP then it will be released.  The
+        * 2nd is to use a "redundant release" to tell non-takeover
+        * nodes where an IP is moving to.  This is how "ctdb ip" can
+        * report the (likely) location of an IP by only asking the
+        * local node.  Redundant releases need to update the PNN but
+        * are otherwise ignored.
         */
        if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
                if (!ctdb_sys_have_ip(&pip->addr)) {
@@ -916,6 +965,7 @@ int32_t ctdb_control_release_ip(struct ctdb_context *ctdb,
                                ctdb_addr_to_str(&pip->addr),
                                vnn->public_netmask_bits,
                                ctdb_vnn_iface_string(vnn)));
+                       vnn->pnn = pip->pnn;
                        ctdb_vnn_unassign_iface(ctdb, vnn);
                        return 0;
                }
@@ -924,6 +974,7 @@ int32_t ctdb_control_release_ip(struct ctdb_context *ctdb,
                        DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
                                           ctdb_addr_to_str(&pip->addr),
                                           vnn->public_netmask_bits));
+                       vnn->pnn = pip->pnn;
                        return 0;
                }
        }
@@ -941,7 +992,7 @@ int32_t ctdb_control_release_ip(struct ctdb_context *ctdb,
                return -1;
        }
 
-       iface = strdup(ctdb_vnn_iface_string(vnn));
+       iface = ctdb_vnn_iface_string(vnn);
 
        DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
                ctdb_addr_to_str(&pip->addr),
@@ -949,24 +1000,23 @@ int32_t ctdb_control_release_ip(struct ctdb_context *ctdb,
                iface,
                pip->pnn));
 
-       state = talloc(ctdb, struct takeover_callback_state);
+       state = talloc(ctdb, struct release_ip_callback_state);
        if (state == NULL) {
                ctdb_set_error(ctdb, "Out of memory at %s:%d",
                               __FILE__, __LINE__);
-               free(iface);
                return -1;
        }
 
-       state->c = talloc_steal(state, c);
-       state->addr = talloc(state, ctdb_sock_addr);       
+       state->c = NULL;
+       state->addr = talloc(state, ctdb_sock_addr);
        if (state->addr == NULL) {
                ctdb_set_error(ctdb, "Out of memory at %s:%d",
                               __FILE__, __LINE__);
-               free(iface);
                talloc_free(state);
                return -1;
        }
        *state->addr = pip->addr;
+       state->target_pnn = pip->pnn;
        state->vnn   = vnn;
 
        vnn->update_in_flight = true;
@@ -979,7 +1029,6 @@ int32_t ctdb_control_release_ip(struct ctdb_context *ctdb,
                                         iface,
                                         ctdb_addr_to_str(&pip->addr),
                                         vnn->public_netmask_bits);
-       free(iface);
        if (ret != 0) {
                DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
                        ctdb_addr_to_str(&pip->addr),
@@ -990,6 +1039,7 @@ int32_t ctdb_control_release_ip(struct ctdb_context *ctdb,
 
        /* tell the control that we will be reply asynchronously */
        *async_reply = true;
+       state->c = talloc_steal(state, c);
        return 0;
 }
 
@@ -1002,39 +1052,67 @@ static int ctdb_add_public_address(struct ctdb_context *ctdb,
        uint32_t num = 0;
        char *tmp;
        const char *iface;
-       int i;
        int ret;
 
-       tmp = strdup(ifaces);
-       for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
-               if (!ctdb_sys_check_iface_exists(iface)) {
-                       DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
-                       free(tmp);
-                       return -1;
-               }
-       }
-       free(tmp);
-
-       /* Verify that we don't have an entry for this ip yet */
-       for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
+       /* Verify that we don't have an entry for this IP yet */
+       for (vnn = ctdb->vnn; vnn != NULL; vnn = vnn->next) {
                if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
-                       DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
-                               ctdb_addr_to_str(addr)));
+                       DEBUG(DEBUG_ERR,
+                             ("Duplicate public IP address '%s'\n",
+                              ctdb_addr_to_str(addr)));
                        return -1;
-               }               
+               }
        }
 
-       /* create a new vnn structure for this ip address */
+       /* Create a new VNN structure for this IP address */
        vnn = talloc_zero(ctdb, struct ctdb_vnn);
-       CTDB_NO_MEMORY_FATAL(ctdb, vnn);
+       if (vnn == NULL) {
+               DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
+               return -1;
+       }
        vnn->ifaces = talloc_array(vnn, const char *, num + 2);
+       if (vnn->ifaces == NULL) {
+               DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
+               talloc_free(vnn);
+               return -1;
+       }
        tmp = talloc_strdup(vnn, ifaces);
-       CTDB_NO_MEMORY_FATAL(ctdb, tmp);
+       if (tmp == NULL) {
+               DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
+               talloc_free(vnn);
+               return -1;
+       }
        for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
+               if (!ctdb_sys_check_iface_exists(iface)) {
+                       DEBUG(DEBUG_ERR,
+                             ("Unknown interface %s for public address %s\n",
+                              iface, ctdb_addr_to_str(addr)));
+                       talloc_free(vnn);
+                       return -1;
+               }
+
+               ret = ctdb_add_local_iface(ctdb, iface);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR,
+                             ("Failed to add interface '%s' "
+                              "for public address %s\n",
+                              iface, ctdb_addr_to_str(addr)));
+                       talloc_free(vnn);
+                       return -1;
+               }
+
                vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
-               CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
+               if (vnn->ifaces == NULL) {
+                       DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
+                       talloc_free(vnn);
+                       return -1;
+               }
                vnn->ifaces[num] = talloc_strdup(vnn, iface);
-               CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
+               if (vnn->ifaces[num] == NULL) {
+                       DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
+                       talloc_free(vnn);
+                       return -1;
+               }
                num++;
        }
        talloc_free(tmp);
@@ -1042,23 +1120,6 @@ static int ctdb_add_public_address(struct ctdb_context *ctdb,
        vnn->public_address      = *addr;
        vnn->public_netmask_bits = mask;
        vnn->pnn                 = -1;
-       if (check_address) {
-               if (ctdb_sys_have_ip(addr)) {
-                       DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
-                       vnn->pnn = ctdb->pnn;
-               }
-       }
-
-       for (i=0; vnn->ifaces[i]; i++) {
-               ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
-               if (ret != 0) {
-                       DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
-                                          "for public_address[%s]\n",
-                                          vnn->ifaces[i], ctdb_addr_to_str(addr)));
-                       talloc_free(vnn);
-                       return -1;
-               }
-       }
 
        DLIST_ADD(ctdb->vnn, vnn);
 
@@ -1132,170 +1193,60 @@ int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
        return 0;
 }
 
-static void *add_ip_callback(void *parm, void *data)
-{
-       struct public_ip_list *this_ip = parm;
-       struct public_ip_list *prev_ip = data;
-
-       if (prev_ip == NULL) {
-               return parm;
-       }
-       if (this_ip->pnn == -1) {
-               this_ip->pnn = prev_ip->pnn;
-       }
-
-       return parm;
-}
-
-static int getips_count_callback(void *param, void *data)
-{
-       struct public_ip_list **ip_list = (struct public_ip_list **)param;
-       struct public_ip_list *new_ip = (struct public_ip_list *)data;
-
-       new_ip->next = *ip_list;
-       *ip_list     = new_ip;
-       return 0;
-}
-
-static int verify_remote_ip_allocation(struct ctdb_context *ctdb,
-                                      struct ctdb_public_ip_list *ips,
-                                      uint32_t pnn);
-
-static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
-                                        struct ipalloc_state *ipalloc_state,
-                                        struct ctdb_node_map_old *nodemap)
+static struct ctdb_public_ip_list *
+ctdb_fetch_remote_public_ips(struct ctdb_context *ctdb,
+                            TALLOC_CTX *mem_ctx,
+                            struct ctdb_node_map_old *nodemap,
+                            uint32_t public_ip_flags)
 {
-       int j;
-       int ret;
+       int j, ret;
        struct ctdb_public_ip_list_old *ip_list;
+       struct ctdb_public_ip_list *public_ips;
 
-       if (ipalloc_state->num != nodemap->num) {
-               DEBUG(DEBUG_ERR,
-                     (__location__
-                      " ipalloc_state->num (%d) != nodemap->num (%d) invalid param\n",
-                      ipalloc_state->num, nodemap->num));
-               return -1;
+       public_ips = talloc_zero_array(mem_ctx,
+                                      struct ctdb_public_ip_list,
+                                      nodemap->num);
+       if (public_ips == NULL) {
+               DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
+               return NULL;
        }
 
-       for (j=0; j<nodemap->num; j++) {
+       for (j = 0; j < nodemap->num; j++) {
                if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
                        continue;
                }
 
-               /* Retrieve the list of known public IPs from the node */
-               ret = ctdb_ctrl_get_public_ips_flags(ctdb,
-                                       TAKEOVER_TIMEOUT(),
-                                       j,
-                                       ipalloc_state->known_public_ips,
-                                       0,
-                                       &ip_list);
+               /* Retrieve the list of public IPs from the
+                * node. Flags says whether it is known or
+                * available. */
+               ret = ctdb_ctrl_get_public_ips_flags(
+                       ctdb, TAKEOVER_TIMEOUT(), j, public_ips,
+                       public_ip_flags, &ip_list);
                if (ret != 0) {
                        DEBUG(DEBUG_ERR,
-                             ("Failed to read known public IPs from node: %u\n",
-                              j));
-                       return -1;
+                             ("Failed to read public IPs from node: %u\n", j));
+                       talloc_free(public_ips);
+                       return NULL;
                }
-               ipalloc_state->known_public_ips[j].num = ip_list->num;
-               /* This could be copied and freed.  However, ip_list
-                * is allocated off ipalloc_state->known_public_ips,
-                * so this is a safe hack.  This will go away in a
-                * while anyway... */
-               ipalloc_state->known_public_ips[j].ip = &ip_list->ips[0];
-
-               if (ctdb->do_checkpublicip) {
-                       verify_remote_ip_allocation(
-                               ctdb,
-                               &ipalloc_state->known_public_ips[j],
-                               j);
-               }
-
-               /* Retrieve the list of available public IPs from the node */
-               ret = ctdb_ctrl_get_public_ips_flags(ctdb,
-                                       TAKEOVER_TIMEOUT(),
-                                       j,
-                                       ipalloc_state->available_public_ips,
-                                       CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE,
-                                       &ip_list);
-               if (ret != 0) {
-                       DEBUG(DEBUG_ERR,
-                             ("Failed to read available public IPs from node: %u\n",
-                              j));
-                       return -1;
-               }
-               ipalloc_state->available_public_ips[j].num = ip_list->num;
-               /* This could be copied and freed.  However, ip_list
-                * is allocated off ipalloc_state->available_public_ips,
-                * so this is a safe hack.  This will go away in a
-                * while anyway... */
-               ipalloc_state->available_public_ips[j].ip = &ip_list->ips[0];
-       }
-
-       return 0;
-}
-
-static struct public_ip_list *
-create_merged_ip_list(struct ctdb_context *ctdb, struct ipalloc_state *ipalloc_state)
-{
-       int i, j;
-       struct public_ip_list *ip_list;
-       struct ctdb_public_ip_list *public_ips;
-
-       TALLOC_FREE(ctdb->ip_tree);
-       ctdb->ip_tree = trbt_create(ctdb, 0);
-
-       for (i=0; i < ctdb->num_nodes; i++) {
-
-               if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
+               public_ips[j].num = ip_list->num;
+               if (ip_list->num == 0) {
+                       talloc_free(ip_list);
                        continue;
                }
-
-               /* there were no public ips for this node */
-               if (ipalloc_state->known_public_ips == NULL) {
-                       continue;
-               }
-
-               public_ips = &ipalloc_state->known_public_ips[i];
-
-               for (j=0; j < public_ips->num; j++) {
-                       struct public_ip_list *tmp_ip;
-
-                       tmp_ip = talloc_zero(ctdb->ip_tree, struct public_ip_list);
-                       CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
-                       /* Do not use information about IP addresses hosted
-                        * on other nodes, it may not be accurate */
-                       if (public_ips->ip[j].pnn == ctdb->nodes[i]->pnn) {
-                               tmp_ip->pnn = public_ips->ip[j].pnn;
-                       } else {
-                               tmp_ip->pnn = -1;
-                       }
-                       tmp_ip->addr = public_ips->ip[j].addr;
-                       tmp_ip->next = NULL;
-
-                       trbt_insertarray32_callback(ctdb->ip_tree,
-                               IP_KEYLEN, ip_key(&public_ips->ip[j].addr),
-                               add_ip_callback,
-                               tmp_ip);
-               }
-       }
-
-       ip_list = NULL;
-       trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
-
-       return ip_list;
-}
-
-static bool all_nodes_are_disabled(struct ctdb_node_map_old *nodemap)
-{
-       int i;
-
-       for (i=0;i<nodemap->num;i++) {
-               if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
-                       /* Found one completely healthy node */
-                       return false;
+               public_ips[j].ip = talloc_zero_array(public_ips,
+                                                    struct ctdb_public_ip,
+                                                    ip_list->num);
+               if (public_ips[j].ip == NULL) {
+                       DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
+                       talloc_free(public_ips);
+                       return NULL;
                }
+               memcpy(public_ips[j].ip, &ip_list->ips[0],
+                      sizeof(struct ctdb_public_ip) * ip_list->num);
+               talloc_free(ip_list);
        }
 
-       return true;
+       return public_ips;
 }
 
 struct get_tunable_callback_data {
@@ -1410,61 +1361,34 @@ static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
        return tvals;
 }
 
-/* Set internal flags for IP allocation:
- *   Clear ip flags
- *   Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
- *   Set NOIPHOST ip flag for each INACTIVE node
- *   if all nodes are disabled:
- *     Set NOIPHOST ip flags from per-node NoIPHostOnAllDisabled tunable
- *   else
- *     Set NOIPHOST ip flags for disabled nodes
- */
-static void set_ipflags_internal(struct ipalloc_state *ipalloc_state,
-                                struct ctdb_node_map_old *nodemap,
-                                uint32_t *tval_noiptakeover,
-                                uint32_t *tval_noiphostonalldisabled)
+static struct ctdb_node_map *
+ctdb_node_map_old_to_new(TALLOC_CTX *mem_ctx,
+                        const struct ctdb_node_map_old *old)
 {
-       int i;
-
-       for (i=0;i<nodemap->num;i++) {
-               /* Can not take IPs on node with NoIPTakeover set */
-               if (tval_noiptakeover[i] != 0) {
-                       ipalloc_state->noiptakeover[i] = true;
-               }
+       struct ctdb_node_map *new;
 
-               /* Can not host IPs on INACTIVE node */
-               if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
-                       ipalloc_state->noiphost[i] = true;
-               }
+       new = talloc(mem_ctx, struct ctdb_node_map);
+       if (new == NULL) {
+               DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
+               return NULL;
        }
+       new->num = old->num;
+       new->node = talloc_zero_array(new,
+                                     struct ctdb_node_and_flags, new->num);
+       memcpy(new->node, &old->nodes[0],
+              sizeof(struct ctdb_node_and_flags) * new->num);
 
-       if (all_nodes_are_disabled(nodemap)) {
-               /* If all nodes are disabled, can not host IPs on node
-                * with NoIPHostOnAllDisabled set
-                */
-               for (i=0;i<nodemap->num;i++) {
-                       if (tval_noiphostonalldisabled[i] != 0) {
-                               ipalloc_state->noiphost[i] = true;
-                       }
-               }
-       } else {
-               /* If some nodes are not disabled, then can not host
-                * IPs on DISABLED node
-                */
-               for (i=0;i<nodemap->num;i++) {
-                       if (nodemap->nodes[i].flags & NODE_FLAGS_DISABLED) {
-                               ipalloc_state->noiphost[i] = true;
-                       }
-               }
-       }
+       return new;
 }
 
+
 static bool set_ipflags(struct ctdb_context *ctdb,
                        struct ipalloc_state *ipalloc_state,
                        struct ctdb_node_map_old *nodemap)
 {
        uint32_t *tval_noiptakeover;
        uint32_t *tval_noiphostonalldisabled;
+       struct ctdb_node_map *new;
 
        tval_noiptakeover = get_tunable_from_nodes(ctdb, ipalloc_state, nodemap,
                                                   "NoIPTakeover", 0);
@@ -1480,85 +1404,64 @@ static bool set_ipflags(struct ctdb_context *ctdb,
                return false;
        }
 
-       set_ipflags_internal(ipalloc_state, nodemap,
+       new = ctdb_node_map_old_to_new(ipalloc_state, nodemap);
+       if (new == NULL) {
+               return false;
+       }
+
+       ipalloc_set_node_flags(ipalloc_state, new,
                             tval_noiptakeover,
                             tval_noiphostonalldisabled);
 
        talloc_free(tval_noiptakeover);
        talloc_free(tval_noiphostonalldisabled);
+       talloc_free(new);
 
        return true;
 }
 
-static struct ipalloc_state * ipalloc_state_init(struct ctdb_context *ctdb,
-                                                TALLOC_CTX *mem_ctx)
+static enum ipalloc_algorithm
+determine_algorithm(const struct ctdb_tunable_list *tunables)
 {
-       struct ipalloc_state *ipalloc_state =
-               talloc_zero(mem_ctx, struct ipalloc_state);
-       if (ipalloc_state == NULL) {
-               DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
-               return NULL;
+       if (1 == tunables->lcp2_public_ip_assignment) {
+               return IPALLOC_LCP2;
+       } else if (1 == tunables->deterministic_public_ips) {
+               return IPALLOC_DETERMINISTIC;
+       } else {
+               return IPALLOC_NONDETERMINISTIC;
        }
+}
 
-       ipalloc_state->num = ctdb->num_nodes;
+struct takeover_callback_data {
+       uint32_t num_nodes;
+       unsigned int *fail_count;
+};
 
-       ipalloc_state->known_public_ips =
-               talloc_zero_array(ipalloc_state,
-                                 struct ctdb_public_ip_list,
-                                 ipalloc_state->num);
-       if (ipalloc_state->known_public_ips == NULL) {
-               DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
-               goto fail;
-       }
+static struct takeover_callback_data *
+takeover_callback_data_init(TALLOC_CTX *mem_ctx,
+                           uint32_t num_nodes)
+{
+       static struct takeover_callback_data *takeover_data;
 
-       ipalloc_state->available_public_ips =
-               talloc_zero_array(ipalloc_state,
-                                 struct ctdb_public_ip_list,
-                                 ipalloc_state->num);
-       if (ipalloc_state->available_public_ips == NULL) {
-               DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
-               goto fail;
-       }
-       ipalloc_state->noiptakeover =
-               talloc_zero_array(ipalloc_state,
-                                 bool,
-                                 ipalloc_state->num);
-       if (ipalloc_state->noiptakeover == NULL) {
-               DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
-               goto fail;
-       }
-       ipalloc_state->noiphost =
-               talloc_zero_array(ipalloc_state,
-                                 bool,
-                                 ipalloc_state->num);
-       if (ipalloc_state->noiphost == NULL) {
-               DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
-               goto fail;
+       takeover_data = talloc_zero(mem_ctx, struct takeover_callback_data);
+       if (takeover_data == NULL) {
+               DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
+               return NULL;
        }
 
-       if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
-               ipalloc_state->algorithm = IPALLOC_LCP2;
-       } else if (1 == ctdb->tunable.deterministic_public_ips) {
-               ipalloc_state->algorithm = IPALLOC_DETERMINISTIC;
-       } else {
-               ipalloc_state->algorithm = IPALLOC_NONDETERMINISTIC;
+       takeover_data->fail_count = talloc_zero_array(takeover_data,
+                                                     unsigned int, num_nodes);
+       if (takeover_data->fail_count == NULL) {
+               DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
+               talloc_free(takeover_data);
+               return NULL;
        }
 
-       ipalloc_state->no_ip_failback = ctdb->tunable.no_ip_failback;
+       takeover_data->num_nodes = num_nodes;
 
-       return ipalloc_state;
-fail:
-       talloc_free(ipalloc_state);
-       return NULL;
+       return takeover_data;
 }
 
-struct takeover_callback_data {
-       bool *node_failed;
-       client_async_callback fail_callback;
-       void *fail_callback_data;
-       struct ctdb_node_map_old *nodemap;
-};
-
 static void takeover_run_fail_callback(struct ctdb_context *ctdb,
                                       uint32_t node_pnn, int32_t res,
                                       TDB_DATA outdata, void *callback_data)
@@ -1567,15 +1470,52 @@ static void takeover_run_fail_callback(struct ctdb_context *ctdb,
                talloc_get_type_abort(callback_data,
                                      struct takeover_callback_data);
 
-       if (node_pnn >= cd->nodemap->num) {
+       if (node_pnn >= cd->num_nodes) {
                DEBUG(DEBUG_ERR, (__location__ " invalid PNN %u\n", node_pnn));
                return;
        }
 
-       if (!cd->node_failed[node_pnn]) {
-               cd->node_failed[node_pnn] = true;
-               cd->fail_callback(ctdb, node_pnn, res, outdata,
-                                 cd->fail_callback_data);
+       if (cd->fail_count[node_pnn] == 0) {
+               DEBUG(DEBUG_ERR,
+                     ("Node %u failed the takeover run\n", node_pnn));
+       }
+
+       cd->fail_count[node_pnn]++;
+}
+
+static void takeover_run_process_failures(struct ctdb_context *ctdb,
+                                         struct takeover_callback_data *tcd)
+{
+       unsigned int max_fails = 0;
+       uint32_t max_pnn = -1;
+       uint32_t i;
+
+       for (i = 0; i < tcd->num_nodes; i++) {
+               if (tcd->fail_count[i] > max_fails) {
+                       max_pnn = i;
+                       max_fails = tcd->fail_count[i];
+               }
+       }
+
+       if (max_fails > 0) {
+               int ret;
+               TDB_DATA data;
+
+               DEBUG(DEBUG_ERR,
+                     ("Sending banning credits to %u with fail count %u\n",
+                      max_pnn, max_fails));
+
+               data.dptr = (uint8_t *)&max_pnn;
+               data.dsize = sizeof(uint32_t);
+               ret = ctdb_client_send_message(ctdb,
+                                              CTDB_BROADCAST_CONNECTED,
+                                              CTDB_SRVID_BANNING,
+                                              data);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR,
+                             ("Failed to set banning credits for node %u\n",
+                              max_pnn));
+               }
        }
 }
 
@@ -1583,33 +1523,30 @@ static void takeover_run_fail_callback(struct ctdb_context *ctdb,
  * Recalculate the allocation of public IPs to nodes and have the
  * nodes host their allocated addresses.
  *
- * - Allocate memory for IP allocation state, including per node
- *   arrays
- * - Populate IP allocation algorithm in IP allocation state
- * - Populate local value of tunable NoIPFailback in IP allocation
-     state - this is really a cluster-wide configuration variable and
-     only the value form the master node is used
- * - Retrieve tunables NoIPTakeover and NoIPHostOnAllDisabled from all
- *   connected nodes - this is done separately so tunable values can
- *   be faked in unit testing
- * - Populate NoIPTakover tunable in IP allocation state
- * - Populate NoIPHost in IP allocation state, derived from node flags
- *   and NoIPHostOnAllDisabled tunable
- * - Retrieve and populate known and available IP lists in IP
- *   allocation state
- * - If no available IP addresses then early exit
- * - Build list of (known IPs, currently assigned node)
- * - Populate list of nodes to force rebalance - internal structure,
- *   currently no way to fetch, only used by LCP2 for nodes that have
- *   had new IP addresses added
+ * - Initialise IP allocation state.  Pass:
+     + algorithm to be used;
+     + whether IP rebalancing ("failback") should be done (this uses a
+       cluster-wide configuration variable and only the value form the
+       master node is used); and
+ *   + list of nodes to force rebalance (internal structure, currently
+ *     no way to fetch, only used by LCP2 for nodes that have had new
+ *     IP addresses added).
+ * - Set IP flags for IP allocation based on node map and tunables
+ *   NoIPTakeover/NoIPHostOnAllDisabled from all connected nodes
+ *   (tunable fetching done separately so values can be faked in unit
+ *   testing)
+ * - Retrieve known and available IP addresses (done separately so
+ *   values can be faked in unit testing)
+ * - Use ipalloc_set_public_ips() to set known and available IP
+     addresses for allocation
+ * - If cluster can't host IP addresses then early exit
  * - Run IP allocation algorithm
  * - Send RELEASE_IP to all nodes for IPs they should not host
  * - Send TAKE_IP to all nodes for IPs they should host
  * - Send IPREALLOCATED to all nodes (with backward compatibility hack)
  */
 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
-                     uint32_t *force_rebalance_nodes,
-                     client_async_callback fail_callback, void *callback_data)
+                     uint32_t *force_rebalance_nodes)
 {
        int i, ret;
        struct ctdb_public_ip ip;
@@ -1621,8 +1558,23 @@ int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodem
        struct ctdb_client_control_state *state;
        TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
        struct ipalloc_state *ipalloc_state;
+       struct ctdb_public_ip_list *known_ips, *available_ips;
        struct takeover_callback_data *takeover_data;
-       bool can_host_ips;
+
+       /* Initialise fail callback data to be used with
+        * takeover_run_fail_callback().  A failure in any of the
+        * following steps will cause an early return, so this can be
+        * reused for each of those steps without re-initialising. */
+       takeover_data = takeover_callback_data_init(tmp_ctx,
+                                                   nodemap->num);
+       if (takeover_data == NULL) {
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       /* Default timeout for early jump to IPREALLOCATED.  See below
+        * for explanation of 3 times... */
+       timeout = timeval_current_ofs(3 * ctdb->tunable.takeover_timeout, 0);
 
        /*
         * ip failover is completely disabled, just send out the 
@@ -1632,67 +1584,62 @@ int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodem
                goto ipreallocated;
        }
 
-       ipalloc_state = ipalloc_state_init(ctdb, tmp_ctx);
+       ipalloc_state = ipalloc_state_init(tmp_ctx, ctdb->num_nodes,
+                                          determine_algorithm(&ctdb->tunable),
+                                          (ctdb->tunable.no_ip_failback != 0),
+                                          force_rebalance_nodes);
        if (ipalloc_state == NULL) {
                talloc_free(tmp_ctx);
                return -1;
        }
 
        if (!set_ipflags(ctdb, ipalloc_state, nodemap)) {
-               DEBUG(DEBUG_ERR,("Failed to set IP flags - aborting takeover run\n"));
+               DEBUG(DEBUG_ERR,
+                     ("Failed to set IP flags - aborting takeover run\n"));
                talloc_free(tmp_ctx);
                return -1;
        }
 
        /* Fetch known/available public IPs from each active node */
-       ret = ctdb_reload_remote_public_ips(ctdb, ipalloc_state, nodemap);
-       if (ret != 0) {
+       /* Fetch lists of known public IPs from all nodes */
+       known_ips = ctdb_fetch_remote_public_ips(ctdb, ipalloc_state,
+                                                nodemap, 0);
+       if (known_ips == NULL) {
+               DEBUG(DEBUG_ERR, ("Failed to read known public IPs\n"));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+       available_ips = ctdb_fetch_remote_public_ips(
+               ctdb, ipalloc_state, nodemap,
+               CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE);
+       if (available_ips == NULL) {
+               DEBUG(DEBUG_ERR, ("Failed to read available public IPs\n"));
                talloc_free(tmp_ctx);
                return -1;
        }
 
-       /* Short-circuit IP allocation if no node has available IPs */
-       can_host_ips = false;
-       for (i=0; i < ipalloc_state->num; i++) {
-               if (ipalloc_state->available_public_ips[i].num != 0) {
-                       can_host_ips = true;
-               }
+       if (! ipalloc_set_public_ips(ipalloc_state, known_ips, available_ips)) {
+               DEBUG(DEBUG_ERR, ("Failed to set public IPs\n"));
+               talloc_free(tmp_ctx);
+               return -1;
        }
-       if (!can_host_ips) {
+
+       if (! ipalloc_can_host_ips(ipalloc_state)) {
                DEBUG(DEBUG_WARNING,("No nodes available to host public IPs yet\n"));
                goto ipreallocated;
        }
 
-       /* since nodes only know about those public addresses that
-          can be served by that particular node, no single node has
-          a full list of all public addresses that exist in the cluster.
-          Walk over all node structures and create a merged list of
-          all public addresses that exist in the cluster.
-
-          keep the tree of ips around as ctdb->ip_tree
-       */
-       all_ips = create_merged_ip_list(ctdb, ipalloc_state);
-       ipalloc_state->all_ips = all_ips;
-
-       ipalloc_state->force_rebalance_nodes = force_rebalance_nodes;
-
        /* Do the IP reassignment calculations */
-       ipalloc(ipalloc_state);
+       all_ips = ipalloc(ipalloc_state);
+       if (all_ips == NULL) {
+               talloc_free(tmp_ctx);
+               return -1;
+       }
 
        /* Now tell all nodes to release any public IPs should not
         * host.  This will be a NOOP on nodes that don't currently
         * hold the given IP.
         */
-       takeover_data = talloc_zero(tmp_ctx, struct takeover_callback_data);
-       CTDB_NO_MEMORY_FATAL(ctdb, takeover_data);
-
-       takeover_data->node_failed = talloc_zero_array(tmp_ctx,
-                                                      bool, nodemap->num);
-       CTDB_NO_MEMORY_FATAL(ctdb, takeover_data->node_failed);
-       takeover_data->fail_callback = fail_callback;
-       takeover_data->fail_callback_data = callback_data;
-       takeover_data->nodemap = nodemap;
-
        async_data = talloc_zero(tmp_ctx, struct client_async_data);
        CTDB_NO_MEMORY_FATAL(ctdb, async_data);
 
@@ -1701,6 +1648,20 @@ int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodem
 
        ZERO_STRUCT(ip); /* Avoid valgrind warnings for union */
 
+       /* Each of the following stages (RELEASE_IP, TAKEOVER_IP,
+        * IPREALLOCATED) notionally has a timeout of TakeoverTimeout
+        * seconds.  However, RELEASE_IP can take longer due to TCP
+        * connection killing, so sometimes needs more time.
+        * Therefore, use a cumulative timeout of TakeoverTimeout * 3
+        * seconds across all 3 stages.  No explicit expiry checks are
+        * needed before each stage because tevent is smart enough to
+        * fire the timeouts even if they are in the past.  Initialise
+        * this here so it explicitly covers the stages we're
+        * interested in but, in particular, not the time taken by the
+        * ipalloc().
+        */
+       timeout = timeval_current_ofs(3 * ctdb->tunable.takeover_timeout, 0);
+
        /* Send a RELEASE_IP to all nodes that should not be hosting
         * each IP.  For each IP, all but one of these will be
         * redundant.  However, the redundant ones are used to tell
@@ -1723,7 +1684,6 @@ int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodem
                        ip.pnn  = tmp_ip->pnn;
                        ip.addr = tmp_ip->addr;
 
-                       timeout = TAKEOVER_TIMEOUT();
                        data.dsize = sizeof(ip);
                        data.dptr  = (uint8_t *)&ip;
                        state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
@@ -1740,9 +1700,9 @@ int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodem
                }
        }
        if (ctdb_client_async_wait(ctdb, async_data) != 0) {
-               DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
-               talloc_free(tmp_ctx);
-               return -1;
+               DEBUG(DEBUG_ERR,
+                     ("Async control CTDB_CONTROL_RELEASE_IP failed\n"));
+               goto fail;
        }
        talloc_free(async_data);
 
@@ -1754,8 +1714,8 @@ int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodem
        async_data = talloc_zero(tmp_ctx, struct client_async_data);
        CTDB_NO_MEMORY_FATAL(ctdb, async_data);
 
-       async_data->fail_callback = fail_callback;
-       async_data->callback_data = callback_data;
+       async_data->fail_callback = takeover_run_fail_callback;
+       async_data->callback_data = takeover_data;
 
        for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
                if (tmp_ip->pnn == -1) {
@@ -1766,7 +1726,6 @@ int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodem
                ip.pnn  = tmp_ip->pnn;
                ip.addr = tmp_ip->addr;
 
-               timeout = TAKEOVER_TIMEOUT();
                data.dsize = sizeof(ip);
                data.dptr  = (uint8_t *)&ip;
                state = ctdb_control_send(ctdb, tmp_ip->pnn,
@@ -1781,9 +1740,9 @@ int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodem
                ctdb_client_async_add(async_data, state);
        }
        if (ctdb_client_async_wait(ctdb, async_data) != 0) {
-               DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
-               talloc_free(tmp_ctx);
-               return -1;
+               DEBUG(DEBUG_ERR,
+                     ("Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
+               goto fail;
        }
 
 ipreallocated:
@@ -1796,17 +1755,23 @@ ipreallocated:
         */
        nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
        ret = ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
-                                       nodes, 0, TAKEOVER_TIMEOUT(),
+                                       nodes, 0, timeout,
                                        false, tdb_null,
-                                       NULL, fail_callback,
-                                       &callback_data);
+                                       NULL, takeover_run_fail_callback,
+                                       takeover_data);
        if (ret != 0) {
                DEBUG(DEBUG_ERR,
                      ("Async CTDB_CONTROL_IPREALLOCATED control failed\n"));
+               goto fail;
        }
 
        talloc_free(tmp_ctx);
        return ret;
+
+fail:
+       takeover_run_process_failures(ctdb, takeover_data);
+       talloc_free(tmp_ctx);
+       return -1;
 }
 
 
@@ -2186,22 +2151,21 @@ void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
 
 void ctdb_release_all_ips(struct ctdb_context *ctdb)
 {
-       struct ctdb_vnn *vnn;
+       struct ctdb_vnn *vnn, *next;
        int count = 0;
-       TDB_DATA data;
 
        if (ctdb->tunable.disable_ip_failover == 1) {
                return;
        }
 
-       for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
+       for (vnn = ctdb->vnn; vnn != NULL; vnn = next) {
+               /* vnn can be freed below in release_ip_post() */
+               next = vnn->next;
+
                if (!ctdb_sys_have_ip(&vnn->public_address)) {
                        ctdb_vnn_unassign_iface(ctdb, vnn);
                        continue;
                }
-               if (!vnn->iface) {
-                       continue;
-               }
 
                /* Don't allow multiple releases at once.  Some code,
                 * particularly ctdb_tickle_sentenced_connections() is
@@ -2223,21 +2187,26 @@ void ctdb_release_all_ips(struct ctdb_context *ctdb)
                                    ctdb_vnn_iface_string(vnn)));
 
                ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
-                                 ctdb_vnn_iface_string(vnn),
-                                 ctdb_addr_to_str(&vnn->public_address),
-                                 vnn->public_netmask_bits);
-
-               data.dptr = (uint8_t *)talloc_strdup(
-                               vnn, ctdb_addr_to_str(&vnn->public_address));
-               if (data.dptr != NULL) {
-                       data.dsize = strlen((char *)data.dptr) + 1;
-                       ctdb_daemon_send_message(ctdb, ctdb->pnn,
-                                                CTDB_SRVID_RELEASE_IP, data);
-                       talloc_free(data.dptr);
+                                      ctdb_vnn_iface_string(vnn),
+                                      ctdb_addr_to_str(&vnn->public_address),
+                                      vnn->public_netmask_bits);
+               /* releaseip timeouts are converted to success, so to
+                * detect failures just check if the IP address is
+                * still there...
+                */
+               if (ctdb_sys_have_ip(&vnn->public_address)) {
+                       DEBUG(DEBUG_ERR,
+                             (__location__
+                              " IP address %s not released\n",
+                              ctdb_addr_to_str(&vnn->public_address)));
+                       vnn->update_in_flight = false;
+                       continue;
                }
 
-               ctdb_vnn_unassign_iface(ctdb, vnn);
-               vnn->update_in_flight = false;
+               vnn = release_ip_post(ctdb, vnn, &vnn->public_address);
+               if (vnn != NULL) {
+                       vnn->update_in_flight = false;
+               }
                count++;
        }
 
@@ -2437,7 +2406,7 @@ int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
                return 0;
        }
 
-       DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
+       DEBUG(DEBUG_ERR,
              ("iface[%s] has changed it's link status %s => %s\n",
               iface->name,
               iface->link_up?"up":"down",
@@ -2791,29 +2760,7 @@ int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA inda
        return 0;
 }
 
-struct delete_ip_callback_state {
-       struct ctdb_req_control_old *c;
-};
-
-/*
-  called when releaseip event finishes for del_public_address
- */
-static void delete_ip_callback(struct ctdb_context *ctdb,
-                              int32_t status, TDB_DATA data,
-                              const char *errormsg,
-                              void *private_data)
-{
-       struct delete_ip_callback_state *state =
-               talloc_get_type(private_data, struct delete_ip_callback_state);
-
-       /* If release failed then fail. */
-       ctdb_request_control_reply(ctdb, state->c, NULL, status, errormsg);
-       talloc_free(private_data);
-}
-
-int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb,
-                                       struct ctdb_req_control_old *c,
-                                       TDB_DATA indata, bool *async_reply)
+int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
 {
        struct ctdb_addr_info_old *pub = (struct ctdb_addr_info_old *)indata.dptr;
        struct ctdb_vnn *vnn;
@@ -2840,49 +2787,13 @@ int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb,
        for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
                if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
                        if (vnn->pnn == ctdb->pnn) {
-                               struct delete_ip_callback_state *state;
-                               struct ctdb_public_ip *ip;
-                               TDB_DATA data;
-                               int ret;
-
+                               /* This IP is currently being hosted.
+                                * Defer the deletion until the next
+                                * takeover run. "ctdb reloadips" will
+                                * always cause a takeover run.  "ctdb
+                                * delip" will now need an explicit
+                                * "ctdb ipreallocated" afterwards. */
                                vnn->delete_pending = true;
-
-                               state = talloc(ctdb,
-                                              struct delete_ip_callback_state);
-                               CTDB_NO_MEMORY(ctdb, state);
-                               state->c = c;
-
-                               ip = talloc(state, struct ctdb_public_ip);
-                               if (ip == NULL) {
-                                       DEBUG(DEBUG_ERR,
-                                             (__location__ " Out of memory\n"));
-                                       talloc_free(state);
-                                       return -1;
-                               }
-                               ip->pnn = -1;
-                               ip->addr = pub->addr;
-
-                               data.dsize = sizeof(struct ctdb_public_ip);
-                               data.dptr = (unsigned char *)ip;
-
-                               ret = ctdb_daemon_send_control(ctdb,
-                                                              ctdb_get_pnn(ctdb),
-                                                              0,
-                                                              CTDB_CONTROL_RELEASE_IP,
-                                                              0, 0,
-                                                              data,
-                                                              delete_ip_callback,
-                                                              state);
-                               if (ret == -1) {
-                                       DEBUG(DEBUG_ERR,
-                                             (__location__ "Unable to send "
-                                              "CTDB_CONTROL_RELEASE_IP\n"));
-                                       talloc_free(state);
-                                       return -1;
-                               }
-
-                               state->c = talloc_steal(state, c);
-                               *async_reply = true;
                        } else {
                                /* This IP is not hosted on the
                                 * current node so just delete it
@@ -2955,82 +2866,6 @@ int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
 }
 
 
-/* This function is called from the recovery daemon to verify that a remote
-   node has the expected ip allocation.
-   This is verified against ctdb->ip_tree
-*/
-static int verify_remote_ip_allocation(struct ctdb_context *ctdb,
-                                      struct ctdb_public_ip_list *ips,
-                                      uint32_t pnn)
-{
-       struct public_ip_list *tmp_ip;
-       int i;
-
-       if (ctdb->ip_tree == NULL) {
-               /* don't know the expected allocation yet, assume remote node
-                  is correct. */
-               return 0;
-       }
-
-       if (ips == NULL) {
-               return 0;
-       }
-
-       for (i=0; i<ips->num; i++) {
-               tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ip[i].addr));
-               if (tmp_ip == NULL) {
-                       DEBUG(DEBUG_ERR,("Node %u has new or unknown public IP %s\n", pnn, ctdb_addr_to_str(&ips->ip[i].addr)));
-                       return -1;
-               }
-
-               if (tmp_ip->pnn == -1 || ips->ip[i].pnn == -1) {
-                       continue;
-               }
-
-               if (tmp_ip->pnn != ips->ip[i].pnn) {
-                       DEBUG(DEBUG_ERR,
-                             ("Inconsistent IP allocation - node %u thinks %s is held by node %u while it is assigned to node %u\n",
-                              pnn,
-                              ctdb_addr_to_str(&ips->ip[i].addr),
-                              ips->ip[i].pnn, tmp_ip->pnn));
-                       return -1;
-               }
-       }
-
-       return 0;
-}
-
-int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
-{
-       struct public_ip_list *tmp_ip;
-
-       /* IP tree is never built if DisableIPFailover is set */
-       if (ctdb->tunable.disable_ip_failover != 0) {
-               return 0;
-       }
-
-       if (ctdb->ip_tree == NULL) {
-               DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
-               return -1;
-       }
-
-       tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
-       if (tmp_ip == NULL) {
-               DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
-               return -1;
-       }
-
-       DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
-       tmp_ip->pnn = ip->pnn;
-
-       return 0;
-}
-
-void clear_ip_assignment_tree(struct ctdb_context *ctdb)
-{
-       TALLOC_FREE(ctdb->ip_tree);
-}
-
 struct ctdb_reloadips_handle {
        struct ctdb_context *ctdb;
        struct ctdb_req_control_old *c;