recoverd: Do not send "ipreallocated" event to stopped nodes
[ctdb.git] / server / ctdb_takeover.c
index 9a18377723e824543aea958eae3b9f9488016133..1e8dc757c5dafc0a26f4a59bc51a5ec9f862dff4 100644 (file)
@@ -3,6 +3,7 @@
 
    Copyright (C) Ronnie Sahlberg  2007
    Copyright (C) Andrew Tridgell  2007
+   Copyright (C) Martin Schwenke  2011
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -18,7 +19,7 @@
    along with this program; if not, see <http://www.gnu.org/licenses/>.
 */
 #include "includes.h"
-#include "lib/events/events.h"
+#include "lib/tevent/tevent.h"
 #include "lib/tdb/include/tdb.h"
 #include "lib/util/dlinklist.h"
 #include "system/network.h"
@@ -65,7 +66,7 @@ static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
        CTDB_NO_MEMORY_FATAL(ctdb, i);
        i->name = talloc_strdup(i, iface);
        CTDB_NO_MEMORY(ctdb, i->name);
-       i->link_up = true;
+       i->link_up = false;
 
        DLIST_ADD(ctdb->ifaces, i);
 
@@ -169,6 +170,31 @@ static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
        }
 }
 
+static bool ctdb_vnn_available(struct ctdb_context *ctdb,
+                              struct ctdb_vnn *vnn)
+{
+       int i;
+
+       if (vnn->iface && vnn->iface->link_up) {
+               return true;
+       }
+
+       for (i=0; vnn->ifaces[i]; i++) {
+               struct ctdb_iface *cur;
+
+               cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
+               if (cur == NULL) {
+                       continue;
+               }
+
+               if (cur->link_up) {
+                       return true;
+               }
+       }
+
+       return false;
+}
+
 struct ctdb_takeover_arp {
        struct ctdb_context *ctdb;
        uint32_t count;
@@ -248,73 +274,274 @@ static void ctdb_control_send_arp(struct event_context *ev, struct timed_event *
                        ctdb_control_send_arp, arp);
 }
 
+static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
+                                      struct ctdb_vnn *vnn)
+{
+       struct ctdb_takeover_arp *arp;
+       struct ctdb_tcp_array *tcparray;
+
+       if (!vnn->takeover_ctx) {
+               vnn->takeover_ctx = talloc_new(vnn);
+               if (!vnn->takeover_ctx) {
+                       return -1;
+               }
+       }
+
+       arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
+       if (!arp) {
+               return -1;
+       }
+
+       arp->ctdb = ctdb;
+       arp->addr = vnn->public_address;
+       arp->vnn  = vnn;
+
+       tcparray = vnn->tcp_array;
+       if (tcparray) {
+               /* add all of the known tcp connections for this IP to the
+                  list of tcp connections to send tickle acks for */
+               arp->tcparray = talloc_steal(arp, tcparray);
+
+               vnn->tcp_array = NULL;
+               vnn->tcp_update_needed = true;
+       }
+
+       event_add_timed(arp->ctdb->ev, vnn->takeover_ctx,
+                       timeval_zero(), ctdb_control_send_arp, arp);
+
+       return 0;
+}
+
 struct takeover_callback_state {
        struct ctdb_req_control *c;
        ctdb_sock_addr *addr;
        struct ctdb_vnn *vnn;
 };
 
+struct ctdb_do_takeip_state {
+       struct ctdb_req_control *c;
+       struct ctdb_vnn *vnn;
+};
+
 /*
   called when takeip event finishes
  */
-static void takeover_ip_callback(struct ctdb_context *ctdb, int status, 
-                                void *private_data)
+static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
+                                   void *private_data)
 {
-       struct takeover_callback_state *state = 
-               talloc_get_type(private_data, struct takeover_callback_state);
-       struct ctdb_takeover_arp *arp;
-       struct ctdb_tcp_array *tcparray;
+       struct ctdb_do_takeip_state *state =
+               talloc_get_type(private_data, struct ctdb_do_takeip_state);
+       int32_t ret;
+       TDB_DATA data;
 
        if (status != 0) {
+               struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
+       
                if (status == -ETIME) {
                        ctdb_ban_self(ctdb);
                }
                DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
-                       ctdb_addr_to_str(state->addr),
-                       ctdb_vnn_iface_string(state->vnn)));
+                                ctdb_addr_to_str(&state->vnn->public_address),
+                                ctdb_vnn_iface_string(state->vnn)));
                ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
+
+               node->flags |= NODE_FLAGS_UNHEALTHY;
                talloc_free(state);
                return;
        }
 
-       if (!state->vnn->takeover_ctx) {
-               state->vnn->takeover_ctx = talloc_new(state->vnn);
-               if (!state->vnn->takeover_ctx) {
-                       goto failed;
-               }
+       ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
+       if (ret != 0) {
+               ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
+               talloc_free(state);
+               return;
        }
 
-       arp = talloc_zero(state->vnn->takeover_ctx, struct ctdb_takeover_arp);
-       if (!arp) goto failed;
-       
-       arp->ctdb = ctdb;
-       arp->addr = *state->addr;
-       arp->vnn  = state->vnn;
-
-       tcparray = state->vnn->tcp_array;
-       if (tcparray) {
-               /* add all of the known tcp connections for this IP to the
-                  list of tcp connections to send tickle acks for */
-               arp->tcparray = talloc_steal(arp, tcparray);
+       data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
+       data.dsize = strlen((char *)data.dptr) + 1;
+       DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
 
-               state->vnn->tcp_array = NULL;
-               state->vnn->tcp_update_needed = true;
-       }
+       ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
 
-       event_add_timed(arp->ctdb->ev, state->vnn->takeover_ctx, 
-                       timeval_zero(), ctdb_control_send_arp, arp);
 
        /* the control succeeded */
        ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
        talloc_free(state);
        return;
+}
 
-failed:
-       ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
+/*
+  take over an ip address
+ */
+static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
+                             struct ctdb_req_control *c,
+                             struct ctdb_vnn *vnn)
+{
+       int ret;
+       struct ctdb_do_takeip_state *state;
+
+       ret = ctdb_vnn_assign_iface(ctdb, vnn);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
+                                "assin a usable interface\n",
+                                ctdb_addr_to_str(&vnn->public_address),
+                                vnn->public_netmask_bits));
+               return -1;
+       }
+
+       state = talloc(vnn, struct ctdb_do_takeip_state);
+       CTDB_NO_MEMORY(ctdb, state);
+
+       state->c = talloc_steal(ctdb, c);
+       state->vnn   = vnn;
+
+       DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
+                           ctdb_addr_to_str(&vnn->public_address),
+                           vnn->public_netmask_bits,
+                           ctdb_vnn_iface_string(vnn)));
+
+       ret = ctdb_event_script_callback(ctdb,
+                                        state,
+                                        ctdb_do_takeip_callback,
+                                        state,
+                                        false,
+                                        CTDB_EVENT_TAKE_IP,
+                                        "%s %s %u",
+                                        ctdb_vnn_iface_string(vnn),
+                                        ctdb_addr_to_str(&vnn->public_address),
+                                        vnn->public_netmask_bits);
+
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
+                       ctdb_addr_to_str(&vnn->public_address),
+                       ctdb_vnn_iface_string(vnn)));
+               talloc_free(state);
+               return -1;
+       }
+
+       return 0;
+}
+
+struct ctdb_do_updateip_state {
+       struct ctdb_req_control *c;
+       struct ctdb_iface *old;
+       struct ctdb_vnn *vnn;
+};
+
+/*
+  called when updateip event finishes
+ */
+static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
+                                     void *private_data)
+{
+       struct ctdb_do_updateip_state *state =
+               talloc_get_type(private_data, struct ctdb_do_updateip_state);
+       int32_t ret;
+
+       if (status != 0) {
+               if (status == -ETIME) {
+                       ctdb_ban_self(ctdb);
+               }
+               DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
+                       ctdb_addr_to_str(&state->vnn->public_address),
+                       state->old->name,
+                       ctdb_vnn_iface_string(state->vnn)));
+
+               /*
+                * All we can do is reset the old interface
+                * and let the next run fix it
+                */
+               ctdb_vnn_unassign_iface(ctdb, state->vnn);
+               state->vnn->iface = state->old;
+               state->vnn->iface->references++;
+
+               ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
+               talloc_free(state);
+               return;
+       }
+
+       ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
+       if (ret != 0) {
+               ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
+               talloc_free(state);
+               return;
+       }
+
+       /* the control succeeded */
+       ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
        talloc_free(state);
        return;
 }
 
+/*
+  update (move) an ip address
+ */
+static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
+                               struct ctdb_req_control *c,
+                               struct ctdb_vnn *vnn)
+{
+       int ret;
+       struct ctdb_do_updateip_state *state;
+       struct ctdb_iface *old = vnn->iface;
+       char *new_name;
+
+       ctdb_vnn_unassign_iface(ctdb, vnn);
+       ret = ctdb_vnn_assign_iface(ctdb, vnn);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
+                                "assin a usable interface (old iface '%s')\n",
+                                ctdb_addr_to_str(&vnn->public_address),
+                                vnn->public_netmask_bits,
+                                old->name));
+               return -1;
+       }
+
+       new_name = ctdb_vnn_iface_string(vnn);
+       if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
+               /* A benign update from one interface onto itself.
+                * no need to run the eventscripts in this case, just return
+                * success.
+                */
+               ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
+               return 0;
+       }
+
+       state = talloc(vnn, struct ctdb_do_updateip_state);
+       CTDB_NO_MEMORY(ctdb, state);
+
+       state->c = talloc_steal(ctdb, c);
+       state->old = old;
+       state->vnn = vnn;
+
+       DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
+                           "interface %s to %s\n",
+                           ctdb_addr_to_str(&vnn->public_address),
+                           vnn->public_netmask_bits,
+                           old->name,
+                           new_name));
+
+       ret = ctdb_event_script_callback(ctdb,
+                                        state,
+                                        ctdb_do_updateip_callback,
+                                        state,
+                                        false,
+                                        CTDB_EVENT_UPDATE_IP,
+                                        "%s %s %s %u",
+                                        state->old->name,
+                                        new_name,
+                                        ctdb_addr_to_str(&vnn->public_address),
+                                        vnn->public_netmask_bits);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
+                                ctdb_addr_to_str(&vnn->public_address),
+                                old->name, new_name));
+               talloc_free(state);
+               return -1;
+       }
+
+       return 0;
+}
+
 /*
   Find the vnn of the node that has a public ip address
   returns -1 if the address is not known as a public address
@@ -335,69 +562,117 @@ static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_
 /*
   take over an ip address
  */
-int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb, 
+int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
                                 struct ctdb_req_control *c,
-                                TDB_DATA indata, 
+                                TDB_DATA indata,
                                 bool *async_reply)
 {
        int ret;
-       struct takeover_callback_state *state;
        struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
        struct ctdb_vnn *vnn;
+       bool have_ip = false;
+       bool do_updateip = false;
+       bool do_takeip = false;
+       struct ctdb_iface *best_iface = NULL;
+
+       if (pip->pnn != ctdb->pnn) {
+               DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
+                                "with pnn %d, but we're node %d\n",
+                                ctdb_addr_to_str(&pip->addr),
+                                pip->pnn, ctdb->pnn));
+               return -1;
+       }
 
        /* update out vnn list */
        vnn = find_public_ip_vnn(ctdb, &pip->addr);
        if (vnn == NULL) {
-               DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n", 
+               DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
                        ctdb_addr_to_str(&pip->addr)));
                return 0;
        }
-       vnn->pnn = pip->pnn;
 
-       /* if our kernel already has this IP, do nothing */
-       if (ctdb_sys_have_ip(&pip->addr)) {
-               return 0;
+       have_ip = ctdb_sys_have_ip(&pip->addr);
+       best_iface = ctdb_vnn_best_iface(ctdb, vnn);
+       if (best_iface == NULL) {
+               DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
+                                "a usable interface (old %s, have_ip %d)\n",
+                                ctdb_addr_to_str(&vnn->public_address),
+                                vnn->public_netmask_bits,
+                                ctdb_vnn_iface_string(vnn),
+                                have_ip));
+               return -1;
        }
 
-       ret = ctdb_vnn_assign_iface(ctdb, vnn);
-       if (ret != 0) {
-               DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
-                                "assin a usable interface\n",
-                                ctdb_addr_to_str(&pip->addr),
-                                vnn->public_netmask_bits));
-               return -1;
+       if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
+               DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
+               have_ip = false;
        }
 
-       state = talloc(vnn, struct takeover_callback_state);
-       CTDB_NO_MEMORY(ctdb, state);
+       if (vnn->iface == NULL && have_ip) {
+               DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
+                                 "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
+                                ctdb_addr_to_str(&vnn->public_address)));
+               return 0;
+       }
 
-       state->c = talloc_steal(ctdb, c);
-       state->addr = talloc(ctdb, ctdb_sock_addr);
-       CTDB_NO_MEMORY(ctdb, state->addr);
+       if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
+               DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
+                                 "and we have it on iface[%s], but it was assigned to node %d"
+                                 "and we are node %d, banning ourself\n",
+                                ctdb_addr_to_str(&vnn->public_address),
+                                ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
+               ctdb_ban_self(ctdb);
+               return -1;
+       }
 
-       *state->addr = pip->addr;
-       state->vnn   = vnn;
+       if (vnn->pnn == -1 && have_ip) {
+               vnn->pnn = ctdb->pnn;
+               DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
+                                 "and we already have it on iface[%s], update local daemon\n",
+                                ctdb_addr_to_str(&vnn->public_address),
+                                 ctdb_vnn_iface_string(vnn)));
+               return 0;
+       }
 
-       DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n", 
-               ctdb_addr_to_str(&pip->addr),
-               vnn->public_netmask_bits, 
-               ctdb_vnn_iface_string(vnn)));
+       if (vnn->iface) {
+               if (vnn->iface->link_up) {
+                       /* only move when the rebalance gains something */
+                       if (vnn->iface->references > (best_iface->references + 1)) {
+                               do_updateip = true;
+                       }
+               } else if (vnn->iface != best_iface) {
+                       do_updateip = true;
+               }
+       }
 
-       ret = ctdb_event_script_callback(ctdb, 
-                                        state, takeover_ip_callback, state,
-                                        false,
-                                        CTDB_EVENT_TAKE_IP,
-                                        "%s %s %u",
-                                        ctdb_vnn_iface_string(vnn),
-                                        ctdb_addr_to_str(&pip->addr),
-                                        vnn->public_netmask_bits);
+       if (!have_ip) {
+               if (do_updateip) {
+                       ctdb_vnn_unassign_iface(ctdb, vnn);
+                       do_updateip = false;
+               }
+               do_takeip = true;
+       }
 
-       if (ret != 0) {
-               DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
+       if (do_takeip) {
+               ret = ctdb_do_takeip(ctdb, c, vnn);
+               if (ret != 0) {
+                       return -1;
+               }
+       } else if (do_updateip) {
+               ret = ctdb_do_updateip(ctdb, c, vnn);
+               if (ret != 0) {
+                       return -1;
+               }
+       } else {
+               /*
+                * The interface is up and the kernel known the ip
+                * => do nothing
+                */
+               DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
                        ctdb_addr_to_str(&pip->addr),
+                       vnn->public_netmask_bits,
                        ctdb_vnn_iface_string(vnn)));
-               talloc_free(state);
-               return -1;
+               return 0;
        }
 
        /* tell ctdb_control.c that we will be replying asynchronously */
@@ -524,7 +799,7 @@ int32_t ctdb_control_release_ip(struct ctdb_context *ctdb,
        vnn->takeover_ctx = NULL;
 
        if (!ctdb_sys_have_ip(&pip->addr)) {
-               DEBUG(DEBUG_NOTICE,("Redundant release of IP %s/%u on interface %s (ip not held)\n", 
+               DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n", 
                        ctdb_addr_to_str(&pip->addr),
                        vnn->public_netmask_bits, 
                        ctdb_vnn_iface_string(vnn)));
@@ -532,7 +807,14 @@ int32_t ctdb_control_release_ip(struct ctdb_context *ctdb,
                return 0;
        }
 
-       DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%u\n", 
+       if (vnn->iface == NULL) {
+               DEBUG(DEBUG_ERR,(__location__ " release_ip of IP %s is known to the kernel, "
+                                "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
+                                ctdb_addr_to_str(&vnn->public_address)));
+               return 0;
+       }
+
+       DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
                ctdb_addr_to_str(&pip->addr),
                vnn->public_netmask_bits, 
                ctdb_vnn_iface_string(vnn),
@@ -625,6 +907,10 @@ static int ctdb_add_public_address(struct ctdb_context *ctdb,
        vnn->public_address      = *addr;
        vnn->public_netmask_bits = mask;
        vnn->pnn                 = -1;
+       if (ctdb_sys_have_ip(addr)) {
+               DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
+               vnn->pnn = ctdb->pnn;
+       }
 
        for (i=0; vnn->ifaces[i]; i++) {
                ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
@@ -635,6 +921,9 @@ static int ctdb_add_public_address(struct ctdb_context *ctdb,
                        talloc_free(vnn);
                        return -1;
                }
+               if (i == 0) {
+                       vnn->iface = ctdb_find_iface(ctdb, vnn->ifaces[i]);
+               }
        }
 
        DLIST_ADD(ctdb->vnn, vnn);
@@ -723,6 +1012,7 @@ int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
                              const char *ip)
 {
        struct ctdb_vnn *svnn;
+       struct ctdb_iface *cur = NULL;
        bool ok;
        int ret;
 
@@ -751,6 +1041,14 @@ int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
                return -1;
        }
 
+       /* assume the single public ip interface is initially "good" */
+       cur = ctdb_find_iface(ctdb, iface);
+       if (cur == NULL) {
+               DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
+               return -1;
+       }
+       cur->link_up = true;
+
        ret = ctdb_vnn_assign_iface(ctdb, svnn);
        if (ret != 0) {
                talloc_free(svnn);
@@ -761,13 +1059,6 @@ int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
        return 0;
 }
 
-struct ctdb_public_ip_list {
-       struct ctdb_public_ip_list *next;
-       uint32_t pnn;
-       ctdb_sock_addr addr;
-};
-
-
 /* Given a physical node, return the number of
    public addresses that is currently assigned to this node.
 */
@@ -795,7 +1086,7 @@ static int can_node_serve_ip(struct ctdb_context *ctdb, int32_t pnn,
        struct ctdb_all_public_ips *public_ips;
        int i;
 
-       public_ips = ctdb->nodes[pnn]->public_ips;
+       public_ips = ctdb->nodes[pnn]->available_public_ips;
 
        if (public_ips == NULL) {
                return -1;
@@ -889,6 +1180,16 @@ static uint32_t *ip_key(ctdb_sock_addr *ip)
 
 static void *add_ip_callback(void *parm, void *data)
 {
+       struct ctdb_public_ip_list *this_ip = parm; 
+       struct ctdb_public_ip_list *prev_ip = data; 
+
+       if (prev_ip == NULL) {
+               return parm;
+       }
+       if (this_ip->pnn == -1) {
+               this_ip->pnn = prev_ip->pnn;
+       }
+
        return parm;
 }
 
@@ -901,18 +1202,21 @@ void getips_count_callback(void *param, void *data)
        *ip_list     = new_ip;
 }
 
-struct ctdb_public_ip_list *
-create_merged_ip_list(struct ctdb_context *ctdb, TALLOC_CTX *tmp_ctx)
+static struct ctdb_public_ip_list *
+create_merged_ip_list(struct ctdb_context *ctdb)
 {
        int i, j;
        struct ctdb_public_ip_list *ip_list;
        struct ctdb_all_public_ips *public_ips;
-       trbt_tree_t *ip_tree;
 
-       ip_tree = trbt_create(tmp_ctx, 0);
+       if (ctdb->ip_tree != NULL) {
+               talloc_free(ctdb->ip_tree);
+               ctdb->ip_tree = NULL;
+       }
+       ctdb->ip_tree = trbt_create(ctdb, 0);
 
        for (i=0;i<ctdb->num_nodes;i++) {
-               public_ips = ctdb->nodes[i]->public_ips;
+               public_ips = ctdb->nodes[i]->known_public_ips;
 
                if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
                        continue;
@@ -926,13 +1230,13 @@ create_merged_ip_list(struct ctdb_context *ctdb, TALLOC_CTX *tmp_ctx)
                for (j=0;j<public_ips->num;j++) {
                        struct ctdb_public_ip_list *tmp_ip; 
 
-                       tmp_ip = talloc_zero(tmp_ctx, struct ctdb_public_ip_list);
+                       tmp_ip = talloc_zero(ctdb->ip_tree, struct ctdb_public_ip_list);
                        CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
                        tmp_ip->pnn  = public_ips->ips[j].pnn;
                        tmp_ip->addr = public_ips->ips[j].addr;
                        tmp_ip->next = NULL;
 
-                       trbt_insertarray32_callback(ip_tree,
+                       trbt_insertarray32_callback(ctdb->ip_tree,
                                IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
                                add_ip_callback,
                                tmp_ip);
@@ -940,102 +1244,124 @@ create_merged_ip_list(struct ctdb_context *ctdb, TALLOC_CTX *tmp_ctx)
        }
 
        ip_list = NULL;
-       trbt_traversearray32(ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
+       trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
 
        return ip_list;
 }
 
-/*
-  make any IP alias changes for public addresses that are necessary 
+/* 
+ * This is the length of the longtest common prefix between the IPs.
+ * It is calculated by XOR-ing the 2 IPs together and counting the
+ * number of leading zeroes.  The implementation means that all
+ * addresses end up being 128 bits long.
+ * Not static, so we can easily link it into a unit test.
+ *
+ * FIXME? Should we consider IPv4 and IPv6 separately given that the
+ * 12 bytes of 0 prefix padding will hurt the algorithm if there are
+ * lots of nodes and IP addresses?
  */
-int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
+uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
 {
-       int i, num_healthy, retries;
-       struct ctdb_public_ip ip;
-       struct ctdb_public_ipv4 ipv4;
-       uint32_t mask;
-       struct ctdb_public_ip_list *all_ips, *tmp_ip;
-       int maxnode, maxnum=0, minnode, minnum=0, num;
-       TDB_DATA data;
-       struct timeval timeout;
-       struct client_async_data *async_data;
-       struct ctdb_client_control_state *state;
-       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
-
+       uint32_t ip1_k[IP_KEYLEN];
+       uint32_t *t;
+       int i;
+       uint32_t x;
 
-       ZERO_STRUCT(ip);
+       uint32_t distance = 0;
 
-       /* Count how many completely healthy nodes we have */
-       num_healthy = 0;
-       for (i=0;i<nodemap->num;i++) {
-               if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
-                       num_healthy++;
+       memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
+       t = ip_key(ip2);
+       for (i=0; i<IP_KEYLEN; i++) {
+               x = ip1_k[i] ^ t[i];
+               if (x == 0) {
+                       distance += 32;
+               } else {
+                       /* Count number of leading zeroes. 
+                        * FIXME? This could be optimised...
+                        */
+                       while ((x & (1 << 31)) == 0) {
+                               x <<= 1;
+                               distance += 1;
+                       }
                }
        }
 
-       if (num_healthy > 0) {
-               /* We have healthy nodes, so only consider them for 
-                  serving public addresses
-               */
-               mask = NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED;
-       } else {
-               /* We didnt have any completely healthy nodes so
-                  use "disabled" nodes as a fallback
-               */
-               mask = NODE_FLAGS_INACTIVE;
-       }
-
-       /* since nodes only know about those public addresses that
-          can be served by that particular node, no single node has
-          a full list of all public addresses that exist in the cluster.
-          Walk over all node structures and create a merged list of
-          all public addresses that exist in the cluster.
-       */
-       all_ips = create_merged_ip_list(ctdb, tmp_ctx);
+       return distance;
+}
 
-       /* If we want deterministic ip allocations, i.e. that the ip addresses
-          will always be allocated the same way for a specific set of
-          available/unavailable nodes.
-       */
-       if (1 == ctdb->tunable.deterministic_public_ips) {              
-               DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
-               for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
-                       tmp_ip->pnn = i%nodemap->num;
-               }
-       }
+/* Calculate the IP distance for the given IP relative to IPs on the
+   given node.  The ips argument is generally the all_ips variable
+   used in the main part of the algorithm.
+ * Not static, so we can easily link it into a unit test.
+ */
+uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
+                          struct ctdb_public_ip_list *ips,
+                          int pnn)
+{
+       struct ctdb_public_ip_list *t;
+       uint32_t d;
 
+       uint32_t sum = 0;
 
-       /* mark all public addresses with a masked node as being served by
-          node -1
-       */
-       for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
-               if (tmp_ip->pnn == -1) {
+       for (t=ips; t != NULL; t=t->next) {
+               if (t->pnn != pnn) {
                        continue;
                }
-               if (nodemap->nodes[tmp_ip->pnn].flags & mask) {
-                       tmp_ip->pnn = -1;
+
+               /* Optimisation: We never calculate the distance
+                * between an address and itself.  This allows us to
+                * calculate the effect of removing an address from a
+                * node by simply calculating the distance between
+                * that address and all of the exitsing addresses.
+                * Moreover, we assume that we're only ever dealing
+                * with addresses from all_ips so we can identify an
+                * address via a pointer rather than doing a more
+                * expensive address comparison. */
+               if (&(t->addr) == ip) {
+                       continue;
                }
+
+               d = ip_distance(ip, &(t->addr));
+               sum += d * d;  /* Cheaper than pulling in math.h :-) */
        }
 
-       /* verify that the assigned nodes can serve that public ip
-          and set it to -1 if not
-       */
-       for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
-               if (tmp_ip->pnn == -1) {
+       return sum;
+}
+
+/* Return the LCP2 imbalance metric for addresses currently assigned
+   to the given node.
+ * Not static, so we can easily link it into a unit test.
+ */
+uint32_t lcp2_imbalance(struct ctdb_public_ip_list * all_ips, int pnn)
+{
+       struct ctdb_public_ip_list *t;
+
+       uint32_t imbalance = 0;
+
+       for (t=all_ips; t!=NULL; t=t->next) {
+               if (t->pnn != pnn) {
                        continue;
                }
-               if (can_node_serve_ip(ctdb, tmp_ip->pnn, tmp_ip) != 0) {
-                       /* this node can not serve this ip. */
-                       tmp_ip->pnn = -1;
-               }
+               /* Pass the rest of the IPs rather than the whole
+                  all_ips input list.
+               */
+               imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
        }
 
+       return imbalance;
+}
+
+/* Allocate any unassigned IPs just by looping through the IPs and
+ * finding the best node for each.
+ * Not static, so we can easily link it into a unit test.
+ */
+void basic_allocate_unassigned(struct ctdb_context *ctdb,
+                              struct ctdb_node_map *nodemap,
+                              uint32_t mask,
+                              struct ctdb_public_ip_list *all_ips)
+{
+       struct ctdb_public_ip_list *tmp_ip;
 
-       /* now we must redistribute all public addresses with takeover node
-          -1 among the nodes available
-       */
-       retries = 0;
-try_again:
        /* loop over all ip's and find a physical node to cover for 
           each unassigned ip.
        */
@@ -1047,26 +1373,26 @@ try_again:
                        }
                }
        }
+}
 
-       /* If we dont want ips to fail back after a node becomes healthy
-          again, we wont even try to reallocat the ip addresses so that
-          they are evenly spread out.
-          This can NOT be used at the same time as DeterministicIPs !
-       */
-       if (1 == ctdb->tunable.no_ip_failback) {
-               if (1 == ctdb->tunable.deterministic_public_ips) {
-                       DEBUG(DEBUG_ERR, ("ERROR: You can not use 'DeterministicIPs' and 'NoIPFailback' at the same time\n"));
-               }
-               goto finished;
-       }
-
+/* Basic non-deterministic rebalancing algorithm.
+ * Not static, so we can easily link it into a unit test.
+ */
+bool basic_failback(struct ctdb_context *ctdb,
+                   struct ctdb_node_map *nodemap,
+                   uint32_t mask,
+                   struct ctdb_public_ip_list *all_ips,
+                   int num_ips,
+                   int *retries)
+{
+       int i;
+       int maxnode, maxnum=0, minnode, minnum=0, num;
+       struct ctdb_public_ip_list *tmp_ip;
 
-       /* now, try to make sure the ip adresses are evenly distributed
-          across the node.
-          for each ip address, loop over all nodes that can serve this
-          ip and make sure that the difference between the node
-          serving the most and the node serving the least ip's are not greater
-          than 1.
+       /* for each ip address, loop over all nodes that can serve
+          this ip and make sure that the difference between the node
+          serving the most and the node serving the least ip's are
+          not greater than 1.
        */
        for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
                if (tmp_ip->pnn == -1) {
@@ -1116,37 +1442,482 @@ try_again:
                        continue;
                }
 
-               /* If we want deterministic IPs then dont try to reallocate 
-                  them to spread out the load.
-               */
-               if (1 == ctdb->tunable.deterministic_public_ips) {
+               /* If we want deterministic IPs then dont try to reallocate 
+                  them to spread out the load.
+               */
+               if (1 == ctdb->tunable.deterministic_public_ips) {
+                       continue;
+               }
+
+               /* if the spread between the smallest and largest coverage by
+                  a node is >=2 we steal one of the ips from the node with
+                  most coverage to even things out a bit.
+                  try to do this a limited number of times since we dont
+                  want to spend too much time balancing the ip coverage.
+               */
+               if ( (maxnum > minnum+1)
+                    && (*retries < (num_ips + 5)) ){
+                       struct ctdb_public_ip_list *tmp;
+
+                       /* mark one of maxnode's vnn's as unassigned and try
+                          again
+                       */
+                       for (tmp=all_ips;tmp;tmp=tmp->next) {
+                               if (tmp->pnn == maxnode) {
+                                       tmp->pnn = -1;
+                                       (*retries)++;
+                                       return true;
+                               }
+                       }
+               }
+       }
+
+       return false;
+}
+
+/* Do necessary LCP2 initialisation.  Bury it in a function here so
+ * that we can unit test it.
+ * Not static, so we can easily link it into a unit test.
+ */
+void lcp2_init(struct ctdb_context * tmp_ctx,
+              struct ctdb_node_map * nodemap,
+              uint32_t mask,
+              struct ctdb_public_ip_list *all_ips,
+              uint32_t **lcp2_imbalances,
+              bool **newly_healthy)
+{
+       int i;
+       struct ctdb_public_ip_list *tmp_ip;
+
+       *newly_healthy = talloc_array(tmp_ctx, bool, nodemap->num);
+       CTDB_NO_MEMORY_FATAL(tmp_ctx, *newly_healthy);
+       *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, nodemap->num);
+       CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
+
+       for (i=0;i<nodemap->num;i++) {
+               (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
+               /* First step: is the node "healthy"? */
+               (*newly_healthy)[i] = ! (bool)(nodemap->nodes[i].flags & mask);
+       }
+
+       /* 2nd step: if a ndoe has IPs assigned then it must have been
+        * healthy before, so we remove it from consideration... */
+       for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
+               if (tmp_ip->pnn != -1) {
+                       (*newly_healthy)[tmp_ip->pnn] = false;
+               }
+       }
+}
+
+/* Allocate any unassigned addresses using the LCP2 algorithm to find
+ * the IP/node combination that will cost the least.
+ * Not static, so we can easily link it into a unit test.
+ */
+void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
+                             struct ctdb_node_map *nodemap,
+                             uint32_t mask,
+                             struct ctdb_public_ip_list *all_ips,
+                             uint32_t *lcp2_imbalances)
+{
+       struct ctdb_public_ip_list *tmp_ip;
+       int dstnode;
+
+       int minnode;
+       uint32_t mindsum, dstdsum, dstimbl, minimbl;
+       struct ctdb_public_ip_list *minip;
+
+       bool should_loop = true;
+       bool have_unassigned = true;
+
+       while (have_unassigned && should_loop) {
+               should_loop = false;
+
+               DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
+               DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
+
+               minnode = -1;
+               mindsum = 0;
+               minip = NULL;
+
+               /* loop over each unassigned ip. */
+               for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
+                       if (tmp_ip->pnn != -1) {
+                               continue;
+                       }
+
+                       for (dstnode=0; dstnode < nodemap->num; dstnode++) {
+                               /* only check nodes that can actually serve this ip */
+                               if (can_node_serve_ip(ctdb, dstnode, tmp_ip)) {
+                                       /* no it couldnt   so skip to the next node */
+                                       continue;
+                               }
+                               if (nodemap->nodes[dstnode].flags & mask) {
+                                       continue;
+                               }
+
+                               dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
+                               dstimbl = lcp2_imbalances[dstnode] + dstdsum;
+                               DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
+                                                  ctdb_addr_to_str(&(tmp_ip->addr)),
+                                                  dstnode,
+                                                  dstimbl - lcp2_imbalances[dstnode]));
+
+
+                               if ((minnode == -1) || (dstdsum < mindsum)) {
+                                       minnode = dstnode;
+                                       minimbl = dstimbl;
+                                       mindsum = dstdsum;
+                                       minip = tmp_ip;
+                                       should_loop = true;
+                               }
+                       }
+               }
+
+               DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
+
+               /* If we found one then assign it to the given node. */
+               if (minnode != -1) {
+                       minip->pnn = minnode;
+                       lcp2_imbalances[minnode] = minimbl;
+                       DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
+                                         ctdb_addr_to_str(&(minip->addr)),
+                                         minnode,
+                                         mindsum));
+               }
+
+               /* There might be a better way but at least this is clear. */
+               have_unassigned = false;
+               for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
+                       if (tmp_ip->pnn == -1) {
+                               have_unassigned = true;
+                       }
+               }
+       }
+
+       /* We know if we have an unassigned addresses so we might as
+        * well optimise.
+        */
+       if (have_unassigned) {
+               for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
+                       if (tmp_ip->pnn == -1) {
+                               DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
+                                                    ctdb_addr_to_str(&tmp_ip->addr)));
+                       }
+               }
+       }
+}
+
+/* LCP2 algorithm for rebalancing the cluster.  Given a candidate node
+ * to move IPs from, determines the best IP/destination node
+ * combination to move from the source node.
+ *
+ * Not static, so we can easily link it into a unit test.
+ */
+bool lcp2_failback_candidate(struct ctdb_context *ctdb,
+                            struct ctdb_node_map *nodemap,
+                            struct ctdb_public_ip_list *all_ips,
+                            int srcnode,
+                            uint32_t candimbl,
+                            uint32_t *lcp2_imbalances,
+                            bool *newly_healthy)
+{
+       int dstnode, mindstnode;
+       uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
+       uint32_t minsrcimbl, mindstimbl;
+       struct ctdb_public_ip_list *minip;
+       struct ctdb_public_ip_list *tmp_ip;
+
+       /* Find an IP and destination node that best reduces imbalance. */
+       minip = NULL;
+       minsrcimbl = 0;
+       mindstnode = -1;
+       mindstimbl = 0;
+
+       DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
+       DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n", srcnode, candimbl));
+
+       for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
+               /* Only consider addresses on srcnode. */
+               if (tmp_ip->pnn != srcnode) {
+                       continue;
+               }
+
+               /* What is this IP address costing the source node? */
+               srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
+               srcimbl = candimbl - srcdsum;
+
+               /* Consider this IP address would cost each potential
+                * destination node.  Destination nodes are limited to
+                * those that are newly healthy, since we don't want
+                * to do gratuitous failover of IPs just to make minor
+                * balance improvements.
+                */
+               for (dstnode=0; dstnode < nodemap->num; dstnode++) {
+                       if (! newly_healthy[dstnode]) {
+                               continue;
+                       }
+                       /* only check nodes that can actually serve this ip */
+                       if (can_node_serve_ip(ctdb, dstnode, tmp_ip)) {
+                               /* no it couldnt   so skip to the next node */
+                               continue;
+                       }
+
+                       dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
+                       dstimbl = lcp2_imbalances[dstnode] + dstdsum;
+                       DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
+                                          srcnode, srcimbl - lcp2_imbalances[srcnode],
+                                          ctdb_addr_to_str(&(tmp_ip->addr)),
+                                          dstnode, dstimbl - lcp2_imbalances[dstnode]));
+
+                       if ((dstimbl < candimbl) && (dstdsum < srcdsum) && \
+                           ((mindstnode == -1) ||                              \
+                            ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
+
+                               minip = tmp_ip;
+                               minsrcimbl = srcimbl;
+                               mindstnode = dstnode;
+                               mindstimbl = dstimbl;
+                       }
+               }
+       }
+       DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
+
+        if (mindstnode != -1) {
+               /* We found a move that makes things better... */
+               DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
+                                 srcnode, minsrcimbl - lcp2_imbalances[srcnode],
+                                 ctdb_addr_to_str(&(minip->addr)),
+                                 mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
+
+
+               lcp2_imbalances[srcnode] = srcimbl;
+               lcp2_imbalances[mindstnode] = mindstimbl;
+               minip->pnn = mindstnode;
+
+               return true;
+       }
+
+        return false;
+       
+}
+
+struct lcp2_imbalance_pnn {
+       uint32_t imbalance;
+       int pnn;
+};
+
+int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
+{
+       const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
+       const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
+
+       if (lipa->imbalance > lipb->imbalance) {
+               return -1;
+       } else if (lipa->imbalance == lipb->imbalance) {
+               return 0;
+       } else {
+               return 1;
+       }
+}
+
+/* LCP2 algorithm for rebalancing the cluster.  This finds the source
+ * node with the highest LCP2 imbalance, and then determines the best
+ * IP/destination node combination to move from the source node.
+ *
+ * Not static, so we can easily link it into a unit test.
+ */
+bool lcp2_failback(struct ctdb_context *ctdb,
+                  struct ctdb_node_map *nodemap,
+                  uint32_t mask,
+                  struct ctdb_public_ip_list *all_ips,
+                  uint32_t *lcp2_imbalances,
+                  bool *newly_healthy)
+{
+       int i, num_newly_healthy;
+       struct lcp2_imbalance_pnn * lips;
+       bool ret;
+
+       /* It is only worth continuing if we have suitable target
+        * nodes to transfer IPs to.  This check is much cheaper than
+        * continuing on...
+        */
+       num_newly_healthy = 0;
+       for (i = 0; i < nodemap->num; i++) {
+               if (newly_healthy[i]) {
+                       num_newly_healthy++;
+               }
+       }
+       if (num_newly_healthy == 0) {
+               return false;
+       }
+
+       /* Put the imbalances and nodes into an array, sort them and
+        * iterate through candidates.  Usually the 1st one will be
+        * used, so this doesn't cost much...
+        */
+       lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, nodemap->num);
+       for (i = 0; i < nodemap->num; i++) {
+               lips[i].imbalance = lcp2_imbalances[i];
+               lips[i].pnn = i;
+       }
+       qsort(lips, nodemap->num, sizeof(struct lcp2_imbalance_pnn),
+             lcp2_cmp_imbalance_pnn);
+
+       ret = false;
+       for (i = 0; i < nodemap->num; i++) {
+               /* This means that all nodes had 0 or 1 addresses, so
+                * can't be imbalanced.
+                */
+               if (lips[i].imbalance == 0) {
+                       break;
+               }
+
+               if (lcp2_failback_candidate(ctdb,
+                                           nodemap,
+                                           all_ips,
+                                           lips[i].pnn,
+                                           lips[i].imbalance,
+                                           lcp2_imbalances,
+                                           newly_healthy)) {
+                       ret = true;
+                       break;
+               }
+       }
+
+       talloc_free(lips);
+       return ret;
+}
+
+/* The calculation part of the IP allocation algorithm.
+ * Not static, so we can easily link it into a unit test.
+ */
+void ctdb_takeover_run_core(struct ctdb_context *ctdb,
+                           struct ctdb_node_map *nodemap,
+                           struct ctdb_public_ip_list **all_ips_p)
+{
+       int i, num_healthy, retries, num_ips;
+       uint32_t mask;
+       struct ctdb_public_ip_list *all_ips, *tmp_ip;
+       uint32_t *lcp2_imbalances;
+       bool *newly_healthy;
+
+       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+
+       /* Count how many completely healthy nodes we have */
+       num_healthy = 0;
+       for (i=0;i<nodemap->num;i++) {
+               if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
+                       num_healthy++;
+               }
+       }
+
+       if (num_healthy > 0) {
+               /* We have healthy nodes, so only consider them for 
+                  serving public addresses
+               */
+               mask = NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED;
+       } else {
+               /* We didnt have any completely healthy nodes so
+                  use "disabled" nodes as a fallback
+               */
+               mask = NODE_FLAGS_INACTIVE;
+       }
+
+       /* since nodes only know about those public addresses that
+          can be served by that particular node, no single node has
+          a full list of all public addresses that exist in the cluster.
+          Walk over all node structures and create a merged list of
+          all public addresses that exist in the cluster.
+
+          keep the tree of ips around as ctdb->ip_tree
+       */
+       all_ips = create_merged_ip_list(ctdb);
+       *all_ips_p = all_ips; /* minimal code changes */
+
+       /* Count how many ips we have */
+       num_ips = 0;
+       for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
+               num_ips++;
+       }
+
+       /* If we want deterministic ip allocations, i.e. that the ip addresses
+          will always be allocated the same way for a specific set of
+          available/unavailable nodes.
+       */
+       if (1 == ctdb->tunable.deterministic_public_ips) {              
+               DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
+               for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
+                       tmp_ip->pnn = i%nodemap->num;
+               }
+       }
+
+
+       /* mark all public addresses with a masked node as being served by
+          node -1
+       */
+       for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
+               if (tmp_ip->pnn == -1) {
+                       continue;
+               }
+               if (nodemap->nodes[tmp_ip->pnn].flags & mask) {
+                       tmp_ip->pnn = -1;
+               }
+       }
+
+       /* verify that the assigned nodes can serve that public ip
+          and set it to -1 if not
+       */
+       for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
+               if (tmp_ip->pnn == -1) {
                        continue;
                }
+               if (can_node_serve_ip(ctdb, tmp_ip->pnn, tmp_ip) != 0) {
+                       /* this node can not serve this ip. */
+                       tmp_ip->pnn = -1;
+               }
+       }
 
-               /* if the spread between the smallest and largest coverage by
-                  a node is >=2 we steal one of the ips from the node with
-                  most coverage to even things out a bit.
-                  try to do this at most 5 times  since we dont want to spend
-                  too much time balancing the ip coverage.
-               */
-               if ( (maxnum > minnum+1)
-                 && (retries < 5) ){
-                       struct ctdb_public_ip_list *tmp;
+        if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
+               lcp2_init(tmp_ctx, nodemap, mask, all_ips, &lcp2_imbalances, &newly_healthy);
+       }
 
-                       /* mark one of maxnode's vnn's as unassigned and try
-                          again
-                       */
-                       for (tmp=all_ips;tmp;tmp=tmp->next) {
-                               if (tmp->pnn == maxnode) {
-                                       tmp->pnn = -1;
-                                       retries++;
-                                       goto try_again;
-                               }
-                       }
+       /* now we must redistribute all public addresses with takeover node
+          -1 among the nodes available
+       */
+       retries = 0;
+try_again:
+       if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
+               lcp2_allocate_unassigned(ctdb, nodemap, mask, all_ips, lcp2_imbalances);
+       } else {
+               basic_allocate_unassigned(ctdb, nodemap, mask, all_ips);
+       }
+
+       /* If we dont want ips to fail back after a node becomes healthy
+          again, we wont even try to reallocat the ip addresses so that
+          they are evenly spread out.
+          This can NOT be used at the same time as DeterministicIPs !
+       */
+       if (1 == ctdb->tunable.no_ip_failback) {
+               if (1 == ctdb->tunable.deterministic_public_ips) {
+                       DEBUG(DEBUG_ERR, ("ERROR: You can not use 'DeterministicIPs' and 'NoIPFailback' at the same time\n"));
                }
+               goto finished;
        }
 
 
+       /* now, try to make sure the ip adresses are evenly distributed
+          across the node.
+       */
+       if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
+               if (lcp2_failback(ctdb, nodemap, mask, all_ips, lcp2_imbalances, newly_healthy)) {
+                       goto try_again;
+               }
+       } else {
+               if (basic_failback(ctdb, nodemap, mask, all_ips, num_ips, &retries)) {
+                       goto try_again;
+               }
+       }
+
        /* finished distributing the public addresses, now just send the 
           info out to the nodes
        */
@@ -1156,12 +1927,48 @@ finished:
           or -1 if there is no node that can cover this ip
        */
 
+       return;
+}
+
+/*
+  make any IP alias changes for public addresses that are necessary 
+ */
+int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
+                     client_async_callback fail_callback, void *callback_data)
+{
+       int i;
+       struct ctdb_public_ip ip;
+       struct ctdb_public_ipv4 ipv4;
+       uint32_t *nodes;
+       struct ctdb_public_ip_list *all_ips, *tmp_ip;
+       TDB_DATA data;
+       struct timeval timeout;
+       struct client_async_data *async_data;
+       struct ctdb_client_control_state *state;
+       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+
+       /*
+        * ip failover is completely disabled, just send out the 
+        * ipreallocated event.
+        */
+       if (ctdb->tunable.disable_ip_failover != 0) {
+               goto ipreallocated;
+       }
+
+       ZERO_STRUCT(ip);
+
+       /* Do the IP reassignment calculations */
+       ctdb_takeover_run_core(ctdb, nodemap, &all_ips);
+
        /* now tell all nodes to delete any alias that they should not
           have.  This will be a NOOP on nodes that don't currently
           hold the given alias */
        async_data = talloc_zero(tmp_ctx, struct client_async_data);
        CTDB_NO_MEMORY_FATAL(ctdb, async_data);
 
+       async_data->fail_callback = fail_callback;
+       async_data->callback_data = callback_data;
+
        for (i=0;i<nodemap->num;i++) {
                /* don't talk to unconnected nodes, but do talk to banned nodes */
                if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
@@ -1219,6 +2026,10 @@ finished:
        /* tell all nodes to get their own IPs */
        async_data = talloc_zero(tmp_ctx, struct client_async_data);
        CTDB_NO_MEMORY_FATAL(ctdb, async_data);
+
+       async_data->fail_callback = fail_callback;
+       async_data->callback_data = callback_data;
+
        for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
                if (tmp_ip->pnn == -1) {
                        /* this IP won't be taken over */
@@ -1262,6 +2073,27 @@ finished:
                return -1;
        }
 
+ipreallocated:
+       /*
+        * Tell all connected, but not stopped (since they are in
+        * recovery and will reject the event), nodes to run
+        * eventscripts to process the "ipreallocated" event.  This
+        * can do a lot of things, including restarting services to
+        * reconfigure them if public IPs have moved.  Once upon a
+        * time this event only used to update natwg.
+        */
+       data.dptr  = discard_const("ipreallocated");
+       data.dsize = strlen((char *)data.dptr) + 1; 
+       nodes = list_of_nodes(ctdb, nodemap, tmp_ctx,
+                             NODE_FLAGS_DISCONNECTED|NODE_FLAGS_STOPPED, -1);
+       if (ctdb_client_async_control(ctdb, CTDB_CONTROL_RUN_EVENTSCRIPTS,
+                                     nodes, 0, TAKEOVER_TIMEOUT(),
+                                     false, data,
+                                     NULL, fail_callback,
+                                     callback_data) != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " ctdb_control to updatenatgw failed\n"));
+       }
+
        talloc_free(tmp_ctx);
        return 0;
 }
@@ -1295,7 +2127,7 @@ int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
        struct ctdb_control_tcp_addr new_addr;
        struct ctdb_control_tcp_addr *tcp_sock = NULL;
        struct ctdb_tcp_list *tcp;
-       struct ctdb_control_tcp_vnn t;
+       struct ctdb_tcp_connection t;
        int ret;
        TDB_DATA data;
        struct ctdb_client_ip *ip;
@@ -1375,8 +2207,8 @@ int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
 
        DLIST_ADD(client->tcp_list, tcp);
 
-       t.src  = tcp_sock->src;
-       t.dest = tcp_sock->dest;
+       t.src_addr = tcp_sock->src;
+       t.dst_addr = tcp_sock->dest;
 
        data.dptr = (uint8_t *)&t;
        data.dsize = sizeof(t);
@@ -1432,22 +2264,24 @@ static struct ctdb_tcp_connection *ctdb_tcp_find(struct ctdb_tcp_array *array,
        return NULL;
 }
 
+
+
 /*
   called by a daemon to inform us of a TCP connection that one of its
   clients managing that should tickled with an ACK when IP takeover is
   done
  */
-int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata)
+int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
 {
-       struct ctdb_control_tcp_vnn *p = (struct ctdb_control_tcp_vnn *)indata.dptr;
+       struct ctdb_tcp_connection *p = (struct ctdb_tcp_connection *)indata.dptr;
        struct ctdb_tcp_array *tcparray;
        struct ctdb_tcp_connection tcp;
        struct ctdb_vnn *vnn;
 
-       vnn = find_public_ip_vnn(ctdb, &p->dest);
+       vnn = find_public_ip_vnn(ctdb, &p->dst_addr);
        if (vnn == NULL) {
                DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
-                       ctdb_addr_to_str(&p->dest)));
+                       ctdb_addr_to_str(&p->dst_addr)));
 
                return -1;
        }
@@ -1467,16 +2301,20 @@ int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata)
                tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
                CTDB_NO_MEMORY(ctdb, tcparray->connections);
 
-               tcparray->connections[tcparray->num].src_addr = p->src;
-               tcparray->connections[tcparray->num].dst_addr = p->dest;
+               tcparray->connections[tcparray->num].src_addr = p->src_addr;
+               tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
                tcparray->num++;
+
+               if (tcp_update_needed) {
+                       vnn->tcp_update_needed = true;
+               }
                return 0;
        }
 
 
        /* Do we already have this tickle ?*/
-       tcp.src_addr = p->src;
-       tcp.dst_addr = p->dest;
+       tcp.src_addr = p->src_addr;
+       tcp.dst_addr = p->dst_addr;
        if (ctdb_tcp_find(vnn->tcp_array, &tcp) != NULL) {
                DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
                        ctdb_addr_to_str(&tcp.dst_addr),
@@ -1492,8 +2330,8 @@ int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata)
        CTDB_NO_MEMORY(ctdb, tcparray->connections);
 
        vnn->tcp_array = tcparray;
-       tcparray->connections[tcparray->num].src_addr = p->src;
-       tcparray->connections[tcparray->num].dst_addr = p->dest;
+       tcparray->connections[tcparray->num].src_addr = p->src_addr;
+       tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
        tcparray->num++;
                                
        DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
@@ -1501,6 +2339,10 @@ int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata)
                ntohs(tcp.dst_addr.ip.sin_port),
                vnn->pnn));
 
+       if (tcp_update_needed) {
+               vnn->tcp_update_needed = true;
+       }
+
        return 0;
 }
 
@@ -1568,6 +2410,20 @@ static void ctdb_remove_tcp_connection(struct ctdb_context *ctdb, struct ctdb_tc
 }
 
 
+/*
+  called by a daemon to inform us of a TCP connection that one of its
+  clients used are no longer needed in the tickle database
+ */
+int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
+{
+       struct ctdb_tcp_connection *conn = (struct ctdb_tcp_connection *)indata.dptr;
+
+       ctdb_remove_tcp_connection(ctdb, conn);
+
+       return 0;
+}
+
+
 /*
   called when a daemon restarts - send all tickes for all public addresses
   we are serving immediately to the new node.
@@ -1627,6 +2483,11 @@ int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb,
        int i, num, len;
        struct ctdb_all_public_ips *ips;
        struct ctdb_vnn *vnn;
+       bool only_available = false;
+
+       if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
+               only_available = true;
+       }
 
        /* count how many public ip structures we have */
        num = 0;
@@ -1639,16 +2500,21 @@ int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb,
        ips = talloc_zero_size(outdata, len);
        CTDB_NO_MEMORY(ctdb, ips);
 
-       outdata->dsize = len;
-       outdata->dptr  = (uint8_t *)ips;
-
-       ips->num = num;
        i = 0;
        for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
+               if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
+                       continue;
+               }
                ips->ips[i].pnn  = vnn->pnn;
                ips->ips[i].addr = vnn->public_address;
                i++;
        }
+       ips->num = i;
+       len = offsetof(struct ctdb_all_public_ips, ips) +
+               i*sizeof(struct ctdb_public_ip);
+
+       outdata->dsize = len;
+       outdata->dptr  = (uint8_t *)ips;
 
        return 0;
 }
@@ -1695,6 +2561,166 @@ int32_t ctdb_control_get_public_ipsv4(struct ctdb_context *ctdb,
        return 0;
 }
 
+int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
+                                       struct ctdb_req_control *c,
+                                       TDB_DATA indata,
+                                       TDB_DATA *outdata)
+{
+       int i, num, len;
+       ctdb_sock_addr *addr;
+       struct ctdb_control_public_ip_info *info;
+       struct ctdb_vnn *vnn;
+
+       addr = (ctdb_sock_addr *)indata.dptr;
+
+       vnn = find_public_ip_vnn(ctdb, addr);
+       if (vnn == NULL) {
+               /* if it is not a public ip   it could be our 'single ip' */
+               if (ctdb->single_ip_vnn) {
+                       if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
+                               vnn = ctdb->single_ip_vnn;
+                       }
+               }
+       }
+       if (vnn == NULL) {
+               DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
+                                "'%s'not a public address\n",
+                                ctdb_addr_to_str(addr)));
+               return -1;
+       }
+
+       /* count how many public ip structures we have */
+       num = 0;
+       for (;vnn->ifaces[num];) {
+               num++;
+       }
+
+       len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
+               num*sizeof(struct ctdb_control_iface_info);
+       info = talloc_zero_size(outdata, len);
+       CTDB_NO_MEMORY(ctdb, info);
+
+       info->ip.addr = vnn->public_address;
+       info->ip.pnn = vnn->pnn;
+       info->active_idx = 0xFFFFFFFF;
+
+       for (i=0; vnn->ifaces[i]; i++) {
+               struct ctdb_iface *cur;
+
+               cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
+               if (cur == NULL) {
+                       DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
+                                          vnn->ifaces[i]));
+                       return -1;
+               }
+               if (vnn->iface == cur) {
+                       info->active_idx = i;
+               }
+               strcpy(info->ifaces[i].name, cur->name);
+               info->ifaces[i].link_state = cur->link_up;
+               info->ifaces[i].references = cur->references;
+       }
+       info->num = i;
+       len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
+               i*sizeof(struct ctdb_control_iface_info);
+
+       outdata->dsize = len;
+       outdata->dptr  = (uint8_t *)info;
+
+       return 0;
+}
+
+int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
+                               struct ctdb_req_control *c,
+                               TDB_DATA *outdata)
+{
+       int i, num, len;
+       struct ctdb_control_get_ifaces *ifaces;
+       struct ctdb_iface *cur;
+
+       /* count how many public ip structures we have */
+       num = 0;
+       for (cur=ctdb->ifaces;cur;cur=cur->next) {
+               num++;
+       }
+
+       len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
+               num*sizeof(struct ctdb_control_iface_info);
+       ifaces = talloc_zero_size(outdata, len);
+       CTDB_NO_MEMORY(ctdb, ifaces);
+
+       i = 0;
+       for (cur=ctdb->ifaces;cur;cur=cur->next) {
+               strcpy(ifaces->ifaces[i].name, cur->name);
+               ifaces->ifaces[i].link_state = cur->link_up;
+               ifaces->ifaces[i].references = cur->references;
+               i++;
+       }
+       ifaces->num = i;
+       len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
+               i*sizeof(struct ctdb_control_iface_info);
+
+       outdata->dsize = len;
+       outdata->dptr  = (uint8_t *)ifaces;
+
+       return 0;
+}
+
+int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
+                                   struct ctdb_req_control *c,
+                                   TDB_DATA indata)
+{
+       struct ctdb_control_iface_info *info;
+       struct ctdb_iface *iface;
+       bool link_up = false;
+
+       info = (struct ctdb_control_iface_info *)indata.dptr;
+
+       if (info->name[CTDB_IFACE_SIZE] != '\0') {
+               int len = strnlen(info->name, CTDB_IFACE_SIZE);
+               DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
+                                 len, len, info->name));
+               return -1;
+       }
+
+       switch (info->link_state) {
+       case 0:
+               link_up = false;
+               break;
+       case 1:
+               link_up = true;
+               break;
+       default:
+               DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
+                                 (unsigned int)info->link_state));
+               return -1;
+       }
+
+       if (info->references != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
+                                 (unsigned int)info->references));
+               return -1;
+       }
+
+       iface = ctdb_find_iface(ctdb, info->name);
+       if (iface == NULL) {
+               return -1;
+       }
+
+       if (link_up == iface->link_up) {
+               return 0;
+       }
+
+       DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
+             ("iface[%s] has changed it's link status %s => %s\n",
+              iface->name,
+              iface->link_up?"up":"down",
+              link_up?"up":"down"));
+
+       iface->link_up = link_up;
+       return 0;
+}
+
 
 /* 
    structure containing the listening socket and the list of tcp connections
@@ -1820,7 +2846,8 @@ static void tickle_connection_traverse(void *param, void *data)
 
        /* have tried too many times, just give up */
        if (con->count >= 5) {
-               talloc_free(con);
+               /* can't delete in traverse: reparent to delete_cons */
+               talloc_steal(param, con);
                return;
        }
 
@@ -1840,11 +2867,13 @@ static void ctdb_tickle_sentenced_connections(struct event_context *ev, struct t
                                              struct timeval t, void *private_data)
 {
        struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
-
+       void *delete_cons = talloc_new(NULL);
 
        /* loop over all connections sending tickle ACKs */
-       trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, NULL);
+       trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
 
+       /* now we've finished traverse, it's safe to do deletion. */
+       talloc_free(delete_cons);
 
        /* If there are no more connections to kill we can remove the
           entire killtcp structure
@@ -1866,7 +2895,25 @@ static void ctdb_tickle_sentenced_connections(struct event_context *ev, struct t
  */
 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
 {
+       struct ctdb_vnn *tmpvnn;
+
+       /* verify that this vnn is still active */
+       for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
+               if (tmpvnn == killtcp->vnn) {
+                       break;
+               }
+       }
+
+       if (tmpvnn == NULL) {
+               return 0;
+       }
+
+       if (killtcp->vnn->killtcp != killtcp) {
+               return 0;
+       }
+
        killtcp->vnn->killtcp = NULL;
+
        return 0;
 }
 
@@ -1921,7 +2968,7 @@ static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb,
           a new structure
         */
        if (killtcp == NULL) {
-               killtcp = talloc_zero(ctdb, struct ctdb_kill_tcp);
+               killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
                CTDB_NO_MEMORY(ctdb, killtcp);
 
                killtcp->vnn         = vnn;
@@ -1967,8 +3014,9 @@ static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb,
 
        if (killtcp->fde == NULL) {
                killtcp->fde = event_add_fd(ctdb->ev, killtcp, killtcp->capture_fd, 
-                                           EVENT_FD_READ | EVENT_FD_AUTOCLOSE, 
+                                           EVENT_FD_READ,
                                            capture_tcp_handler, killtcp);
+               tevent_fd_set_auto_close(killtcp->fde);
 
                /* We also need to set up some events to tickle all these connections
                   until they are all reset
@@ -2351,12 +3399,17 @@ int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb, TDB_DATA inda
                        TALLOC_CTX *mem_ctx;
 
                        DLIST_REMOVE(ctdb->vnn, vnn);
-                       if (vnn->iface == NULL) {
+                       if (vnn->pnn != ctdb->pnn) {
+                               if (vnn->iface != NULL) {
+                                       ctdb_vnn_unassign_iface(ctdb, vnn);
+                               }
                                talloc_free(vnn);
                                return 0;
                        }
+                       vnn->pnn = -1;
 
                        mem_ctx = talloc_new(ctdb);
+                       talloc_steal(mem_ctx, vnn);
                        ret = ctdb_event_script_callback(ctdb, 
                                         mem_ctx, delete_ip_callback, mem_ctx,
                                         false,
@@ -2365,8 +3418,9 @@ int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb, TDB_DATA inda
                                         ctdb_vnn_iface_string(vnn),
                                         ctdb_addr_to_str(&vnn->public_address),
                                         vnn->public_netmask_bits);
-                       ctdb_vnn_unassign_iface(ctdb, vnn);
-                       talloc_free(vnn);
+                       if (vnn->iface != NULL) {
+                               ctdb_vnn_unassign_iface(ctdb, vnn);
+                       }
                        if (ret != 0) {
                                return -1;
                        }
@@ -2377,3 +3431,62 @@ int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb, TDB_DATA inda
        return -1;
 }
 
+/* This function is called from the recovery daemon to verify that a remote
+   node has the expected ip allocation.
+   This is verified against ctdb->ip_tree
+*/
+int verify_remote_ip_allocation(struct ctdb_context *ctdb, struct ctdb_all_public_ips *ips)
+{
+       struct ctdb_public_ip_list *tmp_ip; 
+       int i;
+
+       if (ctdb->ip_tree == NULL) {
+               /* dont know the expected allocation yet, assume remote node
+                  is correct. */
+               return 0;
+       }
+
+       if (ips == NULL) {
+               return 0;
+       }
+
+       for (i=0; i<ips->num; i++) {
+               tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
+               if (tmp_ip == NULL) {
+                       DEBUG(DEBUG_ERR,(__location__ " Could not find host for address %s, reassign ips\n", ctdb_addr_to_str(&ips->ips[i].addr)));
+                       return -1;
+               }
+
+               if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
+                       continue;
+               }
+
+               if (tmp_ip->pnn != ips->ips[i].pnn) {
+                       DEBUG(DEBUG_ERR,("Inconsistent ip allocation. Trigger reallocation. Thinks %s is held by node %u while it is held by node %u\n", ctdb_addr_to_str(&ips->ips[i].addr), ips->ips[i].pnn, tmp_ip->pnn));
+                       return -1;
+               }
+       }
+
+       return 0;
+}
+
+int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
+{
+       struct ctdb_public_ip_list *tmp_ip; 
+
+       if (ctdb->ip_tree == NULL) {
+               DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
+               return -1;
+       }
+
+       tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
+       if (tmp_ip == NULL) {
+               DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
+               return -1;
+       }
+
+       DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
+       tmp_ip->pnn = ip->pnn;
+
+       return 0;
+}