ctdb-ipalloc: Split IP allocation into its own build subsystem
authorMartin Schwenke <martin@meltin.net>
Mon, 23 Nov 2015 05:18:16 +0000 (16:18 +1100)
committerAmitay Isaacs <amitay@samba.org>
Wed, 13 Jan 2016 19:18:20 +0000 (20:18 +0100)
Signed-off-by: Martin Schwenke <martin@meltin.net>
Reviewed-by: Amitay Isaacs <amitay@gmail.com>
ctdb/server/ctdb_takeover.c
ctdb/server/ipalloc.c [new file with mode: 0644]
ctdb/server/ipalloc.h [new file with mode: 0644]
ctdb/server/ipalloc_common.c [new file with mode: 0644]
ctdb/server/ipalloc_deterministic.c [new file with mode: 0644]
ctdb/server/ipalloc_lcp2.c [new file with mode: 0644]
ctdb/server/ipalloc_nondeterministic.c [new file with mode: 0644]
ctdb/server/ipalloc_private.h [new file with mode: 0644]
ctdb/tests/src/ctdbd_test.c
ctdb/wscript

index 227bd162d441c8d89504e4c4e8b3ace3120c90d4..c49009cf22d192cc5fa62826dcf4641d3aa47e52 100644 (file)
 #include "common/common.h"
 #include "common/logging.h"
 
+#include "server/ipalloc.h"
 
 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
 
 #define CTDB_ARP_INTERVAL 1
 #define CTDB_ARP_REPEAT   3
 
-/* Flags used in IP allocation algorithms. */
-enum ipalloc_algorithm {
-       IPALLOC_DETERMINISTIC,
-       IPALLOC_NONDETERMINISTIC,
-       IPALLOC_LCP2,
-};
-
-struct ipalloc_state {
-       uint32_t num;
-
-       /* Arrays with data for each node */
-       struct ctdb_public_ip_list_old **known_public_ips;
-       struct ctdb_public_ip_list_old **available_public_ips;
-       bool *noiptakeover;
-       bool *noiphost;
-
-       struct public_ip_list *all_ips;
-       enum ipalloc_algorithm algorithm;
-       uint32_t no_ip_failback;
-       uint32_t *force_rebalance_nodes;
-};
-
 struct ctdb_interface {
        struct ctdb_interface *prev, *next;
        const char *name;
@@ -1249,138 +1228,6 @@ int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
        return 0;
 }
 
-struct public_ip_list {
-       struct public_ip_list *next;
-       uint32_t pnn;
-       ctdb_sock_addr addr;
-};
-
-/* Given a physical node, return the number of
-   public addresses that is currently assigned to this node.
-*/
-static int node_ip_coverage(int32_t pnn, struct public_ip_list *ips)
-{
-       int num=0;
-
-       for (;ips;ips=ips->next) {
-               if (ips->pnn == pnn) {
-                       num++;
-               }
-       }
-       return num;
-}
-
-
-/* Can the given node host the given IP: is the public IP known to the
- * node and is NOIPHOST unset?
-*/
-static bool can_node_host_ip(struct ipalloc_state *ipalloc_state,
-                            int32_t pnn,
-                            struct public_ip_list *ip)
-{
-       struct ctdb_public_ip_list_old *public_ips;
-       int i;
-
-       if (ipalloc_state->noiphost[pnn]) {
-               return false;
-       }
-
-       public_ips = ipalloc_state->available_public_ips[pnn];
-
-       if (public_ips == NULL) {
-               return false;
-       }
-
-       for (i=0; i<public_ips->num; i++) {
-               if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
-                       /* yes, this node can serve this public ip */
-                       return true;
-               }
-       }
-
-       return false;
-}
-
-static bool can_node_takeover_ip(struct ipalloc_state *ipalloc_state,
-                                int32_t pnn,
-                                struct public_ip_list *ip)
-{
-       if (ipalloc_state->noiptakeover[pnn]) {
-               return false;
-       }
-
-       return can_node_host_ip(ipalloc_state, pnn, ip);
-}
-
-/* search the node lists list for a node to takeover this ip.
-   pick the node that currently are serving the least number of ips
-   so that the ips get spread out evenly.
-*/
-static int find_takeover_node(struct ipalloc_state *ipalloc_state,
-                             struct public_ip_list *ip)
-{
-       int pnn, min=0, num;
-       int i, numnodes;
-
-       numnodes = ipalloc_state->num;
-       pnn    = -1;
-       for (i=0; i<numnodes; i++) {
-               /* verify that this node can serve this ip */
-               if (!can_node_takeover_ip(ipalloc_state, i, ip)) {
-                       /* no it couldnt   so skip to the next node */
-                       continue;
-               }
-
-               num = node_ip_coverage(i, ipalloc_state->all_ips);
-               /* was this the first node we checked ? */
-               if (pnn == -1) {
-                       pnn = i;
-                       min  = num;
-               } else {
-                       if (num < min) {
-                               pnn = i;
-                               min  = num;
-                       }
-               }
-       }
-       if (pnn == -1) {
-               DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
-                       ctdb_addr_to_str(&ip->addr)));
-
-               return -1;
-       }
-
-       ip->pnn = pnn;
-       return 0;
-}
-
-#define IP_KEYLEN      4
-static uint32_t *ip_key(ctdb_sock_addr *ip)
-{
-       static uint32_t key[IP_KEYLEN];
-
-       bzero(key, sizeof(key));
-
-       switch (ip->sa.sa_family) {
-       case AF_INET:
-               key[3]  = htonl(ip->ip.sin_addr.s_addr);
-               break;
-       case AF_INET6: {
-               uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
-               key[0]  = htonl(s6_a32[0]);
-               key[1]  = htonl(s6_a32[1]);
-               key[2]  = htonl(s6_a32[2]);
-               key[3]  = htonl(s6_a32[3]);
-               break;
-       }
-       default:
-               DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
-               return key;
-       }
-
-       return key;
-}
-
 static void *add_ip_callback(void *parm, void *data)
 {
        struct public_ip_list *this_ip = parm;
@@ -1518,679 +1365,6 @@ create_merged_ip_list(struct ctdb_context *ctdb, struct ipalloc_state *ipalloc_s
        return ip_list;
 }
 
-/* 
- * This is the length of the longtest common prefix between the IPs.
- * It is calculated by XOR-ing the 2 IPs together and counting the
- * number of leading zeroes.  The implementation means that all
- * addresses end up being 128 bits long.
- *
- * FIXME? Should we consider IPv4 and IPv6 separately given that the
- * 12 bytes of 0 prefix padding will hurt the algorithm if there are
- * lots of nodes and IP addresses?
- */
-static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
-{
-       uint32_t ip1_k[IP_KEYLEN];
-       uint32_t *t;
-       int i;
-       uint32_t x;
-
-       uint32_t distance = 0;
-
-       memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
-       t = ip_key(ip2);
-       for (i=0; i<IP_KEYLEN; i++) {
-               x = ip1_k[i] ^ t[i];
-               if (x == 0) {
-                       distance += 32;
-               } else {
-                       /* Count number of leading zeroes. 
-                        * FIXME? This could be optimised...
-                        */
-                       while ((x & (1 << 31)) == 0) {
-                               x <<= 1;
-                               distance += 1;
-                       }
-               }
-       }
-
-       return distance;
-}
-
-/* Calculate the IP distance for the given IP relative to IPs on the
-   given node.  The ips argument is generally the all_ips variable
-   used in the main part of the algorithm.
- */
-static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
-                                 struct public_ip_list *ips,
-                                 int pnn)
-{
-       struct public_ip_list *t;
-       uint32_t d;
-
-       uint32_t sum = 0;
-
-       for (t = ips; t != NULL; t = t->next) {
-               if (t->pnn != pnn) {
-                       continue;
-               }
-
-               /* Optimisation: We never calculate the distance
-                * between an address and itself.  This allows us to
-                * calculate the effect of removing an address from a
-                * node by simply calculating the distance between
-                * that address and all of the exitsing addresses.
-                * Moreover, we assume that we're only ever dealing
-                * with addresses from all_ips so we can identify an
-                * address via a pointer rather than doing a more
-                * expensive address comparison. */
-               if (&(t->addr) == ip) {
-                       continue;
-               }
-
-               d = ip_distance(ip, &(t->addr));
-               sum += d * d;  /* Cheaper than pulling in math.h :-) */
-       }
-
-       return sum;
-}
-
-/* Return the LCP2 imbalance metric for addresses currently assigned
-   to the given node.
- */
-static uint32_t lcp2_imbalance(struct public_ip_list * all_ips, int pnn)
-{
-       struct public_ip_list *t;
-
-       uint32_t imbalance = 0;
-
-       for (t = all_ips; t != NULL; t = t->next) {
-               if (t->pnn != pnn) {
-                       continue;
-               }
-               /* Pass the rest of the IPs rather than the whole
-                  all_ips input list.
-               */
-               imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
-       }
-
-       return imbalance;
-}
-
-/* Allocate any unassigned IPs just by looping through the IPs and
- * finding the best node for each.
- */
-static void basic_allocate_unassigned(struct ipalloc_state *ipalloc_state)
-{
-       struct public_ip_list *t;
-
-       /* loop over all ip's and find a physical node to cover for
-          each unassigned ip.
-       */
-       for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
-               if (t->pnn == -1) {
-                       if (find_takeover_node(ipalloc_state, t)) {
-                               DEBUG(DEBUG_WARNING,
-                                     ("Failed to find node to cover ip %s\n",
-                                      ctdb_addr_to_str(&t->addr)));
-                       }
-               }
-       }
-}
-
-/* Basic non-deterministic rebalancing algorithm.
- */
-static void basic_failback(struct ipalloc_state *ipalloc_state,
-                          int num_ips)
-{
-       int i, numnodes;
-       int maxnode, maxnum, minnode, minnum, num, retries;
-       struct public_ip_list *t;
-
-       numnodes = ipalloc_state->num;
-       retries = 0;
-
-try_again:
-       maxnum=0;
-       minnum=0;
-
-       /* for each ip address, loop over all nodes that can serve
-          this ip and make sure that the difference between the node
-          serving the most and the node serving the least ip's are
-          not greater than 1.
-       */
-       for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
-               if (t->pnn == -1) {
-                       continue;
-               }
-
-               /* Get the highest and lowest number of ips's served by any 
-                  valid node which can serve this ip.
-               */
-               maxnode = -1;
-               minnode = -1;
-               for (i=0; i<numnodes; i++) {
-                       /* only check nodes that can actually serve this ip */
-                       if (!can_node_takeover_ip(ipalloc_state, i,
-                                                 t)) {
-                               /* no it couldnt   so skip to the next node */
-                               continue;
-                       }
-
-                       num = node_ip_coverage(i, ipalloc_state->all_ips);
-                       if (maxnode == -1) {
-                               maxnode = i;
-                               maxnum  = num;
-                       } else {
-                               if (num > maxnum) {
-                                       maxnode = i;
-                                       maxnum  = num;
-                               }
-                       }
-                       if (minnode == -1) {
-                               minnode = i;
-                               minnum  = num;
-                       } else {
-                               if (num < minnum) {
-                                       minnode = i;
-                                       minnum  = num;
-                               }
-                       }
-               }
-               if (maxnode == -1) {
-                       DEBUG(DEBUG_WARNING,
-                             (__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
-                              ctdb_addr_to_str(&t->addr)));
-
-                       continue;
-               }
-
-               /* if the spread between the smallest and largest coverage by
-                  a node is >=2 we steal one of the ips from the node with
-                  most coverage to even things out a bit.
-                  try to do this a limited number of times since we dont
-                  want to spend too much time balancing the ip coverage.
-               */
-               if ((maxnum > minnum+1) &&
-                   (retries < (num_ips + 5))){
-                       struct public_ip_list *tt;
-
-                       /* Reassign one of maxnode's VNNs */
-                       for (tt = ipalloc_state->all_ips; tt != NULL; tt = tt->next) {
-                               if (tt->pnn == maxnode) {
-                                       (void)find_takeover_node(ipalloc_state,
-                                                                tt);
-                                       retries++;
-                                       goto try_again;;
-                               }
-                       }
-               }
-       }
-}
-
-static bool lcp2_init(struct ipalloc_state *ipalloc_state,
-                     uint32_t **lcp2_imbalances,
-                     bool **rebalance_candidates)
-{
-       int i, numnodes;
-       struct public_ip_list *t;
-
-       numnodes = ipalloc_state->num;
-
-       *rebalance_candidates = talloc_array(ipalloc_state, bool, numnodes);
-       if (*rebalance_candidates == NULL) {
-               DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
-               return false;
-       }
-       *lcp2_imbalances = talloc_array(ipalloc_state, uint32_t, numnodes);
-       if (*lcp2_imbalances == NULL) {
-               DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
-               return false;
-       }
-
-       for (i=0; i<numnodes; i++) {
-               (*lcp2_imbalances)[i] =
-                       lcp2_imbalance(ipalloc_state->all_ips, i);
-               /* First step: assume all nodes are candidates */
-               (*rebalance_candidates)[i] = true;
-       }
-
-       /* 2nd step: if a node has IPs assigned then it must have been
-        * healthy before, so we remove it from consideration.  This
-        * is overkill but is all we have because we don't maintain
-        * state between takeover runs.  An alternative would be to
-        * keep state and invalidate it every time the recovery master
-        * changes.
-        */
-       for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
-               if (t->pnn != -1) {
-                       (*rebalance_candidates)[t->pnn] = false;
-               }
-       }
-
-       /* 3rd step: if a node is forced to re-balance then
-          we allow failback onto the node */
-       if (ipalloc_state->force_rebalance_nodes == NULL) {
-               return true;
-       }
-       for (i = 0;
-            i < talloc_array_length(ipalloc_state->force_rebalance_nodes);
-            i++) {
-               uint32_t pnn = ipalloc_state->force_rebalance_nodes[i];
-               if (pnn >= numnodes) {
-                       DEBUG(DEBUG_ERR,
-                             (__location__ "unknown node %u\n", pnn));
-                       continue;
-               }
-
-               DEBUG(DEBUG_NOTICE,
-                     ("Forcing rebalancing of IPs to node %u\n", pnn));
-               (*rebalance_candidates)[pnn] = true;
-       }
-
-       return true;
-}
-
-/* Allocate any unassigned addresses using the LCP2 algorithm to find
- * the IP/node combination that will cost the least.
- */
-static void lcp2_allocate_unassigned(struct ipalloc_state *ipalloc_state,
-                                    uint32_t *lcp2_imbalances)
-{
-       struct public_ip_list *t;
-       int dstnode, numnodes;
-
-       int minnode;
-       uint32_t mindsum, dstdsum, dstimbl, minimbl;
-       struct public_ip_list *minip;
-
-       bool should_loop = true;
-       bool have_unassigned = true;
-
-       numnodes = ipalloc_state->num;
-
-       while (have_unassigned && should_loop) {
-               should_loop = false;
-
-               DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
-               DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
-
-               minnode = -1;
-               mindsum = 0;
-               minip = NULL;
-
-               /* loop over each unassigned ip. */
-               for (t = ipalloc_state->all_ips; t != NULL ; t = t->next) {
-                       if (t->pnn != -1) {
-                               continue;
-                       }
-
-                       for (dstnode = 0; dstnode < numnodes; dstnode++) {
-                               /* only check nodes that can actually takeover this ip */
-                               if (!can_node_takeover_ip(ipalloc_state,
-                                                         dstnode,
-                                                         t)) {
-                                       /* no it couldnt   so skip to the next node */
-                                       continue;
-                               }
-
-                               dstdsum = ip_distance_2_sum(&(t->addr),
-                                                           ipalloc_state->all_ips,
-                                                           dstnode);
-                               dstimbl = lcp2_imbalances[dstnode] + dstdsum;
-                               DEBUG(DEBUG_DEBUG,
-                                     (" %s -> %d [+%d]\n",
-                                      ctdb_addr_to_str(&(t->addr)),
-                                      dstnode,
-                                      dstimbl - lcp2_imbalances[dstnode]));
-
-
-                               if ((minnode == -1) || (dstdsum < mindsum)) {
-                                       minnode = dstnode;
-                                       minimbl = dstimbl;
-                                       mindsum = dstdsum;
-                                       minip = t;
-                                       should_loop = true;
-                               }
-                       }
-               }
-
-               DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
-
-               /* If we found one then assign it to the given node. */
-               if (minnode != -1) {
-                       minip->pnn = minnode;
-                       lcp2_imbalances[minnode] = minimbl;
-                       DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
-                                         ctdb_addr_to_str(&(minip->addr)),
-                                         minnode,
-                                         mindsum));
-               }
-
-               /* There might be a better way but at least this is clear. */
-               have_unassigned = false;
-               for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
-                       if (t->pnn == -1) {
-                               have_unassigned = true;
-                       }
-               }
-       }
-
-       /* We know if we have an unassigned addresses so we might as
-        * well optimise.
-        */
-       if (have_unassigned) {
-               for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
-                       if (t->pnn == -1) {
-                               DEBUG(DEBUG_WARNING,
-                                     ("Failed to find node to cover ip %s\n",
-                                      ctdb_addr_to_str(&t->addr)));
-                       }
-               }
-       }
-}
-
-/* LCP2 algorithm for rebalancing the cluster.  Given a candidate node
- * to move IPs from, determines the best IP/destination node
- * combination to move from the source node.
- */
-static bool lcp2_failback_candidate(struct ipalloc_state *ipalloc_state,
-                                   int srcnode,
-                                   uint32_t *lcp2_imbalances,
-                                   bool *rebalance_candidates)
-{
-       int dstnode, mindstnode, numnodes;
-       uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
-       uint32_t minsrcimbl, mindstimbl;
-       struct public_ip_list *minip;
-       struct public_ip_list *t;
-
-       /* Find an IP and destination node that best reduces imbalance. */
-       srcimbl = 0;
-       minip = NULL;
-       minsrcimbl = 0;
-       mindstnode = -1;
-       mindstimbl = 0;
-
-       numnodes = ipalloc_state->num;
-
-       DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
-       DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n",
-                          srcnode, lcp2_imbalances[srcnode]));
-
-       for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
-               /* Only consider addresses on srcnode. */
-               if (t->pnn != srcnode) {
-                       continue;
-               }
-
-               /* What is this IP address costing the source node? */
-               srcdsum = ip_distance_2_sum(&(t->addr),
-                                           ipalloc_state->all_ips,
-                                           srcnode);
-               srcimbl = lcp2_imbalances[srcnode] - srcdsum;
-
-               /* Consider this IP address would cost each potential
-                * destination node.  Destination nodes are limited to
-                * those that are newly healthy, since we don't want
-                * to do gratuitous failover of IPs just to make minor
-                * balance improvements.
-                */
-               for (dstnode = 0; dstnode < numnodes; dstnode++) {
-                       if (!rebalance_candidates[dstnode]) {
-                               continue;
-                       }
-
-                       /* only check nodes that can actually takeover this ip */
-                       if (!can_node_takeover_ip(ipalloc_state, dstnode,
-                                                 t)) {
-                               /* no it couldnt   so skip to the next node */
-                               continue;
-                       }
-
-                       dstdsum = ip_distance_2_sum(&(t->addr),
-                                                   ipalloc_state->all_ips,
-                                                   dstnode);
-                       dstimbl = lcp2_imbalances[dstnode] + dstdsum;
-                       DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
-                                          srcnode, -srcdsum,
-                                          ctdb_addr_to_str(&(t->addr)),
-                                          dstnode, dstdsum));
-
-                       if ((dstimbl < lcp2_imbalances[srcnode]) &&
-                           (dstdsum < srcdsum) &&                      \
-                           ((mindstnode == -1) ||                              \
-                            ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
-
-                               minip = t;
-                               minsrcimbl = srcimbl;
-                               mindstnode = dstnode;
-                               mindstimbl = dstimbl;
-                       }
-               }
-       }
-       DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
-
-        if (mindstnode != -1) {
-               /* We found a move that makes things better... */
-               DEBUG(DEBUG_INFO,
-                     ("%d [%d] -> %s -> %d [+%d]\n",
-                      srcnode, minsrcimbl - lcp2_imbalances[srcnode],
-                      ctdb_addr_to_str(&(minip->addr)),
-                      mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
-
-
-               lcp2_imbalances[srcnode] = minsrcimbl;
-               lcp2_imbalances[mindstnode] = mindstimbl;
-               minip->pnn = mindstnode;
-
-               return true;
-       }
-
-        return false;
-       
-}
-
-struct lcp2_imbalance_pnn {
-       uint32_t imbalance;
-       int pnn;
-};
-
-static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
-{
-       const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
-       const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
-
-       if (lipa->imbalance > lipb->imbalance) {
-               return -1;
-       } else if (lipa->imbalance == lipb->imbalance) {
-               return 0;
-       } else {
-               return 1;
-       }
-}
-
-/* LCP2 algorithm for rebalancing the cluster.  This finds the source
- * node with the highest LCP2 imbalance, and then determines the best
- * IP/destination node combination to move from the source node.
- */
-static void lcp2_failback(struct ipalloc_state *ipalloc_state,
-                         uint32_t *lcp2_imbalances,
-                         bool *rebalance_candidates)
-{
-       int i, numnodes;
-       struct lcp2_imbalance_pnn * lips;
-       bool again;
-
-       numnodes = ipalloc_state->num;
-
-try_again:
-       /* Put the imbalances and nodes into an array, sort them and
-        * iterate through candidates.  Usually the 1st one will be
-        * used, so this doesn't cost much...
-        */
-       DEBUG(DEBUG_DEBUG,("+++++++++++++++++++++++++++++++++++++++++\n"));
-       DEBUG(DEBUG_DEBUG,("Selecting most imbalanced node from:\n"));
-       lips = talloc_array(ipalloc_state, struct lcp2_imbalance_pnn, numnodes);
-       for (i = 0; i < numnodes; i++) {
-               lips[i].imbalance = lcp2_imbalances[i];
-               lips[i].pnn = i;
-               DEBUG(DEBUG_DEBUG,(" %d [%d]\n", i, lcp2_imbalances[i]));
-       }
-       qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
-             lcp2_cmp_imbalance_pnn);
-
-       again = false;
-       for (i = 0; i < numnodes; i++) {
-               /* This means that all nodes had 0 or 1 addresses, so
-                * can't be imbalanced.
-                */
-               if (lips[i].imbalance == 0) {
-                       break;
-               }
-
-               if (lcp2_failback_candidate(ipalloc_state,
-                                           lips[i].pnn,
-                                           lcp2_imbalances,
-                                           rebalance_candidates)) {
-                       again = true;
-                       break;
-               }
-       }
-
-       talloc_free(lips);
-       if (again) {
-               goto try_again;
-       }
-}
-
-static void unassign_unsuitable_ips(struct ipalloc_state *ipalloc_state)
-{
-       struct public_ip_list *t;
-
-       /* verify that the assigned nodes can serve that public ip
-          and set it to -1 if not
-       */
-       for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
-               if (t->pnn == -1) {
-                       continue;
-               }
-               if (!can_node_host_ip(ipalloc_state, t->pnn, t) != 0) {
-                       /* this node can not serve this ip. */
-                       DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
-                                          ctdb_addr_to_str(&(t->addr)),
-                                          t->pnn));
-                       t->pnn = -1;
-               }
-       }
-}
-
-static bool ipalloc_deterministic(struct ipalloc_state *ipalloc_state)
-{
-       struct public_ip_list *t;
-       int i, numnodes;
-
-       numnodes = ipalloc_state->num;
-
-       DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
-       /* Allocate IPs to nodes in a modulo fashion so that IPs will
-        *  always be allocated the same way for a specific set of
-        *  available/unavailable nodes.
-       */
-
-       for (i = 0, t = ipalloc_state->all_ips; t!= NULL; t = t->next, i++) {
-               t->pnn = i % numnodes;
-       }
-
-       /* IP failback doesn't make sense with deterministic
-        * IPs, since the modulo step above implicitly fails
-        * back IPs to their "home" node.
-        */
-       if (1 == ipalloc_state->no_ip_failback) {
-               DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
-       }
-
-       unassign_unsuitable_ips(ipalloc_state);
-
-       basic_allocate_unassigned(ipalloc_state);
-
-       /* No failback here! */
-
-       return true;
-}
-
-static bool ipalloc_nondeterministic(struct ipalloc_state *ipalloc_state)
-{
-       /* This should be pushed down into basic_failback. */
-       struct public_ip_list *t;
-       int num_ips = 0;
-       for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
-               num_ips++;
-       }
-
-       unassign_unsuitable_ips(ipalloc_state);
-
-       basic_allocate_unassigned(ipalloc_state);
-
-       /* If we don't want IPs to fail back then don't rebalance IPs. */
-       if (1 == ipalloc_state->no_ip_failback) {
-               return true;
-       }
-
-       /* Now, try to make sure the ip adresses are evenly distributed
-          across the nodes.
-       */
-       basic_failback(ipalloc_state, num_ips);
-
-       return true;
-}
-
-static bool ipalloc_lcp2(struct ipalloc_state *ipalloc_state)
-{
-       uint32_t *lcp2_imbalances;
-       bool *rebalance_candidates;
-       int numnodes, num_rebalance_candidates, i;
-       bool ret = true;
-
-       unassign_unsuitable_ips(ipalloc_state);
-
-       if (!lcp2_init(ipalloc_state,
-                      &lcp2_imbalances, &rebalance_candidates)) {
-               ret = false;
-               goto finished;
-       }
-
-       lcp2_allocate_unassigned(ipalloc_state, lcp2_imbalances);
-
-       /* If we don't want IPs to fail back then don't rebalance IPs. */
-       if (1 == ipalloc_state->no_ip_failback) {
-               goto finished;
-       }
-
-       /* It is only worth continuing if we have suitable target
-        * nodes to transfer IPs to.  This check is much cheaper than
-        * continuing on...
-        */
-       numnodes = ipalloc_state->num;
-       num_rebalance_candidates = 0;
-       for (i=0; i<numnodes; i++) {
-               if (rebalance_candidates[i]) {
-                       num_rebalance_candidates++;
-               }
-       }
-       if (num_rebalance_candidates == 0) {
-               goto finished;
-       }
-
-       /* Now, try to make sure the ip adresses are evenly distributed
-          across the nodes.
-       */
-       lcp2_failback(ipalloc_state, lcp2_imbalances, rebalance_candidates);
-
-finished:
-       return ret;
-}
-
 static bool all_nodes_are_disabled(struct ctdb_node_map_old *nodemap)
 {
        int i;
@@ -2205,30 +1379,6 @@ static bool all_nodes_are_disabled(struct ctdb_node_map_old *nodemap)
        return true;
 }
 
-/* The calculation part of the IP allocation algorithm. */
-static bool ipalloc(struct ipalloc_state *ipalloc_state)
-{
-       bool ret;
-
-       switch (ipalloc_state->algorithm) {
-       case IPALLOC_LCP2:
-               ret = ipalloc_lcp2(ipalloc_state);
-               break;
-       case IPALLOC_DETERMINISTIC:
-               ret = ipalloc_deterministic(ipalloc_state);
-               break;
-       case IPALLOC_NONDETERMINISTIC:
-               ret = ipalloc_nondeterministic(ipalloc_state);
-               break;
-       }
-
-       /* at this point ->pnn is the node which will own each IP
-          or -1 if there is no node that can cover this ip
-       */
-
-       return ret;
-}
-
 struct get_tunable_callback_data {
        const char *tunable;
        uint32_t *out;
diff --git a/ctdb/server/ipalloc.c b/ctdb/server/ipalloc.c
new file mode 100644 (file)
index 0000000..9b5579c
--- /dev/null
@@ -0,0 +1,53 @@
+/*
+   ctdb ip takeover code
+
+   Copyright (C) Ronnie Sahlberg  2007
+   Copyright (C) Andrew Tridgell  2007
+   Copyright (C) Martin Schwenke  2011
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/network.h"
+
+#include "lib/util/debug.h"
+
+#include "common/logging.h"
+
+#include "server/ipalloc_private.h"
+
+/* The calculation part of the IP allocation algorithm. */
+bool ipalloc(struct ipalloc_state *ipalloc_state)
+{
+       bool ret;
+
+       switch (ipalloc_state->algorithm) {
+       case IPALLOC_LCP2:
+               ret = ipalloc_lcp2(ipalloc_state);
+               break;
+       case IPALLOC_DETERMINISTIC:
+               ret = ipalloc_deterministic(ipalloc_state);
+               break;
+       case IPALLOC_NONDETERMINISTIC:
+               ret = ipalloc_nondeterministic(ipalloc_state);
+               break;
+       }
+
+       /* at this point ->pnn is the node which will own each IP
+          or -1 if there is no node that can cover this ip
+       */
+
+       return ret;
+}
diff --git a/ctdb/server/ipalloc.h b/ctdb/server/ipalloc.h
new file mode 100644 (file)
index 0000000..65c7786
--- /dev/null
@@ -0,0 +1,63 @@
+/*
+   CTDB IP takeover code
+
+   Copyright (C) Ronnie Sahlberg  2007
+   Copyright (C) Andrew Tridgell  2007
+   Copyright (C) Martin Schwenke  2015
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __CTDB_IPALLOC_H__
+#define __CTDB_IPALLOC_H__
+
+#include "replace.h"
+#include "system/network.h"
+
+#include "include/ctdb_protocol.h"
+
+struct public_ip_list {
+       struct public_ip_list *next;
+       uint32_t pnn;
+       ctdb_sock_addr addr;
+};
+
+#define IP_KEYLEN      4
+uint32_t *ip_key(ctdb_sock_addr *ip);
+
+/* Flags used in IP allocation algorithms. */
+enum ipalloc_algorithm {
+       IPALLOC_DETERMINISTIC,
+       IPALLOC_NONDETERMINISTIC,
+       IPALLOC_LCP2,
+};
+
+struct ipalloc_state {
+       uint32_t num;
+
+       /* Arrays with data for each node */
+       struct ctdb_public_ip_list_old **known_public_ips;
+       struct ctdb_public_ip_list_old **available_public_ips;
+       bool *noiptakeover;
+       bool *noiphost;
+
+       struct public_ip_list *all_ips;
+       enum ipalloc_algorithm algorithm;
+       uint32_t no_ip_failback;
+       uint32_t *force_rebalance_nodes;
+};
+
+bool ipalloc(struct ipalloc_state *ipalloc_state);
+
+#endif /* __CTDB_IPALLOC_H__ */
diff --git a/ctdb/server/ipalloc_common.c b/ctdb/server/ipalloc_common.c
new file mode 100644 (file)
index 0000000..9e4de59
--- /dev/null
@@ -0,0 +1,206 @@
+/*
+   ctdb ip takeover code
+
+   Copyright (C) Ronnie Sahlberg  2007
+   Copyright (C) Andrew Tridgell  2007
+   Copyright (C) Martin Schwenke  2011
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/network.h"
+
+#include "ctdb_private.h"
+
+#include "lib/util/time.h"
+
+#include "lib/util/debug.h"
+#include "common/logging.h"
+
+#include "common/common.h"
+#include "common/rb_tree.h"
+
+#include "include/ctdb_protocol.h"
+
+#include "server/ipalloc_private.h"
+
+#define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
+
+/* Given a physical node, return the number of
+   public addresses that is currently assigned to this node.
+*/
+int node_ip_coverage(int32_t pnn, struct public_ip_list *ips)
+{
+       int num=0;
+
+       for (;ips;ips=ips->next) {
+               if (ips->pnn == pnn) {
+                       num++;
+               }
+       }
+       return num;
+}
+
+
+/* Can the given node host the given IP: is the public IP known to the
+ * node and is NOIPHOST unset?
+*/
+static bool can_node_host_ip(struct ipalloc_state *ipalloc_state,
+                            int32_t pnn,
+                            struct public_ip_list *ip)
+{
+       struct ctdb_public_ip_list_old *public_ips;
+       int i;
+
+       if (ipalloc_state->noiphost[pnn]) {
+               return false;
+       }
+
+       public_ips = ipalloc_state->available_public_ips[pnn];
+
+       if (public_ips == NULL) {
+               return false;
+       }
+
+       for (i=0; i<public_ips->num; i++) {
+               if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
+                       /* yes, this node can serve this public ip */
+                       return true;
+               }
+       }
+
+       return false;
+}
+
+bool can_node_takeover_ip(struct ipalloc_state *ipalloc_state,
+                         int32_t pnn,
+                         struct public_ip_list *ip)
+{
+       if (ipalloc_state->noiptakeover[pnn]) {
+               return false;
+       }
+
+       return can_node_host_ip(ipalloc_state, pnn, ip);
+}
+
+/* search the node lists list for a node to takeover this ip.
+   pick the node that currently are serving the least number of ips
+   so that the ips get spread out evenly.
+*/
+int find_takeover_node(struct ipalloc_state *ipalloc_state,
+                      struct public_ip_list *ip)
+{
+       int pnn, min=0, num;
+       int i, numnodes;
+
+       numnodes = ipalloc_state->num;
+       pnn    = -1;
+       for (i=0; i<numnodes; i++) {
+               /* verify that this node can serve this ip */
+               if (!can_node_takeover_ip(ipalloc_state, i, ip)) {
+                       /* no it couldnt   so skip to the next node */
+                       continue;
+               }
+
+               num = node_ip_coverage(i, ipalloc_state->all_ips);
+               /* was this the first node we checked ? */
+               if (pnn == -1) {
+                       pnn = i;
+                       min  = num;
+               } else {
+                       if (num < min) {
+                               pnn = i;
+                               min  = num;
+                       }
+               }
+       }
+       if (pnn == -1) {
+               DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
+                       ctdb_addr_to_str(&ip->addr)));
+
+               return -1;
+       }
+
+       ip->pnn = pnn;
+       return 0;
+}
+
+uint32_t *ip_key(ctdb_sock_addr *ip)
+{
+       static uint32_t key[IP_KEYLEN];
+
+       bzero(key, sizeof(key));
+
+       switch (ip->sa.sa_family) {
+       case AF_INET:
+               key[3]  = htonl(ip->ip.sin_addr.s_addr);
+               break;
+       case AF_INET6: {
+               uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
+               key[0]  = htonl(s6_a32[0]);
+               key[1]  = htonl(s6_a32[1]);
+               key[2]  = htonl(s6_a32[2]);
+               key[3]  = htonl(s6_a32[3]);
+               break;
+       }
+       default:
+               DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
+               return key;
+       }
+
+       return key;
+}
+
+/* Allocate any unassigned IPs just by looping through the IPs and
+ * finding the best node for each.
+ */
+void basic_allocate_unassigned(struct ipalloc_state *ipalloc_state)
+{
+       struct public_ip_list *t;
+
+       /* loop over all ip's and find a physical node to cover for
+          each unassigned ip.
+       */
+       for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
+               if (t->pnn == -1) {
+                       if (find_takeover_node(ipalloc_state, t)) {
+                               DEBUG(DEBUG_WARNING,
+                                     ("Failed to find node to cover ip %s\n",
+                                      ctdb_addr_to_str(&t->addr)));
+                       }
+               }
+       }
+}
+
+void unassign_unsuitable_ips(struct ipalloc_state *ipalloc_state)
+{
+       struct public_ip_list *t;
+
+       /* verify that the assigned nodes can serve that public ip
+          and set it to -1 if not
+       */
+       for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
+               if (t->pnn == -1) {
+                       continue;
+               }
+               if (!can_node_host_ip(ipalloc_state, t->pnn, t) != 0) {
+                       /* this node can not serve this ip. */
+                       DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
+                                          ctdb_addr_to_str(&(t->addr)),
+                                          t->pnn));
+                       t->pnn = -1;
+               }
+       }
+}
diff --git a/ctdb/server/ipalloc_deterministic.c b/ctdb/server/ipalloc_deterministic.c
new file mode 100644 (file)
index 0000000..2801bf6
--- /dev/null
@@ -0,0 +1,62 @@
+/*
+   ctdb ip takeover code
+
+   Copyright (C) Ronnie Sahlberg  2007
+   Copyright (C) Andrew Tridgell  2007
+   Copyright (C) Martin Schwenke  2011
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/network.h"
+
+#include "lib/util/debug.h"
+#include "common/logging.h"
+
+#include "server/ipalloc_private.h"
+
+bool ipalloc_deterministic(struct ipalloc_state *ipalloc_state)
+{
+       struct public_ip_list *t;
+       int i, numnodes;
+
+       numnodes = ipalloc_state->num;
+
+       DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
+       /* Allocate IPs to nodes in a modulo fashion so that IPs will
+        *  always be allocated the same way for a specific set of
+        *  available/unavailable nodes.
+       */
+
+       for (i = 0, t = ipalloc_state->all_ips; t!= NULL; t = t->next, i++) {
+               t->pnn = i % numnodes;
+       }
+
+       /* IP failback doesn't make sense with deterministic
+        * IPs, since the modulo step above implicitly fails
+        * back IPs to their "home" node.
+        */
+       if (1 == ipalloc_state->no_ip_failback) {
+               DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
+       }
+
+       unassign_unsuitable_ips(ipalloc_state);
+
+       basic_allocate_unassigned(ipalloc_state);
+
+       /* No failback here! */
+
+       return true;
+}
diff --git a/ctdb/server/ipalloc_lcp2.c b/ctdb/server/ipalloc_lcp2.c
new file mode 100644 (file)
index 0000000..0dd9364
--- /dev/null
@@ -0,0 +1,515 @@
+/*
+   ctdb ip takeover code
+
+   Copyright (C) Ronnie Sahlberg  2007
+   Copyright (C) Andrew Tridgell  2007
+   Copyright (C) Martin Schwenke  2011
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/network.h"
+
+#include "lib/util/debug.h"
+#include "common/logging.h"
+
+#include "protocol/protocol_api.h"
+
+#include "server/ipalloc_private.h"
+
+/*
+ * This is the length of the longtest common prefix between the IPs.
+ * It is calculated by XOR-ing the 2 IPs together and counting the
+ * number of leading zeroes.  The implementation means that all
+ * addresses end up being 128 bits long.
+ *
+ * FIXME? Should we consider IPv4 and IPv6 separately given that the
+ * 12 bytes of 0 prefix padding will hurt the algorithm if there are
+ * lots of nodes and IP addresses?
+ */
+static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
+{
+       uint32_t ip1_k[IP_KEYLEN];
+       uint32_t *t;
+       int i;
+       uint32_t x;
+
+       uint32_t distance = 0;
+
+       memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
+       t = ip_key(ip2);
+       for (i=0; i<IP_KEYLEN; i++) {
+               x = ip1_k[i] ^ t[i];
+               if (x == 0) {
+                       distance += 32;
+               } else {
+                       /* Count number of leading zeroes.
+                        * FIXME? This could be optimised...
+                        */
+                       while ((x & (1 << 31)) == 0) {
+                               x <<= 1;
+                               distance += 1;
+                       }
+               }
+       }
+
+       return distance;
+}
+
+/* Calculate the IP distance for the given IP relative to IPs on the
+   given node.  The ips argument is generally the all_ips variable
+   used in the main part of the algorithm.
+ */
+static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
+                                 struct public_ip_list *ips,
+                                 int pnn)
+{
+       struct public_ip_list *t;
+       uint32_t d;
+
+       uint32_t sum = 0;
+
+       for (t = ips; t != NULL; t = t->next) {
+               if (t->pnn != pnn) {
+                       continue;
+               }
+
+               /* Optimisation: We never calculate the distance
+                * between an address and itself.  This allows us to
+                * calculate the effect of removing an address from a
+                * node by simply calculating the distance between
+                * that address and all of the exitsing addresses.
+                * Moreover, we assume that we're only ever dealing
+                * with addresses from all_ips so we can identify an
+                * address via a pointer rather than doing a more
+                * expensive address comparison. */
+               if (&(t->addr) == ip) {
+                       continue;
+               }
+
+               d = ip_distance(ip, &(t->addr));
+               sum += d * d;  /* Cheaper than pulling in math.h :-) */
+       }
+
+       return sum;
+}
+
+/* Return the LCP2 imbalance metric for addresses currently assigned
+   to the given node.
+ */
+static uint32_t lcp2_imbalance(struct public_ip_list * all_ips, int pnn)
+{
+       struct public_ip_list *t;
+
+       uint32_t imbalance = 0;
+
+       for (t = all_ips; t != NULL; t = t->next) {
+               if (t->pnn != pnn) {
+                       continue;
+               }
+               /* Pass the rest of the IPs rather than the whole
+                  all_ips input list.
+               */
+               imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
+       }
+
+       return imbalance;
+}
+
+static bool lcp2_init(struct ipalloc_state *ipalloc_state,
+                     uint32_t **lcp2_imbalances,
+                     bool **rebalance_candidates)
+{
+       int i, numnodes;
+       struct public_ip_list *t;
+
+       numnodes = ipalloc_state->num;
+
+       *rebalance_candidates = talloc_array(ipalloc_state, bool, numnodes);
+       if (*rebalance_candidates == NULL) {
+               DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
+               return false;
+       }
+       *lcp2_imbalances = talloc_array(ipalloc_state, uint32_t, numnodes);
+       if (*lcp2_imbalances == NULL) {
+               DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
+               return false;
+       }
+
+       for (i=0; i<numnodes; i++) {
+               (*lcp2_imbalances)[i] =
+                       lcp2_imbalance(ipalloc_state->all_ips, i);
+               /* First step: assume all nodes are candidates */
+               (*rebalance_candidates)[i] = true;
+       }
+
+       /* 2nd step: if a node has IPs assigned then it must have been
+        * healthy before, so we remove it from consideration.  This
+        * is overkill but is all we have because we don't maintain
+        * state between takeover runs.  An alternative would be to
+        * keep state and invalidate it every time the recovery master
+        * changes.
+        */
+       for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
+               if (t->pnn != -1) {
+                       (*rebalance_candidates)[t->pnn] = false;
+               }
+       }
+
+       /* 3rd step: if a node is forced to re-balance then
+          we allow failback onto the node */
+       if (ipalloc_state->force_rebalance_nodes == NULL) {
+               return true;
+       }
+       for (i = 0;
+            i < talloc_array_length(ipalloc_state->force_rebalance_nodes);
+            i++) {
+               uint32_t pnn = ipalloc_state->force_rebalance_nodes[i];
+               if (pnn >= numnodes) {
+                       DEBUG(DEBUG_ERR,
+                             (__location__ "unknown node %u\n", pnn));
+                       continue;
+               }
+
+               DEBUG(DEBUG_NOTICE,
+                     ("Forcing rebalancing of IPs to node %u\n", pnn));
+               (*rebalance_candidates)[pnn] = true;
+       }
+
+       return true;
+}
+
+/* Allocate any unassigned addresses using the LCP2 algorithm to find
+ * the IP/node combination that will cost the least.
+ */
+static void lcp2_allocate_unassigned(struct ipalloc_state *ipalloc_state,
+                                    uint32_t *lcp2_imbalances)
+{
+       struct public_ip_list *t;
+       int dstnode, numnodes;
+
+       int minnode;
+       uint32_t mindsum, dstdsum, dstimbl, minimbl;
+       struct public_ip_list *minip;
+
+       bool should_loop = true;
+       bool have_unassigned = true;
+
+       numnodes = ipalloc_state->num;
+
+       while (have_unassigned && should_loop) {
+               should_loop = false;
+
+               DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
+               DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
+
+               minnode = -1;
+               mindsum = 0;
+               minip = NULL;
+
+               /* loop over each unassigned ip. */
+               for (t = ipalloc_state->all_ips; t != NULL ; t = t->next) {
+                       if (t->pnn != -1) {
+                               continue;
+                       }
+
+                       for (dstnode = 0; dstnode < numnodes; dstnode++) {
+                               /* only check nodes that can actually takeover this ip */
+                               if (!can_node_takeover_ip(ipalloc_state,
+                                                         dstnode,
+                                                         t)) {
+                                       /* no it couldnt   so skip to the next node */
+                                       continue;
+                               }
+
+                               dstdsum = ip_distance_2_sum(&(t->addr),
+                                                           ipalloc_state->all_ips,
+                                                           dstnode);
+                               dstimbl = lcp2_imbalances[dstnode] + dstdsum;
+                               DEBUG(DEBUG_DEBUG,
+                                     (" %s -> %d [+%d]\n",
+                                      ctdb_sock_addr_to_string(ipalloc_state,
+                                                               &(t->addr)),
+                                      dstnode,
+                                      dstimbl - lcp2_imbalances[dstnode]));
+
+
+                               if ((minnode == -1) || (dstdsum < mindsum)) {
+                                       minnode = dstnode;
+                                       minimbl = dstimbl;
+                                       mindsum = dstdsum;
+                                       minip = t;
+                                       should_loop = true;
+                               }
+                       }
+               }
+
+               DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
+
+               /* If we found one then assign it to the given node. */
+               if (minnode != -1) {
+                       minip->pnn = minnode;
+                       lcp2_imbalances[minnode] = minimbl;
+                       DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
+                                         ctdb_sock_addr_to_string(
+                                                 ipalloc_state,
+                                                 &(minip->addr)),
+                                         minnode,
+                                         mindsum));
+               }
+
+               /* There might be a better way but at least this is clear. */
+               have_unassigned = false;
+               for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
+                       if (t->pnn == -1) {
+                               have_unassigned = true;
+                       }
+               }
+       }
+
+       /* We know if we have an unassigned addresses so we might as
+        * well optimise.
+        */
+       if (have_unassigned) {
+               for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
+                       if (t->pnn == -1) {
+                               DEBUG(DEBUG_WARNING,
+                                     ("Failed to find node to cover ip %s\n",
+                                      ctdb_sock_addr_to_string(ipalloc_state,
+                                                               &t->addr)));
+                       }
+               }
+       }
+}
+
+/* LCP2 algorithm for rebalancing the cluster.  Given a candidate node
+ * to move IPs from, determines the best IP/destination node
+ * combination to move from the source node.
+ */
+static bool lcp2_failback_candidate(struct ipalloc_state *ipalloc_state,
+                                   int srcnode,
+                                   uint32_t *lcp2_imbalances,
+                                   bool *rebalance_candidates)
+{
+       int dstnode, mindstnode, numnodes;
+       uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
+       uint32_t minsrcimbl, mindstimbl;
+       struct public_ip_list *minip;
+       struct public_ip_list *t;
+
+       /* Find an IP and destination node that best reduces imbalance. */
+       srcimbl = 0;
+       minip = NULL;
+       minsrcimbl = 0;
+       mindstnode = -1;
+       mindstimbl = 0;
+
+       numnodes = ipalloc_state->num;
+
+       DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
+       DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n",
+                          srcnode, lcp2_imbalances[srcnode]));
+
+       for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
+               /* Only consider addresses on srcnode. */
+               if (t->pnn != srcnode) {
+                       continue;
+               }
+
+               /* What is this IP address costing the source node? */
+               srcdsum = ip_distance_2_sum(&(t->addr),
+                                           ipalloc_state->all_ips,
+                                           srcnode);
+               srcimbl = lcp2_imbalances[srcnode] - srcdsum;
+
+               /* Consider this IP address would cost each potential
+                * destination node.  Destination nodes are limited to
+                * those that are newly healthy, since we don't want
+                * to do gratuitous failover of IPs just to make minor
+                * balance improvements.
+                */
+               for (dstnode = 0; dstnode < numnodes; dstnode++) {
+                       if (!rebalance_candidates[dstnode]) {
+                               continue;
+                       }
+
+                       /* only check nodes that can actually takeover this ip */
+                       if (!can_node_takeover_ip(ipalloc_state, dstnode,
+                                                 t)) {
+                               /* no it couldnt   so skip to the next node */
+                               continue;
+                       }
+
+                       dstdsum = ip_distance_2_sum(&(t->addr),
+                                                   ipalloc_state->all_ips,
+                                                   dstnode);
+                       dstimbl = lcp2_imbalances[dstnode] + dstdsum;
+                       DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
+                                          srcnode, -srcdsum,
+                                          ctdb_sock_addr_to_string(
+                                                  ipalloc_state, &(t->addr)),
+                                          dstnode, dstdsum));
+
+                       if ((dstimbl < lcp2_imbalances[srcnode]) &&
+                           (dstdsum < srcdsum) &&                      \
+                           ((mindstnode == -1) ||                              \
+                            ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
+
+                               minip = t;
+                               minsrcimbl = srcimbl;
+                               mindstnode = dstnode;
+                               mindstimbl = dstimbl;
+                       }
+               }
+       }
+       DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
+
+        if (mindstnode != -1) {
+               /* We found a move that makes things better... */
+               DEBUG(DEBUG_INFO,
+                     ("%d [%d] -> %s -> %d [+%d]\n",
+                      srcnode, minsrcimbl - lcp2_imbalances[srcnode],
+                      ctdb_sock_addr_to_string(ipalloc_state, &(minip->addr)),
+                      mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
+
+
+               lcp2_imbalances[srcnode] = minsrcimbl;
+               lcp2_imbalances[mindstnode] = mindstimbl;
+               minip->pnn = mindstnode;
+
+               return true;
+       }
+
+        return false;
+}
+
+struct lcp2_imbalance_pnn {
+       uint32_t imbalance;
+       int pnn;
+};
+
+static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
+{
+       const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
+       const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
+
+       if (lipa->imbalance > lipb->imbalance) {
+               return -1;
+       } else if (lipa->imbalance == lipb->imbalance) {
+               return 0;
+       } else {
+               return 1;
+       }
+}
+
+/* LCP2 algorithm for rebalancing the cluster.  This finds the source
+ * node with the highest LCP2 imbalance, and then determines the best
+ * IP/destination node combination to move from the source node.
+ */
+static void lcp2_failback(struct ipalloc_state *ipalloc_state,
+                         uint32_t *lcp2_imbalances,
+                         bool *rebalance_candidates)
+{
+       int i, numnodes;
+       struct lcp2_imbalance_pnn * lips;
+       bool again;
+
+       numnodes = ipalloc_state->num;
+
+try_again:
+       /* Put the imbalances and nodes into an array, sort them and
+        * iterate through candidates.  Usually the 1st one will be
+        * used, so this doesn't cost much...
+        */
+       DEBUG(DEBUG_DEBUG,("+++++++++++++++++++++++++++++++++++++++++\n"));
+       DEBUG(DEBUG_DEBUG,("Selecting most imbalanced node from:\n"));
+       lips = talloc_array(ipalloc_state, struct lcp2_imbalance_pnn, numnodes);
+       for (i = 0; i < numnodes; i++) {
+               lips[i].imbalance = lcp2_imbalances[i];
+               lips[i].pnn = i;
+               DEBUG(DEBUG_DEBUG,(" %d [%d]\n", i, lcp2_imbalances[i]));
+       }
+       qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
+             lcp2_cmp_imbalance_pnn);
+
+       again = false;
+       for (i = 0; i < numnodes; i++) {
+               /* This means that all nodes had 0 or 1 addresses, so
+                * can't be imbalanced.
+                */
+               if (lips[i].imbalance == 0) {
+                       break;
+               }
+
+               if (lcp2_failback_candidate(ipalloc_state,
+                                           lips[i].pnn,
+                                           lcp2_imbalances,
+                                           rebalance_candidates)) {
+                       again = true;
+                       break;
+               }
+       }
+
+       talloc_free(lips);
+       if (again) {
+               goto try_again;
+       }
+}
+
+bool ipalloc_lcp2(struct ipalloc_state *ipalloc_state)
+{
+       uint32_t *lcp2_imbalances;
+       bool *rebalance_candidates;
+       int numnodes, num_rebalance_candidates, i;
+       bool ret = true;
+
+       unassign_unsuitable_ips(ipalloc_state);
+
+       if (!lcp2_init(ipalloc_state,
+                      &lcp2_imbalances, &rebalance_candidates)) {
+               ret = false;
+               goto finished;
+       }
+
+       lcp2_allocate_unassigned(ipalloc_state, lcp2_imbalances);
+
+       /* If we don't want IPs to fail back then don't rebalance IPs. */
+       if (1 == ipalloc_state->no_ip_failback) {
+               goto finished;
+       }
+
+       /* It is only worth continuing if we have suitable target
+        * nodes to transfer IPs to.  This check is much cheaper than
+        * continuing on...
+        */
+       numnodes = ipalloc_state->num;
+       num_rebalance_candidates = 0;
+       for (i=0; i<numnodes; i++) {
+               if (rebalance_candidates[i]) {
+                       num_rebalance_candidates++;
+               }
+       }
+       if (num_rebalance_candidates == 0) {
+               goto finished;
+       }
+
+       /* Now, try to make sure the ip adresses are evenly distributed
+          across the nodes.
+       */
+       lcp2_failback(ipalloc_state, lcp2_imbalances, rebalance_candidates);
+
+finished:
+       return ret;
+}
diff --git a/ctdb/server/ipalloc_nondeterministic.c b/ctdb/server/ipalloc_nondeterministic.c
new file mode 100644 (file)
index 0000000..300d8f1
--- /dev/null
@@ -0,0 +1,147 @@
+/*
+   ctdb ip takeover code
+
+   Copyright (C) Ronnie Sahlberg  2007
+   Copyright (C) Andrew Tridgell  2007
+   Copyright (C) Martin Schwenke  2011
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/network.h"
+
+#include "ctdb_private.h"
+
+#include "lib/util/debug.h"
+#include "common/logging.h"
+#include "common/common.h"
+
+#include "server/ipalloc_private.h"
+
+/* Basic non-deterministic rebalancing algorithm.
+ */
+static void basic_failback(struct ipalloc_state *ipalloc_state,
+                          int num_ips)
+{
+       int i, numnodes;
+       int maxnode, maxnum, minnode, minnum, num, retries;
+       struct public_ip_list *t;
+
+       numnodes = ipalloc_state->num;
+       retries = 0;
+
+try_again:
+       maxnum=0;
+       minnum=0;
+
+       /* for each ip address, loop over all nodes that can serve
+          this ip and make sure that the difference between the node
+          serving the most and the node serving the least ip's are
+          not greater than 1.
+       */
+       for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
+               if (t->pnn == -1) {
+                       continue;
+               }
+
+               /* Get the highest and lowest number of ips's served by any 
+                  valid node which can serve this ip.
+               */
+               maxnode = -1;
+               minnode = -1;
+               for (i=0; i<numnodes; i++) {
+                       /* only check nodes that can actually serve this ip */
+                       if (!can_node_takeover_ip(ipalloc_state, i,
+                                                 t)) {
+                               /* no it couldnt   so skip to the next node */
+                               continue;
+                       }
+
+                       num = node_ip_coverage(i, ipalloc_state->all_ips);
+                       if (maxnode == -1) {
+                               maxnode = i;
+                               maxnum  = num;
+                       } else {
+                               if (num > maxnum) {
+                                       maxnode = i;
+                                       maxnum  = num;
+                               }
+                       }
+                       if (minnode == -1) {
+                               minnode = i;
+                               minnum  = num;
+                       } else {
+                               if (num < minnum) {
+                                       minnode = i;
+                                       minnum  = num;
+                               }
+                       }
+               }
+               if (maxnode == -1) {
+                       DEBUG(DEBUG_WARNING,
+                             (__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
+                              ctdb_addr_to_str(&t->addr)));
+
+                       continue;
+               }
+
+               /* if the spread between the smallest and largest coverage by
+                  a node is >=2 we steal one of the ips from the node with
+                  most coverage to even things out a bit.
+                  try to do this a limited number of times since we dont
+                  want to spend too much time balancing the ip coverage.
+               */
+               if ((maxnum > minnum+1) &&
+                   (retries < (num_ips + 5))){
+                       struct public_ip_list *tt;
+
+                       /* Reassign one of maxnode's VNNs */
+                       for (tt = ipalloc_state->all_ips; tt != NULL; tt = tt->next) {
+                               if (tt->pnn == maxnode) {
+                                       (void)find_takeover_node(ipalloc_state,
+                                                                tt);
+                                       retries++;
+                                       goto try_again;;
+                               }
+                       }
+               }
+       }
+}
+
+bool ipalloc_nondeterministic(struct ipalloc_state *ipalloc_state)
+{
+       /* This should be pushed down into basic_failback. */
+       struct public_ip_list *t;
+       int num_ips = 0;
+       for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
+               num_ips++;
+       }
+
+       unassign_unsuitable_ips(ipalloc_state);
+
+       basic_allocate_unassigned(ipalloc_state);
+
+       /* If we don't want IPs to fail back then don't rebalance IPs. */
+       if (1 == ipalloc_state->no_ip_failback) {
+               return true;
+       }
+
+       /* Now, try to make sure the ip adresses are evenly distributed
+          across the nodes.
+       */
+       basic_failback(ipalloc_state, num_ips);
+
+       return true;
+}
diff --git a/ctdb/server/ipalloc_private.h b/ctdb/server/ipalloc_private.h
new file mode 100644 (file)
index 0000000..3ffdeba
--- /dev/null
@@ -0,0 +1,43 @@
+/*
+   CTDB IP takeover code
+
+   Copyright (C) Ronnie Sahlberg  2007
+   Copyright (C) Andrew Tridgell  2007
+   Copyright (C) Martin Schwenke  2015
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __CTDB_IPALLOC_PRIVATE_H__
+#define __CTDB_IPALLOC_PRIVATE_H__
+
+#include "protocol/protocol.h"
+
+#include "server/ipalloc.h"
+
+bool can_node_takeover_ip(struct ipalloc_state *ipalloc_state,
+                         int32_t pnn,
+                         struct public_ip_list *ip);
+int node_ip_coverage(int32_t pnn, struct public_ip_list *ips);
+int find_takeover_node(struct ipalloc_state *ipalloc_state,
+                      struct public_ip_list *ip);
+
+void unassign_unsuitable_ips(struct ipalloc_state *ipalloc_state);
+void basic_allocate_unassigned(struct ipalloc_state *ipalloc_state);
+
+bool ipalloc_nondeterministic(struct ipalloc_state *ipalloc_state);
+bool ipalloc_deterministic(struct ipalloc_state *ipalloc_state);
+bool ipalloc_lcp2(struct ipalloc_state *ipalloc_state);
+
+#endif /* __CTDB_IPALLOC_PRIVATE_H__ */
index 8298a4e10381342c91584b58ee95e2cb2d13b0df..018aa2a7a88f21c0c4e4a4cae5a43090d5766e33 100644 (file)
@@ -61,6 +61,11 @@ bool fast_start;
 #include "server/ctdb_ltdb_server.c"
 #include "server/ctdb_traverse.c"
 #include "server/eventscript.c"
+#include "server/ipalloc_common.c"
+#include "server/ipalloc_deterministic.c"
+#include "server/ipalloc_nondeterministic.c"
+#include "server/ipalloc_lcp2.c"
+#include "server/ipalloc.c"
 #include "server/ctdb_takeover.c"
 #include "server/ctdb_serverids.c"
 #include "server/ctdb_persistent.c"
index e7d2c49a5b98391e459701ae320916ee4286e1ac..0acb6ff20e091586239b85a4a973c8d5113f25f7 100755 (executable)
@@ -373,6 +373,17 @@ def build(bld):
                         includes='include',
                         deps='replace talloc tevent tdb tdb-wrap')
 
+    bld.SAMBA_SUBSYSTEM('ctdb-ipalloc',
+                        source=bld.SUBDIR('server',
+                                          '''ipalloc_deterministic.c
+                                             ipalloc_nondeterministic.c
+                                             ipalloc_lcp2.c
+                                             ipalloc_common.c
+                                             ipalloc.c
+                                          '''),
+                        includes='include',
+                        deps='ctdb-protocol replace talloc tevent')
+
     bld.SAMBA_SUBSYSTEM('ctdb-server',
                         source='server/ctdbd.c ' +
                                bld.SUBDIR('server',
@@ -393,7 +404,7 @@ def build(bld):
                                              ctdb_update_record.c
                                              ctdb_lock.c ctdb_fork.c'''),
                         includes='include',
-                        deps='replace popt talloc tevent tdb talloc_report')
+                        deps='ctdb-ipalloc replace popt talloc tevent tdb talloc_report')
 
     bld.SAMBA_BINARY('ctdbd',
                      source='',
@@ -664,7 +675,8 @@ def build(bld):
     bld.SAMBA_BINARY('ctdb_takeover_tests',
                      source='tests/src/ctdb_takeover_tests.c',
                      deps='''replace popt tdb tevent talloc ctdb-system
-                             samba-util tdb-wrap talloc_report''' +
+                             samba-util tdb-wrap talloc_report
+                             ctdb-protocol''' +
                           ib_deps,
                      includes='include',
                      install_path='${CTDB_TEST_LIBDIR}')