#include "common/common.h"
#include "common/logging.h"
+#include "server/ipalloc.h"
#define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
#define CTDB_ARP_INTERVAL 1
#define CTDB_ARP_REPEAT 3
-/* Flags used in IP allocation algorithms. */
-enum ipalloc_algorithm {
- IPALLOC_DETERMINISTIC,
- IPALLOC_NONDETERMINISTIC,
- IPALLOC_LCP2,
-};
-
-struct ipalloc_state {
- uint32_t num;
-
- /* Arrays with data for each node */
- struct ctdb_public_ip_list_old **known_public_ips;
- struct ctdb_public_ip_list_old **available_public_ips;
- bool *noiptakeover;
- bool *noiphost;
-
- struct public_ip_list *all_ips;
- enum ipalloc_algorithm algorithm;
- uint32_t no_ip_failback;
- uint32_t *force_rebalance_nodes;
-};
-
struct ctdb_interface {
struct ctdb_interface *prev, *next;
const char *name;
return 0;
}
-struct public_ip_list {
- struct public_ip_list *next;
- uint32_t pnn;
- ctdb_sock_addr addr;
-};
-
-/* Given a physical node, return the number of
- public addresses that is currently assigned to this node.
-*/
-static int node_ip_coverage(int32_t pnn, struct public_ip_list *ips)
-{
- int num=0;
-
- for (;ips;ips=ips->next) {
- if (ips->pnn == pnn) {
- num++;
- }
- }
- return num;
-}
-
-
-/* Can the given node host the given IP: is the public IP known to the
- * node and is NOIPHOST unset?
-*/
-static bool can_node_host_ip(struct ipalloc_state *ipalloc_state,
- int32_t pnn,
- struct public_ip_list *ip)
-{
- struct ctdb_public_ip_list_old *public_ips;
- int i;
-
- if (ipalloc_state->noiphost[pnn]) {
- return false;
- }
-
- public_ips = ipalloc_state->available_public_ips[pnn];
-
- if (public_ips == NULL) {
- return false;
- }
-
- for (i=0; i<public_ips->num; i++) {
- if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
- /* yes, this node can serve this public ip */
- return true;
- }
- }
-
- return false;
-}
-
-static bool can_node_takeover_ip(struct ipalloc_state *ipalloc_state,
- int32_t pnn,
- struct public_ip_list *ip)
-{
- if (ipalloc_state->noiptakeover[pnn]) {
- return false;
- }
-
- return can_node_host_ip(ipalloc_state, pnn, ip);
-}
-
-/* search the node lists list for a node to takeover this ip.
- pick the node that currently are serving the least number of ips
- so that the ips get spread out evenly.
-*/
-static int find_takeover_node(struct ipalloc_state *ipalloc_state,
- struct public_ip_list *ip)
-{
- int pnn, min=0, num;
- int i, numnodes;
-
- numnodes = ipalloc_state->num;
- pnn = -1;
- for (i=0; i<numnodes; i++) {
- /* verify that this node can serve this ip */
- if (!can_node_takeover_ip(ipalloc_state, i, ip)) {
- /* no it couldnt so skip to the next node */
- continue;
- }
-
- num = node_ip_coverage(i, ipalloc_state->all_ips);
- /* was this the first node we checked ? */
- if (pnn == -1) {
- pnn = i;
- min = num;
- } else {
- if (num < min) {
- pnn = i;
- min = num;
- }
- }
- }
- if (pnn == -1) {
- DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
- ctdb_addr_to_str(&ip->addr)));
-
- return -1;
- }
-
- ip->pnn = pnn;
- return 0;
-}
-
-#define IP_KEYLEN 4
-static uint32_t *ip_key(ctdb_sock_addr *ip)
-{
- static uint32_t key[IP_KEYLEN];
-
- bzero(key, sizeof(key));
-
- switch (ip->sa.sa_family) {
- case AF_INET:
- key[3] = htonl(ip->ip.sin_addr.s_addr);
- break;
- case AF_INET6: {
- uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
- key[0] = htonl(s6_a32[0]);
- key[1] = htonl(s6_a32[1]);
- key[2] = htonl(s6_a32[2]);
- key[3] = htonl(s6_a32[3]);
- break;
- }
- default:
- DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
- return key;
- }
-
- return key;
-}
-
static void *add_ip_callback(void *parm, void *data)
{
struct public_ip_list *this_ip = parm;
return ip_list;
}
-/*
- * This is the length of the longtest common prefix between the IPs.
- * It is calculated by XOR-ing the 2 IPs together and counting the
- * number of leading zeroes. The implementation means that all
- * addresses end up being 128 bits long.
- *
- * FIXME? Should we consider IPv4 and IPv6 separately given that the
- * 12 bytes of 0 prefix padding will hurt the algorithm if there are
- * lots of nodes and IP addresses?
- */
-static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
-{
- uint32_t ip1_k[IP_KEYLEN];
- uint32_t *t;
- int i;
- uint32_t x;
-
- uint32_t distance = 0;
-
- memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
- t = ip_key(ip2);
- for (i=0; i<IP_KEYLEN; i++) {
- x = ip1_k[i] ^ t[i];
- if (x == 0) {
- distance += 32;
- } else {
- /* Count number of leading zeroes.
- * FIXME? This could be optimised...
- */
- while ((x & (1 << 31)) == 0) {
- x <<= 1;
- distance += 1;
- }
- }
- }
-
- return distance;
-}
-
-/* Calculate the IP distance for the given IP relative to IPs on the
- given node. The ips argument is generally the all_ips variable
- used in the main part of the algorithm.
- */
-static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
- struct public_ip_list *ips,
- int pnn)
-{
- struct public_ip_list *t;
- uint32_t d;
-
- uint32_t sum = 0;
-
- for (t = ips; t != NULL; t = t->next) {
- if (t->pnn != pnn) {
- continue;
- }
-
- /* Optimisation: We never calculate the distance
- * between an address and itself. This allows us to
- * calculate the effect of removing an address from a
- * node by simply calculating the distance between
- * that address and all of the exitsing addresses.
- * Moreover, we assume that we're only ever dealing
- * with addresses from all_ips so we can identify an
- * address via a pointer rather than doing a more
- * expensive address comparison. */
- if (&(t->addr) == ip) {
- continue;
- }
-
- d = ip_distance(ip, &(t->addr));
- sum += d * d; /* Cheaper than pulling in math.h :-) */
- }
-
- return sum;
-}
-
-/* Return the LCP2 imbalance metric for addresses currently assigned
- to the given node.
- */
-static uint32_t lcp2_imbalance(struct public_ip_list * all_ips, int pnn)
-{
- struct public_ip_list *t;
-
- uint32_t imbalance = 0;
-
- for (t = all_ips; t != NULL; t = t->next) {
- if (t->pnn != pnn) {
- continue;
- }
- /* Pass the rest of the IPs rather than the whole
- all_ips input list.
- */
- imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
- }
-
- return imbalance;
-}
-
-/* Allocate any unassigned IPs just by looping through the IPs and
- * finding the best node for each.
- */
-static void basic_allocate_unassigned(struct ipalloc_state *ipalloc_state)
-{
- struct public_ip_list *t;
-
- /* loop over all ip's and find a physical node to cover for
- each unassigned ip.
- */
- for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
- if (t->pnn == -1) {
- if (find_takeover_node(ipalloc_state, t)) {
- DEBUG(DEBUG_WARNING,
- ("Failed to find node to cover ip %s\n",
- ctdb_addr_to_str(&t->addr)));
- }
- }
- }
-}
-
-/* Basic non-deterministic rebalancing algorithm.
- */
-static void basic_failback(struct ipalloc_state *ipalloc_state,
- int num_ips)
-{
- int i, numnodes;
- int maxnode, maxnum, minnode, minnum, num, retries;
- struct public_ip_list *t;
-
- numnodes = ipalloc_state->num;
- retries = 0;
-
-try_again:
- maxnum=0;
- minnum=0;
-
- /* for each ip address, loop over all nodes that can serve
- this ip and make sure that the difference between the node
- serving the most and the node serving the least ip's are
- not greater than 1.
- */
- for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
- if (t->pnn == -1) {
- continue;
- }
-
- /* Get the highest and lowest number of ips's served by any
- valid node which can serve this ip.
- */
- maxnode = -1;
- minnode = -1;
- for (i=0; i<numnodes; i++) {
- /* only check nodes that can actually serve this ip */
- if (!can_node_takeover_ip(ipalloc_state, i,
- t)) {
- /* no it couldnt so skip to the next node */
- continue;
- }
-
- num = node_ip_coverage(i, ipalloc_state->all_ips);
- if (maxnode == -1) {
- maxnode = i;
- maxnum = num;
- } else {
- if (num > maxnum) {
- maxnode = i;
- maxnum = num;
- }
- }
- if (minnode == -1) {
- minnode = i;
- minnum = num;
- } else {
- if (num < minnum) {
- minnode = i;
- minnum = num;
- }
- }
- }
- if (maxnode == -1) {
- DEBUG(DEBUG_WARNING,
- (__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
- ctdb_addr_to_str(&t->addr)));
-
- continue;
- }
-
- /* if the spread between the smallest and largest coverage by
- a node is >=2 we steal one of the ips from the node with
- most coverage to even things out a bit.
- try to do this a limited number of times since we dont
- want to spend too much time balancing the ip coverage.
- */
- if ((maxnum > minnum+1) &&
- (retries < (num_ips + 5))){
- struct public_ip_list *tt;
-
- /* Reassign one of maxnode's VNNs */
- for (tt = ipalloc_state->all_ips; tt != NULL; tt = tt->next) {
- if (tt->pnn == maxnode) {
- (void)find_takeover_node(ipalloc_state,
- tt);
- retries++;
- goto try_again;;
- }
- }
- }
- }
-}
-
-static bool lcp2_init(struct ipalloc_state *ipalloc_state,
- uint32_t **lcp2_imbalances,
- bool **rebalance_candidates)
-{
- int i, numnodes;
- struct public_ip_list *t;
-
- numnodes = ipalloc_state->num;
-
- *rebalance_candidates = talloc_array(ipalloc_state, bool, numnodes);
- if (*rebalance_candidates == NULL) {
- DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
- return false;
- }
- *lcp2_imbalances = talloc_array(ipalloc_state, uint32_t, numnodes);
- if (*lcp2_imbalances == NULL) {
- DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
- return false;
- }
-
- for (i=0; i<numnodes; i++) {
- (*lcp2_imbalances)[i] =
- lcp2_imbalance(ipalloc_state->all_ips, i);
- /* First step: assume all nodes are candidates */
- (*rebalance_candidates)[i] = true;
- }
-
- /* 2nd step: if a node has IPs assigned then it must have been
- * healthy before, so we remove it from consideration. This
- * is overkill but is all we have because we don't maintain
- * state between takeover runs. An alternative would be to
- * keep state and invalidate it every time the recovery master
- * changes.
- */
- for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
- if (t->pnn != -1) {
- (*rebalance_candidates)[t->pnn] = false;
- }
- }
-
- /* 3rd step: if a node is forced to re-balance then
- we allow failback onto the node */
- if (ipalloc_state->force_rebalance_nodes == NULL) {
- return true;
- }
- for (i = 0;
- i < talloc_array_length(ipalloc_state->force_rebalance_nodes);
- i++) {
- uint32_t pnn = ipalloc_state->force_rebalance_nodes[i];
- if (pnn >= numnodes) {
- DEBUG(DEBUG_ERR,
- (__location__ "unknown node %u\n", pnn));
- continue;
- }
-
- DEBUG(DEBUG_NOTICE,
- ("Forcing rebalancing of IPs to node %u\n", pnn));
- (*rebalance_candidates)[pnn] = true;
- }
-
- return true;
-}
-
-/* Allocate any unassigned addresses using the LCP2 algorithm to find
- * the IP/node combination that will cost the least.
- */
-static void lcp2_allocate_unassigned(struct ipalloc_state *ipalloc_state,
- uint32_t *lcp2_imbalances)
-{
- struct public_ip_list *t;
- int dstnode, numnodes;
-
- int minnode;
- uint32_t mindsum, dstdsum, dstimbl, minimbl;
- struct public_ip_list *minip;
-
- bool should_loop = true;
- bool have_unassigned = true;
-
- numnodes = ipalloc_state->num;
-
- while (have_unassigned && should_loop) {
- should_loop = false;
-
- DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
- DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
-
- minnode = -1;
- mindsum = 0;
- minip = NULL;
-
- /* loop over each unassigned ip. */
- for (t = ipalloc_state->all_ips; t != NULL ; t = t->next) {
- if (t->pnn != -1) {
- continue;
- }
-
- for (dstnode = 0; dstnode < numnodes; dstnode++) {
- /* only check nodes that can actually takeover this ip */
- if (!can_node_takeover_ip(ipalloc_state,
- dstnode,
- t)) {
- /* no it couldnt so skip to the next node */
- continue;
- }
-
- dstdsum = ip_distance_2_sum(&(t->addr),
- ipalloc_state->all_ips,
- dstnode);
- dstimbl = lcp2_imbalances[dstnode] + dstdsum;
- DEBUG(DEBUG_DEBUG,
- (" %s -> %d [+%d]\n",
- ctdb_addr_to_str(&(t->addr)),
- dstnode,
- dstimbl - lcp2_imbalances[dstnode]));
-
-
- if ((minnode == -1) || (dstdsum < mindsum)) {
- minnode = dstnode;
- minimbl = dstimbl;
- mindsum = dstdsum;
- minip = t;
- should_loop = true;
- }
- }
- }
-
- DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
-
- /* If we found one then assign it to the given node. */
- if (minnode != -1) {
- minip->pnn = minnode;
- lcp2_imbalances[minnode] = minimbl;
- DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
- ctdb_addr_to_str(&(minip->addr)),
- minnode,
- mindsum));
- }
-
- /* There might be a better way but at least this is clear. */
- have_unassigned = false;
- for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
- if (t->pnn == -1) {
- have_unassigned = true;
- }
- }
- }
-
- /* We know if we have an unassigned addresses so we might as
- * well optimise.
- */
- if (have_unassigned) {
- for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
- if (t->pnn == -1) {
- DEBUG(DEBUG_WARNING,
- ("Failed to find node to cover ip %s\n",
- ctdb_addr_to_str(&t->addr)));
- }
- }
- }
-}
-
-/* LCP2 algorithm for rebalancing the cluster. Given a candidate node
- * to move IPs from, determines the best IP/destination node
- * combination to move from the source node.
- */
-static bool lcp2_failback_candidate(struct ipalloc_state *ipalloc_state,
- int srcnode,
- uint32_t *lcp2_imbalances,
- bool *rebalance_candidates)
-{
- int dstnode, mindstnode, numnodes;
- uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
- uint32_t minsrcimbl, mindstimbl;
- struct public_ip_list *minip;
- struct public_ip_list *t;
-
- /* Find an IP and destination node that best reduces imbalance. */
- srcimbl = 0;
- minip = NULL;
- minsrcimbl = 0;
- mindstnode = -1;
- mindstimbl = 0;
-
- numnodes = ipalloc_state->num;
-
- DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
- DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n",
- srcnode, lcp2_imbalances[srcnode]));
-
- for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
- /* Only consider addresses on srcnode. */
- if (t->pnn != srcnode) {
- continue;
- }
-
- /* What is this IP address costing the source node? */
- srcdsum = ip_distance_2_sum(&(t->addr),
- ipalloc_state->all_ips,
- srcnode);
- srcimbl = lcp2_imbalances[srcnode] - srcdsum;
-
- /* Consider this IP address would cost each potential
- * destination node. Destination nodes are limited to
- * those that are newly healthy, since we don't want
- * to do gratuitous failover of IPs just to make minor
- * balance improvements.
- */
- for (dstnode = 0; dstnode < numnodes; dstnode++) {
- if (!rebalance_candidates[dstnode]) {
- continue;
- }
-
- /* only check nodes that can actually takeover this ip */
- if (!can_node_takeover_ip(ipalloc_state, dstnode,
- t)) {
- /* no it couldnt so skip to the next node */
- continue;
- }
-
- dstdsum = ip_distance_2_sum(&(t->addr),
- ipalloc_state->all_ips,
- dstnode);
- dstimbl = lcp2_imbalances[dstnode] + dstdsum;
- DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
- srcnode, -srcdsum,
- ctdb_addr_to_str(&(t->addr)),
- dstnode, dstdsum));
-
- if ((dstimbl < lcp2_imbalances[srcnode]) &&
- (dstdsum < srcdsum) && \
- ((mindstnode == -1) || \
- ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
-
- minip = t;
- minsrcimbl = srcimbl;
- mindstnode = dstnode;
- mindstimbl = dstimbl;
- }
- }
- }
- DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
-
- if (mindstnode != -1) {
- /* We found a move that makes things better... */
- DEBUG(DEBUG_INFO,
- ("%d [%d] -> %s -> %d [+%d]\n",
- srcnode, minsrcimbl - lcp2_imbalances[srcnode],
- ctdb_addr_to_str(&(minip->addr)),
- mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
-
-
- lcp2_imbalances[srcnode] = minsrcimbl;
- lcp2_imbalances[mindstnode] = mindstimbl;
- minip->pnn = mindstnode;
-
- return true;
- }
-
- return false;
-
-}
-
-struct lcp2_imbalance_pnn {
- uint32_t imbalance;
- int pnn;
-};
-
-static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
-{
- const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
- const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
-
- if (lipa->imbalance > lipb->imbalance) {
- return -1;
- } else if (lipa->imbalance == lipb->imbalance) {
- return 0;
- } else {
- return 1;
- }
-}
-
-/* LCP2 algorithm for rebalancing the cluster. This finds the source
- * node with the highest LCP2 imbalance, and then determines the best
- * IP/destination node combination to move from the source node.
- */
-static void lcp2_failback(struct ipalloc_state *ipalloc_state,
- uint32_t *lcp2_imbalances,
- bool *rebalance_candidates)
-{
- int i, numnodes;
- struct lcp2_imbalance_pnn * lips;
- bool again;
-
- numnodes = ipalloc_state->num;
-
-try_again:
- /* Put the imbalances and nodes into an array, sort them and
- * iterate through candidates. Usually the 1st one will be
- * used, so this doesn't cost much...
- */
- DEBUG(DEBUG_DEBUG,("+++++++++++++++++++++++++++++++++++++++++\n"));
- DEBUG(DEBUG_DEBUG,("Selecting most imbalanced node from:\n"));
- lips = talloc_array(ipalloc_state, struct lcp2_imbalance_pnn, numnodes);
- for (i = 0; i < numnodes; i++) {
- lips[i].imbalance = lcp2_imbalances[i];
- lips[i].pnn = i;
- DEBUG(DEBUG_DEBUG,(" %d [%d]\n", i, lcp2_imbalances[i]));
- }
- qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
- lcp2_cmp_imbalance_pnn);
-
- again = false;
- for (i = 0; i < numnodes; i++) {
- /* This means that all nodes had 0 or 1 addresses, so
- * can't be imbalanced.
- */
- if (lips[i].imbalance == 0) {
- break;
- }
-
- if (lcp2_failback_candidate(ipalloc_state,
- lips[i].pnn,
- lcp2_imbalances,
- rebalance_candidates)) {
- again = true;
- break;
- }
- }
-
- talloc_free(lips);
- if (again) {
- goto try_again;
- }
-}
-
-static void unassign_unsuitable_ips(struct ipalloc_state *ipalloc_state)
-{
- struct public_ip_list *t;
-
- /* verify that the assigned nodes can serve that public ip
- and set it to -1 if not
- */
- for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
- if (t->pnn == -1) {
- continue;
- }
- if (!can_node_host_ip(ipalloc_state, t->pnn, t) != 0) {
- /* this node can not serve this ip. */
- DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
- ctdb_addr_to_str(&(t->addr)),
- t->pnn));
- t->pnn = -1;
- }
- }
-}
-
-static bool ipalloc_deterministic(struct ipalloc_state *ipalloc_state)
-{
- struct public_ip_list *t;
- int i, numnodes;
-
- numnodes = ipalloc_state->num;
-
- DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
- /* Allocate IPs to nodes in a modulo fashion so that IPs will
- * always be allocated the same way for a specific set of
- * available/unavailable nodes.
- */
-
- for (i = 0, t = ipalloc_state->all_ips; t!= NULL; t = t->next, i++) {
- t->pnn = i % numnodes;
- }
-
- /* IP failback doesn't make sense with deterministic
- * IPs, since the modulo step above implicitly fails
- * back IPs to their "home" node.
- */
- if (1 == ipalloc_state->no_ip_failback) {
- DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
- }
-
- unassign_unsuitable_ips(ipalloc_state);
-
- basic_allocate_unassigned(ipalloc_state);
-
- /* No failback here! */
-
- return true;
-}
-
-static bool ipalloc_nondeterministic(struct ipalloc_state *ipalloc_state)
-{
- /* This should be pushed down into basic_failback. */
- struct public_ip_list *t;
- int num_ips = 0;
- for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
- num_ips++;
- }
-
- unassign_unsuitable_ips(ipalloc_state);
-
- basic_allocate_unassigned(ipalloc_state);
-
- /* If we don't want IPs to fail back then don't rebalance IPs. */
- if (1 == ipalloc_state->no_ip_failback) {
- return true;
- }
-
- /* Now, try to make sure the ip adresses are evenly distributed
- across the nodes.
- */
- basic_failback(ipalloc_state, num_ips);
-
- return true;
-}
-
-static bool ipalloc_lcp2(struct ipalloc_state *ipalloc_state)
-{
- uint32_t *lcp2_imbalances;
- bool *rebalance_candidates;
- int numnodes, num_rebalance_candidates, i;
- bool ret = true;
-
- unassign_unsuitable_ips(ipalloc_state);
-
- if (!lcp2_init(ipalloc_state,
- &lcp2_imbalances, &rebalance_candidates)) {
- ret = false;
- goto finished;
- }
-
- lcp2_allocate_unassigned(ipalloc_state, lcp2_imbalances);
-
- /* If we don't want IPs to fail back then don't rebalance IPs. */
- if (1 == ipalloc_state->no_ip_failback) {
- goto finished;
- }
-
- /* It is only worth continuing if we have suitable target
- * nodes to transfer IPs to. This check is much cheaper than
- * continuing on...
- */
- numnodes = ipalloc_state->num;
- num_rebalance_candidates = 0;
- for (i=0; i<numnodes; i++) {
- if (rebalance_candidates[i]) {
- num_rebalance_candidates++;
- }
- }
- if (num_rebalance_candidates == 0) {
- goto finished;
- }
-
- /* Now, try to make sure the ip adresses are evenly distributed
- across the nodes.
- */
- lcp2_failback(ipalloc_state, lcp2_imbalances, rebalance_candidates);
-
-finished:
- return ret;
-}
-
static bool all_nodes_are_disabled(struct ctdb_node_map_old *nodemap)
{
int i;
return true;
}
-/* The calculation part of the IP allocation algorithm. */
-static bool ipalloc(struct ipalloc_state *ipalloc_state)
-{
- bool ret;
-
- switch (ipalloc_state->algorithm) {
- case IPALLOC_LCP2:
- ret = ipalloc_lcp2(ipalloc_state);
- break;
- case IPALLOC_DETERMINISTIC:
- ret = ipalloc_deterministic(ipalloc_state);
- break;
- case IPALLOC_NONDETERMINISTIC:
- ret = ipalloc_nondeterministic(ipalloc_state);
- break;
- }
-
- /* at this point ->pnn is the node which will own each IP
- or -1 if there is no node that can cover this ip
- */
-
- return ret;
-}
-
struct get_tunable_callback_data {
const char *tunable;
uint32_t *out;
--- /dev/null
+/*
+ ctdb ip takeover code
+
+ Copyright (C) Ronnie Sahlberg 2007
+ Copyright (C) Andrew Tridgell 2007
+ Copyright (C) Martin Schwenke 2011
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/network.h"
+
+#include "lib/util/debug.h"
+
+#include "common/logging.h"
+
+#include "server/ipalloc_private.h"
+
+/* The calculation part of the IP allocation algorithm. */
+bool ipalloc(struct ipalloc_state *ipalloc_state)
+{
+ bool ret;
+
+ switch (ipalloc_state->algorithm) {
+ case IPALLOC_LCP2:
+ ret = ipalloc_lcp2(ipalloc_state);
+ break;
+ case IPALLOC_DETERMINISTIC:
+ ret = ipalloc_deterministic(ipalloc_state);
+ break;
+ case IPALLOC_NONDETERMINISTIC:
+ ret = ipalloc_nondeterministic(ipalloc_state);
+ break;
+ }
+
+ /* at this point ->pnn is the node which will own each IP
+ or -1 if there is no node that can cover this ip
+ */
+
+ return ret;
+}
--- /dev/null
+/*
+ CTDB IP takeover code
+
+ Copyright (C) Ronnie Sahlberg 2007
+ Copyright (C) Andrew Tridgell 2007
+ Copyright (C) Martin Schwenke 2015
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __CTDB_IPALLOC_H__
+#define __CTDB_IPALLOC_H__
+
+#include "replace.h"
+#include "system/network.h"
+
+#include "include/ctdb_protocol.h"
+
+struct public_ip_list {
+ struct public_ip_list *next;
+ uint32_t pnn;
+ ctdb_sock_addr addr;
+};
+
+#define IP_KEYLEN 4
+uint32_t *ip_key(ctdb_sock_addr *ip);
+
+/* Flags used in IP allocation algorithms. */
+enum ipalloc_algorithm {
+ IPALLOC_DETERMINISTIC,
+ IPALLOC_NONDETERMINISTIC,
+ IPALLOC_LCP2,
+};
+
+struct ipalloc_state {
+ uint32_t num;
+
+ /* Arrays with data for each node */
+ struct ctdb_public_ip_list_old **known_public_ips;
+ struct ctdb_public_ip_list_old **available_public_ips;
+ bool *noiptakeover;
+ bool *noiphost;
+
+ struct public_ip_list *all_ips;
+ enum ipalloc_algorithm algorithm;
+ uint32_t no_ip_failback;
+ uint32_t *force_rebalance_nodes;
+};
+
+bool ipalloc(struct ipalloc_state *ipalloc_state);
+
+#endif /* __CTDB_IPALLOC_H__ */
--- /dev/null
+/*
+ ctdb ip takeover code
+
+ Copyright (C) Ronnie Sahlberg 2007
+ Copyright (C) Andrew Tridgell 2007
+ Copyright (C) Martin Schwenke 2011
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/network.h"
+
+#include "ctdb_private.h"
+
+#include "lib/util/time.h"
+
+#include "lib/util/debug.h"
+#include "common/logging.h"
+
+#include "common/common.h"
+#include "common/rb_tree.h"
+
+#include "include/ctdb_protocol.h"
+
+#include "server/ipalloc_private.h"
+
+#define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
+
+/* Given a physical node, return the number of
+ public addresses that is currently assigned to this node.
+*/
+int node_ip_coverage(int32_t pnn, struct public_ip_list *ips)
+{
+ int num=0;
+
+ for (;ips;ips=ips->next) {
+ if (ips->pnn == pnn) {
+ num++;
+ }
+ }
+ return num;
+}
+
+
+/* Can the given node host the given IP: is the public IP known to the
+ * node and is NOIPHOST unset?
+*/
+static bool can_node_host_ip(struct ipalloc_state *ipalloc_state,
+ int32_t pnn,
+ struct public_ip_list *ip)
+{
+ struct ctdb_public_ip_list_old *public_ips;
+ int i;
+
+ if (ipalloc_state->noiphost[pnn]) {
+ return false;
+ }
+
+ public_ips = ipalloc_state->available_public_ips[pnn];
+
+ if (public_ips == NULL) {
+ return false;
+ }
+
+ for (i=0; i<public_ips->num; i++) {
+ if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
+ /* yes, this node can serve this public ip */
+ return true;
+ }
+ }
+
+ return false;
+}
+
+bool can_node_takeover_ip(struct ipalloc_state *ipalloc_state,
+ int32_t pnn,
+ struct public_ip_list *ip)
+{
+ if (ipalloc_state->noiptakeover[pnn]) {
+ return false;
+ }
+
+ return can_node_host_ip(ipalloc_state, pnn, ip);
+}
+
+/* search the node lists list for a node to takeover this ip.
+ pick the node that currently are serving the least number of ips
+ so that the ips get spread out evenly.
+*/
+int find_takeover_node(struct ipalloc_state *ipalloc_state,
+ struct public_ip_list *ip)
+{
+ int pnn, min=0, num;
+ int i, numnodes;
+
+ numnodes = ipalloc_state->num;
+ pnn = -1;
+ for (i=0; i<numnodes; i++) {
+ /* verify that this node can serve this ip */
+ if (!can_node_takeover_ip(ipalloc_state, i, ip)) {
+ /* no it couldnt so skip to the next node */
+ continue;
+ }
+
+ num = node_ip_coverage(i, ipalloc_state->all_ips);
+ /* was this the first node we checked ? */
+ if (pnn == -1) {
+ pnn = i;
+ min = num;
+ } else {
+ if (num < min) {
+ pnn = i;
+ min = num;
+ }
+ }
+ }
+ if (pnn == -1) {
+ DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
+ ctdb_addr_to_str(&ip->addr)));
+
+ return -1;
+ }
+
+ ip->pnn = pnn;
+ return 0;
+}
+
+uint32_t *ip_key(ctdb_sock_addr *ip)
+{
+ static uint32_t key[IP_KEYLEN];
+
+ bzero(key, sizeof(key));
+
+ switch (ip->sa.sa_family) {
+ case AF_INET:
+ key[3] = htonl(ip->ip.sin_addr.s_addr);
+ break;
+ case AF_INET6: {
+ uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
+ key[0] = htonl(s6_a32[0]);
+ key[1] = htonl(s6_a32[1]);
+ key[2] = htonl(s6_a32[2]);
+ key[3] = htonl(s6_a32[3]);
+ break;
+ }
+ default:
+ DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
+ return key;
+ }
+
+ return key;
+}
+
+/* Allocate any unassigned IPs just by looping through the IPs and
+ * finding the best node for each.
+ */
+void basic_allocate_unassigned(struct ipalloc_state *ipalloc_state)
+{
+ struct public_ip_list *t;
+
+ /* loop over all ip's and find a physical node to cover for
+ each unassigned ip.
+ */
+ for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
+ if (t->pnn == -1) {
+ if (find_takeover_node(ipalloc_state, t)) {
+ DEBUG(DEBUG_WARNING,
+ ("Failed to find node to cover ip %s\n",
+ ctdb_addr_to_str(&t->addr)));
+ }
+ }
+ }
+}
+
+void unassign_unsuitable_ips(struct ipalloc_state *ipalloc_state)
+{
+ struct public_ip_list *t;
+
+ /* verify that the assigned nodes can serve that public ip
+ and set it to -1 if not
+ */
+ for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
+ if (t->pnn == -1) {
+ continue;
+ }
+ if (!can_node_host_ip(ipalloc_state, t->pnn, t) != 0) {
+ /* this node can not serve this ip. */
+ DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
+ ctdb_addr_to_str(&(t->addr)),
+ t->pnn));
+ t->pnn = -1;
+ }
+ }
+}
--- /dev/null
+/*
+ ctdb ip takeover code
+
+ Copyright (C) Ronnie Sahlberg 2007
+ Copyright (C) Andrew Tridgell 2007
+ Copyright (C) Martin Schwenke 2011
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/network.h"
+
+#include "lib/util/debug.h"
+#include "common/logging.h"
+
+#include "server/ipalloc_private.h"
+
+bool ipalloc_deterministic(struct ipalloc_state *ipalloc_state)
+{
+ struct public_ip_list *t;
+ int i, numnodes;
+
+ numnodes = ipalloc_state->num;
+
+ DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
+ /* Allocate IPs to nodes in a modulo fashion so that IPs will
+ * always be allocated the same way for a specific set of
+ * available/unavailable nodes.
+ */
+
+ for (i = 0, t = ipalloc_state->all_ips; t!= NULL; t = t->next, i++) {
+ t->pnn = i % numnodes;
+ }
+
+ /* IP failback doesn't make sense with deterministic
+ * IPs, since the modulo step above implicitly fails
+ * back IPs to their "home" node.
+ */
+ if (1 == ipalloc_state->no_ip_failback) {
+ DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
+ }
+
+ unassign_unsuitable_ips(ipalloc_state);
+
+ basic_allocate_unassigned(ipalloc_state);
+
+ /* No failback here! */
+
+ return true;
+}
--- /dev/null
+/*
+ ctdb ip takeover code
+
+ Copyright (C) Ronnie Sahlberg 2007
+ Copyright (C) Andrew Tridgell 2007
+ Copyright (C) Martin Schwenke 2011
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/network.h"
+
+#include "lib/util/debug.h"
+#include "common/logging.h"
+
+#include "protocol/protocol_api.h"
+
+#include "server/ipalloc_private.h"
+
+/*
+ * This is the length of the longtest common prefix between the IPs.
+ * It is calculated by XOR-ing the 2 IPs together and counting the
+ * number of leading zeroes. The implementation means that all
+ * addresses end up being 128 bits long.
+ *
+ * FIXME? Should we consider IPv4 and IPv6 separately given that the
+ * 12 bytes of 0 prefix padding will hurt the algorithm if there are
+ * lots of nodes and IP addresses?
+ */
+static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
+{
+ uint32_t ip1_k[IP_KEYLEN];
+ uint32_t *t;
+ int i;
+ uint32_t x;
+
+ uint32_t distance = 0;
+
+ memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
+ t = ip_key(ip2);
+ for (i=0; i<IP_KEYLEN; i++) {
+ x = ip1_k[i] ^ t[i];
+ if (x == 0) {
+ distance += 32;
+ } else {
+ /* Count number of leading zeroes.
+ * FIXME? This could be optimised...
+ */
+ while ((x & (1 << 31)) == 0) {
+ x <<= 1;
+ distance += 1;
+ }
+ }
+ }
+
+ return distance;
+}
+
+/* Calculate the IP distance for the given IP relative to IPs on the
+ given node. The ips argument is generally the all_ips variable
+ used in the main part of the algorithm.
+ */
+static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
+ struct public_ip_list *ips,
+ int pnn)
+{
+ struct public_ip_list *t;
+ uint32_t d;
+
+ uint32_t sum = 0;
+
+ for (t = ips; t != NULL; t = t->next) {
+ if (t->pnn != pnn) {
+ continue;
+ }
+
+ /* Optimisation: We never calculate the distance
+ * between an address and itself. This allows us to
+ * calculate the effect of removing an address from a
+ * node by simply calculating the distance between
+ * that address and all of the exitsing addresses.
+ * Moreover, we assume that we're only ever dealing
+ * with addresses from all_ips so we can identify an
+ * address via a pointer rather than doing a more
+ * expensive address comparison. */
+ if (&(t->addr) == ip) {
+ continue;
+ }
+
+ d = ip_distance(ip, &(t->addr));
+ sum += d * d; /* Cheaper than pulling in math.h :-) */
+ }
+
+ return sum;
+}
+
+/* Return the LCP2 imbalance metric for addresses currently assigned
+ to the given node.
+ */
+static uint32_t lcp2_imbalance(struct public_ip_list * all_ips, int pnn)
+{
+ struct public_ip_list *t;
+
+ uint32_t imbalance = 0;
+
+ for (t = all_ips; t != NULL; t = t->next) {
+ if (t->pnn != pnn) {
+ continue;
+ }
+ /* Pass the rest of the IPs rather than the whole
+ all_ips input list.
+ */
+ imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
+ }
+
+ return imbalance;
+}
+
+static bool lcp2_init(struct ipalloc_state *ipalloc_state,
+ uint32_t **lcp2_imbalances,
+ bool **rebalance_candidates)
+{
+ int i, numnodes;
+ struct public_ip_list *t;
+
+ numnodes = ipalloc_state->num;
+
+ *rebalance_candidates = talloc_array(ipalloc_state, bool, numnodes);
+ if (*rebalance_candidates == NULL) {
+ DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
+ return false;
+ }
+ *lcp2_imbalances = talloc_array(ipalloc_state, uint32_t, numnodes);
+ if (*lcp2_imbalances == NULL) {
+ DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
+ return false;
+ }
+
+ for (i=0; i<numnodes; i++) {
+ (*lcp2_imbalances)[i] =
+ lcp2_imbalance(ipalloc_state->all_ips, i);
+ /* First step: assume all nodes are candidates */
+ (*rebalance_candidates)[i] = true;
+ }
+
+ /* 2nd step: if a node has IPs assigned then it must have been
+ * healthy before, so we remove it from consideration. This
+ * is overkill but is all we have because we don't maintain
+ * state between takeover runs. An alternative would be to
+ * keep state and invalidate it every time the recovery master
+ * changes.
+ */
+ for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
+ if (t->pnn != -1) {
+ (*rebalance_candidates)[t->pnn] = false;
+ }
+ }
+
+ /* 3rd step: if a node is forced to re-balance then
+ we allow failback onto the node */
+ if (ipalloc_state->force_rebalance_nodes == NULL) {
+ return true;
+ }
+ for (i = 0;
+ i < talloc_array_length(ipalloc_state->force_rebalance_nodes);
+ i++) {
+ uint32_t pnn = ipalloc_state->force_rebalance_nodes[i];
+ if (pnn >= numnodes) {
+ DEBUG(DEBUG_ERR,
+ (__location__ "unknown node %u\n", pnn));
+ continue;
+ }
+
+ DEBUG(DEBUG_NOTICE,
+ ("Forcing rebalancing of IPs to node %u\n", pnn));
+ (*rebalance_candidates)[pnn] = true;
+ }
+
+ return true;
+}
+
+/* Allocate any unassigned addresses using the LCP2 algorithm to find
+ * the IP/node combination that will cost the least.
+ */
+static void lcp2_allocate_unassigned(struct ipalloc_state *ipalloc_state,
+ uint32_t *lcp2_imbalances)
+{
+ struct public_ip_list *t;
+ int dstnode, numnodes;
+
+ int minnode;
+ uint32_t mindsum, dstdsum, dstimbl, minimbl;
+ struct public_ip_list *minip;
+
+ bool should_loop = true;
+ bool have_unassigned = true;
+
+ numnodes = ipalloc_state->num;
+
+ while (have_unassigned && should_loop) {
+ should_loop = false;
+
+ DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
+ DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
+
+ minnode = -1;
+ mindsum = 0;
+ minip = NULL;
+
+ /* loop over each unassigned ip. */
+ for (t = ipalloc_state->all_ips; t != NULL ; t = t->next) {
+ if (t->pnn != -1) {
+ continue;
+ }
+
+ for (dstnode = 0; dstnode < numnodes; dstnode++) {
+ /* only check nodes that can actually takeover this ip */
+ if (!can_node_takeover_ip(ipalloc_state,
+ dstnode,
+ t)) {
+ /* no it couldnt so skip to the next node */
+ continue;
+ }
+
+ dstdsum = ip_distance_2_sum(&(t->addr),
+ ipalloc_state->all_ips,
+ dstnode);
+ dstimbl = lcp2_imbalances[dstnode] + dstdsum;
+ DEBUG(DEBUG_DEBUG,
+ (" %s -> %d [+%d]\n",
+ ctdb_sock_addr_to_string(ipalloc_state,
+ &(t->addr)),
+ dstnode,
+ dstimbl - lcp2_imbalances[dstnode]));
+
+
+ if ((minnode == -1) || (dstdsum < mindsum)) {
+ minnode = dstnode;
+ minimbl = dstimbl;
+ mindsum = dstdsum;
+ minip = t;
+ should_loop = true;
+ }
+ }
+ }
+
+ DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
+
+ /* If we found one then assign it to the given node. */
+ if (minnode != -1) {
+ minip->pnn = minnode;
+ lcp2_imbalances[minnode] = minimbl;
+ DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
+ ctdb_sock_addr_to_string(
+ ipalloc_state,
+ &(minip->addr)),
+ minnode,
+ mindsum));
+ }
+
+ /* There might be a better way but at least this is clear. */
+ have_unassigned = false;
+ for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
+ if (t->pnn == -1) {
+ have_unassigned = true;
+ }
+ }
+ }
+
+ /* We know if we have an unassigned addresses so we might as
+ * well optimise.
+ */
+ if (have_unassigned) {
+ for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
+ if (t->pnn == -1) {
+ DEBUG(DEBUG_WARNING,
+ ("Failed to find node to cover ip %s\n",
+ ctdb_sock_addr_to_string(ipalloc_state,
+ &t->addr)));
+ }
+ }
+ }
+}
+
+/* LCP2 algorithm for rebalancing the cluster. Given a candidate node
+ * to move IPs from, determines the best IP/destination node
+ * combination to move from the source node.
+ */
+static bool lcp2_failback_candidate(struct ipalloc_state *ipalloc_state,
+ int srcnode,
+ uint32_t *lcp2_imbalances,
+ bool *rebalance_candidates)
+{
+ int dstnode, mindstnode, numnodes;
+ uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
+ uint32_t minsrcimbl, mindstimbl;
+ struct public_ip_list *minip;
+ struct public_ip_list *t;
+
+ /* Find an IP and destination node that best reduces imbalance. */
+ srcimbl = 0;
+ minip = NULL;
+ minsrcimbl = 0;
+ mindstnode = -1;
+ mindstimbl = 0;
+
+ numnodes = ipalloc_state->num;
+
+ DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
+ DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n",
+ srcnode, lcp2_imbalances[srcnode]));
+
+ for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
+ /* Only consider addresses on srcnode. */
+ if (t->pnn != srcnode) {
+ continue;
+ }
+
+ /* What is this IP address costing the source node? */
+ srcdsum = ip_distance_2_sum(&(t->addr),
+ ipalloc_state->all_ips,
+ srcnode);
+ srcimbl = lcp2_imbalances[srcnode] - srcdsum;
+
+ /* Consider this IP address would cost each potential
+ * destination node. Destination nodes are limited to
+ * those that are newly healthy, since we don't want
+ * to do gratuitous failover of IPs just to make minor
+ * balance improvements.
+ */
+ for (dstnode = 0; dstnode < numnodes; dstnode++) {
+ if (!rebalance_candidates[dstnode]) {
+ continue;
+ }
+
+ /* only check nodes that can actually takeover this ip */
+ if (!can_node_takeover_ip(ipalloc_state, dstnode,
+ t)) {
+ /* no it couldnt so skip to the next node */
+ continue;
+ }
+
+ dstdsum = ip_distance_2_sum(&(t->addr),
+ ipalloc_state->all_ips,
+ dstnode);
+ dstimbl = lcp2_imbalances[dstnode] + dstdsum;
+ DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
+ srcnode, -srcdsum,
+ ctdb_sock_addr_to_string(
+ ipalloc_state, &(t->addr)),
+ dstnode, dstdsum));
+
+ if ((dstimbl < lcp2_imbalances[srcnode]) &&
+ (dstdsum < srcdsum) && \
+ ((mindstnode == -1) || \
+ ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
+
+ minip = t;
+ minsrcimbl = srcimbl;
+ mindstnode = dstnode;
+ mindstimbl = dstimbl;
+ }
+ }
+ }
+ DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
+
+ if (mindstnode != -1) {
+ /* We found a move that makes things better... */
+ DEBUG(DEBUG_INFO,
+ ("%d [%d] -> %s -> %d [+%d]\n",
+ srcnode, minsrcimbl - lcp2_imbalances[srcnode],
+ ctdb_sock_addr_to_string(ipalloc_state, &(minip->addr)),
+ mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
+
+
+ lcp2_imbalances[srcnode] = minsrcimbl;
+ lcp2_imbalances[mindstnode] = mindstimbl;
+ minip->pnn = mindstnode;
+
+ return true;
+ }
+
+ return false;
+}
+
+struct lcp2_imbalance_pnn {
+ uint32_t imbalance;
+ int pnn;
+};
+
+static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
+{
+ const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
+ const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
+
+ if (lipa->imbalance > lipb->imbalance) {
+ return -1;
+ } else if (lipa->imbalance == lipb->imbalance) {
+ return 0;
+ } else {
+ return 1;
+ }
+}
+
+/* LCP2 algorithm for rebalancing the cluster. This finds the source
+ * node with the highest LCP2 imbalance, and then determines the best
+ * IP/destination node combination to move from the source node.
+ */
+static void lcp2_failback(struct ipalloc_state *ipalloc_state,
+ uint32_t *lcp2_imbalances,
+ bool *rebalance_candidates)
+{
+ int i, numnodes;
+ struct lcp2_imbalance_pnn * lips;
+ bool again;
+
+ numnodes = ipalloc_state->num;
+
+try_again:
+ /* Put the imbalances and nodes into an array, sort them and
+ * iterate through candidates. Usually the 1st one will be
+ * used, so this doesn't cost much...
+ */
+ DEBUG(DEBUG_DEBUG,("+++++++++++++++++++++++++++++++++++++++++\n"));
+ DEBUG(DEBUG_DEBUG,("Selecting most imbalanced node from:\n"));
+ lips = talloc_array(ipalloc_state, struct lcp2_imbalance_pnn, numnodes);
+ for (i = 0; i < numnodes; i++) {
+ lips[i].imbalance = lcp2_imbalances[i];
+ lips[i].pnn = i;
+ DEBUG(DEBUG_DEBUG,(" %d [%d]\n", i, lcp2_imbalances[i]));
+ }
+ qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
+ lcp2_cmp_imbalance_pnn);
+
+ again = false;
+ for (i = 0; i < numnodes; i++) {
+ /* This means that all nodes had 0 or 1 addresses, so
+ * can't be imbalanced.
+ */
+ if (lips[i].imbalance == 0) {
+ break;
+ }
+
+ if (lcp2_failback_candidate(ipalloc_state,
+ lips[i].pnn,
+ lcp2_imbalances,
+ rebalance_candidates)) {
+ again = true;
+ break;
+ }
+ }
+
+ talloc_free(lips);
+ if (again) {
+ goto try_again;
+ }
+}
+
+bool ipalloc_lcp2(struct ipalloc_state *ipalloc_state)
+{
+ uint32_t *lcp2_imbalances;
+ bool *rebalance_candidates;
+ int numnodes, num_rebalance_candidates, i;
+ bool ret = true;
+
+ unassign_unsuitable_ips(ipalloc_state);
+
+ if (!lcp2_init(ipalloc_state,
+ &lcp2_imbalances, &rebalance_candidates)) {
+ ret = false;
+ goto finished;
+ }
+
+ lcp2_allocate_unassigned(ipalloc_state, lcp2_imbalances);
+
+ /* If we don't want IPs to fail back then don't rebalance IPs. */
+ if (1 == ipalloc_state->no_ip_failback) {
+ goto finished;
+ }
+
+ /* It is only worth continuing if we have suitable target
+ * nodes to transfer IPs to. This check is much cheaper than
+ * continuing on...
+ */
+ numnodes = ipalloc_state->num;
+ num_rebalance_candidates = 0;
+ for (i=0; i<numnodes; i++) {
+ if (rebalance_candidates[i]) {
+ num_rebalance_candidates++;
+ }
+ }
+ if (num_rebalance_candidates == 0) {
+ goto finished;
+ }
+
+ /* Now, try to make sure the ip adresses are evenly distributed
+ across the nodes.
+ */
+ lcp2_failback(ipalloc_state, lcp2_imbalances, rebalance_candidates);
+
+finished:
+ return ret;
+}
--- /dev/null
+/*
+ ctdb ip takeover code
+
+ Copyright (C) Ronnie Sahlberg 2007
+ Copyright (C) Andrew Tridgell 2007
+ Copyright (C) Martin Schwenke 2011
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/network.h"
+
+#include "ctdb_private.h"
+
+#include "lib/util/debug.h"
+#include "common/logging.h"
+#include "common/common.h"
+
+#include "server/ipalloc_private.h"
+
+/* Basic non-deterministic rebalancing algorithm.
+ */
+static void basic_failback(struct ipalloc_state *ipalloc_state,
+ int num_ips)
+{
+ int i, numnodes;
+ int maxnode, maxnum, minnode, minnum, num, retries;
+ struct public_ip_list *t;
+
+ numnodes = ipalloc_state->num;
+ retries = 0;
+
+try_again:
+ maxnum=0;
+ minnum=0;
+
+ /* for each ip address, loop over all nodes that can serve
+ this ip and make sure that the difference between the node
+ serving the most and the node serving the least ip's are
+ not greater than 1.
+ */
+ for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
+ if (t->pnn == -1) {
+ continue;
+ }
+
+ /* Get the highest and lowest number of ips's served by any
+ valid node which can serve this ip.
+ */
+ maxnode = -1;
+ minnode = -1;
+ for (i=0; i<numnodes; i++) {
+ /* only check nodes that can actually serve this ip */
+ if (!can_node_takeover_ip(ipalloc_state, i,
+ t)) {
+ /* no it couldnt so skip to the next node */
+ continue;
+ }
+
+ num = node_ip_coverage(i, ipalloc_state->all_ips);
+ if (maxnode == -1) {
+ maxnode = i;
+ maxnum = num;
+ } else {
+ if (num > maxnum) {
+ maxnode = i;
+ maxnum = num;
+ }
+ }
+ if (minnode == -1) {
+ minnode = i;
+ minnum = num;
+ } else {
+ if (num < minnum) {
+ minnode = i;
+ minnum = num;
+ }
+ }
+ }
+ if (maxnode == -1) {
+ DEBUG(DEBUG_WARNING,
+ (__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
+ ctdb_addr_to_str(&t->addr)));
+
+ continue;
+ }
+
+ /* if the spread between the smallest and largest coverage by
+ a node is >=2 we steal one of the ips from the node with
+ most coverage to even things out a bit.
+ try to do this a limited number of times since we dont
+ want to spend too much time balancing the ip coverage.
+ */
+ if ((maxnum > minnum+1) &&
+ (retries < (num_ips + 5))){
+ struct public_ip_list *tt;
+
+ /* Reassign one of maxnode's VNNs */
+ for (tt = ipalloc_state->all_ips; tt != NULL; tt = tt->next) {
+ if (tt->pnn == maxnode) {
+ (void)find_takeover_node(ipalloc_state,
+ tt);
+ retries++;
+ goto try_again;;
+ }
+ }
+ }
+ }
+}
+
+bool ipalloc_nondeterministic(struct ipalloc_state *ipalloc_state)
+{
+ /* This should be pushed down into basic_failback. */
+ struct public_ip_list *t;
+ int num_ips = 0;
+ for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
+ num_ips++;
+ }
+
+ unassign_unsuitable_ips(ipalloc_state);
+
+ basic_allocate_unassigned(ipalloc_state);
+
+ /* If we don't want IPs to fail back then don't rebalance IPs. */
+ if (1 == ipalloc_state->no_ip_failback) {
+ return true;
+ }
+
+ /* Now, try to make sure the ip adresses are evenly distributed
+ across the nodes.
+ */
+ basic_failback(ipalloc_state, num_ips);
+
+ return true;
+}
--- /dev/null
+/*
+ CTDB IP takeover code
+
+ Copyright (C) Ronnie Sahlberg 2007
+ Copyright (C) Andrew Tridgell 2007
+ Copyright (C) Martin Schwenke 2015
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __CTDB_IPALLOC_PRIVATE_H__
+#define __CTDB_IPALLOC_PRIVATE_H__
+
+#include "protocol/protocol.h"
+
+#include "server/ipalloc.h"
+
+bool can_node_takeover_ip(struct ipalloc_state *ipalloc_state,
+ int32_t pnn,
+ struct public_ip_list *ip);
+int node_ip_coverage(int32_t pnn, struct public_ip_list *ips);
+int find_takeover_node(struct ipalloc_state *ipalloc_state,
+ struct public_ip_list *ip);
+
+void unassign_unsuitable_ips(struct ipalloc_state *ipalloc_state);
+void basic_allocate_unassigned(struct ipalloc_state *ipalloc_state);
+
+bool ipalloc_nondeterministic(struct ipalloc_state *ipalloc_state);
+bool ipalloc_deterministic(struct ipalloc_state *ipalloc_state);
+bool ipalloc_lcp2(struct ipalloc_state *ipalloc_state);
+
+#endif /* __CTDB_IPALLOC_PRIVATE_H__ */
#include "server/ctdb_ltdb_server.c"
#include "server/ctdb_traverse.c"
#include "server/eventscript.c"
+#include "server/ipalloc_common.c"
+#include "server/ipalloc_deterministic.c"
+#include "server/ipalloc_nondeterministic.c"
+#include "server/ipalloc_lcp2.c"
+#include "server/ipalloc.c"
#include "server/ctdb_takeover.c"
#include "server/ctdb_serverids.c"
#include "server/ctdb_persistent.c"
includes='include',
deps='replace talloc tevent tdb tdb-wrap')
+ bld.SAMBA_SUBSYSTEM('ctdb-ipalloc',
+ source=bld.SUBDIR('server',
+ '''ipalloc_deterministic.c
+ ipalloc_nondeterministic.c
+ ipalloc_lcp2.c
+ ipalloc_common.c
+ ipalloc.c
+ '''),
+ includes='include',
+ deps='ctdb-protocol replace talloc tevent')
+
bld.SAMBA_SUBSYSTEM('ctdb-server',
source='server/ctdbd.c ' +
bld.SUBDIR('server',
ctdb_update_record.c
ctdb_lock.c ctdb_fork.c'''),
includes='include',
- deps='replace popt talloc tevent tdb talloc_report')
+ deps='ctdb-ipalloc replace popt talloc tevent tdb talloc_report')
bld.SAMBA_BINARY('ctdbd',
source='',
bld.SAMBA_BINARY('ctdb_takeover_tests',
source='tests/src/ctdb_takeover_tests.c',
deps='''replace popt tdb tevent talloc ctdb-system
- samba-util tdb-wrap talloc_report''' +
+ samba-util tdb-wrap talloc_report
+ ctdb-protocol''' +
ib_deps,
includes='include',
install_path='${CTDB_TEST_LIBDIR}')