From e320725f0206c56f5fe8b8b580d677c2aa56ca47 Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Mon, 23 Nov 2015 16:18:16 +1100 Subject: [PATCH] ctdb-ipalloc: Split IP allocation into its own build subsystem Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs --- ctdb/server/ctdb_takeover.c | 852 +------------------------ ctdb/server/ipalloc.c | 53 ++ ctdb/server/ipalloc.h | 63 ++ ctdb/server/ipalloc_common.c | 206 ++++++ ctdb/server/ipalloc_deterministic.c | 62 ++ ctdb/server/ipalloc_lcp2.c | 515 +++++++++++++++ ctdb/server/ipalloc_nondeterministic.c | 147 +++++ ctdb/server/ipalloc_private.h | 43 ++ ctdb/tests/src/ctdbd_test.c | 5 + ctdb/wscript | 16 +- 10 files changed, 1109 insertions(+), 853 deletions(-) create mode 100644 ctdb/server/ipalloc.c create mode 100644 ctdb/server/ipalloc.h create mode 100644 ctdb/server/ipalloc_common.c create mode 100644 ctdb/server/ipalloc_deterministic.c create mode 100644 ctdb/server/ipalloc_lcp2.c create mode 100644 ctdb/server/ipalloc_nondeterministic.c create mode 100644 ctdb/server/ipalloc_private.h diff --git a/ctdb/server/ctdb_takeover.c b/ctdb/server/ctdb_takeover.c index 227bd162d44..c49009cf22d 100644 --- a/ctdb/server/ctdb_takeover.c +++ b/ctdb/server/ctdb_takeover.c @@ -41,34 +41,13 @@ #include "common/common.h" #include "common/logging.h" +#include "server/ipalloc.h" #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0) #define CTDB_ARP_INTERVAL 1 #define CTDB_ARP_REPEAT 3 -/* Flags used in IP allocation algorithms. */ -enum ipalloc_algorithm { - IPALLOC_DETERMINISTIC, - IPALLOC_NONDETERMINISTIC, - IPALLOC_LCP2, -}; - -struct ipalloc_state { - uint32_t num; - - /* Arrays with data for each node */ - struct ctdb_public_ip_list_old **known_public_ips; - struct ctdb_public_ip_list_old **available_public_ips; - bool *noiptakeover; - bool *noiphost; - - struct public_ip_list *all_ips; - enum ipalloc_algorithm algorithm; - uint32_t no_ip_failback; - uint32_t *force_rebalance_nodes; -}; - struct ctdb_interface { struct ctdb_interface *prev, *next; const char *name; @@ -1249,138 +1228,6 @@ int ctdb_set_single_public_ip(struct ctdb_context *ctdb, return 0; } -struct public_ip_list { - struct public_ip_list *next; - uint32_t pnn; - ctdb_sock_addr addr; -}; - -/* Given a physical node, return the number of - public addresses that is currently assigned to this node. -*/ -static int node_ip_coverage(int32_t pnn, struct public_ip_list *ips) -{ - int num=0; - - for (;ips;ips=ips->next) { - if (ips->pnn == pnn) { - num++; - } - } - return num; -} - - -/* Can the given node host the given IP: is the public IP known to the - * node and is NOIPHOST unset? -*/ -static bool can_node_host_ip(struct ipalloc_state *ipalloc_state, - int32_t pnn, - struct public_ip_list *ip) -{ - struct ctdb_public_ip_list_old *public_ips; - int i; - - if (ipalloc_state->noiphost[pnn]) { - return false; - } - - public_ips = ipalloc_state->available_public_ips[pnn]; - - if (public_ips == NULL) { - return false; - } - - for (i=0; inum; i++) { - if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) { - /* yes, this node can serve this public ip */ - return true; - } - } - - return false; -} - -static bool can_node_takeover_ip(struct ipalloc_state *ipalloc_state, - int32_t pnn, - struct public_ip_list *ip) -{ - if (ipalloc_state->noiptakeover[pnn]) { - return false; - } - - return can_node_host_ip(ipalloc_state, pnn, ip); -} - -/* search the node lists list for a node to takeover this ip. - pick the node that currently are serving the least number of ips - so that the ips get spread out evenly. -*/ -static int find_takeover_node(struct ipalloc_state *ipalloc_state, - struct public_ip_list *ip) -{ - int pnn, min=0, num; - int i, numnodes; - - numnodes = ipalloc_state->num; - pnn = -1; - for (i=0; iall_ips); - /* was this the first node we checked ? */ - if (pnn == -1) { - pnn = i; - min = num; - } else { - if (num < min) { - pnn = i; - min = num; - } - } - } - if (pnn == -1) { - DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n", - ctdb_addr_to_str(&ip->addr))); - - return -1; - } - - ip->pnn = pnn; - return 0; -} - -#define IP_KEYLEN 4 -static uint32_t *ip_key(ctdb_sock_addr *ip) -{ - static uint32_t key[IP_KEYLEN]; - - bzero(key, sizeof(key)); - - switch (ip->sa.sa_family) { - case AF_INET: - key[3] = htonl(ip->ip.sin_addr.s_addr); - break; - case AF_INET6: { - uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr); - key[0] = htonl(s6_a32[0]); - key[1] = htonl(s6_a32[1]); - key[2] = htonl(s6_a32[2]); - key[3] = htonl(s6_a32[3]); - break; - } - default: - DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family)); - return key; - } - - return key; -} - static void *add_ip_callback(void *parm, void *data) { struct public_ip_list *this_ip = parm; @@ -1518,679 +1365,6 @@ create_merged_ip_list(struct ctdb_context *ctdb, struct ipalloc_state *ipalloc_s return ip_list; } -/* - * This is the length of the longtest common prefix between the IPs. - * It is calculated by XOR-ing the 2 IPs together and counting the - * number of leading zeroes. The implementation means that all - * addresses end up being 128 bits long. - * - * FIXME? Should we consider IPv4 and IPv6 separately given that the - * 12 bytes of 0 prefix padding will hurt the algorithm if there are - * lots of nodes and IP addresses? - */ -static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2) -{ - uint32_t ip1_k[IP_KEYLEN]; - uint32_t *t; - int i; - uint32_t x; - - uint32_t distance = 0; - - memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k)); - t = ip_key(ip2); - for (i=0; inext) { - if (t->pnn != pnn) { - continue; - } - - /* Optimisation: We never calculate the distance - * between an address and itself. This allows us to - * calculate the effect of removing an address from a - * node by simply calculating the distance between - * that address and all of the exitsing addresses. - * Moreover, we assume that we're only ever dealing - * with addresses from all_ips so we can identify an - * address via a pointer rather than doing a more - * expensive address comparison. */ - if (&(t->addr) == ip) { - continue; - } - - d = ip_distance(ip, &(t->addr)); - sum += d * d; /* Cheaper than pulling in math.h :-) */ - } - - return sum; -} - -/* Return the LCP2 imbalance metric for addresses currently assigned - to the given node. - */ -static uint32_t lcp2_imbalance(struct public_ip_list * all_ips, int pnn) -{ - struct public_ip_list *t; - - uint32_t imbalance = 0; - - for (t = all_ips; t != NULL; t = t->next) { - if (t->pnn != pnn) { - continue; - } - /* Pass the rest of the IPs rather than the whole - all_ips input list. - */ - imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn); - } - - return imbalance; -} - -/* Allocate any unassigned IPs just by looping through the IPs and - * finding the best node for each. - */ -static void basic_allocate_unassigned(struct ipalloc_state *ipalloc_state) -{ - struct public_ip_list *t; - - /* loop over all ip's and find a physical node to cover for - each unassigned ip. - */ - for (t = ipalloc_state->all_ips; t != NULL; t = t->next) { - if (t->pnn == -1) { - if (find_takeover_node(ipalloc_state, t)) { - DEBUG(DEBUG_WARNING, - ("Failed to find node to cover ip %s\n", - ctdb_addr_to_str(&t->addr))); - } - } - } -} - -/* Basic non-deterministic rebalancing algorithm. - */ -static void basic_failback(struct ipalloc_state *ipalloc_state, - int num_ips) -{ - int i, numnodes; - int maxnode, maxnum, minnode, minnum, num, retries; - struct public_ip_list *t; - - numnodes = ipalloc_state->num; - retries = 0; - -try_again: - maxnum=0; - minnum=0; - - /* for each ip address, loop over all nodes that can serve - this ip and make sure that the difference between the node - serving the most and the node serving the least ip's are - not greater than 1. - */ - for (t = ipalloc_state->all_ips; t != NULL; t = t->next) { - if (t->pnn == -1) { - continue; - } - - /* Get the highest and lowest number of ips's served by any - valid node which can serve this ip. - */ - maxnode = -1; - minnode = -1; - for (i=0; iall_ips); - if (maxnode == -1) { - maxnode = i; - maxnum = num; - } else { - if (num > maxnum) { - maxnode = i; - maxnum = num; - } - } - if (minnode == -1) { - minnode = i; - minnum = num; - } else { - if (num < minnum) { - minnode = i; - minnum = num; - } - } - } - if (maxnode == -1) { - DEBUG(DEBUG_WARNING, - (__location__ " Could not find maxnode. May not be able to serve ip '%s'\n", - ctdb_addr_to_str(&t->addr))); - - continue; - } - - /* if the spread between the smallest and largest coverage by - a node is >=2 we steal one of the ips from the node with - most coverage to even things out a bit. - try to do this a limited number of times since we dont - want to spend too much time balancing the ip coverage. - */ - if ((maxnum > minnum+1) && - (retries < (num_ips + 5))){ - struct public_ip_list *tt; - - /* Reassign one of maxnode's VNNs */ - for (tt = ipalloc_state->all_ips; tt != NULL; tt = tt->next) { - if (tt->pnn == maxnode) { - (void)find_takeover_node(ipalloc_state, - tt); - retries++; - goto try_again;; - } - } - } - } -} - -static bool lcp2_init(struct ipalloc_state *ipalloc_state, - uint32_t **lcp2_imbalances, - bool **rebalance_candidates) -{ - int i, numnodes; - struct public_ip_list *t; - - numnodes = ipalloc_state->num; - - *rebalance_candidates = talloc_array(ipalloc_state, bool, numnodes); - if (*rebalance_candidates == NULL) { - DEBUG(DEBUG_ERR, (__location__ " out of memory\n")); - return false; - } - *lcp2_imbalances = talloc_array(ipalloc_state, uint32_t, numnodes); - if (*lcp2_imbalances == NULL) { - DEBUG(DEBUG_ERR, (__location__ " out of memory\n")); - return false; - } - - for (i=0; iall_ips, i); - /* First step: assume all nodes are candidates */ - (*rebalance_candidates)[i] = true; - } - - /* 2nd step: if a node has IPs assigned then it must have been - * healthy before, so we remove it from consideration. This - * is overkill but is all we have because we don't maintain - * state between takeover runs. An alternative would be to - * keep state and invalidate it every time the recovery master - * changes. - */ - for (t = ipalloc_state->all_ips; t != NULL; t = t->next) { - if (t->pnn != -1) { - (*rebalance_candidates)[t->pnn] = false; - } - } - - /* 3rd step: if a node is forced to re-balance then - we allow failback onto the node */ - if (ipalloc_state->force_rebalance_nodes == NULL) { - return true; - } - for (i = 0; - i < talloc_array_length(ipalloc_state->force_rebalance_nodes); - i++) { - uint32_t pnn = ipalloc_state->force_rebalance_nodes[i]; - if (pnn >= numnodes) { - DEBUG(DEBUG_ERR, - (__location__ "unknown node %u\n", pnn)); - continue; - } - - DEBUG(DEBUG_NOTICE, - ("Forcing rebalancing of IPs to node %u\n", pnn)); - (*rebalance_candidates)[pnn] = true; - } - - return true; -} - -/* Allocate any unassigned addresses using the LCP2 algorithm to find - * the IP/node combination that will cost the least. - */ -static void lcp2_allocate_unassigned(struct ipalloc_state *ipalloc_state, - uint32_t *lcp2_imbalances) -{ - struct public_ip_list *t; - int dstnode, numnodes; - - int minnode; - uint32_t mindsum, dstdsum, dstimbl, minimbl; - struct public_ip_list *minip; - - bool should_loop = true; - bool have_unassigned = true; - - numnodes = ipalloc_state->num; - - while (have_unassigned && should_loop) { - should_loop = false; - - DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n")); - DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n")); - - minnode = -1; - mindsum = 0; - minip = NULL; - - /* loop over each unassigned ip. */ - for (t = ipalloc_state->all_ips; t != NULL ; t = t->next) { - if (t->pnn != -1) { - continue; - } - - for (dstnode = 0; dstnode < numnodes; dstnode++) { - /* only check nodes that can actually takeover this ip */ - if (!can_node_takeover_ip(ipalloc_state, - dstnode, - t)) { - /* no it couldnt so skip to the next node */ - continue; - } - - dstdsum = ip_distance_2_sum(&(t->addr), - ipalloc_state->all_ips, - dstnode); - dstimbl = lcp2_imbalances[dstnode] + dstdsum; - DEBUG(DEBUG_DEBUG, - (" %s -> %d [+%d]\n", - ctdb_addr_to_str(&(t->addr)), - dstnode, - dstimbl - lcp2_imbalances[dstnode])); - - - if ((minnode == -1) || (dstdsum < mindsum)) { - minnode = dstnode; - minimbl = dstimbl; - mindsum = dstdsum; - minip = t; - should_loop = true; - } - } - } - - DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n")); - - /* If we found one then assign it to the given node. */ - if (minnode != -1) { - minip->pnn = minnode; - lcp2_imbalances[minnode] = minimbl; - DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n", - ctdb_addr_to_str(&(minip->addr)), - minnode, - mindsum)); - } - - /* There might be a better way but at least this is clear. */ - have_unassigned = false; - for (t = ipalloc_state->all_ips; t != NULL; t = t->next) { - if (t->pnn == -1) { - have_unassigned = true; - } - } - } - - /* We know if we have an unassigned addresses so we might as - * well optimise. - */ - if (have_unassigned) { - for (t = ipalloc_state->all_ips; t != NULL; t = t->next) { - if (t->pnn == -1) { - DEBUG(DEBUG_WARNING, - ("Failed to find node to cover ip %s\n", - ctdb_addr_to_str(&t->addr))); - } - } - } -} - -/* LCP2 algorithm for rebalancing the cluster. Given a candidate node - * to move IPs from, determines the best IP/destination node - * combination to move from the source node. - */ -static bool lcp2_failback_candidate(struct ipalloc_state *ipalloc_state, - int srcnode, - uint32_t *lcp2_imbalances, - bool *rebalance_candidates) -{ - int dstnode, mindstnode, numnodes; - uint32_t srcimbl, srcdsum, dstimbl, dstdsum; - uint32_t minsrcimbl, mindstimbl; - struct public_ip_list *minip; - struct public_ip_list *t; - - /* Find an IP and destination node that best reduces imbalance. */ - srcimbl = 0; - minip = NULL; - minsrcimbl = 0; - mindstnode = -1; - mindstimbl = 0; - - numnodes = ipalloc_state->num; - - DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n")); - DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n", - srcnode, lcp2_imbalances[srcnode])); - - for (t = ipalloc_state->all_ips; t != NULL; t = t->next) { - /* Only consider addresses on srcnode. */ - if (t->pnn != srcnode) { - continue; - } - - /* What is this IP address costing the source node? */ - srcdsum = ip_distance_2_sum(&(t->addr), - ipalloc_state->all_ips, - srcnode); - srcimbl = lcp2_imbalances[srcnode] - srcdsum; - - /* Consider this IP address would cost each potential - * destination node. Destination nodes are limited to - * those that are newly healthy, since we don't want - * to do gratuitous failover of IPs just to make minor - * balance improvements. - */ - for (dstnode = 0; dstnode < numnodes; dstnode++) { - if (!rebalance_candidates[dstnode]) { - continue; - } - - /* only check nodes that can actually takeover this ip */ - if (!can_node_takeover_ip(ipalloc_state, dstnode, - t)) { - /* no it couldnt so skip to the next node */ - continue; - } - - dstdsum = ip_distance_2_sum(&(t->addr), - ipalloc_state->all_ips, - dstnode); - dstimbl = lcp2_imbalances[dstnode] + dstdsum; - DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n", - srcnode, -srcdsum, - ctdb_addr_to_str(&(t->addr)), - dstnode, dstdsum)); - - if ((dstimbl < lcp2_imbalances[srcnode]) && - (dstdsum < srcdsum) && \ - ((mindstnode == -1) || \ - ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) { - - minip = t; - minsrcimbl = srcimbl; - mindstnode = dstnode; - mindstimbl = dstimbl; - } - } - } - DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n")); - - if (mindstnode != -1) { - /* We found a move that makes things better... */ - DEBUG(DEBUG_INFO, - ("%d [%d] -> %s -> %d [+%d]\n", - srcnode, minsrcimbl - lcp2_imbalances[srcnode], - ctdb_addr_to_str(&(minip->addr)), - mindstnode, mindstimbl - lcp2_imbalances[mindstnode])); - - - lcp2_imbalances[srcnode] = minsrcimbl; - lcp2_imbalances[mindstnode] = mindstimbl; - minip->pnn = mindstnode; - - return true; - } - - return false; - -} - -struct lcp2_imbalance_pnn { - uint32_t imbalance; - int pnn; -}; - -static int lcp2_cmp_imbalance_pnn(const void * a, const void * b) -{ - const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a; - const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b; - - if (lipa->imbalance > lipb->imbalance) { - return -1; - } else if (lipa->imbalance == lipb->imbalance) { - return 0; - } else { - return 1; - } -} - -/* LCP2 algorithm for rebalancing the cluster. This finds the source - * node with the highest LCP2 imbalance, and then determines the best - * IP/destination node combination to move from the source node. - */ -static void lcp2_failback(struct ipalloc_state *ipalloc_state, - uint32_t *lcp2_imbalances, - bool *rebalance_candidates) -{ - int i, numnodes; - struct lcp2_imbalance_pnn * lips; - bool again; - - numnodes = ipalloc_state->num; - -try_again: - /* Put the imbalances and nodes into an array, sort them and - * iterate through candidates. Usually the 1st one will be - * used, so this doesn't cost much... - */ - DEBUG(DEBUG_DEBUG,("+++++++++++++++++++++++++++++++++++++++++\n")); - DEBUG(DEBUG_DEBUG,("Selecting most imbalanced node from:\n")); - lips = talloc_array(ipalloc_state, struct lcp2_imbalance_pnn, numnodes); - for (i = 0; i < numnodes; i++) { - lips[i].imbalance = lcp2_imbalances[i]; - lips[i].pnn = i; - DEBUG(DEBUG_DEBUG,(" %d [%d]\n", i, lcp2_imbalances[i])); - } - qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn), - lcp2_cmp_imbalance_pnn); - - again = false; - for (i = 0; i < numnodes; i++) { - /* This means that all nodes had 0 or 1 addresses, so - * can't be imbalanced. - */ - if (lips[i].imbalance == 0) { - break; - } - - if (lcp2_failback_candidate(ipalloc_state, - lips[i].pnn, - lcp2_imbalances, - rebalance_candidates)) { - again = true; - break; - } - } - - talloc_free(lips); - if (again) { - goto try_again; - } -} - -static void unassign_unsuitable_ips(struct ipalloc_state *ipalloc_state) -{ - struct public_ip_list *t; - - /* verify that the assigned nodes can serve that public ip - and set it to -1 if not - */ - for (t = ipalloc_state->all_ips; t != NULL; t = t->next) { - if (t->pnn == -1) { - continue; - } - if (!can_node_host_ip(ipalloc_state, t->pnn, t) != 0) { - /* this node can not serve this ip. */ - DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n", - ctdb_addr_to_str(&(t->addr)), - t->pnn)); - t->pnn = -1; - } - } -} - -static bool ipalloc_deterministic(struct ipalloc_state *ipalloc_state) -{ - struct public_ip_list *t; - int i, numnodes; - - numnodes = ipalloc_state->num; - - DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n")); - /* Allocate IPs to nodes in a modulo fashion so that IPs will - * always be allocated the same way for a specific set of - * available/unavailable nodes. - */ - - for (i = 0, t = ipalloc_state->all_ips; t!= NULL; t = t->next, i++) { - t->pnn = i % numnodes; - } - - /* IP failback doesn't make sense with deterministic - * IPs, since the modulo step above implicitly fails - * back IPs to their "home" node. - */ - if (1 == ipalloc_state->no_ip_failback) { - DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n")); - } - - unassign_unsuitable_ips(ipalloc_state); - - basic_allocate_unassigned(ipalloc_state); - - /* No failback here! */ - - return true; -} - -static bool ipalloc_nondeterministic(struct ipalloc_state *ipalloc_state) -{ - /* This should be pushed down into basic_failback. */ - struct public_ip_list *t; - int num_ips = 0; - for (t = ipalloc_state->all_ips; t != NULL; t = t->next) { - num_ips++; - } - - unassign_unsuitable_ips(ipalloc_state); - - basic_allocate_unassigned(ipalloc_state); - - /* If we don't want IPs to fail back then don't rebalance IPs. */ - if (1 == ipalloc_state->no_ip_failback) { - return true; - } - - /* Now, try to make sure the ip adresses are evenly distributed - across the nodes. - */ - basic_failback(ipalloc_state, num_ips); - - return true; -} - -static bool ipalloc_lcp2(struct ipalloc_state *ipalloc_state) -{ - uint32_t *lcp2_imbalances; - bool *rebalance_candidates; - int numnodes, num_rebalance_candidates, i; - bool ret = true; - - unassign_unsuitable_ips(ipalloc_state); - - if (!lcp2_init(ipalloc_state, - &lcp2_imbalances, &rebalance_candidates)) { - ret = false; - goto finished; - } - - lcp2_allocate_unassigned(ipalloc_state, lcp2_imbalances); - - /* If we don't want IPs to fail back then don't rebalance IPs. */ - if (1 == ipalloc_state->no_ip_failback) { - goto finished; - } - - /* It is only worth continuing if we have suitable target - * nodes to transfer IPs to. This check is much cheaper than - * continuing on... - */ - numnodes = ipalloc_state->num; - num_rebalance_candidates = 0; - for (i=0; ialgorithm) { - case IPALLOC_LCP2: - ret = ipalloc_lcp2(ipalloc_state); - break; - case IPALLOC_DETERMINISTIC: - ret = ipalloc_deterministic(ipalloc_state); - break; - case IPALLOC_NONDETERMINISTIC: - ret = ipalloc_nondeterministic(ipalloc_state); - break; - } - - /* at this point ->pnn is the node which will own each IP - or -1 if there is no node that can cover this ip - */ - - return ret; -} - struct get_tunable_callback_data { const char *tunable; uint32_t *out; diff --git a/ctdb/server/ipalloc.c b/ctdb/server/ipalloc.c new file mode 100644 index 00000000000..9b5579c7c69 --- /dev/null +++ b/ctdb/server/ipalloc.c @@ -0,0 +1,53 @@ +/* + ctdb ip takeover code + + Copyright (C) Ronnie Sahlberg 2007 + Copyright (C) Andrew Tridgell 2007 + Copyright (C) Martin Schwenke 2011 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see . +*/ + +#include "replace.h" +#include "system/network.h" + +#include "lib/util/debug.h" + +#include "common/logging.h" + +#include "server/ipalloc_private.h" + +/* The calculation part of the IP allocation algorithm. */ +bool ipalloc(struct ipalloc_state *ipalloc_state) +{ + bool ret; + + switch (ipalloc_state->algorithm) { + case IPALLOC_LCP2: + ret = ipalloc_lcp2(ipalloc_state); + break; + case IPALLOC_DETERMINISTIC: + ret = ipalloc_deterministic(ipalloc_state); + break; + case IPALLOC_NONDETERMINISTIC: + ret = ipalloc_nondeterministic(ipalloc_state); + break; + } + + /* at this point ->pnn is the node which will own each IP + or -1 if there is no node that can cover this ip + */ + + return ret; +} diff --git a/ctdb/server/ipalloc.h b/ctdb/server/ipalloc.h new file mode 100644 index 00000000000..65c778656f0 --- /dev/null +++ b/ctdb/server/ipalloc.h @@ -0,0 +1,63 @@ +/* + CTDB IP takeover code + + Copyright (C) Ronnie Sahlberg 2007 + Copyright (C) Andrew Tridgell 2007 + Copyright (C) Martin Schwenke 2015 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see . +*/ + +#ifndef __CTDB_IPALLOC_H__ +#define __CTDB_IPALLOC_H__ + +#include "replace.h" +#include "system/network.h" + +#include "include/ctdb_protocol.h" + +struct public_ip_list { + struct public_ip_list *next; + uint32_t pnn; + ctdb_sock_addr addr; +}; + +#define IP_KEYLEN 4 +uint32_t *ip_key(ctdb_sock_addr *ip); + +/* Flags used in IP allocation algorithms. */ +enum ipalloc_algorithm { + IPALLOC_DETERMINISTIC, + IPALLOC_NONDETERMINISTIC, + IPALLOC_LCP2, +}; + +struct ipalloc_state { + uint32_t num; + + /* Arrays with data for each node */ + struct ctdb_public_ip_list_old **known_public_ips; + struct ctdb_public_ip_list_old **available_public_ips; + bool *noiptakeover; + bool *noiphost; + + struct public_ip_list *all_ips; + enum ipalloc_algorithm algorithm; + uint32_t no_ip_failback; + uint32_t *force_rebalance_nodes; +}; + +bool ipalloc(struct ipalloc_state *ipalloc_state); + +#endif /* __CTDB_IPALLOC_H__ */ diff --git a/ctdb/server/ipalloc_common.c b/ctdb/server/ipalloc_common.c new file mode 100644 index 00000000000..9e4de59834a --- /dev/null +++ b/ctdb/server/ipalloc_common.c @@ -0,0 +1,206 @@ +/* + ctdb ip takeover code + + Copyright (C) Ronnie Sahlberg 2007 + Copyright (C) Andrew Tridgell 2007 + Copyright (C) Martin Schwenke 2011 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see . +*/ + +#include "replace.h" +#include "system/network.h" + +#include "ctdb_private.h" + +#include "lib/util/time.h" + +#include "lib/util/debug.h" +#include "common/logging.h" + +#include "common/common.h" +#include "common/rb_tree.h" + +#include "include/ctdb_protocol.h" + +#include "server/ipalloc_private.h" + +#define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0) + +/* Given a physical node, return the number of + public addresses that is currently assigned to this node. +*/ +int node_ip_coverage(int32_t pnn, struct public_ip_list *ips) +{ + int num=0; + + for (;ips;ips=ips->next) { + if (ips->pnn == pnn) { + num++; + } + } + return num; +} + + +/* Can the given node host the given IP: is the public IP known to the + * node and is NOIPHOST unset? +*/ +static bool can_node_host_ip(struct ipalloc_state *ipalloc_state, + int32_t pnn, + struct public_ip_list *ip) +{ + struct ctdb_public_ip_list_old *public_ips; + int i; + + if (ipalloc_state->noiphost[pnn]) { + return false; + } + + public_ips = ipalloc_state->available_public_ips[pnn]; + + if (public_ips == NULL) { + return false; + } + + for (i=0; inum; i++) { + if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) { + /* yes, this node can serve this public ip */ + return true; + } + } + + return false; +} + +bool can_node_takeover_ip(struct ipalloc_state *ipalloc_state, + int32_t pnn, + struct public_ip_list *ip) +{ + if (ipalloc_state->noiptakeover[pnn]) { + return false; + } + + return can_node_host_ip(ipalloc_state, pnn, ip); +} + +/* search the node lists list for a node to takeover this ip. + pick the node that currently are serving the least number of ips + so that the ips get spread out evenly. +*/ +int find_takeover_node(struct ipalloc_state *ipalloc_state, + struct public_ip_list *ip) +{ + int pnn, min=0, num; + int i, numnodes; + + numnodes = ipalloc_state->num; + pnn = -1; + for (i=0; iall_ips); + /* was this the first node we checked ? */ + if (pnn == -1) { + pnn = i; + min = num; + } else { + if (num < min) { + pnn = i; + min = num; + } + } + } + if (pnn == -1) { + DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n", + ctdb_addr_to_str(&ip->addr))); + + return -1; + } + + ip->pnn = pnn; + return 0; +} + +uint32_t *ip_key(ctdb_sock_addr *ip) +{ + static uint32_t key[IP_KEYLEN]; + + bzero(key, sizeof(key)); + + switch (ip->sa.sa_family) { + case AF_INET: + key[3] = htonl(ip->ip.sin_addr.s_addr); + break; + case AF_INET6: { + uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr); + key[0] = htonl(s6_a32[0]); + key[1] = htonl(s6_a32[1]); + key[2] = htonl(s6_a32[2]); + key[3] = htonl(s6_a32[3]); + break; + } + default: + DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family)); + return key; + } + + return key; +} + +/* Allocate any unassigned IPs just by looping through the IPs and + * finding the best node for each. + */ +void basic_allocate_unassigned(struct ipalloc_state *ipalloc_state) +{ + struct public_ip_list *t; + + /* loop over all ip's and find a physical node to cover for + each unassigned ip. + */ + for (t = ipalloc_state->all_ips; t != NULL; t = t->next) { + if (t->pnn == -1) { + if (find_takeover_node(ipalloc_state, t)) { + DEBUG(DEBUG_WARNING, + ("Failed to find node to cover ip %s\n", + ctdb_addr_to_str(&t->addr))); + } + } + } +} + +void unassign_unsuitable_ips(struct ipalloc_state *ipalloc_state) +{ + struct public_ip_list *t; + + /* verify that the assigned nodes can serve that public ip + and set it to -1 if not + */ + for (t = ipalloc_state->all_ips; t != NULL; t = t->next) { + if (t->pnn == -1) { + continue; + } + if (!can_node_host_ip(ipalloc_state, t->pnn, t) != 0) { + /* this node can not serve this ip. */ + DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n", + ctdb_addr_to_str(&(t->addr)), + t->pnn)); + t->pnn = -1; + } + } +} diff --git a/ctdb/server/ipalloc_deterministic.c b/ctdb/server/ipalloc_deterministic.c new file mode 100644 index 00000000000..2801bf6c25e --- /dev/null +++ b/ctdb/server/ipalloc_deterministic.c @@ -0,0 +1,62 @@ +/* + ctdb ip takeover code + + Copyright (C) Ronnie Sahlberg 2007 + Copyright (C) Andrew Tridgell 2007 + Copyright (C) Martin Schwenke 2011 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see . +*/ + +#include "replace.h" +#include "system/network.h" + +#include "lib/util/debug.h" +#include "common/logging.h" + +#include "server/ipalloc_private.h" + +bool ipalloc_deterministic(struct ipalloc_state *ipalloc_state) +{ + struct public_ip_list *t; + int i, numnodes; + + numnodes = ipalloc_state->num; + + DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n")); + /* Allocate IPs to nodes in a modulo fashion so that IPs will + * always be allocated the same way for a specific set of + * available/unavailable nodes. + */ + + for (i = 0, t = ipalloc_state->all_ips; t!= NULL; t = t->next, i++) { + t->pnn = i % numnodes; + } + + /* IP failback doesn't make sense with deterministic + * IPs, since the modulo step above implicitly fails + * back IPs to their "home" node. + */ + if (1 == ipalloc_state->no_ip_failback) { + DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n")); + } + + unassign_unsuitable_ips(ipalloc_state); + + basic_allocate_unassigned(ipalloc_state); + + /* No failback here! */ + + return true; +} diff --git a/ctdb/server/ipalloc_lcp2.c b/ctdb/server/ipalloc_lcp2.c new file mode 100644 index 00000000000..0dd9364880d --- /dev/null +++ b/ctdb/server/ipalloc_lcp2.c @@ -0,0 +1,515 @@ +/* + ctdb ip takeover code + + Copyright (C) Ronnie Sahlberg 2007 + Copyright (C) Andrew Tridgell 2007 + Copyright (C) Martin Schwenke 2011 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see . +*/ + +#include "replace.h" +#include "system/network.h" + +#include "lib/util/debug.h" +#include "common/logging.h" + +#include "protocol/protocol_api.h" + +#include "server/ipalloc_private.h" + +/* + * This is the length of the longtest common prefix between the IPs. + * It is calculated by XOR-ing the 2 IPs together and counting the + * number of leading zeroes. The implementation means that all + * addresses end up being 128 bits long. + * + * FIXME? Should we consider IPv4 and IPv6 separately given that the + * 12 bytes of 0 prefix padding will hurt the algorithm if there are + * lots of nodes and IP addresses? + */ +static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2) +{ + uint32_t ip1_k[IP_KEYLEN]; + uint32_t *t; + int i; + uint32_t x; + + uint32_t distance = 0; + + memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k)); + t = ip_key(ip2); + for (i=0; inext) { + if (t->pnn != pnn) { + continue; + } + + /* Optimisation: We never calculate the distance + * between an address and itself. This allows us to + * calculate the effect of removing an address from a + * node by simply calculating the distance between + * that address and all of the exitsing addresses. + * Moreover, we assume that we're only ever dealing + * with addresses from all_ips so we can identify an + * address via a pointer rather than doing a more + * expensive address comparison. */ + if (&(t->addr) == ip) { + continue; + } + + d = ip_distance(ip, &(t->addr)); + sum += d * d; /* Cheaper than pulling in math.h :-) */ + } + + return sum; +} + +/* Return the LCP2 imbalance metric for addresses currently assigned + to the given node. + */ +static uint32_t lcp2_imbalance(struct public_ip_list * all_ips, int pnn) +{ + struct public_ip_list *t; + + uint32_t imbalance = 0; + + for (t = all_ips; t != NULL; t = t->next) { + if (t->pnn != pnn) { + continue; + } + /* Pass the rest of the IPs rather than the whole + all_ips input list. + */ + imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn); + } + + return imbalance; +} + +static bool lcp2_init(struct ipalloc_state *ipalloc_state, + uint32_t **lcp2_imbalances, + bool **rebalance_candidates) +{ + int i, numnodes; + struct public_ip_list *t; + + numnodes = ipalloc_state->num; + + *rebalance_candidates = talloc_array(ipalloc_state, bool, numnodes); + if (*rebalance_candidates == NULL) { + DEBUG(DEBUG_ERR, (__location__ " out of memory\n")); + return false; + } + *lcp2_imbalances = talloc_array(ipalloc_state, uint32_t, numnodes); + if (*lcp2_imbalances == NULL) { + DEBUG(DEBUG_ERR, (__location__ " out of memory\n")); + return false; + } + + for (i=0; iall_ips, i); + /* First step: assume all nodes are candidates */ + (*rebalance_candidates)[i] = true; + } + + /* 2nd step: if a node has IPs assigned then it must have been + * healthy before, so we remove it from consideration. This + * is overkill but is all we have because we don't maintain + * state between takeover runs. An alternative would be to + * keep state and invalidate it every time the recovery master + * changes. + */ + for (t = ipalloc_state->all_ips; t != NULL; t = t->next) { + if (t->pnn != -1) { + (*rebalance_candidates)[t->pnn] = false; + } + } + + /* 3rd step: if a node is forced to re-balance then + we allow failback onto the node */ + if (ipalloc_state->force_rebalance_nodes == NULL) { + return true; + } + for (i = 0; + i < talloc_array_length(ipalloc_state->force_rebalance_nodes); + i++) { + uint32_t pnn = ipalloc_state->force_rebalance_nodes[i]; + if (pnn >= numnodes) { + DEBUG(DEBUG_ERR, + (__location__ "unknown node %u\n", pnn)); + continue; + } + + DEBUG(DEBUG_NOTICE, + ("Forcing rebalancing of IPs to node %u\n", pnn)); + (*rebalance_candidates)[pnn] = true; + } + + return true; +} + +/* Allocate any unassigned addresses using the LCP2 algorithm to find + * the IP/node combination that will cost the least. + */ +static void lcp2_allocate_unassigned(struct ipalloc_state *ipalloc_state, + uint32_t *lcp2_imbalances) +{ + struct public_ip_list *t; + int dstnode, numnodes; + + int minnode; + uint32_t mindsum, dstdsum, dstimbl, minimbl; + struct public_ip_list *minip; + + bool should_loop = true; + bool have_unassigned = true; + + numnodes = ipalloc_state->num; + + while (have_unassigned && should_loop) { + should_loop = false; + + DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n")); + DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n")); + + minnode = -1; + mindsum = 0; + minip = NULL; + + /* loop over each unassigned ip. */ + for (t = ipalloc_state->all_ips; t != NULL ; t = t->next) { + if (t->pnn != -1) { + continue; + } + + for (dstnode = 0; dstnode < numnodes; dstnode++) { + /* only check nodes that can actually takeover this ip */ + if (!can_node_takeover_ip(ipalloc_state, + dstnode, + t)) { + /* no it couldnt so skip to the next node */ + continue; + } + + dstdsum = ip_distance_2_sum(&(t->addr), + ipalloc_state->all_ips, + dstnode); + dstimbl = lcp2_imbalances[dstnode] + dstdsum; + DEBUG(DEBUG_DEBUG, + (" %s -> %d [+%d]\n", + ctdb_sock_addr_to_string(ipalloc_state, + &(t->addr)), + dstnode, + dstimbl - lcp2_imbalances[dstnode])); + + + if ((minnode == -1) || (dstdsum < mindsum)) { + minnode = dstnode; + minimbl = dstimbl; + mindsum = dstdsum; + minip = t; + should_loop = true; + } + } + } + + DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n")); + + /* If we found one then assign it to the given node. */ + if (minnode != -1) { + minip->pnn = minnode; + lcp2_imbalances[minnode] = minimbl; + DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n", + ctdb_sock_addr_to_string( + ipalloc_state, + &(minip->addr)), + minnode, + mindsum)); + } + + /* There might be a better way but at least this is clear. */ + have_unassigned = false; + for (t = ipalloc_state->all_ips; t != NULL; t = t->next) { + if (t->pnn == -1) { + have_unassigned = true; + } + } + } + + /* We know if we have an unassigned addresses so we might as + * well optimise. + */ + if (have_unassigned) { + for (t = ipalloc_state->all_ips; t != NULL; t = t->next) { + if (t->pnn == -1) { + DEBUG(DEBUG_WARNING, + ("Failed to find node to cover ip %s\n", + ctdb_sock_addr_to_string(ipalloc_state, + &t->addr))); + } + } + } +} + +/* LCP2 algorithm for rebalancing the cluster. Given a candidate node + * to move IPs from, determines the best IP/destination node + * combination to move from the source node. + */ +static bool lcp2_failback_candidate(struct ipalloc_state *ipalloc_state, + int srcnode, + uint32_t *lcp2_imbalances, + bool *rebalance_candidates) +{ + int dstnode, mindstnode, numnodes; + uint32_t srcimbl, srcdsum, dstimbl, dstdsum; + uint32_t minsrcimbl, mindstimbl; + struct public_ip_list *minip; + struct public_ip_list *t; + + /* Find an IP and destination node that best reduces imbalance. */ + srcimbl = 0; + minip = NULL; + minsrcimbl = 0; + mindstnode = -1; + mindstimbl = 0; + + numnodes = ipalloc_state->num; + + DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n")); + DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n", + srcnode, lcp2_imbalances[srcnode])); + + for (t = ipalloc_state->all_ips; t != NULL; t = t->next) { + /* Only consider addresses on srcnode. */ + if (t->pnn != srcnode) { + continue; + } + + /* What is this IP address costing the source node? */ + srcdsum = ip_distance_2_sum(&(t->addr), + ipalloc_state->all_ips, + srcnode); + srcimbl = lcp2_imbalances[srcnode] - srcdsum; + + /* Consider this IP address would cost each potential + * destination node. Destination nodes are limited to + * those that are newly healthy, since we don't want + * to do gratuitous failover of IPs just to make minor + * balance improvements. + */ + for (dstnode = 0; dstnode < numnodes; dstnode++) { + if (!rebalance_candidates[dstnode]) { + continue; + } + + /* only check nodes that can actually takeover this ip */ + if (!can_node_takeover_ip(ipalloc_state, dstnode, + t)) { + /* no it couldnt so skip to the next node */ + continue; + } + + dstdsum = ip_distance_2_sum(&(t->addr), + ipalloc_state->all_ips, + dstnode); + dstimbl = lcp2_imbalances[dstnode] + dstdsum; + DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n", + srcnode, -srcdsum, + ctdb_sock_addr_to_string( + ipalloc_state, &(t->addr)), + dstnode, dstdsum)); + + if ((dstimbl < lcp2_imbalances[srcnode]) && + (dstdsum < srcdsum) && \ + ((mindstnode == -1) || \ + ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) { + + minip = t; + minsrcimbl = srcimbl; + mindstnode = dstnode; + mindstimbl = dstimbl; + } + } + } + DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n")); + + if (mindstnode != -1) { + /* We found a move that makes things better... */ + DEBUG(DEBUG_INFO, + ("%d [%d] -> %s -> %d [+%d]\n", + srcnode, minsrcimbl - lcp2_imbalances[srcnode], + ctdb_sock_addr_to_string(ipalloc_state, &(minip->addr)), + mindstnode, mindstimbl - lcp2_imbalances[mindstnode])); + + + lcp2_imbalances[srcnode] = minsrcimbl; + lcp2_imbalances[mindstnode] = mindstimbl; + minip->pnn = mindstnode; + + return true; + } + + return false; +} + +struct lcp2_imbalance_pnn { + uint32_t imbalance; + int pnn; +}; + +static int lcp2_cmp_imbalance_pnn(const void * a, const void * b) +{ + const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a; + const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b; + + if (lipa->imbalance > lipb->imbalance) { + return -1; + } else if (lipa->imbalance == lipb->imbalance) { + return 0; + } else { + return 1; + } +} + +/* LCP2 algorithm for rebalancing the cluster. This finds the source + * node with the highest LCP2 imbalance, and then determines the best + * IP/destination node combination to move from the source node. + */ +static void lcp2_failback(struct ipalloc_state *ipalloc_state, + uint32_t *lcp2_imbalances, + bool *rebalance_candidates) +{ + int i, numnodes; + struct lcp2_imbalance_pnn * lips; + bool again; + + numnodes = ipalloc_state->num; + +try_again: + /* Put the imbalances and nodes into an array, sort them and + * iterate through candidates. Usually the 1st one will be + * used, so this doesn't cost much... + */ + DEBUG(DEBUG_DEBUG,("+++++++++++++++++++++++++++++++++++++++++\n")); + DEBUG(DEBUG_DEBUG,("Selecting most imbalanced node from:\n")); + lips = talloc_array(ipalloc_state, struct lcp2_imbalance_pnn, numnodes); + for (i = 0; i < numnodes; i++) { + lips[i].imbalance = lcp2_imbalances[i]; + lips[i].pnn = i; + DEBUG(DEBUG_DEBUG,(" %d [%d]\n", i, lcp2_imbalances[i])); + } + qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn), + lcp2_cmp_imbalance_pnn); + + again = false; + for (i = 0; i < numnodes; i++) { + /* This means that all nodes had 0 or 1 addresses, so + * can't be imbalanced. + */ + if (lips[i].imbalance == 0) { + break; + } + + if (lcp2_failback_candidate(ipalloc_state, + lips[i].pnn, + lcp2_imbalances, + rebalance_candidates)) { + again = true; + break; + } + } + + talloc_free(lips); + if (again) { + goto try_again; + } +} + +bool ipalloc_lcp2(struct ipalloc_state *ipalloc_state) +{ + uint32_t *lcp2_imbalances; + bool *rebalance_candidates; + int numnodes, num_rebalance_candidates, i; + bool ret = true; + + unassign_unsuitable_ips(ipalloc_state); + + if (!lcp2_init(ipalloc_state, + &lcp2_imbalances, &rebalance_candidates)) { + ret = false; + goto finished; + } + + lcp2_allocate_unassigned(ipalloc_state, lcp2_imbalances); + + /* If we don't want IPs to fail back then don't rebalance IPs. */ + if (1 == ipalloc_state->no_ip_failback) { + goto finished; + } + + /* It is only worth continuing if we have suitable target + * nodes to transfer IPs to. This check is much cheaper than + * continuing on... + */ + numnodes = ipalloc_state->num; + num_rebalance_candidates = 0; + for (i=0; i. +*/ + +#include "replace.h" +#include "system/network.h" + +#include "ctdb_private.h" + +#include "lib/util/debug.h" +#include "common/logging.h" +#include "common/common.h" + +#include "server/ipalloc_private.h" + +/* Basic non-deterministic rebalancing algorithm. + */ +static void basic_failback(struct ipalloc_state *ipalloc_state, + int num_ips) +{ + int i, numnodes; + int maxnode, maxnum, minnode, minnum, num, retries; + struct public_ip_list *t; + + numnodes = ipalloc_state->num; + retries = 0; + +try_again: + maxnum=0; + minnum=0; + + /* for each ip address, loop over all nodes that can serve + this ip and make sure that the difference between the node + serving the most and the node serving the least ip's are + not greater than 1. + */ + for (t = ipalloc_state->all_ips; t != NULL; t = t->next) { + if (t->pnn == -1) { + continue; + } + + /* Get the highest and lowest number of ips's served by any + valid node which can serve this ip. + */ + maxnode = -1; + minnode = -1; + for (i=0; iall_ips); + if (maxnode == -1) { + maxnode = i; + maxnum = num; + } else { + if (num > maxnum) { + maxnode = i; + maxnum = num; + } + } + if (minnode == -1) { + minnode = i; + minnum = num; + } else { + if (num < minnum) { + minnode = i; + minnum = num; + } + } + } + if (maxnode == -1) { + DEBUG(DEBUG_WARNING, + (__location__ " Could not find maxnode. May not be able to serve ip '%s'\n", + ctdb_addr_to_str(&t->addr))); + + continue; + } + + /* if the spread between the smallest and largest coverage by + a node is >=2 we steal one of the ips from the node with + most coverage to even things out a bit. + try to do this a limited number of times since we dont + want to spend too much time balancing the ip coverage. + */ + if ((maxnum > minnum+1) && + (retries < (num_ips + 5))){ + struct public_ip_list *tt; + + /* Reassign one of maxnode's VNNs */ + for (tt = ipalloc_state->all_ips; tt != NULL; tt = tt->next) { + if (tt->pnn == maxnode) { + (void)find_takeover_node(ipalloc_state, + tt); + retries++; + goto try_again;; + } + } + } + } +} + +bool ipalloc_nondeterministic(struct ipalloc_state *ipalloc_state) +{ + /* This should be pushed down into basic_failback. */ + struct public_ip_list *t; + int num_ips = 0; + for (t = ipalloc_state->all_ips; t != NULL; t = t->next) { + num_ips++; + } + + unassign_unsuitable_ips(ipalloc_state); + + basic_allocate_unassigned(ipalloc_state); + + /* If we don't want IPs to fail back then don't rebalance IPs. */ + if (1 == ipalloc_state->no_ip_failback) { + return true; + } + + /* Now, try to make sure the ip adresses are evenly distributed + across the nodes. + */ + basic_failback(ipalloc_state, num_ips); + + return true; +} diff --git a/ctdb/server/ipalloc_private.h b/ctdb/server/ipalloc_private.h new file mode 100644 index 00000000000..3ffdeba2409 --- /dev/null +++ b/ctdb/server/ipalloc_private.h @@ -0,0 +1,43 @@ +/* + CTDB IP takeover code + + Copyright (C) Ronnie Sahlberg 2007 + Copyright (C) Andrew Tridgell 2007 + Copyright (C) Martin Schwenke 2015 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see . +*/ + +#ifndef __CTDB_IPALLOC_PRIVATE_H__ +#define __CTDB_IPALLOC_PRIVATE_H__ + +#include "protocol/protocol.h" + +#include "server/ipalloc.h" + +bool can_node_takeover_ip(struct ipalloc_state *ipalloc_state, + int32_t pnn, + struct public_ip_list *ip); +int node_ip_coverage(int32_t pnn, struct public_ip_list *ips); +int find_takeover_node(struct ipalloc_state *ipalloc_state, + struct public_ip_list *ip); + +void unassign_unsuitable_ips(struct ipalloc_state *ipalloc_state); +void basic_allocate_unassigned(struct ipalloc_state *ipalloc_state); + +bool ipalloc_nondeterministic(struct ipalloc_state *ipalloc_state); +bool ipalloc_deterministic(struct ipalloc_state *ipalloc_state); +bool ipalloc_lcp2(struct ipalloc_state *ipalloc_state); + +#endif /* __CTDB_IPALLOC_PRIVATE_H__ */ diff --git a/ctdb/tests/src/ctdbd_test.c b/ctdb/tests/src/ctdbd_test.c index 8298a4e1038..018aa2a7a88 100644 --- a/ctdb/tests/src/ctdbd_test.c +++ b/ctdb/tests/src/ctdbd_test.c @@ -61,6 +61,11 @@ bool fast_start; #include "server/ctdb_ltdb_server.c" #include "server/ctdb_traverse.c" #include "server/eventscript.c" +#include "server/ipalloc_common.c" +#include "server/ipalloc_deterministic.c" +#include "server/ipalloc_nondeterministic.c" +#include "server/ipalloc_lcp2.c" +#include "server/ipalloc.c" #include "server/ctdb_takeover.c" #include "server/ctdb_serverids.c" #include "server/ctdb_persistent.c" diff --git a/ctdb/wscript b/ctdb/wscript index e7d2c49a5b9..0acb6ff20e0 100755 --- a/ctdb/wscript +++ b/ctdb/wscript @@ -373,6 +373,17 @@ def build(bld): includes='include', deps='replace talloc tevent tdb tdb-wrap') + bld.SAMBA_SUBSYSTEM('ctdb-ipalloc', + source=bld.SUBDIR('server', + '''ipalloc_deterministic.c + ipalloc_nondeterministic.c + ipalloc_lcp2.c + ipalloc_common.c + ipalloc.c + '''), + includes='include', + deps='ctdb-protocol replace talloc tevent') + bld.SAMBA_SUBSYSTEM('ctdb-server', source='server/ctdbd.c ' + bld.SUBDIR('server', @@ -393,7 +404,7 @@ def build(bld): ctdb_update_record.c ctdb_lock.c ctdb_fork.c'''), includes='include', - deps='replace popt talloc tevent tdb talloc_report') + deps='ctdb-ipalloc replace popt talloc tevent tdb talloc_report') bld.SAMBA_BINARY('ctdbd', source='', @@ -664,7 +675,8 @@ def build(bld): bld.SAMBA_BINARY('ctdb_takeover_tests', source='tests/src/ctdb_takeover_tests.c', deps='''replace popt tdb tevent talloc ctdb-system - samba-util tdb-wrap talloc_report''' + + samba-util tdb-wrap talloc_report + ctdb-protocol''' + ib_deps, includes='include', install_path='${CTDB_TEST_LIBDIR}') -- 2.34.1