merge from ronnie
authorAndrew Tridgell <tridge@samba.org>
Mon, 10 Sep 2007 03:21:11 +0000 (13:21 +1000)
committerAndrew Tridgell <tridge@samba.org>
Mon, 10 Sep 2007 03:21:11 +0000 (13:21 +1000)
1  2 
server/ctdb_takeover.c

index 89787caf1be080a827acd30e6d0bb1f4f052daa1,b252b07737d0c1e1362a74c40ac4a009b60083da..71c294da44caeacfd7c67eca3deea8d17b04170c
@@@ -514,50 -657,146 +657,146 @@@ int ctdb_takeover_run(struct ctdb_conte
  
        ZERO_STRUCT(ip);
  
-       /* Work out which node will look after each public IP.
-        * takeover_node cycles over the nodes and is incremented each time a 
-        * node has been assigned to take over for another node.
-        * This spreads the failed nodes out across the remaining
-        * nodes more evenly
-        */
+       /* Count how many completely healthy nodes we have */
+       num_healthy = 0;
        for (i=0;i<nodemap->num;i++) {
                if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
-                       ctdb->nodes[i]->takeover_vnn = nodemap->nodes[i].vnn;
-               } else {
-                       uint32_t takeover_vnn;
+                       num_healthy++;
+               }
+       }
  
-                       /* If this public address has already been taken over
-                          by a node and that node is still healthy, then
-                          leave the public address at that node.
-                       */
-                       takeover_vnn = ctdb->nodes[i]->takeover_vnn;
-                       if ( ctdb_validate_vnn(ctdb, takeover_vnn)
-                         && (!(nodemap->nodes[takeover_vnn].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) ) {
-                               continue;
+       if (num_healthy > 0) {
+               /* We have healthy nodes, so only consider them for 
+                  serving public addresses
+               */
+               mask = NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED;
+       } else {
+               /* We didnt have any completely healthy nodes so
+                  use "disabled" nodes as a fallback
+               */
+               mask = NODE_FLAGS_INACTIVE;
+       }
+       /* since nodes only know about those public addresses that
+          can be served by that particular node, no single node has
+          a full list of all public addresses that exist in the cluster.
+          Walk over all node structures and create a merged list of
+          all public addresses that exist in the cluster.
+       */
+       all_ips = create_merged_ip_list(ctdb, tmp_ctx);
+       /* mark all public addresses with a masked node as being served by
+          node -1
+       */
+       for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
+               if (tmp_ip->pnn == -1) {
+                       continue;
+               }
+               if (nodemap->nodes[tmp_ip->pnn].flags & mask) {
+                       tmp_ip->pnn = -1;
+               }
+       }
+       /* now we must redistribute all public addresses with takeover node
+          -1 among the nodes available
+       */
+       retries = 0;
+ try_again:
+       /* loop over all ip's and find a physical node to cover for 
+          each unassigned ip.
+       */
+       for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
+               if (tmp_ip->pnn == -1) {
+                       if (find_takeover_node(ctdb, nodemap, mask, tmp_ip, all_ips)) {
+                               DEBUG(0,("Failed to find node to cover ip %s\n", inet_ntoa(tmp_ip->sin.sin_addr)));
                        }
+               }
+       }
+       /* now, try to make sure the ip adresses are evenly distributed
+          across the node.
+          for each ip address, loop over all nodes that can serve this
+          ip and make sure that the difference between the node
+          serving the most and the node serving the least ip's are not greater
+          than 1.
+       */
+       for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
+               if (tmp_ip->pnn == -1) {
+                       continue;
+               }
  
+               /* Get the highest and lowest number of ips's served by any 
+                  valid node which can serve this ip.
+               */
+               maxnode = -1;
+               minnode = -1;
+               for (i=0;i<nodemap->num;i++) {
+                       if (nodemap->nodes[i].flags & mask) {
+                               continue;
+                       }
  
-                       ctdb->nodes[i]->takeover_vnn = (uint32_t)-1;    
+                       /* only check nodes that can actually serve this ip */
+                       if (can_node_serve_ip(ctdb, i, tmp_ip)) {
+                               /* no it couldnt   so skip to the next node */
+                               continue;
+                       }
  
-                       ctdb_takeover_find_node(ctdb, nodemap, i, NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED);
-                       
-                       /* if no enabled node can take it, then we
-                          might as well use any active node. It
-                          probably means that some subsystem (such as
-                          NFS) is sick on all nodes. Best we can do
-                          is to keep the other services up. */
-                       if (ctdb->nodes[i]->takeover_vnn == (uint32_t)-1) {
-                               ctdb_takeover_find_node(ctdb, nodemap, i, NODE_FLAGS_INACTIVE);
+                       num = node_ip_coverage(ctdb, i, all_ips);
+                       if (maxnode == -1) {
+                               maxnode = i;
+                               maxnum  = num;
+                       } else {
+                               if (num > maxnum) {
+                                       maxnode = i;
+                                       maxnum  = num;
+                               }
                        }
 -                      DEBUG(0,(__location__ " Could not find maxnode. May not be able to server ip '%s'\n", inet_ntoa(tmp_ip->sin.sin_addr)));
+                       if (minnode == -1) {
+                               minnode = i;
+                               minnum  = num;
+                       } else {
+                               if (num < minnum) {
+                                       minnode = i;
+                                       minnum  = num;
+                               }
+                       }
+               }
+               if (maxnode == -1) {
++                      DEBUG(0,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n", inet_ntoa(tmp_ip->sin.sin_addr)));
+                       continue;
+               }
+               /* if the spread between the smallest and largest coverage by
+                  a node is >=2 we steal one of the ips from the node with
+                  most coverage to even things out a bit.
+                  try to do this at most 5 times  since we dont want to spend
+                  too much time balancing the ip coverage.
+               */
+               if ( (maxnum > minnum+1)
+                 && (retries < 5) ){
+                       struct ctdb_public_ip_list *tmp;
  
-                       if (ctdb->nodes[i]->takeover_vnn == (uint32_t)-1) {
-                               DEBUG(0,(__location__ " No node available on same network to take %s\n",
-                                        ctdb->nodes[i]->public_address));
+                       /* mark one of maxnode's vnn's as unassigned and try
+                          again
+                       */
+                       for (tmp=all_ips;tmp;tmp=tmp->next) {
+                               if (tmp->pnn == maxnode) {
+                                       tmp->pnn = -1;
+                                       retries++;
+                                       goto try_again;
+                               }
                        }
                }
-       }       
+       }
  
-       /* at this point ctdb->nodes[i]->takeover_vnn is the vnn which will own each IP */
+       /* at this point ->pnn is the node which will own each IP
+          or -1 if there is no node that can cover this ip
+       */
  
        /* now tell all nodes to delete any alias that they should not
           have.  This will be a NOOP on nodes that don't currently