3 # ctdb ip takeover code
5 # Copyright (C) Martin Schwenke 2010
7 # Based on original CTDB C code:
9 # Copyright (C) Ronnie Sahlberg 2007
10 # Copyright (C) Andrew Tridgell 2007
12 # This program is free software; you can redistribute it and/or modify
13 # it under the terms of the GNU General Public License as published by
14 # the Free Software Foundation; either version 3 of the License, or
15 # (at your option) any later version.
17 # This program is distributed in the hope that it will be useful,
18 # but WITHOUT ANY WARRANTY; without even the implied warranty of
19 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 # GNU General Public License for more details.
22 # You should have received a copy of the GNU General Public License
23 # along with this program; if not, see <http://www.gnu.org/licenses/>.
28 # Use optparse since newer argparse not available in RHEL5/EPEL.
29 from optparse import OptionParser
35 def process_args(extra_options=[]):
38 parser = OptionParser(option_list=extra_options)
40 parser.add_option("--nd",
41 action="store_false", dest="deterministic_public_ips",
43 help="turn off deterministic_public_ips")
44 parser.add_option("--ni",
45 action="store_true", dest="no_ip_failback", default=False,
46 help="turn on no_ip_failback")
47 parser.add_option("-b", "--balance",
48 action="store_true", dest="balance", default=False,
49 help="show (im)balance information after each event")
50 parser.add_option("-d", "--diff",
51 action="store_true", dest="diff", default=False,
52 help="show IP address movements for each event")
53 parser.add_option("-n", "--no-print",
54 action="store_false", dest="show", default=True,
55 help="don't show IP address layout after each event")
56 parser.add_option("-v", "--verbose",
57 action="store_true", dest="verbose", default=False,
58 help="print information and actions taken to stdout")
59 parser.add_option("--hack",
60 action="store", type="int", dest="hack", default=0,
61 help="apply a hack (see the code!!!)")
62 parser.add_option("-r", "--retries",
63 action="store", type="int", dest="retries", default=5,
64 help="number of retry loops for rebalancing [default: %default]")
65 parser.add_option("-i", "--iterations",
66 action="store", type="int", dest="iterations",
68 help="number of iterations to run in test [default: %default]")
70 def seed_callback(option, opt, value, parser):
72 parser.add_option("-s", "--seed",
73 action="callback", type="int", callback=seed_callback,
74 help="initial random number seed for random events")
76 parser.add_option("-x", "--exit",
77 action="store_true", dest="exit", default=False,
78 help="exit on the 1st gratuitous IP move")
80 (options, args) = parser.parse_args()
83 parser.error("too many argumentss")
100 def verbose_print(t):
102 if not type(t) == list:
105 print "\n".join([str(i) for i in t])
109 def __init__(self, public_addresses):
110 self.public_addresses = set(public_addresses)
111 self.current_addresses = set()
114 def can_node_serve_ip(self, ip):
115 return ip in self.public_addresses
117 def node_ip_coverage(self):
118 return len(self.current_addresses)
120 class Cluster(object):
123 self.deterministic_public_ips = options.deterministic_public_ips
124 self.no_ip_failback = options.no_ip_failback
125 self.all_public_ips = set()
129 self.grat_ip_moves = []
132 self.num_unhealthy = []
137 return "\n".join(["%2d %s %s" %
139 "*" if len(n.public_addresses) == 0 else \
140 (" " if n.healthy else "#"),
141 sorted(list(n.current_addresses)))
142 for (i, n) in enumerate(self.nodes)])
144 def print_statistics(self):
145 print_begin("STATISTICS")
146 print "Events: %6d" % self.events
147 print "Total IP moves: %6d" % sum(self.ip_moves)
148 print "Gratuitous IP moves: %6d" % sum(self.grat_ip_moves)
149 print "Max imbalance: %6d" % max(self.imbalance)
150 print "Final imbalance: %6d" % self.imbalance[-1]
151 print "Maximum unhealthy: %6d" % max(self.num_unhealthy)
154 def find_pnn_with_ip(self, ip):
155 for (i, n) in enumerate(self.nodes):
156 if ip in n.current_addresses:
160 def quietly_remove_ip(self, ip):
161 # Remove address from old node.
162 old = self.find_pnn_with_ip(ip)
164 self.nodes[old].current_addresses.remove(ip)
166 def add_node(self, node):
167 self.nodes.append(node)
168 self.all_public_ips |= node.public_addresses
170 def healthy(self, *pnns):
171 verbose_begin("HEALTHY")
174 self.nodes[pnn].healthy = True
179 def unhealthy(self, *pnns):
181 verbose_begin("UNHEALTHY")
184 self.nodes[pnn].healthy = False
189 def do_something_random(self):
192 """Make a random node healthy or unhealthy.
194 If all nodes are healthy or unhealthy, then invert one of
195 them. Otherwise, there's a 1/4 chance of making another node
198 num_nodes = len(self.nodes)
199 healthy_pnns = [i for (i,n) in enumerate(self.nodes) if n.healthy]
200 num_healthy = len(healthy_pnns)
202 if num_nodes == num_healthy:
203 self.unhealthy(random.randint(0, num_nodes-1))
204 elif num_healthy == 0:
205 self.healthy(random.randint(0, num_nodes-1))
206 elif random.randint(1, 4) == 1:
207 self.unhealthy(random.choice(healthy_pnns))
209 all_pnns = range(num_nodes)
210 unhealthy_pnns = sorted(list(set(all_pnns) - set(healthy_pnns)))
211 self.healthy(random.choice(unhealthy_pnns))
213 def random_iterations(self):
215 while i <= options.iterations:
216 verbose_begin("EVENT %d" % i)
218 self.do_something_random()
219 if self.recover() and options.exit > 0:
223 self.print_statistics()
225 def calculate_imbalance(self):
229 assigned = sorted([ip
231 for ip in n.current_addresses])
238 for (i, n) in enumerate(self.nodes):
242 if not n.can_node_serve_ip(ip):
247 num = n.node_ip_coverage()
249 if maxnode == -1 or num > maxnum:
253 if minnode == -1 or num < minnum:
261 if maxnum - minnum < 2:
263 imbalance = max([imbalance, i])
268 """Calculate differences in IP assignments between self and prev.
270 Gratuitous IP moves (from a healthy node to a healthy node)
271 are prefix by !!. Any gratuitous IP moves cause this function
272 to return False. If there are no gratuitous moves then it
279 for (new, n) in enumerate(self.nodes):
280 for ip in n.current_addresses:
281 old = self.prev.find_pnn_with_ip(ip)
285 self.prev.nodes[new].healthy and \
286 self.nodes[new].healthy and \
287 self.nodes[old].healthy and \
288 self.prev.nodes[old].healthy:
293 details.append("%s %s: %d -> %d" %
294 (prefix, ip, old, new))
296 return (ip_moves, grat_ip_moves, details)
298 def find_least_loaded_node(self, ip):
299 """Just like find_takeover_node but doesn't care about health."""
302 for (i, n) in enumerate(self.nodes):
303 if not n.can_node_serve_ip(ip):
306 num = n.node_ip_coverage()
317 verbose_print("Could not find node to take over public address %s" % ip)
320 self.nodes[pnn].current_addresses.add(ip)
322 verbose_print("%s -> %d" % (ip, pnn))
325 def find_takeover_node(self, ip):
329 for (i, n) in enumerate(self.nodes):
333 if not n.can_node_serve_ip(ip):
336 num = n.node_ip_coverage()
347 verbose_print("Could not find node to take over public address %s" % ip)
350 self.nodes[pnn].current_addresses.add(ip)
352 verbose_print("%s -> %d" % (ip, pnn))
355 def ctdb_takeover_run(self):
359 # Don't bother with the num_healthy stuff. It is an
362 # We just keep the allocate IPs in the current_addresses field
363 # of the node. This needs to readable, not efficient!
365 if self.deterministic_public_ips:
367 addr_list = sorted(list(self.all_public_ips))
368 for (i, ip) in enumerate(addr_list):
369 if options.hack == 1:
370 self.quietly_remove_ip(ip)
371 self.find_least_loaded_node(ip)
372 elif options.hack == 2:
373 pnn = i % len(self.nodes)
374 if ip in self.nodes[pnn].public_addresses:
375 self.quietly_remove_ip(ip)
376 # Add addresses to new node.
377 self.nodes[pnn].current_addresses.add(ip)
378 verbose_print("%s -> %d" % (ip, pnn))
380 self.quietly_remove_ip(ip)
381 # Add addresses to new node.
382 pnn = i % len(self.nodes)
383 self.nodes[pnn].current_addresses.add(ip)
384 verbose_print("%s -> %d" % (ip, pnn))
386 # Remove public addresses from unhealthy nodes.
387 for (pnn, n) in enumerate(self.nodes):
389 verbose_print(["%s <- %d" % (ip, pnn)
390 for ip in n.current_addresses])
391 n.current_addresses = set()
393 # If a node can't serve an assigned address then remove it.
395 verbose_print(["%s <- %d" % (ip, pnn)
396 for ip in n.current_addresses - n.public_addresses])
397 n.current_addresses &= n.public_addresses
399 # We'll only retry the balancing act up to 5 times.
405 assigned = set([ip for n in self.nodes for ip in n.current_addresses])
406 unassigned = sorted(list(self.all_public_ips - assigned))
408 for ip in unassigned:
409 self.find_takeover_node(ip)
411 if self.no_ip_failback:
414 assigned = sorted([ip
416 for ip in n.current_addresses])
421 for (i, n) in enumerate(self.nodes):
425 if not n.can_node_serve_ip(ip):
428 num = n.node_ip_coverage()
446 print "Could not maxnode. May not be able to serve ip", ip
449 if self.deterministic_public_ips:
452 if maxnum > minnum + 1 and retries < options.retries:
453 # Remove the 1st ip from maxnode
454 t = sorted(list(self.nodes[maxnode].current_addresses))
456 verbose_print("%s <- %d" % (realloc, maxnode))
457 self.nodes[maxnode].current_addresses.remove(realloc)
459 # Redo the outer loop.
464 verbose_begin("TAKEOVER")
466 self.ctdb_takeover_run()
472 if self.prev is not None:
473 (ip_moves, grat_ip_moves, details) = self.diff()
474 self.ip_moves.append(ip_moves)
475 self.grat_ip_moves.append(grat_ip_moves)
479 print "\n".join(details)
482 imbalance = self.calculate_imbalance()
483 self.imbalance.append(imbalance)
485 print_begin("IMBALANCE")
489 num_unhealthy = len(self.nodes) - \
490 len([n for n in self.nodes if n.healthy])
491 self.num_unhealthy.append(num_unhealthy)
499 self.prev = copy.deepcopy(self)