Testing: IP allocation simulation - clean up usage message.
[sahlberg/ctdb.git] / tests / takeover / ctdb_takeover.py
1 #!/usr/bin/env python
2
3 # ctdb ip takeover code
4
5 # Copyright (C) Martin Schwenke 2010
6
7 # Based on original CTDB C code:
8 #
9 # Copyright (C) Ronnie Sahlberg  2007
10 # Copyright (C) Andrew Tridgell  2007
11
12 # This program is free software; you can redistribute it and/or modify
13 # it under the terms of the GNU General Public License as published by
14 # the Free Software Foundation; either version 3 of the License, or
15 # (at your option) any later version.
16
17 # This program is distributed in the hope that it will be useful,
18 # but WITHOUT ANY WARRANTY; without even the implied warranty of
19 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20 # GNU General Public License for more details.
21
22 # You should have received a copy of the GNU General Public License
23 # along with this program; if not, see <http://www.gnu.org/licenses/>.
24
25
26 import os
27 import sys
28 # Use optparse since newer argparse not available in RHEL5/EPEL.
29 from optparse import OptionParser
30 import copy
31 import random
32
33 options = None
34
35 def process_args(extra_options=[]):
36     global options
37
38     parser = OptionParser(option_list=extra_options)
39
40     parser.add_option("--nd",
41                       action="store_false", dest="deterministic_public_ips",
42                       default=True,
43                       help="turn off deterministic_public_ips")
44     parser.add_option("--ni",
45                       action="store_true", dest="no_ip_failback", default=False,
46                       help="turn on no_ip_failback")
47     parser.add_option("-b", "--balance",
48                       action="store_true", dest="balance", default=False,
49                       help="show (im)balance information after each event")
50     parser.add_option("-d", "--diff",
51                       action="store_true", dest="diff", default=False,
52                       help="show IP address movements for each event")
53     parser.add_option("-n", "--no-print",
54                       action="store_false", dest="show", default=True,
55                       help="don't show IP address layout after each event")
56     parser.add_option("-v", "--verbose",
57                       action="store_true", dest="verbose", default=False,
58                       help="print information and actions taken to stdout")
59     parser.add_option("--hack",
60                       action="store", type="int", dest="hack", default=0,
61                       help="apply a hack (see the code!!!)")
62     parser.add_option("-r", "--retries",
63                       action="store", type="int", dest="retries", default=5,
64                       help="number of retry loops for rebalancing [default: %default]")
65     parser.add_option("-i", "--iterations",
66                       action="store", type="int", dest="iterations",
67                       default=1000,
68                       help="number of iterations to run in test [default: %default]")
69
70     def seed_callback(option, opt, value, parser):
71         random.seed(value)
72     parser.add_option("-s", "--seed",
73                       action="callback", type="int", callback=seed_callback,
74                       help="initial random number seed for random events")
75
76     parser.add_option("-x", "--exit",
77                       action="store_true", dest="exit", default=False,
78                       help="exit on the 1st gratuitous IP move")
79     
80     (options, args) = parser.parse_args()
81
82     if len(args) != 0:
83         parser.error("too many argumentss")
84
85 def print_begin(t):
86     print "=" * 40
87     print "%s:" % (t)
88
89 def print_end():
90     print "-" * 40
91
92 def verbose_begin(t):
93     if options.verbose:
94         print_begin(t)
95
96 def verbose_end():
97     if options.verbose:
98         print_end()
99
100 def verbose_print(t):
101     if options.verbose:
102         if not type(t) == list:
103             t = [t]
104         if t != []:
105             print "\n".join([str(i) for i in t])
106
107
108 class Node(object):
109     def __init__(self, public_addresses):
110         self.public_addresses = set(public_addresses)
111         self.current_addresses = set()
112         self.healthy = True
113
114     def can_node_serve_ip(self, ip):
115         return ip in self.public_addresses
116
117     def node_ip_coverage(self):
118         return len(self.current_addresses)
119
120 class Cluster(object):
121     def __init__(self):
122         self.nodes = []
123         self.deterministic_public_ips = options.deterministic_public_ips
124         self.no_ip_failback = options.no_ip_failback
125         self.all_public_ips = set()
126
127         # Statistics
128         self.ip_moves = []
129         self.grat_ip_moves = []
130         self.imbalance = []
131         self.events = -1
132         self.num_unhealthy = []
133
134         self.prev = None
135
136     def __str__(self):
137         return "\n".join(["%2d %s %s" %
138                           (i,
139                            "*" if len(n.public_addresses) == 0 else \
140                                (" " if n.healthy else "#"),
141                            sorted(list(n.current_addresses)))
142                           for (i, n) in enumerate(self.nodes)])
143
144     def print_statistics(self):
145         print_begin("STATISTICS")
146         print "Events:              %6d" % self.events
147         print "Total IP moves:      %6d" % sum(self.ip_moves)
148         print "Gratuitous IP moves: %6d" % sum(self.grat_ip_moves)
149         print "Max imbalance:       %6d" % max(self.imbalance)
150         print "Final imbalance:     %6d" % self.imbalance[-1]
151         print "Maximum unhealthy:   %6d" % max(self.num_unhealthy)
152         print_end()
153
154     def find_pnn_with_ip(self, ip):
155         for (i, n) in enumerate(self.nodes):
156             if ip in n.current_addresses:
157                 return i
158         return -1
159
160     def quietly_remove_ip(self, ip):
161         # Remove address from old node.
162         old = self.find_pnn_with_ip(ip)
163         if old != -1:
164             self.nodes[old].current_addresses.remove(ip)
165
166     def add_node(self, node):
167         self.nodes.append(node)
168         self.all_public_ips |= node.public_addresses
169
170     def healthy(self, *pnns):
171         verbose_begin("HEALTHY")
172
173         for pnn in pnns:
174             self.nodes[pnn].healthy = True
175             verbose_print(pnn)
176
177         verbose_end()
178         
179     def unhealthy(self, *pnns):
180
181         verbose_begin("UNHEALTHY")
182
183         for pnn in pnns:
184             self.nodes[pnn].healthy = False
185             verbose_print(pnn)
186
187         verbose_end()
188
189     def do_something_random(self):
190
191
192         """Make a random node healthy or unhealthy.
193
194         If all nodes are healthy or unhealthy, then invert one of
195         them.  Otherwise, there's a 1/4 chance of making another node
196         unhealthy."""
197
198         num_nodes = len(self.nodes)
199         healthy_pnns = [i for (i,n) in enumerate(self.nodes) if n.healthy]
200         num_healthy = len(healthy_pnns)
201
202         if num_nodes == num_healthy:
203             self.unhealthy(random.randint(0, num_nodes-1))
204         elif num_healthy == 0:
205             self.healthy(random.randint(0, num_nodes-1))
206         elif random.randint(1, 4) == 1:
207             self.unhealthy(random.choice(healthy_pnns))
208         else:
209             all_pnns = range(num_nodes)
210             unhealthy_pnns = sorted(list(set(all_pnns) - set(healthy_pnns)))
211             self.healthy(random.choice(unhealthy_pnns))
212
213     def random_iterations(self):
214         i = 1
215         while i <= options.iterations:
216             verbose_begin("EVENT %d" % i)
217             verbose_end()
218             self.do_something_random()
219             if self.recover() and options.exit > 0:
220                 break
221             i += 1
222
223         self.print_statistics()
224
225     def calculate_imbalance(self):
226
227         imbalance = 0
228
229         assigned = sorted([ip
230                            for n in self.nodes
231                            for ip in n.current_addresses])
232
233         for ip in assigned:
234
235             num_capable = 0
236             maxnode = -1
237             minnode = -1
238             for (i, n) in enumerate(self.nodes):
239                 if not n.healthy:
240                     continue
241
242                 if not n.can_node_serve_ip(ip):
243                     continue
244
245                 num_capable += 1
246
247                 num = n.node_ip_coverage()
248
249                 if maxnode == -1 or num > maxnum:
250                     maxnode = i
251                     maxnum = num
252
253                 if minnode == -1 or num < minnum:
254                     minnode = i
255                     minnum = num
256             
257             if maxnode == -1:
258                 continue
259
260             i = maxnum - minnum
261             if maxnum - minnum < 2:
262                 i = 0
263             imbalance = max([imbalance, i])
264
265         return imbalance
266
267     def diff(self):
268         """Calculate differences in IP assignments between self and prev.
269
270         Gratuitous IP moves (from a healthy node to a healthy node)
271         are prefix by !!.  Any gratuitous IP moves cause this function
272         to return False.  If there are no gratuitous moves then it
273         will return True."""
274
275         ip_moves = 0
276         grat_ip_moves = 0
277         details = []
278
279         for (new, n) in enumerate(self.nodes):
280             for ip in n.current_addresses:
281                 old = self.prev.find_pnn_with_ip(ip)
282                 if old != new:
283                     ip_moves += 1
284                     if old != -1 and \
285                             self.prev.nodes[new].healthy and \
286                             self.nodes[new].healthy and \
287                             self.nodes[old].healthy and \
288                             self.prev.nodes[old].healthy:
289                         prefix = "!!"
290                         grat_ip_moves += 1
291                     else:
292                         prefix = "  "
293                     details.append("%s %s: %d -> %d" %
294                                    (prefix, ip, old, new))
295
296         return (ip_moves, grat_ip_moves, details)
297                     
298     def find_least_loaded_node(self, ip):
299         """Just like find_takeover_node but doesn't care about health."""
300         pnn = -1
301         min = 0
302         for (i, n) in enumerate(self.nodes):
303             if not n.can_node_serve_ip(ip):
304                 continue
305
306             num = n.node_ip_coverage()
307
308             if (pnn == -1):
309                 pnn = i
310                 min = num
311             else:
312                 if num < min:
313                     pnn = i
314                     min = num
315
316         if pnn == -1:
317             verbose_print("Could not find node to take over public address %s" % ip)
318             return False
319
320         self.nodes[pnn].current_addresses.add(ip)
321
322         verbose_print("%s -> %d" % (ip, pnn))
323         return True
324
325     def find_takeover_node(self, ip):
326
327         pnn = -1
328         min = 0
329         for (i, n) in enumerate(self.nodes):
330             if not n.healthy:
331                 continue
332
333             if not n.can_node_serve_ip(ip):
334                 continue
335
336             num = n.node_ip_coverage()
337
338             if (pnn == -1):
339                 pnn = i
340                 min = num
341             else:
342                 if num < min:
343                     pnn = i
344                     min = num
345
346         if pnn == -1:
347             verbose_print("Could not find node to take over public address %s" % ip)
348             return False
349
350         self.nodes[pnn].current_addresses.add(ip)
351
352         verbose_print("%s -> %d" % (ip, pnn))
353         return True
354
355     def ctdb_takeover_run(self):
356
357         self.events += 1
358
359         # Don't bother with the num_healthy stuff.  It is an
360         # irrelevant detail.
361
362         # We just keep the allocate IPs in the current_addresses field
363         # of the node.  This needs to readable, not efficient!
364
365         if self.deterministic_public_ips:
366             # Remap everything.
367             addr_list = sorted(list(self.all_public_ips))
368             for (i, ip) in enumerate(addr_list):
369                 if options.hack == 1:
370                     self.quietly_remove_ip(ip)
371                     self.find_least_loaded_node(ip)
372                 elif options.hack == 2:
373                     pnn = i % len(self.nodes)
374                     if ip in self.nodes[pnn].public_addresses:
375                         self.quietly_remove_ip(ip)
376                         # Add addresses to new node.
377                         self.nodes[pnn].current_addresses.add(ip)
378                         verbose_print("%s -> %d" % (ip, pnn))
379                 else:
380                     self.quietly_remove_ip(ip)
381                     # Add addresses to new node.
382                     pnn = i % len(self.nodes)
383                     self.nodes[pnn].current_addresses.add(ip)
384                     verbose_print("%s -> %d" % (ip, pnn))
385
386         # Remove public addresses from unhealthy nodes.
387         for (pnn, n) in enumerate(self.nodes):
388             if not n.healthy:
389                 verbose_print(["%s <- %d" % (ip, pnn)
390                                for ip in n.current_addresses])
391                 n.current_addresses = set()
392
393         # If a node can't serve an assigned address then remove it.
394         for n in self.nodes:
395             verbose_print(["%s <- %d" % (ip, pnn)
396                            for ip in n.current_addresses - n.public_addresses])
397             n.current_addresses &= n.public_addresses
398
399         # We'll only retry the balancing act up to 5 times.
400         retries = 0
401         should_loop = True
402         while should_loop:
403             should_loop = False
404
405             assigned = set([ip for n in self.nodes for ip in n.current_addresses])
406             unassigned = sorted(list(self.all_public_ips - assigned))
407
408             for ip in unassigned:
409                 self.find_takeover_node(ip)
410
411             if self.no_ip_failback:
412                 break
413
414             assigned = sorted([ip
415                                for n in self.nodes
416                                for ip in n.current_addresses])
417             for ip in assigned:
418
419                 maxnode = -1
420                 minnode = -1
421                 for (i, n) in enumerate(self.nodes):
422                     if not n.healthy:
423                         continue
424
425                     if not n.can_node_serve_ip(ip):
426                         continue
427
428                     num = n.node_ip_coverage()
429
430                     if maxnode == -1:
431                         maxnode = i
432                         maxnum = num
433                     else:
434                         if num > maxnum:
435                             maxnode = i
436                             maxnum = num
437                     if minnode == -1:
438                         minnode = i
439                         minnum = num
440                     else:
441                         if num < minnum:
442                             minnode = i
443                             minnum = num
444
445                 if maxnode == -1:
446                     print "Could not maxnode. May not be able to serve ip", ip
447                     continue
448
449                 if self.deterministic_public_ips:
450                     continue
451
452                 if maxnum > minnum + 1 and retries < options.retries:
453                     # Remove the 1st ip from maxnode
454                     t = sorted(list(self.nodes[maxnode].current_addresses))
455                     realloc = t[0]
456                     verbose_print("%s <- %d" % (realloc, maxnode))
457                     self.nodes[maxnode].current_addresses.remove(realloc)
458                     retries += 1
459                     # Redo the outer loop.
460                     should_loop = True
461                     break
462
463     def recover(self):
464         verbose_begin("TAKEOVER")
465
466         self.ctdb_takeover_run()
467
468         verbose_end()
469
470         grat_ip_moves = 0
471
472         if self.prev is not None:
473             (ip_moves, grat_ip_moves, details) = self.diff()
474             self.ip_moves.append(ip_moves)
475             self.grat_ip_moves.append(grat_ip_moves)
476
477             if options.diff:
478                 print_begin("DIFF")
479                 print "\n".join(details)
480                 print_end()
481
482         imbalance = self.calculate_imbalance()
483         self.imbalance.append(imbalance)
484         if options.balance:
485             print_begin("IMBALANCE")
486             print imbalance
487             print_end()
488
489         num_unhealthy = len(self.nodes) - \
490             len([n for n in self.nodes if n.healthy])
491         self.num_unhealthy.append(num_unhealthy)
492
493         if options.show:
494             print_begin("STATE")
495             print self
496             print_end()
497
498         self.prev = None
499         self.prev = copy.deepcopy(self)
500
501         return grat_ip_moves