recoverd: Move the test for both 'DeterministicIPs' and 'NoIPFailback' set
[obnox/ctdb.git] / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6    Copyright (C) Martin Schwenke  2011
7
8    This program is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3 of the License, or
11    (at your option) any later version.
12    
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17    
18    You should have received a copy of the GNU General Public License
19    along with this program; if not, see <http://www.gnu.org/licenses/>.
20 */
21 #include "includes.h"
22 #include "lib/tdb/include/tdb.h"
23 #include "lib/util/dlinklist.h"
24 #include "system/network.h"
25 #include "system/filesys.h"
26 #include "system/wait.h"
27 #include "../include/ctdb_private.h"
28 #include "../common/rb_tree.h"
29
30
31 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
32
33 #define CTDB_ARP_INTERVAL 1
34 #define CTDB_ARP_REPEAT   3
35
36 struct ctdb_iface {
37         struct ctdb_iface *prev, *next;
38         const char *name;
39         bool link_up;
40         uint32_t references;
41 };
42
43 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
44 {
45         if (vnn->iface) {
46                 return vnn->iface->name;
47         }
48
49         return "__none__";
50 }
51
52 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
53 {
54         struct ctdb_iface *i;
55
56         /* Verify that we dont have an entry for this ip yet */
57         for (i=ctdb->ifaces;i;i=i->next) {
58                 if (strcmp(i->name, iface) == 0) {
59                         return 0;
60                 }
61         }
62
63         /* create a new structure for this interface */
64         i = talloc_zero(ctdb, struct ctdb_iface);
65         CTDB_NO_MEMORY_FATAL(ctdb, i);
66         i->name = talloc_strdup(i, iface);
67         CTDB_NO_MEMORY(ctdb, i->name);
68         /*
69          * If link_up defaults to true then IPs can be allocated to a
70          * node during the first recovery.  However, then an interface
71          * could have its link marked down during the startup event,
72          * causing the IP to move almost immediately.  If link_up
73          * defaults to false then, during normal operation, IPs added
74          * to a new interface can't be assigned until a monitor cycle
75          * has occurred and marked the new interfaces up.  This makes
76          * IP allocation unpredictable.  The following is a neat
77          * compromise: early in startup link_up defaults to false, so
78          * IPs can't be assigned, and after startup IPs can be
79          * assigned immediately.
80          */
81         i->link_up = ctdb->done_startup;
82
83         DLIST_ADD(ctdb->ifaces, i);
84
85         return 0;
86 }
87
88 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
89                                         const char *name)
90 {
91         int n;
92
93         for (n = 0; vnn->ifaces[n] != NULL; n++) {
94                 if (strcmp(name, vnn->ifaces[n]) == 0) {
95                         return true;
96                 }
97         }
98
99         return false;
100 }
101
102 /* If any interfaces now have no possible IPs then delete them.  This
103  * implementation is naive (i.e. simple) rather than clever
104  * (i.e. complex).  Given that this is run on delip and that operation
105  * is rare, this doesn't need to be efficient - it needs to be
106  * foolproof.  One alternative is reference counting, where the logic
107  * is distributed and can, therefore, be broken in multiple places.
108  * Another alternative is to build a red-black tree of interfaces that
109  * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
110  * once) and then walking ctdb->ifaces once and deleting those not in
111  * the tree.  Let's go to one of those if the naive implementation
112  * causes problems...  :-)
113  */
114 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
115                                         struct ctdb_vnn *vnn,
116                                         TALLOC_CTX *mem_ctx)
117 {
118         struct ctdb_iface *i;
119
120         /* For each interface, check if there's an IP using it. */
121         for(i=ctdb->ifaces; i; i=i->next) {
122                 struct ctdb_vnn *tv;
123                 bool found;
124
125                 /* Only consider interfaces named in the given VNN. */
126                 if (!vnn_has_interface_with_name(vnn, i->name)) {
127                         continue;
128                 }
129
130                 /* Is the "single IP" on this interface? */
131                 if ((ctdb->single_ip_vnn != NULL) &&
132                     (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
133                     (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
134                         /* Found, next interface please... */
135                         continue;
136                 }
137                 /* Search for a vnn with this interface. */
138                 found = false;
139                 for (tv=ctdb->vnn; tv; tv=tv->next) {
140                         if (vnn_has_interface_with_name(tv, i->name)) {
141                                 found = true;
142                                 break;
143                         }
144                 }
145
146                 if (!found) {
147                         /* None of the VNNs are using this interface. */
148                         DLIST_REMOVE(ctdb->ifaces, i);
149                         /* Caller will free mem_ctx when convenient. */
150                         talloc_steal(mem_ctx, i);
151                 }
152         }
153 }
154
155
156 static struct ctdb_iface *ctdb_find_iface(struct ctdb_context *ctdb,
157                                           const char *iface)
158 {
159         struct ctdb_iface *i;
160
161         /* Verify that we dont have an entry for this ip yet */
162         for (i=ctdb->ifaces;i;i=i->next) {
163                 if (strcmp(i->name, iface) == 0) {
164                         return i;
165                 }
166         }
167
168         return NULL;
169 }
170
171 static struct ctdb_iface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
172                                               struct ctdb_vnn *vnn)
173 {
174         int i;
175         struct ctdb_iface *cur = NULL;
176         struct ctdb_iface *best = NULL;
177
178         for (i=0; vnn->ifaces[i]; i++) {
179
180                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
181                 if (cur == NULL) {
182                         continue;
183                 }
184
185                 if (!cur->link_up) {
186                         continue;
187                 }
188
189                 if (best == NULL) {
190                         best = cur;
191                         continue;
192                 }
193
194                 if (cur->references < best->references) {
195                         best = cur;
196                         continue;
197                 }
198         }
199
200         return best;
201 }
202
203 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
204                                      struct ctdb_vnn *vnn)
205 {
206         struct ctdb_iface *best = NULL;
207
208         if (vnn->iface) {
209                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
210                                    "still assigned to iface '%s'\n",
211                                    ctdb_addr_to_str(&vnn->public_address),
212                                    ctdb_vnn_iface_string(vnn)));
213                 return 0;
214         }
215
216         best = ctdb_vnn_best_iface(ctdb, vnn);
217         if (best == NULL) {
218                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
219                                   "cannot assign to iface any iface\n",
220                                   ctdb_addr_to_str(&vnn->public_address)));
221                 return -1;
222         }
223
224         vnn->iface = best;
225         best->references++;
226         vnn->pnn = ctdb->pnn;
227
228         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
229                            "now assigned to iface '%s' refs[%d]\n",
230                            ctdb_addr_to_str(&vnn->public_address),
231                            ctdb_vnn_iface_string(vnn),
232                            best->references));
233         return 0;
234 }
235
236 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
237                                     struct ctdb_vnn *vnn)
238 {
239         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
240                            "now unassigned (old iface '%s' refs[%d])\n",
241                            ctdb_addr_to_str(&vnn->public_address),
242                            ctdb_vnn_iface_string(vnn),
243                            vnn->iface?vnn->iface->references:0));
244         if (vnn->iface) {
245                 vnn->iface->references--;
246         }
247         vnn->iface = NULL;
248         if (vnn->pnn == ctdb->pnn) {
249                 vnn->pnn = -1;
250         }
251 }
252
253 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
254                                struct ctdb_vnn *vnn)
255 {
256         int i;
257
258         if (vnn->iface && vnn->iface->link_up) {
259                 return true;
260         }
261
262         for (i=0; vnn->ifaces[i]; i++) {
263                 struct ctdb_iface *cur;
264
265                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
266                 if (cur == NULL) {
267                         continue;
268                 }
269
270                 if (cur->link_up) {
271                         return true;
272                 }
273         }
274
275         return false;
276 }
277
278 struct ctdb_takeover_arp {
279         struct ctdb_context *ctdb;
280         uint32_t count;
281         ctdb_sock_addr addr;
282         struct ctdb_tcp_array *tcparray;
283         struct ctdb_vnn *vnn;
284 };
285
286
287 /*
288   lists of tcp endpoints
289  */
290 struct ctdb_tcp_list {
291         struct ctdb_tcp_list *prev, *next;
292         struct ctdb_tcp_connection connection;
293 };
294
295 /*
296   list of clients to kill on IP release
297  */
298 struct ctdb_client_ip {
299         struct ctdb_client_ip *prev, *next;
300         struct ctdb_context *ctdb;
301         ctdb_sock_addr addr;
302         uint32_t client_id;
303 };
304
305
306 /*
307   send a gratuitous arp
308  */
309 static void ctdb_control_send_arp(struct event_context *ev, struct timed_event *te, 
310                                   struct timeval t, void *private_data)
311 {
312         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
313                                                         struct ctdb_takeover_arp);
314         int i, ret;
315         struct ctdb_tcp_array *tcparray;
316         const char *iface = ctdb_vnn_iface_string(arp->vnn);
317
318         ret = ctdb_sys_send_arp(&arp->addr, iface);
319         if (ret != 0) {
320                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
321                                   iface, strerror(errno)));
322         }
323
324         tcparray = arp->tcparray;
325         if (tcparray) {
326                 for (i=0;i<tcparray->num;i++) {
327                         struct ctdb_tcp_connection *tcon;
328
329                         tcon = &tcparray->connections[i];
330                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
331                                 (unsigned)ntohs(tcon->dst_addr.ip.sin_port), 
332                                 ctdb_addr_to_str(&tcon->src_addr),
333                                 (unsigned)ntohs(tcon->src_addr.ip.sin_port)));
334                         ret = ctdb_sys_send_tcp(
335                                 &tcon->src_addr, 
336                                 &tcon->dst_addr,
337                                 0, 0, 0);
338                         if (ret != 0) {
339                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
340                                         ctdb_addr_to_str(&tcon->src_addr)));
341                         }
342                 }
343         }
344
345         arp->count++;
346
347         if (arp->count == CTDB_ARP_REPEAT) {
348                 talloc_free(arp);
349                 return;
350         }
351
352         event_add_timed(arp->ctdb->ev, arp->vnn->takeover_ctx, 
353                         timeval_current_ofs(CTDB_ARP_INTERVAL, 100000), 
354                         ctdb_control_send_arp, arp);
355 }
356
357 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
358                                        struct ctdb_vnn *vnn)
359 {
360         struct ctdb_takeover_arp *arp;
361         struct ctdb_tcp_array *tcparray;
362
363         if (!vnn->takeover_ctx) {
364                 vnn->takeover_ctx = talloc_new(vnn);
365                 if (!vnn->takeover_ctx) {
366                         return -1;
367                 }
368         }
369
370         arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
371         if (!arp) {
372                 return -1;
373         }
374
375         arp->ctdb = ctdb;
376         arp->addr = vnn->public_address;
377         arp->vnn  = vnn;
378
379         tcparray = vnn->tcp_array;
380         if (tcparray) {
381                 /* add all of the known tcp connections for this IP to the
382                    list of tcp connections to send tickle acks for */
383                 arp->tcparray = talloc_steal(arp, tcparray);
384
385                 vnn->tcp_array = NULL;
386                 vnn->tcp_update_needed = true;
387         }
388
389         event_add_timed(arp->ctdb->ev, vnn->takeover_ctx,
390                         timeval_zero(), ctdb_control_send_arp, arp);
391
392         return 0;
393 }
394
395 struct takeover_callback_state {
396         struct ctdb_req_control *c;
397         ctdb_sock_addr *addr;
398         struct ctdb_vnn *vnn;
399 };
400
401 struct ctdb_do_takeip_state {
402         struct ctdb_req_control *c;
403         struct ctdb_vnn *vnn;
404 };
405
406 /*
407   called when takeip event finishes
408  */
409 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
410                                     void *private_data)
411 {
412         struct ctdb_do_takeip_state *state =
413                 talloc_get_type(private_data, struct ctdb_do_takeip_state);
414         int32_t ret;
415         TDB_DATA data;
416
417         if (status != 0) {
418                 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
419         
420                 if (status == -ETIME) {
421                         ctdb_ban_self(ctdb);
422                 }
423                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
424                                  ctdb_addr_to_str(&state->vnn->public_address),
425                                  ctdb_vnn_iface_string(state->vnn)));
426                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
427
428                 node->flags |= NODE_FLAGS_UNHEALTHY;
429                 talloc_free(state);
430                 return;
431         }
432
433         if (ctdb->do_checkpublicip) {
434
435         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
436         if (ret != 0) {
437                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
438                 talloc_free(state);
439                 return;
440         }
441
442         }
443
444         data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
445         data.dsize = strlen((char *)data.dptr) + 1;
446         DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
447
448         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
449
450
451         /* the control succeeded */
452         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
453         talloc_free(state);
454         return;
455 }
456
457 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
458 {
459         state->vnn->update_in_flight = false;
460         return 0;
461 }
462
463 /*
464   take over an ip address
465  */
466 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
467                               struct ctdb_req_control *c,
468                               struct ctdb_vnn *vnn)
469 {
470         int ret;
471         struct ctdb_do_takeip_state *state;
472
473         if (vnn->update_in_flight) {
474                 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
475                                     "update for this IP already in flight\n",
476                                     ctdb_addr_to_str(&vnn->public_address),
477                                     vnn->public_netmask_bits));
478                 return -1;
479         }
480
481         ret = ctdb_vnn_assign_iface(ctdb, vnn);
482         if (ret != 0) {
483                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
484                                  "assign a usable interface\n",
485                                  ctdb_addr_to_str(&vnn->public_address),
486                                  vnn->public_netmask_bits));
487                 return -1;
488         }
489
490         state = talloc(vnn, struct ctdb_do_takeip_state);
491         CTDB_NO_MEMORY(ctdb, state);
492
493         state->c = talloc_steal(ctdb, c);
494         state->vnn   = vnn;
495
496         vnn->update_in_flight = true;
497         talloc_set_destructor(state, ctdb_takeip_destructor);
498
499         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
500                             ctdb_addr_to_str(&vnn->public_address),
501                             vnn->public_netmask_bits,
502                             ctdb_vnn_iface_string(vnn)));
503
504         ret = ctdb_event_script_callback(ctdb,
505                                          state,
506                                          ctdb_do_takeip_callback,
507                                          state,
508                                          false,
509                                          CTDB_EVENT_TAKE_IP,
510                                          "%s %s %u",
511                                          ctdb_vnn_iface_string(vnn),
512                                          ctdb_addr_to_str(&vnn->public_address),
513                                          vnn->public_netmask_bits);
514
515         if (ret != 0) {
516                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
517                         ctdb_addr_to_str(&vnn->public_address),
518                         ctdb_vnn_iface_string(vnn)));
519                 talloc_free(state);
520                 return -1;
521         }
522
523         return 0;
524 }
525
526 struct ctdb_do_updateip_state {
527         struct ctdb_req_control *c;
528         struct ctdb_iface *old;
529         struct ctdb_vnn *vnn;
530 };
531
532 /*
533   called when updateip event finishes
534  */
535 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
536                                       void *private_data)
537 {
538         struct ctdb_do_updateip_state *state =
539                 talloc_get_type(private_data, struct ctdb_do_updateip_state);
540         int32_t ret;
541
542         if (status != 0) {
543                 if (status == -ETIME) {
544                         ctdb_ban_self(ctdb);
545                 }
546                 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
547                         ctdb_addr_to_str(&state->vnn->public_address),
548                         state->old->name,
549                         ctdb_vnn_iface_string(state->vnn)));
550
551                 /*
552                  * All we can do is reset the old interface
553                  * and let the next run fix it
554                  */
555                 ctdb_vnn_unassign_iface(ctdb, state->vnn);
556                 state->vnn->iface = state->old;
557                 state->vnn->iface->references++;
558
559                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
560                 talloc_free(state);
561                 return;
562         }
563
564         if (ctdb->do_checkpublicip) {
565
566         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
567         if (ret != 0) {
568                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
569                 talloc_free(state);
570                 return;
571         }
572
573         }
574
575         /* the control succeeded */
576         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
577         talloc_free(state);
578         return;
579 }
580
581 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
582 {
583         state->vnn->update_in_flight = false;
584         return 0;
585 }
586
587 /*
588   update (move) an ip address
589  */
590 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
591                                 struct ctdb_req_control *c,
592                                 struct ctdb_vnn *vnn)
593 {
594         int ret;
595         struct ctdb_do_updateip_state *state;
596         struct ctdb_iface *old = vnn->iface;
597         const char *new_name;
598
599         if (vnn->update_in_flight) {
600                 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
601                                     "update for this IP already in flight\n",
602                                     ctdb_addr_to_str(&vnn->public_address),
603                                     vnn->public_netmask_bits));
604                 return -1;
605         }
606
607         ctdb_vnn_unassign_iface(ctdb, vnn);
608         ret = ctdb_vnn_assign_iface(ctdb, vnn);
609         if (ret != 0) {
610                 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
611                                  "assin a usable interface (old iface '%s')\n",
612                                  ctdb_addr_to_str(&vnn->public_address),
613                                  vnn->public_netmask_bits,
614                                  old->name));
615                 return -1;
616         }
617
618         new_name = ctdb_vnn_iface_string(vnn);
619         if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
620                 /* A benign update from one interface onto itself.
621                  * no need to run the eventscripts in this case, just return
622                  * success.
623                  */
624                 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
625                 return 0;
626         }
627
628         state = talloc(vnn, struct ctdb_do_updateip_state);
629         CTDB_NO_MEMORY(ctdb, state);
630
631         state->c = talloc_steal(ctdb, c);
632         state->old = old;
633         state->vnn = vnn;
634
635         vnn->update_in_flight = true;
636         talloc_set_destructor(state, ctdb_updateip_destructor);
637
638         DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
639                             "interface %s to %s\n",
640                             ctdb_addr_to_str(&vnn->public_address),
641                             vnn->public_netmask_bits,
642                             old->name,
643                             new_name));
644
645         ret = ctdb_event_script_callback(ctdb,
646                                          state,
647                                          ctdb_do_updateip_callback,
648                                          state,
649                                          false,
650                                          CTDB_EVENT_UPDATE_IP,
651                                          "%s %s %s %u",
652                                          state->old->name,
653                                          new_name,
654                                          ctdb_addr_to_str(&vnn->public_address),
655                                          vnn->public_netmask_bits);
656         if (ret != 0) {
657                 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
658                                  ctdb_addr_to_str(&vnn->public_address),
659                                  old->name, new_name));
660                 talloc_free(state);
661                 return -1;
662         }
663
664         return 0;
665 }
666
667 /*
668   Find the vnn of the node that has a public ip address
669   returns -1 if the address is not known as a public address
670  */
671 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
672 {
673         struct ctdb_vnn *vnn;
674
675         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
676                 if (ctdb_same_ip(&vnn->public_address, addr)) {
677                         return vnn;
678                 }
679         }
680
681         return NULL;
682 }
683
684 /*
685   take over an ip address
686  */
687 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
688                                  struct ctdb_req_control *c,
689                                  TDB_DATA indata,
690                                  bool *async_reply)
691 {
692         int ret;
693         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
694         struct ctdb_vnn *vnn;
695         bool have_ip = false;
696         bool do_updateip = false;
697         bool do_takeip = false;
698         struct ctdb_iface *best_iface = NULL;
699
700         if (pip->pnn != ctdb->pnn) {
701                 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
702                                  "with pnn %d, but we're node %d\n",
703                                  ctdb_addr_to_str(&pip->addr),
704                                  pip->pnn, ctdb->pnn));
705                 return -1;
706         }
707
708         /* update out vnn list */
709         vnn = find_public_ip_vnn(ctdb, &pip->addr);
710         if (vnn == NULL) {
711                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
712                         ctdb_addr_to_str(&pip->addr)));
713                 return 0;
714         }
715
716         if (ctdb->do_checkpublicip) {
717                 have_ip = ctdb_sys_have_ip(&pip->addr);
718         }
719         best_iface = ctdb_vnn_best_iface(ctdb, vnn);
720         if (best_iface == NULL) {
721                 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
722                                  "a usable interface (old %s, have_ip %d)\n",
723                                  ctdb_addr_to_str(&vnn->public_address),
724                                  vnn->public_netmask_bits,
725                                  ctdb_vnn_iface_string(vnn),
726                                  have_ip));
727                 return -1;
728         }
729
730         if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
731                 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
732                 have_ip = false;
733         }
734
735
736         if (vnn->iface == NULL && have_ip) {
737                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
738                                   "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
739                                  ctdb_addr_to_str(&vnn->public_address)));
740                 return 0;
741         }
742
743         if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
744                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
745                                   "and we have it on iface[%s], but it was assigned to node %d"
746                                   "and we are node %d, banning ourself\n",
747                                  ctdb_addr_to_str(&vnn->public_address),
748                                  ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
749                 ctdb_ban_self(ctdb);
750                 return -1;
751         }
752
753         if (vnn->pnn == -1 && have_ip) {
754                 vnn->pnn = ctdb->pnn;
755                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
756                                   "and we already have it on iface[%s], update local daemon\n",
757                                  ctdb_addr_to_str(&vnn->public_address),
758                                   ctdb_vnn_iface_string(vnn)));
759                 return 0;
760         }
761
762         if (vnn->iface) {
763                 if (vnn->iface != best_iface) {
764                         if (!vnn->iface->link_up) {
765                                 do_updateip = true;
766                         } else if (vnn->iface->references > (best_iface->references + 1)) {
767                                 /* only move when the rebalance gains something */
768                                         do_updateip = true;
769                         }
770                 }
771         }
772
773         if (!have_ip) {
774                 if (do_updateip) {
775                         ctdb_vnn_unassign_iface(ctdb, vnn);
776                         do_updateip = false;
777                 }
778                 do_takeip = true;
779         }
780
781         if (do_takeip) {
782                 ret = ctdb_do_takeip(ctdb, c, vnn);
783                 if (ret != 0) {
784                         return -1;
785                 }
786         } else if (do_updateip) {
787                 ret = ctdb_do_updateip(ctdb, c, vnn);
788                 if (ret != 0) {
789                         return -1;
790                 }
791         } else {
792                 /*
793                  * The interface is up and the kernel known the ip
794                  * => do nothing
795                  */
796                 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
797                         ctdb_addr_to_str(&pip->addr),
798                         vnn->public_netmask_bits,
799                         ctdb_vnn_iface_string(vnn)));
800                 return 0;
801         }
802
803         /* tell ctdb_control.c that we will be replying asynchronously */
804         *async_reply = true;
805
806         return 0;
807 }
808
809 /*
810   takeover an ip address old v4 style
811  */
812 int32_t ctdb_control_takeover_ipv4(struct ctdb_context *ctdb, 
813                                 struct ctdb_req_control *c,
814                                 TDB_DATA indata, 
815                                 bool *async_reply)
816 {
817         TDB_DATA data;
818         
819         data.dsize = sizeof(struct ctdb_public_ip);
820         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
821         CTDB_NO_MEMORY(ctdb, data.dptr);
822         
823         memcpy(data.dptr, indata.dptr, indata.dsize);
824         return ctdb_control_takeover_ip(ctdb, c, data, async_reply);
825 }
826
827 /*
828   kill any clients that are registered with a IP that is being released
829  */
830 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
831 {
832         struct ctdb_client_ip *ip;
833
834         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
835                 ctdb_addr_to_str(addr)));
836
837         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
838                 ctdb_sock_addr tmp_addr;
839
840                 tmp_addr = ip->addr;
841                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
842                         ip->client_id,
843                         ctdb_addr_to_str(&ip->addr)));
844
845                 if (ctdb_same_ip(&tmp_addr, addr)) {
846                         struct ctdb_client *client = ctdb_reqid_find(ctdb, 
847                                                                      ip->client_id, 
848                                                                      struct ctdb_client);
849                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
850                                 ip->client_id,
851                                 ctdb_addr_to_str(&ip->addr),
852                                 client->pid));
853
854                         if (client->pid != 0) {
855                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
856                                         (unsigned)client->pid,
857                                         ctdb_addr_to_str(addr),
858                                         ip->client_id));
859                                 ctdb_kill(ctdb, client->pid, SIGKILL);
860                         }
861                 }
862         }
863 }
864
865 /*
866   called when releaseip event finishes
867  */
868 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
869                                 void *private_data)
870 {
871         struct takeover_callback_state *state = 
872                 talloc_get_type(private_data, struct takeover_callback_state);
873         TDB_DATA data;
874
875         if (status == -ETIME) {
876                 ctdb_ban_self(ctdb);
877         }
878
879         /* send a message to all clients of this node telling them
880            that the cluster has been reconfigured and they should
881            release any sockets on this IP */
882         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
883         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
884         data.dsize = strlen((char *)data.dptr)+1;
885
886         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
887
888         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
889
890         /* kill clients that have registered with this IP */
891         release_kill_clients(ctdb, state->addr);
892
893         ctdb_vnn_unassign_iface(ctdb, state->vnn);
894
895         /* the control succeeded */
896         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
897         talloc_free(state);
898 }
899
900 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
901 {
902         state->vnn->update_in_flight = false;
903         return 0;
904 }
905
906 /*
907   release an ip address
908  */
909 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
910                                 struct ctdb_req_control *c,
911                                 TDB_DATA indata, 
912                                 bool *async_reply)
913 {
914         int ret;
915         struct takeover_callback_state *state;
916         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
917         struct ctdb_vnn *vnn;
918         char *iface;
919
920         /* update our vnn list */
921         vnn = find_public_ip_vnn(ctdb, &pip->addr);
922         if (vnn == NULL) {
923                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
924                         ctdb_addr_to_str(&pip->addr)));
925                 return 0;
926         }
927         vnn->pnn = pip->pnn;
928
929         /* stop any previous arps */
930         talloc_free(vnn->takeover_ctx);
931         vnn->takeover_ctx = NULL;
932
933         /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
934          * lazy multicast to drop an IP from any node that isn't the
935          * intended new node.  The following causes makes ctdbd ignore
936          * a release for any address it doesn't host.
937          */
938         if (ctdb->do_checkpublicip) {
939                 if (!ctdb_sys_have_ip(&pip->addr)) {
940                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
941                                 ctdb_addr_to_str(&pip->addr),
942                                 vnn->public_netmask_bits,
943                                 ctdb_vnn_iface_string(vnn)));
944                         ctdb_vnn_unassign_iface(ctdb, vnn);
945                         return 0;
946                 }
947         } else {
948                 if (vnn->iface == NULL) {
949                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
950                                            ctdb_addr_to_str(&pip->addr),
951                                            vnn->public_netmask_bits));
952                         return 0;
953                 }
954         }
955
956         /* There is a potential race between take_ip and us because we
957          * update the VNN via a callback that run when the
958          * eventscripts have been run.  Avoid the race by allowing one
959          * update to be in flight at a time.
960          */
961         if (vnn->update_in_flight) {
962                 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
963                                     "update for this IP already in flight\n",
964                                     ctdb_addr_to_str(&vnn->public_address),
965                                     vnn->public_netmask_bits));
966                 return -1;
967         }
968
969         if (ctdb->do_checkpublicip) {
970                 iface = ctdb_sys_find_ifname(&pip->addr);
971                 if (iface == NULL) {
972                         DEBUG(DEBUG_ERR, ("Could not find which interface the ip address is hosted on. can not release it\n"));
973                         return 0;
974                 }
975         } else {
976                 iface = strdup(ctdb_vnn_iface_string(vnn));
977         }
978
979         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
980                 ctdb_addr_to_str(&pip->addr),
981                 vnn->public_netmask_bits,
982                 iface,
983                 pip->pnn));
984
985         state = talloc(ctdb, struct takeover_callback_state);
986         CTDB_NO_MEMORY(ctdb, state);
987
988         state->c = talloc_steal(state, c);
989         state->addr = talloc(state, ctdb_sock_addr);       
990         CTDB_NO_MEMORY(ctdb, state->addr);
991         *state->addr = pip->addr;
992         state->vnn   = vnn;
993
994         vnn->update_in_flight = true;
995         talloc_set_destructor(state, ctdb_releaseip_destructor);
996
997         ret = ctdb_event_script_callback(ctdb, 
998                                          state, release_ip_callback, state,
999                                          false,
1000                                          CTDB_EVENT_RELEASE_IP,
1001                                          "%s %s %u",
1002                                          iface,
1003                                          ctdb_addr_to_str(&pip->addr),
1004                                          vnn->public_netmask_bits);
1005         free(iface);
1006         if (ret != 0) {
1007                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1008                         ctdb_addr_to_str(&pip->addr),
1009                         ctdb_vnn_iface_string(vnn)));
1010                 talloc_free(state);
1011                 return -1;
1012         }
1013
1014         /* tell the control that we will be reply asynchronously */
1015         *async_reply = true;
1016         return 0;
1017 }
1018
1019 /*
1020   release an ip address old v4 style
1021  */
1022 int32_t ctdb_control_release_ipv4(struct ctdb_context *ctdb, 
1023                                 struct ctdb_req_control *c,
1024                                 TDB_DATA indata, 
1025                                 bool *async_reply)
1026 {
1027         TDB_DATA data;
1028         
1029         data.dsize = sizeof(struct ctdb_public_ip);
1030         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
1031         CTDB_NO_MEMORY(ctdb, data.dptr);
1032         
1033         memcpy(data.dptr, indata.dptr, indata.dsize);
1034         return ctdb_control_release_ip(ctdb, c, data, async_reply);
1035 }
1036
1037
1038 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1039                                    ctdb_sock_addr *addr,
1040                                    unsigned mask, const char *ifaces,
1041                                    bool check_address)
1042 {
1043         struct ctdb_vnn      *vnn;
1044         uint32_t num = 0;
1045         char *tmp;
1046         const char *iface;
1047         int i;
1048         int ret;
1049
1050         tmp = strdup(ifaces);
1051         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1052                 if (!ctdb_sys_check_iface_exists(iface)) {
1053                         DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1054                         free(tmp);
1055                         return -1;
1056                 }
1057         }
1058         free(tmp);
1059
1060         /* Verify that we dont have an entry for this ip yet */
1061         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1062                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1063                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
1064                                 ctdb_addr_to_str(addr)));
1065                         return -1;
1066                 }               
1067         }
1068
1069         /* create a new vnn structure for this ip address */
1070         vnn = talloc_zero(ctdb, struct ctdb_vnn);
1071         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1072         vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1073         tmp = talloc_strdup(vnn, ifaces);
1074         CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1075         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1076                 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1077                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1078                 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1079                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1080                 num++;
1081         }
1082         talloc_free(tmp);
1083         vnn->ifaces[num] = NULL;
1084         vnn->public_address      = *addr;
1085         vnn->public_netmask_bits = mask;
1086         vnn->pnn                 = -1;
1087         if (check_address) {
1088                 if (ctdb_sys_have_ip(addr)) {
1089                         DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1090                         vnn->pnn = ctdb->pnn;
1091                 }
1092         }
1093
1094         for (i=0; vnn->ifaces[i]; i++) {
1095                 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1096                 if (ret != 0) {
1097                         DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1098                                            "for public_address[%s]\n",
1099                                            vnn->ifaces[i], ctdb_addr_to_str(addr)));
1100                         talloc_free(vnn);
1101                         return -1;
1102                 }
1103         }
1104
1105         DLIST_ADD(ctdb->vnn, vnn);
1106
1107         return 0;
1108 }
1109
1110 /*
1111   setup the event script directory
1112 */
1113 int ctdb_set_event_script_dir(struct ctdb_context *ctdb, const char *script_dir)
1114 {
1115         ctdb->event_script_dir = talloc_strdup(ctdb, script_dir);
1116         CTDB_NO_MEMORY(ctdb, ctdb->event_script_dir);
1117         return 0;
1118 }
1119
1120 static void ctdb_check_interfaces_event(struct event_context *ev, struct timed_event *te, 
1121                                   struct timeval t, void *private_data)
1122 {
1123         struct ctdb_context *ctdb = talloc_get_type(private_data, 
1124                                                         struct ctdb_context);
1125         struct ctdb_vnn *vnn;
1126
1127         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1128                 int i;
1129
1130                 for (i=0; vnn->ifaces[i] != NULL; i++) {
1131                         if (!ctdb_sys_check_iface_exists(vnn->ifaces[i])) {
1132                                 DEBUG(DEBUG_CRIT,("Interface %s does not exist but is used by public ip %s\n",
1133                                         vnn->ifaces[i],
1134                                         ctdb_addr_to_str(&vnn->public_address)));
1135                         }
1136                 }
1137         }
1138
1139         event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx, 
1140                 timeval_current_ofs(30, 0), 
1141                 ctdb_check_interfaces_event, ctdb);
1142 }
1143
1144
1145 int ctdb_start_monitoring_interfaces(struct ctdb_context *ctdb)
1146 {
1147         if (ctdb->check_public_ifaces_ctx != NULL) {
1148                 talloc_free(ctdb->check_public_ifaces_ctx);
1149                 ctdb->check_public_ifaces_ctx = NULL;
1150         }
1151
1152         ctdb->check_public_ifaces_ctx = talloc_new(ctdb);
1153         if (ctdb->check_public_ifaces_ctx == NULL) {
1154                 ctdb_fatal(ctdb, "failed to allocate context for checking interfaces");
1155         }
1156
1157         event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx, 
1158                 timeval_current_ofs(30, 0), 
1159                 ctdb_check_interfaces_event, ctdb);
1160
1161         return 0;
1162 }
1163
1164
1165 /*
1166   setup the public address lists from a file
1167 */
1168 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1169 {
1170         char **lines;
1171         int nlines;
1172         int i;
1173
1174         lines = file_lines_load(ctdb->public_addresses_file, &nlines, ctdb);
1175         if (lines == NULL) {
1176                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1177                 return -1;
1178         }
1179         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1180                 nlines--;
1181         }
1182
1183         for (i=0;i<nlines;i++) {
1184                 unsigned mask;
1185                 ctdb_sock_addr addr;
1186                 const char *addrstr;
1187                 const char *ifaces;
1188                 char *tok, *line;
1189
1190                 line = lines[i];
1191                 while ((*line == ' ') || (*line == '\t')) {
1192                         line++;
1193                 }
1194                 if (*line == '#') {
1195                         continue;
1196                 }
1197                 if (strcmp(line, "") == 0) {
1198                         continue;
1199                 }
1200                 tok = strtok(line, " \t");
1201                 addrstr = tok;
1202                 tok = strtok(NULL, " \t");
1203                 if (tok == NULL) {
1204                         if (NULL == ctdb->default_public_interface) {
1205                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1206                                          i+1));
1207                                 talloc_free(lines);
1208                                 return -1;
1209                         }
1210                         ifaces = ctdb->default_public_interface;
1211                 } else {
1212                         ifaces = tok;
1213                 }
1214
1215                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1216                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1217                         talloc_free(lines);
1218                         return -1;
1219                 }
1220                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1221                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1222                         talloc_free(lines);
1223                         return -1;
1224                 }
1225         }
1226
1227
1228         talloc_free(lines);
1229         return 0;
1230 }
1231
1232 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1233                               const char *iface,
1234                               const char *ip)
1235 {
1236         struct ctdb_vnn *svnn;
1237         struct ctdb_iface *cur = NULL;
1238         bool ok;
1239         int ret;
1240
1241         svnn = talloc_zero(ctdb, struct ctdb_vnn);
1242         CTDB_NO_MEMORY(ctdb, svnn);
1243
1244         svnn->ifaces = talloc_array(svnn, const char *, 2);
1245         CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1246         svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1247         CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1248         svnn->ifaces[1] = NULL;
1249
1250         ok = parse_ip(ip, iface, 0, &svnn->public_address);
1251         if (!ok) {
1252                 talloc_free(svnn);
1253                 return -1;
1254         }
1255
1256         ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1257         if (ret != 0) {
1258                 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1259                                    "for single_ip[%s]\n",
1260                                    svnn->ifaces[0],
1261                                    ctdb_addr_to_str(&svnn->public_address)));
1262                 talloc_free(svnn);
1263                 return -1;
1264         }
1265
1266         /* assume the single public ip interface is initially "good" */
1267         cur = ctdb_find_iface(ctdb, iface);
1268         if (cur == NULL) {
1269                 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1270                 return -1;
1271         }
1272         cur->link_up = true;
1273
1274         ret = ctdb_vnn_assign_iface(ctdb, svnn);
1275         if (ret != 0) {
1276                 talloc_free(svnn);
1277                 return -1;
1278         }
1279
1280         ctdb->single_ip_vnn = svnn;
1281         return 0;
1282 }
1283
1284 /* Given a physical node, return the number of
1285    public addresses that is currently assigned to this node.
1286 */
1287 static int node_ip_coverage(struct ctdb_context *ctdb, 
1288         int32_t pnn,
1289         struct ctdb_public_ip_list *ips)
1290 {
1291         int num=0;
1292
1293         for (;ips;ips=ips->next) {
1294                 if (ips->pnn == pnn) {
1295                         num++;
1296                 }
1297         }
1298         return num;
1299 }
1300
1301
1302 /* Check if this is a public ip known to the node, i.e. can that
1303    node takeover this ip ?
1304 */
1305 static int can_node_serve_ip(struct ctdb_context *ctdb, int32_t pnn, 
1306                 struct ctdb_public_ip_list *ip)
1307 {
1308         struct ctdb_all_public_ips *public_ips;
1309         int i;
1310
1311         public_ips = ctdb->nodes[pnn]->available_public_ips;
1312
1313         if (public_ips == NULL) {
1314                 return -1;
1315         }
1316
1317         for (i=0;i<public_ips->num;i++) {
1318                 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1319                         /* yes, this node can serve this public ip */
1320                         return 0;
1321                 }
1322         }
1323
1324         return -1;
1325 }
1326
1327
1328 /* search the node lists list for a node to takeover this ip.
1329    pick the node that currently are serving the least number of ips
1330    so that the ips get spread out evenly.
1331 */
1332 static int find_takeover_node(struct ctdb_context *ctdb, 
1333                 struct ctdb_node_map *nodemap, uint32_t mask, 
1334                 struct ctdb_public_ip_list *ip,
1335                 struct ctdb_public_ip_list *all_ips)
1336 {
1337         int pnn, min=0, num;
1338         int i;
1339
1340         pnn    = -1;
1341         for (i=0;i<nodemap->num;i++) {
1342                 if (nodemap->nodes[i].flags & NODE_FLAGS_NOIPTAKEOVER) {
1343                         /* This node is not allowed to takeover any addresses
1344                         */
1345                         continue;
1346                 }
1347
1348                 if (nodemap->nodes[i].flags & mask) {
1349                         /* This node is not healty and can not be used to serve
1350                            a public address 
1351                         */
1352                         continue;
1353                 }
1354
1355                 /* verify that this node can serve this ip */
1356                 if (can_node_serve_ip(ctdb, i, ip)) {
1357                         /* no it couldnt   so skip to the next node */
1358                         continue;
1359                 }
1360
1361                 num = node_ip_coverage(ctdb, i, all_ips);
1362                 /* was this the first node we checked ? */
1363                 if (pnn == -1) {
1364                         pnn = i;
1365                         min  = num;
1366                 } else {
1367                         if (num < min) {
1368                                 pnn = i;
1369                                 min  = num;
1370                         }
1371                 }
1372         }       
1373         if (pnn == -1) {
1374                 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1375                         ctdb_addr_to_str(&ip->addr)));
1376
1377                 return -1;
1378         }
1379
1380         ip->pnn = pnn;
1381         return 0;
1382 }
1383
1384 #define IP_KEYLEN       4
1385 static uint32_t *ip_key(ctdb_sock_addr *ip)
1386 {
1387         static uint32_t key[IP_KEYLEN];
1388
1389         bzero(key, sizeof(key));
1390
1391         switch (ip->sa.sa_family) {
1392         case AF_INET:
1393                 key[3]  = htonl(ip->ip.sin_addr.s_addr);
1394                 break;
1395         case AF_INET6: {
1396                 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1397                 key[0]  = htonl(s6_a32[0]);
1398                 key[1]  = htonl(s6_a32[1]);
1399                 key[2]  = htonl(s6_a32[2]);
1400                 key[3]  = htonl(s6_a32[3]);
1401                 break;
1402         }
1403         default:
1404                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1405                 return key;
1406         }
1407
1408         return key;
1409 }
1410
1411 static void *add_ip_callback(void *parm, void *data)
1412 {
1413         struct ctdb_public_ip_list *this_ip = parm; 
1414         struct ctdb_public_ip_list *prev_ip = data; 
1415
1416         if (prev_ip == NULL) {
1417                 return parm;
1418         }
1419         if (this_ip->pnn == -1) {
1420                 this_ip->pnn = prev_ip->pnn;
1421         }
1422
1423         return parm;
1424 }
1425
1426 static int getips_count_callback(void *param, void *data)
1427 {
1428         struct ctdb_public_ip_list **ip_list = (struct ctdb_public_ip_list **)param;
1429         struct ctdb_public_ip_list *new_ip = (struct ctdb_public_ip_list *)data;
1430
1431         new_ip->next = *ip_list;
1432         *ip_list     = new_ip;
1433         return 0;
1434 }
1435
1436 static struct ctdb_public_ip_list *
1437 create_merged_ip_list(struct ctdb_context *ctdb)
1438 {
1439         int i, j;
1440         struct ctdb_public_ip_list *ip_list;
1441         struct ctdb_all_public_ips *public_ips;
1442
1443         if (ctdb->ip_tree != NULL) {
1444                 talloc_free(ctdb->ip_tree);
1445                 ctdb->ip_tree = NULL;
1446         }
1447         ctdb->ip_tree = trbt_create(ctdb, 0);
1448
1449         for (i=0;i<ctdb->num_nodes;i++) {
1450                 public_ips = ctdb->nodes[i]->known_public_ips;
1451
1452                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1453                         continue;
1454                 }
1455
1456                 /* there were no public ips for this node */
1457                 if (public_ips == NULL) {
1458                         continue;
1459                 }               
1460
1461                 for (j=0;j<public_ips->num;j++) {
1462                         struct ctdb_public_ip_list *tmp_ip; 
1463
1464                         tmp_ip = talloc_zero(ctdb->ip_tree, struct ctdb_public_ip_list);
1465                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1466                         tmp_ip->pnn  = public_ips->ips[j].pnn;
1467                         tmp_ip->addr = public_ips->ips[j].addr;
1468                         tmp_ip->next = NULL;
1469
1470                         trbt_insertarray32_callback(ctdb->ip_tree,
1471                                 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1472                                 add_ip_callback,
1473                                 tmp_ip);
1474                 }
1475         }
1476
1477         ip_list = NULL;
1478         trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1479
1480         return ip_list;
1481 }
1482
1483 /* 
1484  * This is the length of the longtest common prefix between the IPs.
1485  * It is calculated by XOR-ing the 2 IPs together and counting the
1486  * number of leading zeroes.  The implementation means that all
1487  * addresses end up being 128 bits long.
1488  *
1489  * FIXME? Should we consider IPv4 and IPv6 separately given that the
1490  * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1491  * lots of nodes and IP addresses?
1492  */
1493 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1494 {
1495         uint32_t ip1_k[IP_KEYLEN];
1496         uint32_t *t;
1497         int i;
1498         uint32_t x;
1499
1500         uint32_t distance = 0;
1501
1502         memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1503         t = ip_key(ip2);
1504         for (i=0; i<IP_KEYLEN; i++) {
1505                 x = ip1_k[i] ^ t[i];
1506                 if (x == 0) {
1507                         distance += 32;
1508                 } else {
1509                         /* Count number of leading zeroes. 
1510                          * FIXME? This could be optimised...
1511                          */
1512                         while ((x & (1 << 31)) == 0) {
1513                                 x <<= 1;
1514                                 distance += 1;
1515                         }
1516                 }
1517         }
1518
1519         return distance;
1520 }
1521
1522 /* Calculate the IP distance for the given IP relative to IPs on the
1523    given node.  The ips argument is generally the all_ips variable
1524    used in the main part of the algorithm.
1525  */
1526 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1527                                   struct ctdb_public_ip_list *ips,
1528                                   int pnn)
1529 {
1530         struct ctdb_public_ip_list *t;
1531         uint32_t d;
1532
1533         uint32_t sum = 0;
1534
1535         for (t=ips; t != NULL; t=t->next) {
1536                 if (t->pnn != pnn) {
1537                         continue;
1538                 }
1539
1540                 /* Optimisation: We never calculate the distance
1541                  * between an address and itself.  This allows us to
1542                  * calculate the effect of removing an address from a
1543                  * node by simply calculating the distance between
1544                  * that address and all of the exitsing addresses.
1545                  * Moreover, we assume that we're only ever dealing
1546                  * with addresses from all_ips so we can identify an
1547                  * address via a pointer rather than doing a more
1548                  * expensive address comparison. */
1549                 if (&(t->addr) == ip) {
1550                         continue;
1551                 }
1552
1553                 d = ip_distance(ip, &(t->addr));
1554                 sum += d * d;  /* Cheaper than pulling in math.h :-) */
1555         }
1556
1557         return sum;
1558 }
1559
1560 /* Return the LCP2 imbalance metric for addresses currently assigned
1561    to the given node.
1562  */
1563 static uint32_t lcp2_imbalance(struct ctdb_public_ip_list * all_ips, int pnn)
1564 {
1565         struct ctdb_public_ip_list *t;
1566
1567         uint32_t imbalance = 0;
1568
1569         for (t=all_ips; t!=NULL; t=t->next) {
1570                 if (t->pnn != pnn) {
1571                         continue;
1572                 }
1573                 /* Pass the rest of the IPs rather than the whole
1574                    all_ips input list.
1575                 */
1576                 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1577         }
1578
1579         return imbalance;
1580 }
1581
1582 /* Allocate any unassigned IPs just by looping through the IPs and
1583  * finding the best node for each.
1584  */
1585 static void basic_allocate_unassigned(struct ctdb_context *ctdb,
1586                                       struct ctdb_node_map *nodemap,
1587                                       uint32_t mask,
1588                                       struct ctdb_public_ip_list *all_ips)
1589 {
1590         struct ctdb_public_ip_list *tmp_ip;
1591
1592         /* loop over all ip's and find a physical node to cover for 
1593            each unassigned ip.
1594         */
1595         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1596                 if (tmp_ip->pnn == -1) {
1597                         if (find_takeover_node(ctdb, nodemap, mask, tmp_ip, all_ips)) {
1598                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1599                                         ctdb_addr_to_str(&tmp_ip->addr)));
1600                         }
1601                 }
1602         }
1603 }
1604
1605 /* Basic non-deterministic rebalancing algorithm.
1606  */
1607 static bool basic_failback(struct ctdb_context *ctdb,
1608                            struct ctdb_node_map *nodemap,
1609                            uint32_t mask,
1610                            struct ctdb_public_ip_list *all_ips,
1611                            int num_ips,
1612                            int *retries)
1613 {
1614         int i;
1615         int maxnode, maxnum=0, minnode, minnum=0, num;
1616         struct ctdb_public_ip_list *tmp_ip;
1617
1618         /* for each ip address, loop over all nodes that can serve
1619            this ip and make sure that the difference between the node
1620            serving the most and the node serving the least ip's are
1621            not greater than 1.
1622         */
1623         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1624                 if (tmp_ip->pnn == -1) {
1625                         continue;
1626                 }
1627
1628                 /* Get the highest and lowest number of ips's served by any 
1629                    valid node which can serve this ip.
1630                 */
1631                 maxnode = -1;
1632                 minnode = -1;
1633                 for (i=0;i<nodemap->num;i++) {
1634                         if (nodemap->nodes[i].flags & mask) {
1635                                 continue;
1636                         }
1637
1638                         /* Only check nodes that are allowed to takeover an ip */
1639                         if (nodemap->nodes[i].flags & NODE_FLAGS_NOIPTAKEOVER) {
1640                                 continue;
1641                         }
1642
1643                         /* only check nodes that can actually serve this ip */
1644                         if (can_node_serve_ip(ctdb, i, tmp_ip)) {
1645                                 /* no it couldnt   so skip to the next node */
1646                                 continue;
1647                         }
1648
1649                         num = node_ip_coverage(ctdb, i, all_ips);
1650                         if (maxnode == -1) {
1651                                 maxnode = i;
1652                                 maxnum  = num;
1653                         } else {
1654                                 if (num > maxnum) {
1655                                         maxnode = i;
1656                                         maxnum  = num;
1657                                 }
1658                         }
1659                         if (minnode == -1) {
1660                                 minnode = i;
1661                                 minnum  = num;
1662                         } else {
1663                                 if (num < minnum) {
1664                                         minnode = i;
1665                                         minnum  = num;
1666                                 }
1667                         }
1668                 }
1669                 if (maxnode == -1) {
1670                         DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1671                                 ctdb_addr_to_str(&tmp_ip->addr)));
1672
1673                         continue;
1674                 }
1675
1676                 /* If we want deterministic IPs then dont try to reallocate 
1677                    them to spread out the load.
1678                 */
1679                 if (1 == ctdb->tunable.deterministic_public_ips) {
1680                         continue;
1681                 }
1682
1683                 /* if the spread between the smallest and largest coverage by
1684                    a node is >=2 we steal one of the ips from the node with
1685                    most coverage to even things out a bit.
1686                    try to do this a limited number of times since we dont
1687                    want to spend too much time balancing the ip coverage.
1688                 */
1689                 if ( (maxnum > minnum+1)
1690                      && (*retries < (num_ips + 5)) ){
1691                         struct ctdb_public_ip_list *tmp;
1692
1693                         /* mark one of maxnode's vnn's as unassigned and try
1694                            again
1695                         */
1696                         for (tmp=all_ips;tmp;tmp=tmp->next) {
1697                                 if (tmp->pnn == maxnode) {
1698                                         tmp->pnn = -1;
1699                                         (*retries)++;
1700                                         return true;
1701                                 }
1702                         }
1703                 }
1704         }
1705
1706         return false;
1707 }
1708
1709 struct ctdb_rebalancenodes {
1710         struct ctdb_rebalancenodes *next;
1711         uint32_t pnn;
1712 };
1713 static struct ctdb_rebalancenodes *force_rebalance_list = NULL;
1714
1715
1716 /* set this flag to force the node to be rebalanced even if it just didnt
1717    become healthy again.
1718 */
1719 void lcp2_forcerebalance(struct ctdb_context *ctdb, uint32_t pnn)
1720 {
1721         struct ctdb_rebalancenodes *rebalance;
1722
1723         for (rebalance = force_rebalance_list; rebalance; rebalance = rebalance->next) {
1724                 if (rebalance->pnn == pnn) {
1725                         return;
1726                 }
1727         }
1728
1729         rebalance = talloc(ctdb, struct ctdb_rebalancenodes);
1730         rebalance->pnn = pnn;
1731         rebalance->next = force_rebalance_list;
1732         force_rebalance_list = rebalance;
1733 }
1734
1735 /* Do necessary LCP2 initialisation.  Bury it in a function here so
1736  * that we can unit test it.
1737  */
1738 static void lcp2_init(struct ctdb_context * tmp_ctx,
1739                struct ctdb_node_map * nodemap,
1740                uint32_t mask,
1741                struct ctdb_public_ip_list *all_ips,
1742                uint32_t **lcp2_imbalances,
1743                bool **newly_healthy)
1744 {
1745         int i;
1746         struct ctdb_public_ip_list *tmp_ip;
1747
1748         *newly_healthy = talloc_array(tmp_ctx, bool, nodemap->num);
1749         CTDB_NO_MEMORY_FATAL(tmp_ctx, *newly_healthy);
1750         *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, nodemap->num);
1751         CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
1752
1753         for (i=0;i<nodemap->num;i++) {
1754                 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1755                 /* First step: is the node "healthy"? */
1756                 (*newly_healthy)[i] = ! (bool)(nodemap->nodes[i].flags & mask);
1757         }
1758
1759         /* 2nd step: if a ndoe has IPs assigned then it must have been
1760          * healthy before, so we remove it from consideration... */
1761         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1762                 if (tmp_ip->pnn != -1) {
1763                         (*newly_healthy)[tmp_ip->pnn] = false;
1764                 }
1765         }
1766
1767         /* 3rd step: if a node is forced to re-balance then
1768            we allow failback onto the node */
1769         while (force_rebalance_list != NULL) {
1770                 struct ctdb_rebalancenodes *next = force_rebalance_list->next;
1771
1772                 if (force_rebalance_list->pnn <= nodemap->num) {
1773                         (*newly_healthy)[force_rebalance_list->pnn] = true;
1774                 }
1775
1776                 DEBUG(DEBUG_ERR,("During ipreallocation, forced rebalance of node %d\n", force_rebalance_list->pnn));
1777                 talloc_free(force_rebalance_list);
1778                 force_rebalance_list = next;
1779         }
1780 }
1781
1782 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1783  * the IP/node combination that will cost the least.
1784  */
1785 static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
1786                               struct ctdb_node_map *nodemap,
1787                               uint32_t mask,
1788                               struct ctdb_public_ip_list *all_ips,
1789                               uint32_t *lcp2_imbalances)
1790 {
1791         struct ctdb_public_ip_list *tmp_ip;
1792         int dstnode;
1793
1794         int minnode;
1795         uint32_t mindsum, dstdsum, dstimbl, minimbl;
1796         struct ctdb_public_ip_list *minip;
1797
1798         bool should_loop = true;
1799         bool have_unassigned = true;
1800
1801         while (have_unassigned && should_loop) {
1802                 should_loop = false;
1803
1804                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1805                 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1806
1807                 minnode = -1;
1808                 mindsum = 0;
1809                 minip = NULL;
1810
1811                 /* loop over each unassigned ip. */
1812                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1813                         if (tmp_ip->pnn != -1) {
1814                                 continue;
1815                         }
1816
1817                         for (dstnode=0; dstnode < nodemap->num; dstnode++) {
1818                                 /* Only check nodes that are allowed to takeover an ip */
1819                                 if (nodemap->nodes[dstnode].flags & NODE_FLAGS_NOIPTAKEOVER) {
1820                                         continue;
1821                                 }
1822
1823                                 /* only check nodes that can actually serve this ip */
1824                                 if (can_node_serve_ip(ctdb, dstnode, tmp_ip)) {
1825                                         /* no it couldnt   so skip to the next node */
1826                                         continue;
1827                                 }
1828                                 if (nodemap->nodes[dstnode].flags & mask) {
1829                                         continue;
1830                                 }
1831
1832                                 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1833                                 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1834                                 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1835                                                    ctdb_addr_to_str(&(tmp_ip->addr)),
1836                                                    dstnode,
1837                                                    dstimbl - lcp2_imbalances[dstnode]));
1838
1839
1840                                 if ((minnode == -1) || (dstdsum < mindsum)) {
1841                                         minnode = dstnode;
1842                                         minimbl = dstimbl;
1843                                         mindsum = dstdsum;
1844                                         minip = tmp_ip;
1845                                         should_loop = true;
1846                                 }
1847                         }
1848                 }
1849
1850                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1851
1852                 /* If we found one then assign it to the given node. */
1853                 if (minnode != -1) {
1854                         minip->pnn = minnode;
1855                         lcp2_imbalances[minnode] = minimbl;
1856                         DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1857                                           ctdb_addr_to_str(&(minip->addr)),
1858                                           minnode,
1859                                           mindsum));
1860                 }
1861
1862                 /* There might be a better way but at least this is clear. */
1863                 have_unassigned = false;
1864                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1865                         if (tmp_ip->pnn == -1) {
1866                                 have_unassigned = true;
1867                         }
1868                 }
1869         }
1870
1871         /* We know if we have an unassigned addresses so we might as
1872          * well optimise.
1873          */
1874         if (have_unassigned) {
1875                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1876                         if (tmp_ip->pnn == -1) {
1877                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1878                                                      ctdb_addr_to_str(&tmp_ip->addr)));
1879                         }
1880                 }
1881         }
1882 }
1883
1884 /* LCP2 algorithm for rebalancing the cluster.  Given a candidate node
1885  * to move IPs from, determines the best IP/destination node
1886  * combination to move from the source node.
1887  */
1888 static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
1889                                     struct ctdb_node_map *nodemap,
1890                                     struct ctdb_public_ip_list *all_ips,
1891                                     int srcnode,
1892                                     uint32_t candimbl,
1893                                     uint32_t *lcp2_imbalances,
1894                                     bool *newly_healthy)
1895 {
1896         int dstnode, mindstnode;
1897         uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1898         uint32_t minsrcimbl, mindstimbl;
1899         struct ctdb_public_ip_list *minip;
1900         struct ctdb_public_ip_list *tmp_ip;
1901
1902         /* Find an IP and destination node that best reduces imbalance. */
1903         minip = NULL;
1904         minsrcimbl = 0;
1905         mindstnode = -1;
1906         mindstimbl = 0;
1907
1908         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1909         DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n", srcnode, candimbl));
1910
1911         for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1912                 /* Only consider addresses on srcnode. */
1913                 if (tmp_ip->pnn != srcnode) {
1914                         continue;
1915                 }
1916
1917                 /* What is this IP address costing the source node? */
1918                 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1919                 srcimbl = candimbl - srcdsum;
1920
1921                 /* Consider this IP address would cost each potential
1922                  * destination node.  Destination nodes are limited to
1923                  * those that are newly healthy, since we don't want
1924                  * to do gratuitous failover of IPs just to make minor
1925                  * balance improvements.
1926                  */
1927                 for (dstnode=0; dstnode < nodemap->num; dstnode++) {
1928                         if (! newly_healthy[dstnode]) {
1929                                 continue;
1930                         }
1931
1932                         /* Only check nodes that are allowed to takeover an ip */
1933                         if (nodemap->nodes[dstnode].flags & NODE_FLAGS_NOIPTAKEOVER) {
1934                                 continue;
1935                         }
1936
1937                         /* only check nodes that can actually serve this ip */
1938                         if (can_node_serve_ip(ctdb, dstnode, tmp_ip)) {
1939                                 /* no it couldnt   so skip to the next node */
1940                                 continue;
1941                         }
1942
1943                         dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1944                         dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1945                         DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1946                                            srcnode, srcimbl - lcp2_imbalances[srcnode],
1947                                            ctdb_addr_to_str(&(tmp_ip->addr)),
1948                                            dstnode, dstimbl - lcp2_imbalances[dstnode]));
1949
1950                         if ((dstimbl < candimbl) && (dstdsum < srcdsum) && \
1951                             ((mindstnode == -1) ||                              \
1952                              ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1953
1954                                 minip = tmp_ip;
1955                                 minsrcimbl = srcimbl;
1956                                 mindstnode = dstnode;
1957                                 mindstimbl = dstimbl;
1958                         }
1959                 }
1960         }
1961         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1962
1963         if (mindstnode != -1) {
1964                 /* We found a move that makes things better... */
1965                 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1966                                   srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1967                                   ctdb_addr_to_str(&(minip->addr)),
1968                                   mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1969
1970
1971                 lcp2_imbalances[srcnode] = srcimbl;
1972                 lcp2_imbalances[mindstnode] = mindstimbl;
1973                 minip->pnn = mindstnode;
1974
1975                 return true;
1976         }
1977
1978         return false;
1979         
1980 }
1981
1982 struct lcp2_imbalance_pnn {
1983         uint32_t imbalance;
1984         int pnn;
1985 };
1986
1987 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
1988 {
1989         const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
1990         const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
1991
1992         if (lipa->imbalance > lipb->imbalance) {
1993                 return -1;
1994         } else if (lipa->imbalance == lipb->imbalance) {
1995                 return 0;
1996         } else {
1997                 return 1;
1998         }
1999 }
2000
2001 /* LCP2 algorithm for rebalancing the cluster.  This finds the source
2002  * node with the highest LCP2 imbalance, and then determines the best
2003  * IP/destination node combination to move from the source node.
2004  */
2005 static bool lcp2_failback(struct ctdb_context *ctdb,
2006                           struct ctdb_node_map *nodemap,
2007                           uint32_t mask,
2008                           struct ctdb_public_ip_list *all_ips,
2009                           uint32_t *lcp2_imbalances,
2010                           bool *newly_healthy)
2011 {
2012         int i, num_newly_healthy;
2013         struct lcp2_imbalance_pnn * lips;
2014         bool ret;
2015
2016         /* It is only worth continuing if we have suitable target
2017          * nodes to transfer IPs to.  This check is much cheaper than
2018          * continuing on...
2019          */
2020         num_newly_healthy = 0;
2021         for (i = 0; i < nodemap->num; i++) {
2022                 if (newly_healthy[i]) {
2023                         num_newly_healthy++;
2024                 }
2025         }
2026         if (num_newly_healthy == 0) {
2027                 return false;
2028         }
2029
2030         /* Put the imbalances and nodes into an array, sort them and
2031          * iterate through candidates.  Usually the 1st one will be
2032          * used, so this doesn't cost much...
2033          */
2034         lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, nodemap->num);
2035         for (i = 0; i < nodemap->num; i++) {
2036                 lips[i].imbalance = lcp2_imbalances[i];
2037                 lips[i].pnn = i;
2038         }
2039         qsort(lips, nodemap->num, sizeof(struct lcp2_imbalance_pnn),
2040               lcp2_cmp_imbalance_pnn);
2041
2042         ret = false;
2043         for (i = 0; i < nodemap->num; i++) {
2044                 /* This means that all nodes had 0 or 1 addresses, so
2045                  * can't be imbalanced.
2046                  */
2047                 if (lips[i].imbalance == 0) {
2048                         break;
2049                 }
2050
2051                 if (lcp2_failback_candidate(ctdb,
2052                                             nodemap,
2053                                             all_ips,
2054                                             lips[i].pnn,
2055                                             lips[i].imbalance,
2056                                             lcp2_imbalances,
2057                                             newly_healthy)) {
2058                         ret = true;
2059                         break;
2060                 }
2061         }
2062
2063         talloc_free(lips);
2064         return ret;
2065 }
2066
2067 /* The calculation part of the IP allocation algorithm. */
2068 static void ctdb_takeover_run_core(struct ctdb_context *ctdb,
2069                                    struct ctdb_node_map *nodemap,
2070                                    struct ctdb_public_ip_list **all_ips_p)
2071 {
2072         int i, num_healthy, retries, num_ips;
2073         uint32_t mask;
2074         struct ctdb_public_ip_list *all_ips, *tmp_ip;
2075         uint32_t *lcp2_imbalances;
2076         bool *newly_healthy;
2077
2078         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2079
2080         /* Count how many completely healthy nodes we have */
2081         num_healthy = 0;
2082         for (i=0;i<nodemap->num;i++) {
2083                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
2084                         num_healthy++;
2085                 }
2086         }
2087
2088         /* If we have healthy nodes then we will only consider them
2089            for serving public addresses
2090         */
2091         mask = NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED;
2092         if ((num_healthy == 0) &&
2093             (ctdb->tunable.no_ip_takeover_on_disabled == 0)) {
2094                 /* We didnt have any completely healthy nodes so
2095                    use "disabled" nodes as a fallback
2096                 */
2097                 mask = NODE_FLAGS_INACTIVE;
2098         }
2099
2100         /* since nodes only know about those public addresses that
2101            can be served by that particular node, no single node has
2102            a full list of all public addresses that exist in the cluster.
2103            Walk over all node structures and create a merged list of
2104            all public addresses that exist in the cluster.
2105
2106            keep the tree of ips around as ctdb->ip_tree
2107         */
2108         all_ips = create_merged_ip_list(ctdb);
2109         *all_ips_p = all_ips; /* minimal code changes */
2110
2111         /* Count how many ips we have */
2112         num_ips = 0;
2113         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2114                 num_ips++;
2115         }
2116
2117         /* If we want deterministic ip allocations, i.e. that the ip addresses
2118            will always be allocated the same way for a specific set of
2119            available/unavailable nodes.
2120         */
2121         if (1 == ctdb->tunable.deterministic_public_ips) {              
2122                 DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
2123                 for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
2124                         tmp_ip->pnn = i%nodemap->num;
2125                 }
2126
2127                 /* IP failback doesn't make sense with deterministic
2128                  * IPs, since the modulo step above implicitly fails
2129                  * back IPs to their "home" node.
2130                  */
2131                 if (1 == ctdb->tunable.no_ip_failback) {
2132                         DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
2133                 }
2134         }
2135
2136
2137         /* mark all public addresses with a masked node as being served by
2138            node -1
2139         */
2140         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2141                 if (tmp_ip->pnn == -1) {
2142                         continue;
2143                 }
2144                 if (nodemap->nodes[tmp_ip->pnn].flags & mask) {
2145                         tmp_ip->pnn = -1;
2146                 }
2147         }
2148
2149         /* verify that the assigned nodes can serve that public ip
2150            and set it to -1 if not
2151         */
2152         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2153                 if (tmp_ip->pnn == -1) {
2154                         continue;
2155                 }
2156                 if (can_node_serve_ip(ctdb, tmp_ip->pnn, tmp_ip) != 0) {
2157                         /* this node can not serve this ip. */
2158                         tmp_ip->pnn = -1;
2159                 }
2160         }
2161
2162         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2163                 lcp2_init(tmp_ctx, nodemap, mask, all_ips, &lcp2_imbalances, &newly_healthy);
2164         }
2165
2166         /* now we must redistribute all public addresses with takeover node
2167            -1 among the nodes available
2168         */
2169         retries = 0;
2170 try_again:
2171         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2172                 lcp2_allocate_unassigned(ctdb, nodemap, mask, all_ips, lcp2_imbalances);
2173         } else {
2174                 basic_allocate_unassigned(ctdb, nodemap, mask, all_ips);
2175         }
2176
2177         /* If we dont want ips to fail back after a node becomes healthy
2178            again, we wont even try to reallocat the ip addresses so that
2179            they are evenly spread out.
2180            This can NOT be used at the same time as DeterministicIPs !
2181         */
2182         if (1 == ctdb->tunable.no_ip_failback) {
2183                 goto finished;
2184         }
2185
2186
2187         /* now, try to make sure the ip adresses are evenly distributed
2188            across the node.
2189         */
2190         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2191                 if (lcp2_failback(ctdb, nodemap, mask, all_ips, lcp2_imbalances, newly_healthy)) {
2192                         goto try_again;
2193                 }
2194         } else {
2195                 if (basic_failback(ctdb, nodemap, mask, all_ips, num_ips, &retries)) {
2196                         goto try_again;
2197                 }
2198         }
2199
2200         /* finished distributing the public addresses, now just send the 
2201            info out to the nodes */
2202 finished:
2203         /* at this point ->pnn is the node which will own each IP
2204            or -1 if there is no node that can cover this ip
2205         */
2206
2207         talloc_free(tmp_ctx);
2208
2209         return;
2210 }
2211
2212 static void noiptakeover_cb(struct ctdb_context *ctdb, uint32_t pnn, int32_t res, TDB_DATA outdata, void *callback)
2213 {
2214         struct ctdb_node_map *nodemap = (struct ctdb_node_map *)callback;
2215
2216         if (res != 0) {
2217                 DEBUG(DEBUG_ERR,("Failure to read NoIPTakeover tunable from remote node %d\n", pnn));
2218                 return;
2219         }
2220
2221         if (outdata.dsize != sizeof(uint32_t)) {
2222                 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading NoIPTakeover tunable from node %d. Expected %d bytes but received %d bytes\n", pnn, (int)sizeof(uint32_t), (int)outdata.dsize));
2223                 return;
2224         }
2225
2226         if (pnn >= nodemap->num) {
2227                 DEBUG(DEBUG_ERR,("Got NoIPTakeover reply from node %d but nodemap only has %d entries\n", pnn, nodemap->num));
2228                 return;
2229         }
2230
2231         if (*(uint32_t *)outdata.dptr != 0) {
2232                 nodemap->nodes[pnn].flags |= NODE_FLAGS_NOIPTAKEOVER;
2233         }
2234 }
2235
2236 /*
2237   make any IP alias changes for public addresses that are necessary 
2238  */
2239 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
2240                       client_async_callback fail_callback, void *callback_data)
2241 {
2242         int i;
2243         struct ctdb_public_ip ip;
2244         struct ctdb_public_ipv4 ipv4;
2245         struct ctdb_control_get_tunable *t;
2246         uint32_t *nodes;
2247         struct ctdb_public_ip_list *all_ips, *tmp_ip;
2248         TDB_DATA data;
2249         struct timeval timeout;
2250         struct client_async_data *async_data;
2251         struct ctdb_client_control_state *state;
2252         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2253         uint32_t disable_timeout;
2254
2255         /*
2256          * ip failover is completely disabled, just send out the 
2257          * ipreallocated event.
2258          */
2259         if (ctdb->tunable.disable_ip_failover != 0) {
2260                 goto ipreallocated;
2261         }
2262
2263
2264         /* assume all nodes do support failback */
2265         for (i=0;i<nodemap->num;i++) {
2266                 nodemap->nodes[i].flags &= ~NODE_FLAGS_NOIPTAKEOVER;
2267         }
2268         data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen("NoIPTakeover") + 1;
2269         data.dptr  = talloc_size(tmp_ctx, data.dsize);
2270         t = (struct ctdb_control_get_tunable *)data.dptr;
2271         t->length = strlen("NoIPTakeover")+1;
2272         memcpy(t->name, "NoIPTakeover", t->length);
2273         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2274         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
2275                                       nodes, 0, TAKEOVER_TIMEOUT(),
2276                                       false, data,
2277                                       noiptakeover_cb, NULL,
2278                                       nodemap) != 0) {
2279                 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to get noiptakeover tunable failed\n"));
2280         }
2281         talloc_free(nodes);
2282         talloc_free(data.dptr);
2283
2284
2285         ZERO_STRUCT(ip);
2286
2287         /* Do the IP reassignment calculations */
2288         ctdb_takeover_run_core(ctdb, nodemap, &all_ips);
2289
2290         /* The recovery daemon does regular sanity checks of the IPs.
2291          * However, sometimes it is overzealous and thinks changes are
2292          * required when they're already underway.  This stops the
2293          * checks for a while before we start moving IPs.
2294          */
2295         disable_timeout = ctdb->tunable.takeover_timeout;
2296         data.dptr  = (uint8_t*)&disable_timeout;
2297         data.dsize = sizeof(disable_timeout);
2298         if (ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
2299                                      CTDB_SRVID_DISABLE_IP_CHECK, data) != 0) {
2300                 DEBUG(DEBUG_INFO,("Failed to disable ip verification\n"));
2301         }
2302
2303         /* now tell all nodes to delete any alias that they should not
2304            have.  This will be a NOOP on nodes that don't currently
2305            hold the given alias */
2306         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2307         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2308
2309         async_data->fail_callback = fail_callback;
2310         async_data->callback_data = callback_data;
2311
2312         for (i=0;i<nodemap->num;i++) {
2313                 /* don't talk to unconnected nodes, but do talk to banned nodes */
2314                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2315                         continue;
2316                 }
2317
2318                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2319                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2320                                 /* This node should be serving this
2321                                    vnn so dont tell it to release the ip
2322                                 */
2323                                 continue;
2324                         }
2325                         if (tmp_ip->addr.sa.sa_family == AF_INET) {
2326                                 ipv4.pnn = tmp_ip->pnn;
2327                                 ipv4.sin = tmp_ip->addr.ip;
2328
2329                                 timeout = TAKEOVER_TIMEOUT();
2330                                 data.dsize = sizeof(ipv4);
2331                                 data.dptr  = (uint8_t *)&ipv4;
2332                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2333                                                 0, CTDB_CONTROL_RELEASE_IPv4, 0,
2334                                                 data, async_data,
2335                                                 &timeout, NULL);
2336                         } else {
2337                                 ip.pnn  = tmp_ip->pnn;
2338                                 ip.addr = tmp_ip->addr;
2339
2340                                 timeout = TAKEOVER_TIMEOUT();
2341                                 data.dsize = sizeof(ip);
2342                                 data.dptr  = (uint8_t *)&ip;
2343                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2344                                                 0, CTDB_CONTROL_RELEASE_IP, 0,
2345                                                 data, async_data,
2346                                                 &timeout, NULL);
2347                         }
2348
2349                         if (state == NULL) {
2350                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2351                                 talloc_free(tmp_ctx);
2352                                 return -1;
2353                         }
2354                 
2355                         ctdb_client_async_add(async_data, state);
2356                 }
2357         }
2358         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2359                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2360                 talloc_free(tmp_ctx);
2361                 return -1;
2362         }
2363         talloc_free(async_data);
2364
2365
2366         /* tell all nodes to get their own IPs */
2367         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2368         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2369
2370         async_data->fail_callback = fail_callback;
2371         async_data->callback_data = callback_data;
2372
2373         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2374                 if (tmp_ip->pnn == -1) {
2375                         /* this IP won't be taken over */
2376                         continue;
2377                 }
2378
2379                 if (tmp_ip->addr.sa.sa_family == AF_INET) {
2380                         ipv4.pnn = tmp_ip->pnn;
2381                         ipv4.sin = tmp_ip->addr.ip;
2382
2383                         timeout = TAKEOVER_TIMEOUT();
2384                         data.dsize = sizeof(ipv4);
2385                         data.dptr  = (uint8_t *)&ipv4;
2386                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
2387                                         0, CTDB_CONTROL_TAKEOVER_IPv4, 0,
2388                                         data, async_data,
2389                                         &timeout, NULL);
2390                 } else {
2391                         ip.pnn  = tmp_ip->pnn;
2392                         ip.addr = tmp_ip->addr;
2393
2394                         timeout = TAKEOVER_TIMEOUT();
2395                         data.dsize = sizeof(ip);
2396                         data.dptr  = (uint8_t *)&ip;
2397                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
2398                                         0, CTDB_CONTROL_TAKEOVER_IP, 0,
2399                                         data, async_data,
2400                                         &timeout, NULL);
2401                 }
2402                 if (state == NULL) {
2403                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2404                         talloc_free(tmp_ctx);
2405                         return -1;
2406                 }
2407                 
2408                 ctdb_client_async_add(async_data, state);
2409         }
2410         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2411                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2412                 talloc_free(tmp_ctx);
2413                 return -1;
2414         }
2415
2416 ipreallocated:
2417         /* 
2418          * Tell all nodes to run eventscripts to process the
2419          * "ipreallocated" event.  This can do a lot of things,
2420          * including restarting services to reconfigure them if public
2421          * IPs have moved.  Once upon a time this event only used to
2422          * update natwg.
2423          */
2424         data.dptr  = discard_const("ipreallocated");
2425         data.dsize = strlen((char *)data.dptr) + 1; 
2426         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2427         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_RUN_EVENTSCRIPTS,
2428                                       nodes, 0, TAKEOVER_TIMEOUT(),
2429                                       false, data,
2430                                       NULL, fail_callback,
2431                                       callback_data) != 0) {
2432                 DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
2433         }
2434
2435         talloc_free(tmp_ctx);
2436         return 0;
2437 }
2438
2439
2440 /*
2441   destroy a ctdb_client_ip structure
2442  */
2443 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2444 {
2445         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2446                 ctdb_addr_to_str(&ip->addr),
2447                 ntohs(ip->addr.ip.sin_port),
2448                 ip->client_id));
2449
2450         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2451         return 0;
2452 }
2453
2454 /*
2455   called by a client to inform us of a TCP connection that it is managing
2456   that should tickled with an ACK when IP takeover is done
2457   we handle both the old ipv4 style of packets as well as the new ipv4/6
2458   pdus.
2459  */
2460 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2461                                 TDB_DATA indata)
2462 {
2463         struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
2464         struct ctdb_control_tcp *old_addr = NULL;
2465         struct ctdb_control_tcp_addr new_addr;
2466         struct ctdb_control_tcp_addr *tcp_sock = NULL;
2467         struct ctdb_tcp_list *tcp;
2468         struct ctdb_tcp_connection t;
2469         int ret;
2470         TDB_DATA data;
2471         struct ctdb_client_ip *ip;
2472         struct ctdb_vnn *vnn;
2473         ctdb_sock_addr addr;
2474
2475         switch (indata.dsize) {
2476         case sizeof(struct ctdb_control_tcp):
2477                 old_addr = (struct ctdb_control_tcp *)indata.dptr;
2478                 ZERO_STRUCT(new_addr);
2479                 tcp_sock = &new_addr;
2480                 tcp_sock->src.ip  = old_addr->src;
2481                 tcp_sock->dest.ip = old_addr->dest;
2482                 break;
2483         case sizeof(struct ctdb_control_tcp_addr):
2484                 tcp_sock = (struct ctdb_control_tcp_addr *)indata.dptr;
2485                 break;
2486         default:
2487                 DEBUG(DEBUG_ERR,(__location__ " Invalid data structure passed "
2488                                  "to ctdb_control_tcp_client. size was %d but "
2489                                  "only allowed sizes are %lu and %lu\n",
2490                                  (int)indata.dsize,
2491                                  (long unsigned)sizeof(struct ctdb_control_tcp),
2492                                  (long unsigned)sizeof(struct ctdb_control_tcp_addr)));
2493                 return -1;
2494         }
2495
2496         addr = tcp_sock->src;
2497         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
2498         addr = tcp_sock->dest;
2499         ctdb_canonicalize_ip(&addr, &tcp_sock->dest);
2500
2501         ZERO_STRUCT(addr);
2502         memcpy(&addr, &tcp_sock->dest, sizeof(addr));
2503         vnn = find_public_ip_vnn(ctdb, &addr);
2504         if (vnn == NULL) {
2505                 switch (addr.sa.sa_family) {
2506                 case AF_INET:
2507                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2508                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
2509                                         ctdb_addr_to_str(&addr)));
2510                         }
2511                         break;
2512                 case AF_INET6:
2513                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
2514                                 ctdb_addr_to_str(&addr)));
2515                         break;
2516                 default:
2517                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2518                 }
2519
2520                 return 0;
2521         }
2522
2523         if (vnn->pnn != ctdb->pnn) {
2524                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2525                         ctdb_addr_to_str(&addr),
2526                         client_id, client->pid));
2527                 /* failing this call will tell smbd to die */
2528                 return -1;
2529         }
2530
2531         ip = talloc(client, struct ctdb_client_ip);
2532         CTDB_NO_MEMORY(ctdb, ip);
2533
2534         ip->ctdb      = ctdb;
2535         ip->addr      = addr;
2536         ip->client_id = client_id;
2537         talloc_set_destructor(ip, ctdb_client_ip_destructor);
2538         DLIST_ADD(ctdb->client_ip_list, ip);
2539
2540         tcp = talloc(client, struct ctdb_tcp_list);
2541         CTDB_NO_MEMORY(ctdb, tcp);
2542
2543         tcp->connection.src_addr = tcp_sock->src;
2544         tcp->connection.dst_addr = tcp_sock->dest;
2545
2546         DLIST_ADD(client->tcp_list, tcp);
2547
2548         t.src_addr = tcp_sock->src;
2549         t.dst_addr = tcp_sock->dest;
2550
2551         data.dptr = (uint8_t *)&t;
2552         data.dsize = sizeof(t);
2553
2554         switch (addr.sa.sa_family) {
2555         case AF_INET:
2556                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2557                         (unsigned)ntohs(tcp_sock->dest.ip.sin_port), 
2558                         ctdb_addr_to_str(&tcp_sock->src),
2559                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2560                 break;
2561         case AF_INET6:
2562                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2563                         (unsigned)ntohs(tcp_sock->dest.ip6.sin6_port), 
2564                         ctdb_addr_to_str(&tcp_sock->src),
2565                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2566                 break;
2567         default:
2568                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2569         }
2570
2571
2572         /* tell all nodes about this tcp connection */
2573         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
2574                                        CTDB_CONTROL_TCP_ADD,
2575                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2576         if (ret != 0) {
2577                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
2578                 return -1;
2579         }
2580
2581         return 0;
2582 }
2583
2584 /*
2585   find a tcp address on a list
2586  */
2587 static struct ctdb_tcp_connection *ctdb_tcp_find(struct ctdb_tcp_array *array, 
2588                                            struct ctdb_tcp_connection *tcp)
2589 {
2590         int i;
2591
2592         if (array == NULL) {
2593                 return NULL;
2594         }
2595
2596         for (i=0;i<array->num;i++) {
2597                 if (ctdb_same_sockaddr(&array->connections[i].src_addr, &tcp->src_addr) &&
2598                     ctdb_same_sockaddr(&array->connections[i].dst_addr, &tcp->dst_addr)) {
2599                         return &array->connections[i];
2600                 }
2601         }
2602         return NULL;
2603 }
2604
2605
2606
2607 /*
2608   called by a daemon to inform us of a TCP connection that one of its
2609   clients managing that should tickled with an ACK when IP takeover is
2610   done
2611  */
2612 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
2613 {
2614         struct ctdb_tcp_connection *p = (struct ctdb_tcp_connection *)indata.dptr;
2615         struct ctdb_tcp_array *tcparray;
2616         struct ctdb_tcp_connection tcp;
2617         struct ctdb_vnn *vnn;
2618
2619         vnn = find_public_ip_vnn(ctdb, &p->dst_addr);
2620         if (vnn == NULL) {
2621                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
2622                         ctdb_addr_to_str(&p->dst_addr)));
2623
2624                 return -1;
2625         }
2626
2627
2628         tcparray = vnn->tcp_array;
2629
2630         /* If this is the first tickle */
2631         if (tcparray == NULL) {
2632                 tcparray = talloc_size(ctdb->nodes, 
2633                         offsetof(struct ctdb_tcp_array, connections) +
2634                         sizeof(struct ctdb_tcp_connection) * 1);
2635                 CTDB_NO_MEMORY(ctdb, tcparray);
2636                 vnn->tcp_array = tcparray;
2637
2638                 tcparray->num = 0;
2639                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
2640                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
2641
2642                 tcparray->connections[tcparray->num].src_addr = p->src_addr;
2643                 tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
2644                 tcparray->num++;
2645
2646                 if (tcp_update_needed) {
2647                         vnn->tcp_update_needed = true;
2648                 }
2649                 return 0;
2650         }
2651
2652
2653         /* Do we already have this tickle ?*/
2654         tcp.src_addr = p->src_addr;
2655         tcp.dst_addr = p->dst_addr;
2656         if (ctdb_tcp_find(vnn->tcp_array, &tcp) != NULL) {
2657                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
2658                         ctdb_addr_to_str(&tcp.dst_addr),
2659                         ntohs(tcp.dst_addr.ip.sin_port),
2660                         vnn->pnn));
2661                 return 0;
2662         }
2663
2664         /* A new tickle, we must add it to the array */
2665         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
2666                                         struct ctdb_tcp_connection,
2667                                         tcparray->num+1);
2668         CTDB_NO_MEMORY(ctdb, tcparray->connections);
2669
2670         vnn->tcp_array = tcparray;
2671         tcparray->connections[tcparray->num].src_addr = p->src_addr;
2672         tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
2673         tcparray->num++;
2674                                 
2675         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
2676                 ctdb_addr_to_str(&tcp.dst_addr),
2677                 ntohs(tcp.dst_addr.ip.sin_port),
2678                 vnn->pnn));
2679
2680         if (tcp_update_needed) {
2681                 vnn->tcp_update_needed = true;
2682         }
2683
2684         return 0;
2685 }
2686
2687
2688 /*
2689   called by a daemon to inform us of a TCP connection that one of its
2690   clients managing that should tickled with an ACK when IP takeover is
2691   done
2692  */
2693 static void ctdb_remove_tcp_connection(struct ctdb_context *ctdb, struct ctdb_tcp_connection *conn)
2694 {
2695         struct ctdb_tcp_connection *tcpp;
2696         struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst_addr);
2697
2698         if (vnn == NULL) {
2699                 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
2700                         ctdb_addr_to_str(&conn->dst_addr)));
2701                 return;
2702         }
2703
2704         /* if the array is empty we cant remove it
2705            and we dont need to do anything
2706          */
2707         if (vnn->tcp_array == NULL) {
2708                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
2709                         ctdb_addr_to_str(&conn->dst_addr),
2710                         ntohs(conn->dst_addr.ip.sin_port)));
2711                 return;
2712         }
2713
2714
2715         /* See if we know this connection
2716            if we dont know this connection  then we dont need to do anything
2717          */
2718         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
2719         if (tcpp == NULL) {
2720                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
2721                         ctdb_addr_to_str(&conn->dst_addr),
2722                         ntohs(conn->dst_addr.ip.sin_port)));
2723                 return;
2724         }
2725
2726
2727         /* We need to remove this entry from the array.
2728            Instead of allocating a new array and copying data to it
2729            we cheat and just copy the last entry in the existing array
2730            to the entry that is to be removed and just shring the 
2731            ->num field
2732          */
2733         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
2734         vnn->tcp_array->num--;
2735
2736         /* If we deleted the last entry we also need to remove the entire array
2737          */
2738         if (vnn->tcp_array->num == 0) {
2739                 talloc_free(vnn->tcp_array);
2740                 vnn->tcp_array = NULL;
2741         }               
2742
2743         vnn->tcp_update_needed = true;
2744
2745         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
2746                 ctdb_addr_to_str(&conn->src_addr),
2747                 ntohs(conn->src_addr.ip.sin_port)));
2748 }
2749
2750
2751 /*
2752   called by a daemon to inform us of a TCP connection that one of its
2753   clients used are no longer needed in the tickle database
2754  */
2755 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
2756 {
2757         struct ctdb_tcp_connection *conn = (struct ctdb_tcp_connection *)indata.dptr;
2758
2759         ctdb_remove_tcp_connection(ctdb, conn);
2760
2761         return 0;
2762 }
2763
2764
2765 /*
2766   called when a daemon restarts - send all tickes for all public addresses
2767   we are serving immediately to the new node.
2768  */
2769 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t vnn)
2770 {
2771 /*XXX here we should send all tickes we are serving to the new node */
2772         return 0;
2773 }
2774
2775
2776 /*
2777   called when a client structure goes away - hook to remove
2778   elements from the tcp_list in all daemons
2779  */
2780 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
2781 {
2782         while (client->tcp_list) {
2783                 struct ctdb_tcp_list *tcp = client->tcp_list;
2784                 DLIST_REMOVE(client->tcp_list, tcp);
2785                 ctdb_remove_tcp_connection(client->ctdb, &tcp->connection);
2786         }
2787 }
2788
2789
2790 /*
2791   release all IPs on shutdown
2792  */
2793 void ctdb_release_all_ips(struct ctdb_context *ctdb)
2794 {
2795         struct ctdb_vnn *vnn;
2796
2797         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2798                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
2799                         ctdb_vnn_unassign_iface(ctdb, vnn);
2800                         continue;
2801                 }
2802                 if (!vnn->iface) {
2803                         continue;
2804                 }
2805                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
2806                                   ctdb_vnn_iface_string(vnn),
2807                                   ctdb_addr_to_str(&vnn->public_address),
2808                                   vnn->public_netmask_bits);
2809                 release_kill_clients(ctdb, &vnn->public_address);
2810                 ctdb_vnn_unassign_iface(ctdb, vnn);
2811         }
2812 }
2813
2814
2815 /*
2816   get list of public IPs
2817  */
2818 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
2819                                     struct ctdb_req_control *c, TDB_DATA *outdata)
2820 {
2821         int i, num, len;
2822         struct ctdb_all_public_ips *ips;
2823         struct ctdb_vnn *vnn;
2824         bool only_available = false;
2825
2826         if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
2827                 only_available = true;
2828         }
2829
2830         /* count how many public ip structures we have */
2831         num = 0;
2832         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2833                 num++;
2834         }
2835
2836         len = offsetof(struct ctdb_all_public_ips, ips) + 
2837                 num*sizeof(struct ctdb_public_ip);
2838         ips = talloc_zero_size(outdata, len);
2839         CTDB_NO_MEMORY(ctdb, ips);
2840
2841         i = 0;
2842         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2843                 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
2844                         continue;
2845                 }
2846                 ips->ips[i].pnn  = vnn->pnn;
2847                 ips->ips[i].addr = vnn->public_address;
2848                 i++;
2849         }
2850         ips->num = i;
2851         len = offsetof(struct ctdb_all_public_ips, ips) +
2852                 i*sizeof(struct ctdb_public_ip);
2853
2854         outdata->dsize = len;
2855         outdata->dptr  = (uint8_t *)ips;
2856
2857         return 0;
2858 }
2859
2860
2861 /*
2862   get list of public IPs, old ipv4 style.  only returns ipv4 addresses
2863  */
2864 int32_t ctdb_control_get_public_ipsv4(struct ctdb_context *ctdb, 
2865                                     struct ctdb_req_control *c, TDB_DATA *outdata)
2866 {
2867         int i, num, len;
2868         struct ctdb_all_public_ipsv4 *ips;
2869         struct ctdb_vnn *vnn;
2870
2871         /* count how many public ip structures we have */
2872         num = 0;
2873         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2874                 if (vnn->public_address.sa.sa_family != AF_INET) {
2875                         continue;
2876                 }
2877                 num++;
2878         }
2879
2880         len = offsetof(struct ctdb_all_public_ipsv4, ips) + 
2881                 num*sizeof(struct ctdb_public_ipv4);
2882         ips = talloc_zero_size(outdata, len);
2883         CTDB_NO_MEMORY(ctdb, ips);
2884
2885         outdata->dsize = len;
2886         outdata->dptr  = (uint8_t *)ips;
2887
2888         ips->num = num;
2889         i = 0;
2890         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2891                 if (vnn->public_address.sa.sa_family != AF_INET) {
2892                         continue;
2893                 }
2894                 ips->ips[i].pnn = vnn->pnn;
2895                 ips->ips[i].sin = vnn->public_address.ip;
2896                 i++;
2897         }
2898
2899         return 0;
2900 }
2901
2902 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
2903                                         struct ctdb_req_control *c,
2904                                         TDB_DATA indata,
2905                                         TDB_DATA *outdata)
2906 {
2907         int i, num, len;
2908         ctdb_sock_addr *addr;
2909         struct ctdb_control_public_ip_info *info;
2910         struct ctdb_vnn *vnn;
2911
2912         addr = (ctdb_sock_addr *)indata.dptr;
2913
2914         vnn = find_public_ip_vnn(ctdb, addr);
2915         if (vnn == NULL) {
2916                 /* if it is not a public ip   it could be our 'single ip' */
2917                 if (ctdb->single_ip_vnn) {
2918                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
2919                                 vnn = ctdb->single_ip_vnn;
2920                         }
2921                 }
2922         }
2923         if (vnn == NULL) {
2924                 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
2925                                  "'%s'not a public address\n",
2926                                  ctdb_addr_to_str(addr)));
2927                 return -1;
2928         }
2929
2930         /* count how many public ip structures we have */
2931         num = 0;
2932         for (;vnn->ifaces[num];) {
2933                 num++;
2934         }
2935
2936         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
2937                 num*sizeof(struct ctdb_control_iface_info);
2938         info = talloc_zero_size(outdata, len);
2939         CTDB_NO_MEMORY(ctdb, info);
2940
2941         info->ip.addr = vnn->public_address;
2942         info->ip.pnn = vnn->pnn;
2943         info->active_idx = 0xFFFFFFFF;
2944
2945         for (i=0; vnn->ifaces[i]; i++) {
2946                 struct ctdb_iface *cur;
2947
2948                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
2949                 if (cur == NULL) {
2950                         DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
2951                                            vnn->ifaces[i]));
2952                         return -1;
2953                 }
2954                 if (vnn->iface == cur) {
2955                         info->active_idx = i;
2956                 }
2957                 strcpy(info->ifaces[i].name, cur->name);
2958                 info->ifaces[i].link_state = cur->link_up;
2959                 info->ifaces[i].references = cur->references;
2960         }
2961         info->num = i;
2962         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
2963                 i*sizeof(struct ctdb_control_iface_info);
2964
2965         outdata->dsize = len;
2966         outdata->dptr  = (uint8_t *)info;
2967
2968         return 0;
2969 }
2970
2971 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
2972                                 struct ctdb_req_control *c,
2973                                 TDB_DATA *outdata)
2974 {
2975         int i, num, len;
2976         struct ctdb_control_get_ifaces *ifaces;
2977         struct ctdb_iface *cur;
2978
2979         /* count how many public ip structures we have */
2980         num = 0;
2981         for (cur=ctdb->ifaces;cur;cur=cur->next) {
2982                 num++;
2983         }
2984
2985         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
2986                 num*sizeof(struct ctdb_control_iface_info);
2987         ifaces = talloc_zero_size(outdata, len);
2988         CTDB_NO_MEMORY(ctdb, ifaces);
2989
2990         i = 0;
2991         for (cur=ctdb->ifaces;cur;cur=cur->next) {
2992                 strcpy(ifaces->ifaces[i].name, cur->name);
2993                 ifaces->ifaces[i].link_state = cur->link_up;
2994                 ifaces->ifaces[i].references = cur->references;
2995                 i++;
2996         }
2997         ifaces->num = i;
2998         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
2999                 i*sizeof(struct ctdb_control_iface_info);
3000
3001         outdata->dsize = len;
3002         outdata->dptr  = (uint8_t *)ifaces;
3003
3004         return 0;
3005 }
3006
3007 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
3008                                     struct ctdb_req_control *c,
3009                                     TDB_DATA indata)
3010 {
3011         struct ctdb_control_iface_info *info;
3012         struct ctdb_iface *iface;
3013         bool link_up = false;
3014
3015         info = (struct ctdb_control_iface_info *)indata.dptr;
3016
3017         if (info->name[CTDB_IFACE_SIZE] != '\0') {
3018                 int len = strnlen(info->name, CTDB_IFACE_SIZE);
3019                 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
3020                                   len, len, info->name));
3021                 return -1;
3022         }
3023
3024         switch (info->link_state) {
3025         case 0:
3026                 link_up = false;
3027                 break;
3028         case 1:
3029                 link_up = true;
3030                 break;
3031         default:
3032                 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
3033                                   (unsigned int)info->link_state));
3034                 return -1;
3035         }
3036
3037         if (info->references != 0) {
3038                 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
3039                                   (unsigned int)info->references));
3040                 return -1;
3041         }
3042
3043         iface = ctdb_find_iface(ctdb, info->name);
3044         if (iface == NULL) {
3045                 return -1;
3046         }
3047
3048         if (link_up == iface->link_up) {
3049                 return 0;
3050         }
3051
3052         DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
3053               ("iface[%s] has changed it's link status %s => %s\n",
3054                iface->name,
3055                iface->link_up?"up":"down",
3056                link_up?"up":"down"));
3057
3058         iface->link_up = link_up;
3059         return 0;
3060 }
3061
3062
3063 /* 
3064    structure containing the listening socket and the list of tcp connections
3065    that the ctdb daemon is to kill
3066 */
3067 struct ctdb_kill_tcp {
3068         struct ctdb_vnn *vnn;
3069         struct ctdb_context *ctdb;
3070         int capture_fd;
3071         struct fd_event *fde;
3072         trbt_tree_t *connections;
3073         void *private_data;
3074 };
3075
3076 /*
3077   a tcp connection that is to be killed
3078  */
3079 struct ctdb_killtcp_con {
3080         ctdb_sock_addr src_addr;
3081         ctdb_sock_addr dst_addr;
3082         int count;
3083         struct ctdb_kill_tcp *killtcp;
3084 };
3085
3086 /* this function is used to create a key to represent this socketpair
3087    in the killtcp tree.
3088    this key is used to insert and lookup matching socketpairs that are
3089    to be tickled and RST
3090 */
3091 #define KILLTCP_KEYLEN  10
3092 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
3093 {
3094         static uint32_t key[KILLTCP_KEYLEN];
3095
3096         bzero(key, sizeof(key));
3097
3098         if (src->sa.sa_family != dst->sa.sa_family) {
3099                 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
3100                 return key;
3101         }
3102         
3103         switch (src->sa.sa_family) {
3104         case AF_INET:
3105                 key[0]  = dst->ip.sin_addr.s_addr;
3106                 key[1]  = src->ip.sin_addr.s_addr;
3107                 key[2]  = dst->ip.sin_port;
3108                 key[3]  = src->ip.sin_port;
3109                 break;
3110         case AF_INET6: {
3111                 uint32_t *dst6_addr32 =
3112                         (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
3113                 uint32_t *src6_addr32 =
3114                         (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
3115                 key[0]  = dst6_addr32[3];
3116                 key[1]  = src6_addr32[3];
3117                 key[2]  = dst6_addr32[2];
3118                 key[3]  = src6_addr32[2];
3119                 key[4]  = dst6_addr32[1];
3120                 key[5]  = src6_addr32[1];
3121                 key[6]  = dst6_addr32[0];
3122                 key[7]  = src6_addr32[0];
3123                 key[8]  = dst->ip6.sin6_port;
3124                 key[9]  = src->ip6.sin6_port;
3125                 break;
3126         }
3127         default:
3128                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
3129                 return key;
3130         }
3131
3132         return key;
3133 }
3134
3135 /*
3136   called when we get a read event on the raw socket
3137  */
3138 static void capture_tcp_handler(struct event_context *ev, struct fd_event *fde, 
3139                                 uint16_t flags, void *private_data)
3140 {
3141         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3142         struct ctdb_killtcp_con *con;
3143         ctdb_sock_addr src, dst;
3144         uint32_t ack_seq, seq;
3145
3146         if (!(flags & EVENT_FD_READ)) {
3147                 return;
3148         }
3149
3150         if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
3151                                 killtcp->private_data,
3152                                 &src, &dst,
3153                                 &ack_seq, &seq) != 0) {
3154                 /* probably a non-tcp ACK packet */
3155                 return;
3156         }
3157
3158         /* check if we have this guy in our list of connections
3159            to kill
3160         */
3161         con = trbt_lookuparray32(killtcp->connections, 
3162                         KILLTCP_KEYLEN, killtcp_key(&src, &dst));
3163         if (con == NULL) {
3164                 /* no this was some other packet we can just ignore */
3165                 return;
3166         }
3167
3168         /* This one has been tickled !
3169            now reset him and remove him from the list.
3170          */
3171         DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
3172                 ntohs(con->dst_addr.ip.sin_port),
3173                 ctdb_addr_to_str(&con->src_addr),
3174                 ntohs(con->src_addr.ip.sin_port)));
3175
3176         ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
3177         talloc_free(con);
3178 }
3179
3180
3181 /* when traversing the list of all tcp connections to send tickle acks to
3182    (so that we can capture the ack coming back and kill the connection
3183     by a RST)
3184    this callback is called for each connection we are currently trying to kill
3185 */
3186 static int tickle_connection_traverse(void *param, void *data)
3187 {
3188         struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
3189
3190         /* have tried too many times, just give up */
3191         if (con->count >= 5) {
3192                 /* can't delete in traverse: reparent to delete_cons */
3193                 talloc_steal(param, con);
3194                 return 0;
3195         }
3196
3197         /* othervise, try tickling it again */
3198         con->count++;
3199         ctdb_sys_send_tcp(
3200                 (ctdb_sock_addr *)&con->dst_addr,
3201                 (ctdb_sock_addr *)&con->src_addr,
3202                 0, 0, 0);
3203         return 0;
3204 }
3205
3206
3207 /* 
3208    called every second until all sentenced connections have been reset
3209  */
3210 static void ctdb_tickle_sentenced_connections(struct event_context *ev, struct timed_event *te, 
3211                                               struct timeval t, void *private_data)
3212 {
3213         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3214         void *delete_cons = talloc_new(NULL);
3215
3216         /* loop over all connections sending tickle ACKs */
3217         trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
3218
3219         /* now we've finished traverse, it's safe to do deletion. */
3220         talloc_free(delete_cons);
3221
3222         /* If there are no more connections to kill we can remove the
3223            entire killtcp structure
3224          */
3225         if ( (killtcp->connections == NULL) || 
3226              (killtcp->connections->root == NULL) ) {
3227                 talloc_free(killtcp);
3228                 return;
3229         }
3230
3231         /* try tickling them again in a seconds time
3232          */
3233         event_add_timed(killtcp->ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
3234                         ctdb_tickle_sentenced_connections, killtcp);
3235 }
3236
3237 /*
3238   destroy the killtcp structure
3239  */
3240 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
3241 {
3242         struct ctdb_vnn *tmpvnn;
3243
3244         /* verify that this vnn is still active */
3245         for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
3246                 if (tmpvnn == killtcp->vnn) {
3247                         break;
3248                 }
3249         }
3250
3251         if (tmpvnn == NULL) {
3252                 return 0;
3253         }
3254
3255         if (killtcp->vnn->killtcp != killtcp) {
3256                 return 0;
3257         }
3258
3259         killtcp->vnn->killtcp = NULL;
3260
3261         return 0;
3262 }
3263
3264
3265 /* nothing fancy here, just unconditionally replace any existing
3266    connection structure with the new one.
3267
3268    dont even free the old one if it did exist, that one is talloc_stolen
3269    by the same node in the tree anyway and will be deleted when the new data 
3270    is deleted
3271 */
3272 static void *add_killtcp_callback(void *parm, void *data)
3273 {
3274         return parm;
3275 }
3276
3277 /*
3278   add a tcp socket to the list of connections we want to RST
3279  */
3280 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb, 
3281                                        ctdb_sock_addr *s,
3282                                        ctdb_sock_addr *d)
3283 {
3284         ctdb_sock_addr src, dst;
3285         struct ctdb_kill_tcp *killtcp;
3286         struct ctdb_killtcp_con *con;
3287         struct ctdb_vnn *vnn;
3288
3289         ctdb_canonicalize_ip(s, &src);
3290         ctdb_canonicalize_ip(d, &dst);
3291
3292         vnn = find_public_ip_vnn(ctdb, &dst);
3293         if (vnn == NULL) {
3294                 vnn = find_public_ip_vnn(ctdb, &src);
3295         }
3296         if (vnn == NULL) {
3297                 /* if it is not a public ip   it could be our 'single ip' */
3298                 if (ctdb->single_ip_vnn) {
3299                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
3300                                 vnn = ctdb->single_ip_vnn;
3301                         }
3302                 }
3303         }
3304         if (vnn == NULL) {
3305                 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n")); 
3306                 return -1;
3307         }
3308
3309         killtcp = vnn->killtcp;
3310         
3311         /* If this is the first connection to kill we must allocate
3312            a new structure
3313          */
3314         if (killtcp == NULL) {
3315                 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
3316                 CTDB_NO_MEMORY(ctdb, killtcp);
3317
3318                 killtcp->vnn         = vnn;
3319                 killtcp->ctdb        = ctdb;
3320                 killtcp->capture_fd  = -1;
3321                 killtcp->connections = trbt_create(killtcp, 0);
3322
3323                 vnn->killtcp         = killtcp;
3324                 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
3325         }
3326
3327
3328
3329         /* create a structure that describes this connection we want to
3330            RST and store it in killtcp->connections
3331         */
3332         con = talloc(killtcp, struct ctdb_killtcp_con);
3333         CTDB_NO_MEMORY(ctdb, con);
3334         con->src_addr = src;
3335         con->dst_addr = dst;
3336         con->count    = 0;
3337         con->killtcp  = killtcp;
3338
3339
3340         trbt_insertarray32_callback(killtcp->connections,
3341                         KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
3342                         add_killtcp_callback, con);
3343
3344         /* 
3345            If we dont have a socket to listen on yet we must create it
3346          */
3347         if (killtcp->capture_fd == -1) {
3348                 const char *iface = ctdb_vnn_iface_string(vnn);
3349                 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
3350                 if (killtcp->capture_fd == -1) {
3351                         DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
3352                                           "socket on iface '%s' for killtcp (%s)\n",
3353                                           iface, strerror(errno)));
3354                         goto failed;
3355                 }
3356         }
3357
3358
3359         if (killtcp->fde == NULL) {
3360                 killtcp->fde = event_add_fd(ctdb->ev, killtcp, killtcp->capture_fd, 
3361                                             EVENT_FD_READ,
3362                                             capture_tcp_handler, killtcp);
3363                 tevent_fd_set_auto_close(killtcp->fde);
3364
3365                 /* We also need to set up some events to tickle all these connections
3366                    until they are all reset
3367                 */
3368                 event_add_timed(ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
3369                                 ctdb_tickle_sentenced_connections, killtcp);
3370         }
3371
3372         /* tickle him once now */
3373         ctdb_sys_send_tcp(
3374                 &con->dst_addr,
3375                 &con->src_addr,
3376                 0, 0, 0);
3377
3378         return 0;
3379
3380 failed:
3381         talloc_free(vnn->killtcp);
3382         vnn->killtcp = NULL;
3383         return -1;
3384 }
3385
3386 /*
3387   kill a TCP connection.
3388  */
3389 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
3390 {
3391         struct ctdb_control_killtcp *killtcp = (struct ctdb_control_killtcp *)indata.dptr;
3392
3393         return ctdb_killtcp_add_connection(ctdb, &killtcp->src_addr, &killtcp->dst_addr);
3394 }
3395
3396 /*
3397   called by a daemon to inform us of the entire list of TCP tickles for
3398   a particular public address.
3399   this control should only be sent by the node that is currently serving
3400   that public address.
3401  */
3402 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
3403 {
3404         struct ctdb_control_tcp_tickle_list *list = (struct ctdb_control_tcp_tickle_list *)indata.dptr;
3405         struct ctdb_tcp_array *tcparray;
3406         struct ctdb_vnn *vnn;
3407
3408         /* We must at least have tickles.num or else we cant verify the size
3409            of the received data blob
3410          */
3411         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3412                                         tickles.connections)) {
3413                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list. Not enough data for the tickle.num field\n"));
3414                 return -1;
3415         }
3416
3417         /* verify that the size of data matches what we expect */
3418         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3419                                 tickles.connections)
3420                          + sizeof(struct ctdb_tcp_connection)
3421                                  * list->tickles.num) {
3422                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list\n"));
3423                 return -1;
3424         }       
3425
3426         vnn = find_public_ip_vnn(ctdb, &list->addr);
3427         if (vnn == NULL) {
3428                 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n", 
3429                         ctdb_addr_to_str(&list->addr)));
3430
3431                 return 1;
3432         }
3433
3434         /* remove any old ticklelist we might have */
3435         talloc_free(vnn->tcp_array);
3436         vnn->tcp_array = NULL;
3437
3438         tcparray = talloc(ctdb->nodes, struct ctdb_tcp_array);
3439         CTDB_NO_MEMORY(ctdb, tcparray);
3440
3441         tcparray->num = list->tickles.num;
3442
3443         tcparray->connections = talloc_array(tcparray, struct ctdb_tcp_connection, tcparray->num);
3444         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3445
3446         memcpy(tcparray->connections, &list->tickles.connections[0], 
3447                sizeof(struct ctdb_tcp_connection)*tcparray->num);
3448
3449         /* We now have a new fresh tickle list array for this vnn */
3450         vnn->tcp_array = talloc_steal(vnn, tcparray);
3451         
3452         return 0;
3453 }
3454
3455 /*
3456   called to return the full list of tickles for the puclic address associated 
3457   with the provided vnn
3458  */
3459 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3460 {
3461         ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3462         struct ctdb_control_tcp_tickle_list *list;
3463         struct ctdb_tcp_array *tcparray;
3464         int num;
3465         struct ctdb_vnn *vnn;
3466
3467         vnn = find_public_ip_vnn(ctdb, addr);
3468         if (vnn == NULL) {
3469                 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n", 
3470                         ctdb_addr_to_str(addr)));
3471
3472                 return 1;
3473         }
3474
3475         tcparray = vnn->tcp_array;
3476         if (tcparray) {
3477                 num = tcparray->num;
3478         } else {
3479                 num = 0;
3480         }
3481
3482         outdata->dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3483                                 tickles.connections)
3484                         + sizeof(struct ctdb_tcp_connection) * num;
3485
3486         outdata->dptr  = talloc_size(outdata, outdata->dsize);
3487         CTDB_NO_MEMORY(ctdb, outdata->dptr);
3488         list = (struct ctdb_control_tcp_tickle_list *)outdata->dptr;
3489
3490         list->addr = *addr;
3491         list->tickles.num = num;
3492         if (num) {
3493                 memcpy(&list->tickles.connections[0], tcparray->connections, 
3494                         sizeof(struct ctdb_tcp_connection) * num);
3495         }
3496
3497         return 0;
3498 }
3499
3500
3501 /*
3502   set the list of all tcp tickles for a public address
3503  */
3504 static int ctdb_ctrl_set_tcp_tickles(struct ctdb_context *ctdb, 
3505                               struct timeval timeout, uint32_t destnode, 
3506                               ctdb_sock_addr *addr,
3507                               struct ctdb_tcp_array *tcparray)
3508 {
3509         int ret, num;
3510         TDB_DATA data;
3511         struct ctdb_control_tcp_tickle_list *list;
3512
3513         if (tcparray) {
3514                 num = tcparray->num;
3515         } else {
3516                 num = 0;
3517         }
3518
3519         data.dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3520                                 tickles.connections) +
3521                         sizeof(struct ctdb_tcp_connection) * num;
3522         data.dptr = talloc_size(ctdb, data.dsize);
3523         CTDB_NO_MEMORY(ctdb, data.dptr);
3524
3525         list = (struct ctdb_control_tcp_tickle_list *)data.dptr;
3526         list->addr = *addr;
3527         list->tickles.num = num;
3528         if (tcparray) {
3529                 memcpy(&list->tickles.connections[0], tcparray->connections, sizeof(struct ctdb_tcp_connection) * num);
3530         }
3531
3532         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
3533                                        CTDB_CONTROL_SET_TCP_TICKLE_LIST,
3534                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3535         if (ret != 0) {
3536                 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
3537                 return -1;
3538         }
3539
3540         talloc_free(data.dptr);
3541
3542         return ret;
3543 }
3544
3545
3546 /*
3547   perform tickle updates if required
3548  */
3549 static void ctdb_update_tcp_tickles(struct event_context *ev, 
3550                                 struct timed_event *te, 
3551                                 struct timeval t, void *private_data)
3552 {
3553         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3554         int ret;
3555         struct ctdb_vnn *vnn;
3556
3557         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3558                 /* we only send out updates for public addresses that 
3559                    we have taken over
3560                  */
3561                 if (ctdb->pnn != vnn->pnn) {
3562                         continue;
3563                 }
3564                 /* We only send out the updates if we need to */
3565                 if (!vnn->tcp_update_needed) {
3566                         continue;
3567                 }
3568                 ret = ctdb_ctrl_set_tcp_tickles(ctdb, 
3569                                 TAKEOVER_TIMEOUT(),
3570                                 CTDB_BROADCAST_CONNECTED,
3571                                 &vnn->public_address,
3572                                 vnn->tcp_array);
3573                 if (ret != 0) {
3574                         DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
3575                                 ctdb_addr_to_str(&vnn->public_address)));
3576                 }
3577         }
3578
3579         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
3580                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
3581                              ctdb_update_tcp_tickles, ctdb);
3582 }               
3583         
3584
3585 /*
3586   start periodic update of tcp tickles
3587  */
3588 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
3589 {
3590         ctdb->tickle_update_context = talloc_new(ctdb);
3591
3592         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
3593                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
3594                              ctdb_update_tcp_tickles, ctdb);
3595 }
3596
3597
3598
3599
3600 struct control_gratious_arp {
3601         struct ctdb_context *ctdb;
3602         ctdb_sock_addr addr;
3603         const char *iface;
3604         int count;
3605 };
3606
3607 /*
3608   send a control_gratuitous arp
3609  */
3610 static void send_gratious_arp(struct event_context *ev, struct timed_event *te, 
3611                                   struct timeval t, void *private_data)
3612 {
3613         int ret;
3614         struct control_gratious_arp *arp = talloc_get_type(private_data, 
3615                                                         struct control_gratious_arp);
3616
3617         ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
3618         if (ret != 0) {
3619                 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
3620                                  arp->iface, strerror(errno)));
3621         }
3622
3623
3624         arp->count++;
3625         if (arp->count == CTDB_ARP_REPEAT) {
3626                 talloc_free(arp);
3627                 return;
3628         }
3629
3630         event_add_timed(arp->ctdb->ev, arp, 
3631                         timeval_current_ofs(CTDB_ARP_INTERVAL, 0), 
3632                         send_gratious_arp, arp);
3633 }
3634
3635
3636 /*
3637   send a gratious arp 
3638  */
3639 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
3640 {
3641         struct ctdb_control_gratious_arp *gratious_arp = (struct ctdb_control_gratious_arp *)indata.dptr;
3642         struct control_gratious_arp *arp;
3643
3644         /* verify the size of indata */
3645         if (indata.dsize < offsetof(struct ctdb_control_gratious_arp, iface)) {
3646                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
3647                                  (unsigned)indata.dsize, 
3648                                  (unsigned)offsetof(struct ctdb_control_gratious_arp, iface)));
3649                 return -1;
3650         }
3651         if (indata.dsize != 
3652                 ( offsetof(struct ctdb_control_gratious_arp, iface)
3653                 + gratious_arp->len ) ){
3654
3655                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3656                         "but should be %u bytes\n", 
3657                          (unsigned)indata.dsize, 
3658                          (unsigned)(offsetof(struct ctdb_control_gratious_arp, iface)+gratious_arp->len)));
3659                 return -1;
3660         }
3661
3662
3663         arp = talloc(ctdb, struct control_gratious_arp);
3664         CTDB_NO_MEMORY(ctdb, arp);
3665
3666         arp->ctdb  = ctdb;
3667         arp->addr   = gratious_arp->addr;
3668         arp->iface = talloc_strdup(arp, gratious_arp->iface);
3669         CTDB_NO_MEMORY(ctdb, arp->iface);
3670         arp->count = 0;
3671         
3672         event_add_timed(arp->ctdb->ev, arp, 
3673                         timeval_zero(), send_gratious_arp, arp);
3674
3675         return 0;
3676 }
3677
3678 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
3679 {
3680         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
3681         int ret;
3682
3683         /* verify the size of indata */
3684         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
3685                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
3686                 return -1;
3687         }
3688         if (indata.dsize != 
3689                 ( offsetof(struct ctdb_control_ip_iface, iface)
3690                 + pub->len ) ){
3691
3692                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3693                         "but should be %u bytes\n", 
3694                          (unsigned)indata.dsize, 
3695                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
3696                 return -1;
3697         }
3698
3699         ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
3700
3701         if (ret != 0) {
3702                 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
3703                 return -1;
3704         }
3705
3706         return 0;
3707 }
3708
3709 /*
3710   called when releaseip event finishes for del_public_address
3711  */
3712 static void delete_ip_callback(struct ctdb_context *ctdb, int status, 
3713                                 void *private_data)
3714 {
3715         talloc_free(private_data);
3716 }
3717
3718 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
3719 {
3720         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
3721         struct ctdb_vnn *vnn;
3722         int ret;
3723
3724         /* verify the size of indata */
3725         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
3726                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
3727                 return -1;
3728         }
3729         if (indata.dsize != 
3730                 ( offsetof(struct ctdb_control_ip_iface, iface)
3731                 + pub->len ) ){
3732
3733                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3734                         "but should be %u bytes\n", 
3735                          (unsigned)indata.dsize, 
3736                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
3737                 return -1;
3738         }
3739
3740         /* walk over all public addresses until we find a match */
3741         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3742                 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
3743                         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3744
3745                         DLIST_REMOVE(ctdb->vnn, vnn);
3746                         talloc_steal(mem_ctx, vnn);
3747                         ctdb_remove_orphaned_ifaces(ctdb, vnn, mem_ctx);
3748                         if (vnn->pnn != ctdb->pnn) {
3749                                 if (vnn->iface != NULL) {
3750                                         ctdb_vnn_unassign_iface(ctdb, vnn);
3751                                 }
3752                                 talloc_free(mem_ctx);
3753                                 return 0;
3754                         }
3755                         vnn->pnn = -1;
3756
3757                         ret = ctdb_event_script_callback(ctdb, 
3758                                          mem_ctx, delete_ip_callback, mem_ctx,
3759                                          false,
3760                                          CTDB_EVENT_RELEASE_IP,
3761                                          "%s %s %u",
3762                                          ctdb_vnn_iface_string(vnn),
3763                                          ctdb_addr_to_str(&vnn->public_address),
3764                                          vnn->public_netmask_bits);
3765                         if (vnn->iface != NULL) {
3766                                 ctdb_vnn_unassign_iface(ctdb, vnn);
3767                         }
3768                         if (ret != 0) {
3769                                 return -1;
3770                         }
3771                         return 0;
3772                 }
3773         }
3774
3775         return -1;
3776 }
3777
3778 /* This function is called from the recovery daemon to verify that a remote
3779    node has the expected ip allocation.
3780    This is verified against ctdb->ip_tree
3781 */
3782 int verify_remote_ip_allocation(struct ctdb_context *ctdb, struct ctdb_all_public_ips *ips)
3783 {
3784         struct ctdb_public_ip_list *tmp_ip; 
3785         int i;
3786
3787         if (ctdb->ip_tree == NULL) {
3788                 /* dont know the expected allocation yet, assume remote node
3789                    is correct. */
3790                 return 0;
3791         }
3792
3793         if (ips == NULL) {
3794                 return 0;
3795         }
3796
3797         for (i=0; i<ips->num; i++) {
3798                 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
3799                 if (tmp_ip == NULL) {
3800                         DEBUG(DEBUG_ERR,(__location__ " Could not find host for address %s, reassign ips\n", ctdb_addr_to_str(&ips->ips[i].addr)));
3801                         return -1;
3802                 }
3803
3804                 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
3805                         continue;
3806                 }
3807
3808                 if (tmp_ip->pnn != ips->ips[i].pnn) {
3809                         DEBUG(DEBUG_ERR,("Inconsistent ip allocation. Trigger reallocation. Thinks %s is held by node %u while it is held by node %u\n", ctdb_addr_to_str(&ips->ips[i].addr), ips->ips[i].pnn, tmp_ip->pnn));
3810                         return -1;
3811                 }
3812         }
3813
3814         return 0;
3815 }
3816
3817 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
3818 {
3819         struct ctdb_public_ip_list *tmp_ip; 
3820
3821         if (ctdb->ip_tree == NULL) {
3822                 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
3823                 return -1;
3824         }
3825
3826         tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
3827         if (tmp_ip == NULL) {
3828                 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
3829                 return -1;
3830         }
3831
3832         DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
3833         tmp_ip->pnn = ip->pnn;
3834
3835         return 0;
3836 }
3837
3838
3839 struct ctdb_reloadips_handle {
3840         struct ctdb_context *ctdb;
3841         struct ctdb_req_control *c;
3842         int status;
3843         int fd[2];
3844         pid_t child;
3845         struct fd_event *fde;
3846 };
3847
3848 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
3849 {
3850         if (h == h->ctdb->reload_ips) {
3851                 h->ctdb->reload_ips = NULL;
3852         }
3853         if (h->c != NULL) {
3854                 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
3855                 h->c = NULL;
3856         }
3857         ctdb_kill(h->ctdb, h->child, SIGKILL);
3858         return 0;
3859 }
3860
3861 static void ctdb_reloadips_timeout_event(struct event_context *ev,
3862                                 struct timed_event *te,
3863                                 struct timeval t, void *private_data)
3864 {
3865         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
3866
3867         talloc_free(h);
3868 }       
3869
3870 static void ctdb_reloadips_child_handler(struct event_context *ev, struct fd_event *fde, 
3871                              uint16_t flags, void *private_data)
3872 {
3873         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
3874
3875         char res;
3876         int ret;
3877
3878         ret = read(h->fd[0], &res, 1);
3879         if (ret < 1 || res != 0) {
3880                 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
3881                 res = 1;
3882         }
3883         h->status = res;
3884
3885         talloc_free(h);
3886 }
3887
3888 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
3889 {
3890         TALLOC_CTX *mem_ctx = talloc_new(NULL);
3891         struct ctdb_all_public_ips *ips;
3892         struct ctdb_vnn *vnn;
3893         int i, ret;
3894
3895         /* read the ip allocation from the local node */
3896         ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ips);
3897         if (ret != 0) {
3898                 DEBUG(DEBUG_ERR, ("Unable to get public ips from local node\n"));
3899                 talloc_free(mem_ctx);
3900                 return -1;
3901         }
3902
3903         /* re-read the public ips file */
3904         ctdb->vnn = NULL;
3905         if (ctdb_set_public_addresses(ctdb, false) != 0) {
3906                 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
3907                 talloc_free(mem_ctx);
3908                 return -1;
3909         }               
3910
3911
3912         /* check the previous list of ips and scan for ips that have been
3913            dropped.
3914          */
3915         for (i = 0; i < ips->num; i++) {
3916                 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
3917                         if (ctdb_same_ip(&vnn->public_address, &ips->ips[i].addr)) {
3918                                 break;
3919                         }
3920                 }
3921
3922                 /* we need to delete this ip, no longer available on this node */
3923                 if (vnn == NULL) {
3924                         struct ctdb_control_ip_iface pub;
3925
3926                         DEBUG(DEBUG_NOTICE,("RELOADIPS: IP%s is no longer available on this node. Deleting it.\n", ctdb_addr_to_str(&ips->ips[i].addr)));
3927                         pub.addr  = ips->ips[i].addr;
3928                         pub.mask  = 0;
3929                         pub.len   = 0;
3930
3931                         ret = ctdb_ctrl_del_public_ip(ctdb, TAKEOVER_TIMEOUT(), CTDB_CURRENT_NODE, &pub);
3932                         if (ret != 0) {
3933                                 DEBUG(DEBUG_ERR, ("RELOADIPS: Unable to del public ip:%s from local node\n", ctdb_addr_to_str(&ips->ips[i].addr)));
3934                                 return -1;
3935                         }
3936                 }
3937         }
3938
3939
3940         /* loop over all new ones and check the ones we need to add */
3941         for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
3942                 for (i = 0; i < ips->num; i++) {
3943                         if (ctdb_same_ip(&vnn->public_address, &ips->ips[i].addr)) {
3944                                 break;
3945                         }
3946                 }
3947                 if (i == ips->num) {
3948                         struct ctdb_control_ip_iface pub;
3949                         const char *ifaces = NULL;
3950                         int iface = 0;
3951
3952                         DEBUG(DEBUG_NOTICE,("RELOADIPS: New ip:%s found, adding it.\n", ctdb_addr_to_str(&vnn->public_address)));
3953
3954                         pub.addr  = vnn->public_address;
3955                         pub.mask  = vnn->public_netmask_bits;
3956
3957
3958                         ifaces = vnn->ifaces[0];
3959                         iface = 1;
3960                         while (vnn->ifaces[iface] != NULL) {
3961                                 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces, vnn->ifaces[iface]);
3962                                 iface++;
3963                         }
3964                         pub.len   = strlen(ifaces)+1;
3965                         memcpy(&pub.iface[0], ifaces, strlen(ifaces)+1);
3966
3967                         ret = ctdb_ctrl_add_public_ip(ctdb, TAKEOVER_TIMEOUT(), CTDB_CURRENT_NODE, &pub);
3968                         if (ret != 0) {
3969                                 DEBUG(DEBUG_ERR, ("RELOADIPS: Unable to add public ip:%s to local node\n", ctdb_addr_to_str(&vnn->public_address)));
3970                                 return -1;
3971                         }
3972                 }
3973         }
3974
3975         return 0;
3976 }
3977
3978 /* This control is sent to force the node to re-read the public addresses file
3979    and drop any addresses we should nnot longer host, and add new addresses
3980    that we are now able to host
3981 */
3982 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control *c, bool *async_reply)
3983 {
3984         struct ctdb_reloadips_handle *h;
3985         pid_t parent = getpid();
3986
3987         if (ctdb->reload_ips != NULL) {
3988                 talloc_free(ctdb->reload_ips);
3989                 ctdb->reload_ips = NULL;
3990         }
3991
3992         h = talloc(ctdb, struct ctdb_reloadips_handle);
3993         CTDB_NO_MEMORY(ctdb, h);
3994         h->ctdb     = ctdb;
3995         h->c        = NULL;
3996         h->status   = -1;
3997         
3998         if (pipe(h->fd) == -1) {
3999                 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
4000                 talloc_free(h);
4001                 return -1;
4002         }
4003
4004         h->child = ctdb_fork(ctdb);
4005         if (h->child == (pid_t)-1) {
4006                 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
4007                 close(h->fd[0]);
4008                 close(h->fd[1]);
4009                 talloc_free(h);
4010                 return -1;
4011         }
4012
4013         /* child process */
4014         if (h->child == 0) {
4015                 signed char res = 0;
4016
4017                 close(h->fd[0]);
4018                 debug_extra = talloc_asprintf(NULL, "reloadips:");
4019
4020                 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
4021                         DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
4022                         res = -1;
4023                 } else {
4024                         res = ctdb_reloadips_child(ctdb);
4025                         if (res != 0) {
4026                                 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
4027                         }
4028                 }
4029
4030                 write(h->fd[1], &res, 1);
4031                 /* make sure we die when our parent dies */
4032                 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
4033                         sleep(5);
4034                 }
4035                 _exit(0);
4036         }
4037
4038         h->c             = talloc_steal(h, c);
4039
4040         close(h->fd[1]);
4041         set_close_on_exec(h->fd[0]);
4042
4043         talloc_set_destructor(h, ctdb_reloadips_destructor);
4044
4045
4046         h->fde = event_add_fd(ctdb->ev, h, h->fd[0],
4047                         EVENT_FD_READ, ctdb_reloadips_child_handler,
4048                         (void *)h);
4049         tevent_fd_set_auto_close(h->fde);
4050
4051         event_add_timed(ctdb->ev, h,
4052                         timeval_current_ofs(120, 0),
4053                         ctdb_reloadips_timeout_event, h);
4054
4055         /* we reply later */
4056         *async_reply = true;
4057         return 0;
4058 }