33f99075ff7bf01d9b04b7c97f8a244d2eb4539e
[obnox/samba/samba-obnox.git] / ctdb / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6    Copyright (C) Martin Schwenke  2011
7
8    This program is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3 of the License, or
11    (at your option) any later version.
12    
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17    
18    You should have received a copy of the GNU General Public License
19    along with this program; if not, see <http://www.gnu.org/licenses/>.
20 */
21 #include "replace.h"
22 #include "system/network.h"
23 #include "system/filesys.h"
24 #include "system/time.h"
25 #include "system/wait.h"
26
27 #include <talloc.h>
28 #include <tevent.h>
29
30 #include "lib/util/dlinklist.h"
31 #include "lib/util/debug.h"
32 #include "lib/util/samba_util.h"
33 #include "lib/util/util_process.h"
34
35 #include "ctdb_private.h"
36 #include "ctdb_client.h"
37
38 #include "common/rb_tree.h"
39 #include "common/reqid.h"
40 #include "common/system.h"
41 #include "common/common.h"
42 #include "common/logging.h"
43
44
45 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
46
47 #define CTDB_ARP_INTERVAL 1
48 #define CTDB_ARP_REPEAT   3
49
50 /* Flags used in IP allocation algorithms. */
51 struct ctdb_ipflags {
52         bool noiptakeover;
53         bool noiphost;
54 };
55
56 enum ipalloc_algorithm {
57         IPALLOC_DETERMINISTIC,
58         IPALLOC_NONDETERMINISTIC,
59         IPALLOC_LCP2,
60 };
61
62 struct ipalloc_state {
63         uint32_t num;
64
65         /* Arrays with data for each node */
66         struct ctdb_public_ip_list_old **known_public_ips;
67         struct ctdb_public_ip_list_old **available_public_ips;
68         struct ctdb_ipflags *ipflags;
69
70         enum ipalloc_algorithm algorithm;
71         uint32_t no_ip_failback;
72 };
73
74 struct ctdb_interface {
75         struct ctdb_interface *prev, *next;
76         const char *name;
77         bool link_up;
78         uint32_t references;
79 };
80
81 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
82 {
83         if (vnn->iface) {
84                 return vnn->iface->name;
85         }
86
87         return "__none__";
88 }
89
90 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
91 {
92         struct ctdb_interface *i;
93
94         /* Verify that we don't have an entry for this ip yet */
95         for (i=ctdb->ifaces;i;i=i->next) {
96                 if (strcmp(i->name, iface) == 0) {
97                         return 0;
98                 }
99         }
100
101         /* create a new structure for this interface */
102         i = talloc_zero(ctdb, struct ctdb_interface);
103         CTDB_NO_MEMORY_FATAL(ctdb, i);
104         i->name = talloc_strdup(i, iface);
105         CTDB_NO_MEMORY(ctdb, i->name);
106
107         i->link_up = true;
108
109         DLIST_ADD(ctdb->ifaces, i);
110
111         return 0;
112 }
113
114 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
115                                         const char *name)
116 {
117         int n;
118
119         for (n = 0; vnn->ifaces[n] != NULL; n++) {
120                 if (strcmp(name, vnn->ifaces[n]) == 0) {
121                         return true;
122                 }
123         }
124
125         return false;
126 }
127
128 /* If any interfaces now have no possible IPs then delete them.  This
129  * implementation is naive (i.e. simple) rather than clever
130  * (i.e. complex).  Given that this is run on delip and that operation
131  * is rare, this doesn't need to be efficient - it needs to be
132  * foolproof.  One alternative is reference counting, where the logic
133  * is distributed and can, therefore, be broken in multiple places.
134  * Another alternative is to build a red-black tree of interfaces that
135  * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
136  * once) and then walking ctdb->ifaces once and deleting those not in
137  * the tree.  Let's go to one of those if the naive implementation
138  * causes problems...  :-)
139  */
140 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
141                                         struct ctdb_vnn *vnn)
142 {
143         struct ctdb_interface *i, *next;
144
145         /* For each interface, check if there's an IP using it. */
146         for (i = ctdb->ifaces; i != NULL; i = next) {
147                 struct ctdb_vnn *tv;
148                 bool found;
149                 next = i->next;
150
151                 /* Only consider interfaces named in the given VNN. */
152                 if (!vnn_has_interface_with_name(vnn, i->name)) {
153                         continue;
154                 }
155
156                 /* Is the "single IP" on this interface? */
157                 if ((ctdb->single_ip_vnn != NULL) &&
158                     (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
159                     (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
160                         /* Found, next interface please... */
161                         continue;
162                 }
163                 /* Search for a vnn with this interface. */
164                 found = false;
165                 for (tv=ctdb->vnn; tv; tv=tv->next) {
166                         if (vnn_has_interface_with_name(tv, i->name)) {
167                                 found = true;
168                                 break;
169                         }
170                 }
171
172                 if (!found) {
173                         /* None of the VNNs are using this interface. */
174                         DLIST_REMOVE(ctdb->ifaces, i);
175                         talloc_free(i);
176                 }
177         }
178 }
179
180
181 static struct ctdb_interface *ctdb_find_iface(struct ctdb_context *ctdb,
182                                               const char *iface)
183 {
184         struct ctdb_interface *i;
185
186         for (i=ctdb->ifaces;i;i=i->next) {
187                 if (strcmp(i->name, iface) == 0) {
188                         return i;
189                 }
190         }
191
192         return NULL;
193 }
194
195 static struct ctdb_interface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
196                                                   struct ctdb_vnn *vnn)
197 {
198         int i;
199         struct ctdb_interface *cur = NULL;
200         struct ctdb_interface *best = NULL;
201
202         for (i=0; vnn->ifaces[i]; i++) {
203
204                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
205                 if (cur == NULL) {
206                         continue;
207                 }
208
209                 if (!cur->link_up) {
210                         continue;
211                 }
212
213                 if (best == NULL) {
214                         best = cur;
215                         continue;
216                 }
217
218                 if (cur->references < best->references) {
219                         best = cur;
220                         continue;
221                 }
222         }
223
224         return best;
225 }
226
227 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
228                                      struct ctdb_vnn *vnn)
229 {
230         struct ctdb_interface *best = NULL;
231
232         if (vnn->iface) {
233                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
234                                    "still assigned to iface '%s'\n",
235                                    ctdb_addr_to_str(&vnn->public_address),
236                                    ctdb_vnn_iface_string(vnn)));
237                 return 0;
238         }
239
240         best = ctdb_vnn_best_iface(ctdb, vnn);
241         if (best == NULL) {
242                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
243                                   "cannot assign to iface any iface\n",
244                                   ctdb_addr_to_str(&vnn->public_address)));
245                 return -1;
246         }
247
248         vnn->iface = best;
249         best->references++;
250         vnn->pnn = ctdb->pnn;
251
252         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
253                            "now assigned to iface '%s' refs[%d]\n",
254                            ctdb_addr_to_str(&vnn->public_address),
255                            ctdb_vnn_iface_string(vnn),
256                            best->references));
257         return 0;
258 }
259
260 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
261                                     struct ctdb_vnn *vnn)
262 {
263         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
264                            "now unassigned (old iface '%s' refs[%d])\n",
265                            ctdb_addr_to_str(&vnn->public_address),
266                            ctdb_vnn_iface_string(vnn),
267                            vnn->iface?vnn->iface->references:0));
268         if (vnn->iface) {
269                 vnn->iface->references--;
270         }
271         vnn->iface = NULL;
272         if (vnn->pnn == ctdb->pnn) {
273                 vnn->pnn = -1;
274         }
275 }
276
277 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
278                                struct ctdb_vnn *vnn)
279 {
280         int i;
281
282         /* Nodes that are not RUNNING can not host IPs */
283         if (ctdb->runstate != CTDB_RUNSTATE_RUNNING) {
284                 return false;
285         }
286
287         if (vnn->delete_pending) {
288                 return false;
289         }
290
291         if (vnn->iface && vnn->iface->link_up) {
292                 return true;
293         }
294
295         for (i=0; vnn->ifaces[i]; i++) {
296                 struct ctdb_interface *cur;
297
298                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
299                 if (cur == NULL) {
300                         continue;
301                 }
302
303                 if (cur->link_up) {
304                         return true;
305                 }
306         }
307
308         return false;
309 }
310
311 struct ctdb_takeover_arp {
312         struct ctdb_context *ctdb;
313         uint32_t count;
314         ctdb_sock_addr addr;
315         struct ctdb_tcp_array *tcparray;
316         struct ctdb_vnn *vnn;
317 };
318
319
320 /*
321   lists of tcp endpoints
322  */
323 struct ctdb_tcp_list {
324         struct ctdb_tcp_list *prev, *next;
325         struct ctdb_connection connection;
326 };
327
328 /*
329   list of clients to kill on IP release
330  */
331 struct ctdb_client_ip {
332         struct ctdb_client_ip *prev, *next;
333         struct ctdb_context *ctdb;
334         ctdb_sock_addr addr;
335         uint32_t client_id;
336 };
337
338
339 /*
340   send a gratuitous arp
341  */
342 static void ctdb_control_send_arp(struct tevent_context *ev,
343                                   struct tevent_timer *te,
344                                   struct timeval t, void *private_data)
345 {
346         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
347                                                         struct ctdb_takeover_arp);
348         int i, ret;
349         struct ctdb_tcp_array *tcparray;
350         const char *iface = ctdb_vnn_iface_string(arp->vnn);
351
352         ret = ctdb_sys_send_arp(&arp->addr, iface);
353         if (ret != 0) {
354                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
355                                   iface, strerror(errno)));
356         }
357
358         tcparray = arp->tcparray;
359         if (tcparray) {
360                 for (i=0;i<tcparray->num;i++) {
361                         struct ctdb_connection *tcon;
362
363                         tcon = &tcparray->connections[i];
364                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
365                                 (unsigned)ntohs(tcon->dst.ip.sin_port),
366                                 ctdb_addr_to_str(&tcon->src),
367                                 (unsigned)ntohs(tcon->src.ip.sin_port)));
368                         ret = ctdb_sys_send_tcp(
369                                 &tcon->src,
370                                 &tcon->dst,
371                                 0, 0, 0);
372                         if (ret != 0) {
373                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
374                                         ctdb_addr_to_str(&tcon->src)));
375                         }
376                 }
377         }
378
379         arp->count++;
380
381         if (arp->count == CTDB_ARP_REPEAT) {
382                 talloc_free(arp);
383                 return;
384         }
385
386         tevent_add_timer(arp->ctdb->ev, arp->vnn->takeover_ctx,
387                          timeval_current_ofs(CTDB_ARP_INTERVAL, 100000),
388                          ctdb_control_send_arp, arp);
389 }
390
391 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
392                                        struct ctdb_vnn *vnn)
393 {
394         struct ctdb_takeover_arp *arp;
395         struct ctdb_tcp_array *tcparray;
396
397         if (!vnn->takeover_ctx) {
398                 vnn->takeover_ctx = talloc_new(vnn);
399                 if (!vnn->takeover_ctx) {
400                         return -1;
401                 }
402         }
403
404         arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
405         if (!arp) {
406                 return -1;
407         }
408
409         arp->ctdb = ctdb;
410         arp->addr = vnn->public_address;
411         arp->vnn  = vnn;
412
413         tcparray = vnn->tcp_array;
414         if (tcparray) {
415                 /* add all of the known tcp connections for this IP to the
416                    list of tcp connections to send tickle acks for */
417                 arp->tcparray = talloc_steal(arp, tcparray);
418
419                 vnn->tcp_array = NULL;
420                 vnn->tcp_update_needed = true;
421         }
422
423         tevent_add_timer(arp->ctdb->ev, vnn->takeover_ctx,
424                          timeval_zero(), ctdb_control_send_arp, arp);
425
426         return 0;
427 }
428
429 struct takeover_callback_state {
430         struct ctdb_req_control_old *c;
431         ctdb_sock_addr *addr;
432         struct ctdb_vnn *vnn;
433 };
434
435 struct ctdb_do_takeip_state {
436         struct ctdb_req_control_old *c;
437         struct ctdb_vnn *vnn;
438 };
439
440 /*
441   called when takeip event finishes
442  */
443 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
444                                     void *private_data)
445 {
446         struct ctdb_do_takeip_state *state =
447                 talloc_get_type(private_data, struct ctdb_do_takeip_state);
448         int32_t ret;
449         TDB_DATA data;
450
451         if (status != 0) {
452                 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
453         
454                 if (status == -ETIME) {
455                         ctdb_ban_self(ctdb);
456                 }
457                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
458                                  ctdb_addr_to_str(&state->vnn->public_address),
459                                  ctdb_vnn_iface_string(state->vnn)));
460                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
461
462                 node->flags |= NODE_FLAGS_UNHEALTHY;
463                 talloc_free(state);
464                 return;
465         }
466
467         if (ctdb->do_checkpublicip) {
468
469         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
470         if (ret != 0) {
471                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
472                 talloc_free(state);
473                 return;
474         }
475
476         }
477
478         data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
479         data.dsize = strlen((char *)data.dptr) + 1;
480         DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
481
482         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
483
484
485         /* the control succeeded */
486         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
487         talloc_free(state);
488         return;
489 }
490
491 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
492 {
493         state->vnn->update_in_flight = false;
494         return 0;
495 }
496
497 /*
498   take over an ip address
499  */
500 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
501                               struct ctdb_req_control_old *c,
502                               struct ctdb_vnn *vnn)
503 {
504         int ret;
505         struct ctdb_do_takeip_state *state;
506
507         if (vnn->update_in_flight) {
508                 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
509                                     "update for this IP already in flight\n",
510                                     ctdb_addr_to_str(&vnn->public_address),
511                                     vnn->public_netmask_bits));
512                 return -1;
513         }
514
515         ret = ctdb_vnn_assign_iface(ctdb, vnn);
516         if (ret != 0) {
517                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
518                                  "assign a usable interface\n",
519                                  ctdb_addr_to_str(&vnn->public_address),
520                                  vnn->public_netmask_bits));
521                 return -1;
522         }
523
524         state = talloc(vnn, struct ctdb_do_takeip_state);
525         CTDB_NO_MEMORY(ctdb, state);
526
527         state->c = talloc_steal(ctdb, c);
528         state->vnn   = vnn;
529
530         vnn->update_in_flight = true;
531         talloc_set_destructor(state, ctdb_takeip_destructor);
532
533         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
534                             ctdb_addr_to_str(&vnn->public_address),
535                             vnn->public_netmask_bits,
536                             ctdb_vnn_iface_string(vnn)));
537
538         ret = ctdb_event_script_callback(ctdb,
539                                          state,
540                                          ctdb_do_takeip_callback,
541                                          state,
542                                          CTDB_EVENT_TAKE_IP,
543                                          "%s %s %u",
544                                          ctdb_vnn_iface_string(vnn),
545                                          ctdb_addr_to_str(&vnn->public_address),
546                                          vnn->public_netmask_bits);
547
548         if (ret != 0) {
549                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
550                         ctdb_addr_to_str(&vnn->public_address),
551                         ctdb_vnn_iface_string(vnn)));
552                 talloc_free(state);
553                 return -1;
554         }
555
556         return 0;
557 }
558
559 struct ctdb_do_updateip_state {
560         struct ctdb_req_control_old *c;
561         struct ctdb_interface *old;
562         struct ctdb_vnn *vnn;
563 };
564
565 /*
566   called when updateip event finishes
567  */
568 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
569                                       void *private_data)
570 {
571         struct ctdb_do_updateip_state *state =
572                 talloc_get_type(private_data, struct ctdb_do_updateip_state);
573         int32_t ret;
574
575         if (status != 0) {
576                 if (status == -ETIME) {
577                         ctdb_ban_self(ctdb);
578                 }
579                 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
580                         ctdb_addr_to_str(&state->vnn->public_address),
581                         state->old->name,
582                         ctdb_vnn_iface_string(state->vnn)));
583
584                 /*
585                  * All we can do is reset the old interface
586                  * and let the next run fix it
587                  */
588                 ctdb_vnn_unassign_iface(ctdb, state->vnn);
589                 state->vnn->iface = state->old;
590                 state->vnn->iface->references++;
591
592                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
593                 talloc_free(state);
594                 return;
595         }
596
597         if (ctdb->do_checkpublicip) {
598
599         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
600         if (ret != 0) {
601                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
602                 talloc_free(state);
603                 return;
604         }
605
606         }
607
608         /* the control succeeded */
609         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
610         talloc_free(state);
611         return;
612 }
613
614 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
615 {
616         state->vnn->update_in_flight = false;
617         return 0;
618 }
619
620 /*
621   update (move) an ip address
622  */
623 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
624                                 struct ctdb_req_control_old *c,
625                                 struct ctdb_vnn *vnn)
626 {
627         int ret;
628         struct ctdb_do_updateip_state *state;
629         struct ctdb_interface *old = vnn->iface;
630         const char *new_name;
631
632         if (vnn->update_in_flight) {
633                 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
634                                     "update for this IP already in flight\n",
635                                     ctdb_addr_to_str(&vnn->public_address),
636                                     vnn->public_netmask_bits));
637                 return -1;
638         }
639
640         ctdb_vnn_unassign_iface(ctdb, vnn);
641         ret = ctdb_vnn_assign_iface(ctdb, vnn);
642         if (ret != 0) {
643                 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
644                                  "assin a usable interface (old iface '%s')\n",
645                                  ctdb_addr_to_str(&vnn->public_address),
646                                  vnn->public_netmask_bits,
647                                  old->name));
648                 return -1;
649         }
650
651         new_name = ctdb_vnn_iface_string(vnn);
652         if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
653                 /* A benign update from one interface onto itself.
654                  * no need to run the eventscripts in this case, just return
655                  * success.
656                  */
657                 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
658                 return 0;
659         }
660
661         state = talloc(vnn, struct ctdb_do_updateip_state);
662         CTDB_NO_MEMORY(ctdb, state);
663
664         state->c = talloc_steal(ctdb, c);
665         state->old = old;
666         state->vnn = vnn;
667
668         vnn->update_in_flight = true;
669         talloc_set_destructor(state, ctdb_updateip_destructor);
670
671         DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
672                             "interface %s to %s\n",
673                             ctdb_addr_to_str(&vnn->public_address),
674                             vnn->public_netmask_bits,
675                             old->name,
676                             new_name));
677
678         ret = ctdb_event_script_callback(ctdb,
679                                          state,
680                                          ctdb_do_updateip_callback,
681                                          state,
682                                          CTDB_EVENT_UPDATE_IP,
683                                          "%s %s %s %u",
684                                          state->old->name,
685                                          new_name,
686                                          ctdb_addr_to_str(&vnn->public_address),
687                                          vnn->public_netmask_bits);
688         if (ret != 0) {
689                 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
690                                  ctdb_addr_to_str(&vnn->public_address),
691                                  old->name, new_name));
692                 talloc_free(state);
693                 return -1;
694         }
695
696         return 0;
697 }
698
699 /*
700   Find the vnn of the node that has a public ip address
701   returns -1 if the address is not known as a public address
702  */
703 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
704 {
705         struct ctdb_vnn *vnn;
706
707         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
708                 if (ctdb_same_ip(&vnn->public_address, addr)) {
709                         return vnn;
710                 }
711         }
712
713         return NULL;
714 }
715
716 /*
717   take over an ip address
718  */
719 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
720                                  struct ctdb_req_control_old *c,
721                                  TDB_DATA indata,
722                                  bool *async_reply)
723 {
724         int ret;
725         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
726         struct ctdb_vnn *vnn;
727         bool have_ip = false;
728         bool do_updateip = false;
729         bool do_takeip = false;
730         struct ctdb_interface *best_iface = NULL;
731
732         if (pip->pnn != ctdb->pnn) {
733                 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
734                                  "with pnn %d, but we're node %d\n",
735                                  ctdb_addr_to_str(&pip->addr),
736                                  pip->pnn, ctdb->pnn));
737                 return -1;
738         }
739
740         /* update out vnn list */
741         vnn = find_public_ip_vnn(ctdb, &pip->addr);
742         if (vnn == NULL) {
743                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
744                         ctdb_addr_to_str(&pip->addr)));
745                 return 0;
746         }
747
748         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
749                 have_ip = ctdb_sys_have_ip(&pip->addr);
750         }
751         best_iface = ctdb_vnn_best_iface(ctdb, vnn);
752         if (best_iface == NULL) {
753                 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
754                                  "a usable interface (old %s, have_ip %d)\n",
755                                  ctdb_addr_to_str(&vnn->public_address),
756                                  vnn->public_netmask_bits,
757                                  ctdb_vnn_iface_string(vnn),
758                                  have_ip));
759                 return -1;
760         }
761
762         if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
763                 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
764                 have_ip = false;
765         }
766
767
768         if (vnn->iface == NULL && have_ip) {
769                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
770                                   "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
771                                  ctdb_addr_to_str(&vnn->public_address)));
772                 return 0;
773         }
774
775         if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
776                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
777                                   "and we have it on iface[%s], but it was assigned to node %d"
778                                   "and we are node %d, banning ourself\n",
779                                  ctdb_addr_to_str(&vnn->public_address),
780                                  ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
781                 ctdb_ban_self(ctdb);
782                 return -1;
783         }
784
785         if (vnn->pnn == -1 && have_ip) {
786                 vnn->pnn = ctdb->pnn;
787                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
788                                   "and we already have it on iface[%s], update local daemon\n",
789                                  ctdb_addr_to_str(&vnn->public_address),
790                                   ctdb_vnn_iface_string(vnn)));
791                 return 0;
792         }
793
794         if (vnn->iface) {
795                 if (vnn->iface != best_iface) {
796                         if (!vnn->iface->link_up) {
797                                 do_updateip = true;
798                         } else if (vnn->iface->references > (best_iface->references + 1)) {
799                                 /* only move when the rebalance gains something */
800                                         do_updateip = true;
801                         }
802                 }
803         }
804
805         if (!have_ip) {
806                 if (do_updateip) {
807                         ctdb_vnn_unassign_iface(ctdb, vnn);
808                         do_updateip = false;
809                 }
810                 do_takeip = true;
811         }
812
813         if (do_takeip) {
814                 ret = ctdb_do_takeip(ctdb, c, vnn);
815                 if (ret != 0) {
816                         return -1;
817                 }
818         } else if (do_updateip) {
819                 ret = ctdb_do_updateip(ctdb, c, vnn);
820                 if (ret != 0) {
821                         return -1;
822                 }
823         } else {
824                 /*
825                  * The interface is up and the kernel known the ip
826                  * => do nothing
827                  */
828                 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
829                         ctdb_addr_to_str(&pip->addr),
830                         vnn->public_netmask_bits,
831                         ctdb_vnn_iface_string(vnn)));
832                 return 0;
833         }
834
835         /* tell ctdb_control.c that we will be replying asynchronously */
836         *async_reply = true;
837
838         return 0;
839 }
840
841 /*
842   kill any clients that are registered with a IP that is being released
843  */
844 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
845 {
846         struct ctdb_client_ip *ip;
847
848         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
849                 ctdb_addr_to_str(addr)));
850
851         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
852                 ctdb_sock_addr tmp_addr;
853
854                 tmp_addr = ip->addr;
855                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
856                         ip->client_id,
857                         ctdb_addr_to_str(&ip->addr)));
858
859                 if (ctdb_same_ip(&tmp_addr, addr)) {
860                         struct ctdb_client *client = reqid_find(ctdb->idr,
861                                                                 ip->client_id,
862                                                                 struct ctdb_client);
863                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
864                                 ip->client_id,
865                                 ctdb_addr_to_str(&ip->addr),
866                                 client->pid));
867
868                         if (client->pid != 0) {
869                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
870                                         (unsigned)client->pid,
871                                         ctdb_addr_to_str(addr),
872                                         ip->client_id));
873                                 kill(client->pid, SIGKILL);
874                         }
875                 }
876         }
877 }
878
879 static void do_delete_ip(struct ctdb_context *ctdb, struct ctdb_vnn *vnn)
880 {
881         DLIST_REMOVE(ctdb->vnn, vnn);
882         ctdb_vnn_unassign_iface(ctdb, vnn);
883         ctdb_remove_orphaned_ifaces(ctdb, vnn);
884         talloc_free(vnn);
885 }
886
887 /*
888   called when releaseip event finishes
889  */
890 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
891                                 void *private_data)
892 {
893         struct takeover_callback_state *state = 
894                 talloc_get_type(private_data, struct takeover_callback_state);
895         TDB_DATA data;
896
897         if (status == -ETIME) {
898                 ctdb_ban_self(ctdb);
899         }
900
901         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
902                 if  (ctdb_sys_have_ip(state->addr)) {
903                         DEBUG(DEBUG_ERR,
904                               ("IP %s still hosted during release IP callback, failing\n",
905                                ctdb_addr_to_str(state->addr)));
906                         ctdb_request_control_reply(ctdb, state->c,
907                                                    NULL, -1, NULL);
908                         talloc_free(state);
909                         return;
910                 }
911         }
912
913         /* send a message to all clients of this node telling them
914            that the cluster has been reconfigured and they should
915            release any sockets on this IP */
916         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
917         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
918         data.dsize = strlen((char *)data.dptr)+1;
919
920         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
921
922         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
923
924         /* kill clients that have registered with this IP */
925         release_kill_clients(ctdb, state->addr);
926
927         ctdb_vnn_unassign_iface(ctdb, state->vnn);
928
929         /* Process the IP if it has been marked for deletion */
930         if (state->vnn->delete_pending) {
931                 do_delete_ip(ctdb, state->vnn);
932                 state->vnn = NULL;
933         }
934
935         /* the control succeeded */
936         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
937         talloc_free(state);
938 }
939
940 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
941 {
942         if (state->vnn != NULL) {
943                 state->vnn->update_in_flight = false;
944         }
945         return 0;
946 }
947
948 /*
949   release an ip address
950  */
951 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
952                                 struct ctdb_req_control_old *c,
953                                 TDB_DATA indata, 
954                                 bool *async_reply)
955 {
956         int ret;
957         struct takeover_callback_state *state;
958         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
959         struct ctdb_vnn *vnn;
960         char *iface;
961
962         /* update our vnn list */
963         vnn = find_public_ip_vnn(ctdb, &pip->addr);
964         if (vnn == NULL) {
965                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
966                         ctdb_addr_to_str(&pip->addr)));
967                 return 0;
968         }
969         vnn->pnn = pip->pnn;
970
971         /* stop any previous arps */
972         talloc_free(vnn->takeover_ctx);
973         vnn->takeover_ctx = NULL;
974
975         /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
976          * lazy multicast to drop an IP from any node that isn't the
977          * intended new node.  The following causes makes ctdbd ignore
978          * a release for any address it doesn't host.
979          */
980         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
981                 if (!ctdb_sys_have_ip(&pip->addr)) {
982                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
983                                 ctdb_addr_to_str(&pip->addr),
984                                 vnn->public_netmask_bits,
985                                 ctdb_vnn_iface_string(vnn)));
986                         ctdb_vnn_unassign_iface(ctdb, vnn);
987                         return 0;
988                 }
989         } else {
990                 if (vnn->iface == NULL) {
991                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
992                                            ctdb_addr_to_str(&pip->addr),
993                                            vnn->public_netmask_bits));
994                         return 0;
995                 }
996         }
997
998         /* There is a potential race between take_ip and us because we
999          * update the VNN via a callback that run when the
1000          * eventscripts have been run.  Avoid the race by allowing one
1001          * update to be in flight at a time.
1002          */
1003         if (vnn->update_in_flight) {
1004                 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
1005                                     "update for this IP already in flight\n",
1006                                     ctdb_addr_to_str(&vnn->public_address),
1007                                     vnn->public_netmask_bits));
1008                 return -1;
1009         }
1010
1011         iface = strdup(ctdb_vnn_iface_string(vnn));
1012
1013         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
1014                 ctdb_addr_to_str(&pip->addr),
1015                 vnn->public_netmask_bits,
1016                 iface,
1017                 pip->pnn));
1018
1019         state = talloc(ctdb, struct takeover_callback_state);
1020         if (state == NULL) {
1021                 ctdb_set_error(ctdb, "Out of memory at %s:%d",
1022                                __FILE__, __LINE__);
1023                 free(iface);
1024                 return -1;
1025         }
1026
1027         state->c = talloc_steal(state, c);
1028         state->addr = talloc(state, ctdb_sock_addr);       
1029         if (state->addr == NULL) {
1030                 ctdb_set_error(ctdb, "Out of memory at %s:%d",
1031                                __FILE__, __LINE__);
1032                 free(iface);
1033                 talloc_free(state);
1034                 return -1;
1035         }
1036         *state->addr = pip->addr;
1037         state->vnn   = vnn;
1038
1039         vnn->update_in_flight = true;
1040         talloc_set_destructor(state, ctdb_releaseip_destructor);
1041
1042         ret = ctdb_event_script_callback(ctdb, 
1043                                          state, release_ip_callback, state,
1044                                          CTDB_EVENT_RELEASE_IP,
1045                                          "%s %s %u",
1046                                          iface,
1047                                          ctdb_addr_to_str(&pip->addr),
1048                                          vnn->public_netmask_bits);
1049         free(iface);
1050         if (ret != 0) {
1051                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1052                         ctdb_addr_to_str(&pip->addr),
1053                         ctdb_vnn_iface_string(vnn)));
1054                 talloc_free(state);
1055                 return -1;
1056         }
1057
1058         /* tell the control that we will be reply asynchronously */
1059         *async_reply = true;
1060         return 0;
1061 }
1062
1063 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1064                                    ctdb_sock_addr *addr,
1065                                    unsigned mask, const char *ifaces,
1066                                    bool check_address)
1067 {
1068         struct ctdb_vnn      *vnn;
1069         uint32_t num = 0;
1070         char *tmp;
1071         const char *iface;
1072         int i;
1073         int ret;
1074
1075         tmp = strdup(ifaces);
1076         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1077                 if (!ctdb_sys_check_iface_exists(iface)) {
1078                         DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1079                         free(tmp);
1080                         return -1;
1081                 }
1082         }
1083         free(tmp);
1084
1085         /* Verify that we don't have an entry for this ip yet */
1086         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1087                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1088                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
1089                                 ctdb_addr_to_str(addr)));
1090                         return -1;
1091                 }               
1092         }
1093
1094         /* create a new vnn structure for this ip address */
1095         vnn = talloc_zero(ctdb, struct ctdb_vnn);
1096         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1097         vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1098         tmp = talloc_strdup(vnn, ifaces);
1099         CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1100         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1101                 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1102                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1103                 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1104                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1105                 num++;
1106         }
1107         talloc_free(tmp);
1108         vnn->ifaces[num] = NULL;
1109         vnn->public_address      = *addr;
1110         vnn->public_netmask_bits = mask;
1111         vnn->pnn                 = -1;
1112         if (check_address) {
1113                 if (ctdb_sys_have_ip(addr)) {
1114                         DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1115                         vnn->pnn = ctdb->pnn;
1116                 }
1117         }
1118
1119         for (i=0; vnn->ifaces[i]; i++) {
1120                 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1121                 if (ret != 0) {
1122                         DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1123                                            "for public_address[%s]\n",
1124                                            vnn->ifaces[i], ctdb_addr_to_str(addr)));
1125                         talloc_free(vnn);
1126                         return -1;
1127                 }
1128         }
1129
1130         DLIST_ADD(ctdb->vnn, vnn);
1131
1132         return 0;
1133 }
1134
1135 /*
1136   setup the public address lists from a file
1137 */
1138 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1139 {
1140         char **lines;
1141         int nlines;
1142         int i;
1143
1144         lines = file_lines_load(ctdb->public_addresses_file, &nlines, 0, ctdb);
1145         if (lines == NULL) {
1146                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1147                 return -1;
1148         }
1149         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1150                 nlines--;
1151         }
1152
1153         for (i=0;i<nlines;i++) {
1154                 unsigned mask;
1155                 ctdb_sock_addr addr;
1156                 const char *addrstr;
1157                 const char *ifaces;
1158                 char *tok, *line;
1159
1160                 line = lines[i];
1161                 while ((*line == ' ') || (*line == '\t')) {
1162                         line++;
1163                 }
1164                 if (*line == '#') {
1165                         continue;
1166                 }
1167                 if (strcmp(line, "") == 0) {
1168                         continue;
1169                 }
1170                 tok = strtok(line, " \t");
1171                 addrstr = tok;
1172                 tok = strtok(NULL, " \t");
1173                 if (tok == NULL) {
1174                         if (NULL == ctdb->default_public_interface) {
1175                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1176                                          i+1));
1177                                 talloc_free(lines);
1178                                 return -1;
1179                         }
1180                         ifaces = ctdb->default_public_interface;
1181                 } else {
1182                         ifaces = tok;
1183                 }
1184
1185                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1186                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1187                         talloc_free(lines);
1188                         return -1;
1189                 }
1190                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1191                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1192                         talloc_free(lines);
1193                         return -1;
1194                 }
1195         }
1196
1197
1198         talloc_free(lines);
1199         return 0;
1200 }
1201
1202 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1203                               const char *iface,
1204                               const char *ip)
1205 {
1206         struct ctdb_vnn *svnn;
1207         struct ctdb_interface *cur = NULL;
1208         bool ok;
1209         int ret;
1210
1211         svnn = talloc_zero(ctdb, struct ctdb_vnn);
1212         CTDB_NO_MEMORY(ctdb, svnn);
1213
1214         svnn->ifaces = talloc_array(svnn, const char *, 2);
1215         CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1216         svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1217         CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1218         svnn->ifaces[1] = NULL;
1219
1220         ok = parse_ip(ip, iface, 0, &svnn->public_address);
1221         if (!ok) {
1222                 talloc_free(svnn);
1223                 return -1;
1224         }
1225
1226         ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1227         if (ret != 0) {
1228                 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1229                                    "for single_ip[%s]\n",
1230                                    svnn->ifaces[0],
1231                                    ctdb_addr_to_str(&svnn->public_address)));
1232                 talloc_free(svnn);
1233                 return -1;
1234         }
1235
1236         /* assume the single public ip interface is initially "good" */
1237         cur = ctdb_find_iface(ctdb, iface);
1238         if (cur == NULL) {
1239                 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1240                 return -1;
1241         }
1242         cur->link_up = true;
1243
1244         ret = ctdb_vnn_assign_iface(ctdb, svnn);
1245         if (ret != 0) {
1246                 talloc_free(svnn);
1247                 return -1;
1248         }
1249
1250         ctdb->single_ip_vnn = svnn;
1251         return 0;
1252 }
1253
1254 struct public_ip_list {
1255         struct public_ip_list *next;
1256         uint32_t pnn;
1257         ctdb_sock_addr addr;
1258 };
1259
1260 /* Given a physical node, return the number of
1261    public addresses that is currently assigned to this node.
1262 */
1263 static int node_ip_coverage(int32_t pnn, struct public_ip_list *ips)
1264 {
1265         int num=0;
1266
1267         for (;ips;ips=ips->next) {
1268                 if (ips->pnn == pnn) {
1269                         num++;
1270                 }
1271         }
1272         return num;
1273 }
1274
1275
1276 /* Can the given node host the given IP: is the public IP known to the
1277  * node and is NOIPHOST unset?
1278 */
1279 static bool can_node_host_ip(struct ipalloc_state *ipalloc_state,
1280                              int32_t pnn,
1281                              struct public_ip_list *ip)
1282 {
1283         struct ctdb_public_ip_list_old *public_ips;
1284         int i;
1285
1286         if (ipalloc_state->ipflags[pnn].noiphost) {
1287                 return false;
1288         }
1289
1290         public_ips = ipalloc_state->available_public_ips[pnn];
1291
1292         if (public_ips == NULL) {
1293                 return false;
1294         }
1295
1296         for (i=0; i<public_ips->num; i++) {
1297                 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1298                         /* yes, this node can serve this public ip */
1299                         return true;
1300                 }
1301         }
1302
1303         return false;
1304 }
1305
1306 static bool can_node_takeover_ip(struct ipalloc_state *ipalloc_state,
1307                                  int32_t pnn,
1308                                  struct public_ip_list *ip)
1309 {
1310         if (ipalloc_state->ipflags[pnn].noiptakeover) {
1311                 return false;
1312         }
1313
1314         return can_node_host_ip(ipalloc_state, pnn, ip);
1315 }
1316
1317 /* search the node lists list for a node to takeover this ip.
1318    pick the node that currently are serving the least number of ips
1319    so that the ips get spread out evenly.
1320 */
1321 static int find_takeover_node(struct ipalloc_state *ipalloc_state,
1322                               struct public_ip_list *ip,
1323                               struct public_ip_list *all_ips)
1324 {
1325         int pnn, min=0, num;
1326         int i, numnodes;
1327
1328         numnodes = ipalloc_state->num;
1329         pnn    = -1;
1330         for (i=0; i<numnodes; i++) {
1331                 /* verify that this node can serve this ip */
1332                 if (!can_node_takeover_ip(ipalloc_state, i, ip)) {
1333                         /* no it couldnt   so skip to the next node */
1334                         continue;
1335                 }
1336
1337                 num = node_ip_coverage(i, all_ips);
1338                 /* was this the first node we checked ? */
1339                 if (pnn == -1) {
1340                         pnn = i;
1341                         min  = num;
1342                 } else {
1343                         if (num < min) {
1344                                 pnn = i;
1345                                 min  = num;
1346                         }
1347                 }
1348         }
1349         if (pnn == -1) {
1350                 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1351                         ctdb_addr_to_str(&ip->addr)));
1352
1353                 return -1;
1354         }
1355
1356         ip->pnn = pnn;
1357         return 0;
1358 }
1359
1360 #define IP_KEYLEN       4
1361 static uint32_t *ip_key(ctdb_sock_addr *ip)
1362 {
1363         static uint32_t key[IP_KEYLEN];
1364
1365         bzero(key, sizeof(key));
1366
1367         switch (ip->sa.sa_family) {
1368         case AF_INET:
1369                 key[3]  = htonl(ip->ip.sin_addr.s_addr);
1370                 break;
1371         case AF_INET6: {
1372                 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1373                 key[0]  = htonl(s6_a32[0]);
1374                 key[1]  = htonl(s6_a32[1]);
1375                 key[2]  = htonl(s6_a32[2]);
1376                 key[3]  = htonl(s6_a32[3]);
1377                 break;
1378         }
1379         default:
1380                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1381                 return key;
1382         }
1383
1384         return key;
1385 }
1386
1387 static void *add_ip_callback(void *parm, void *data)
1388 {
1389         struct public_ip_list *this_ip = parm;
1390         struct public_ip_list *prev_ip = data;
1391
1392         if (prev_ip == NULL) {
1393                 return parm;
1394         }
1395         if (this_ip->pnn == -1) {
1396                 this_ip->pnn = prev_ip->pnn;
1397         }
1398
1399         return parm;
1400 }
1401
1402 static int getips_count_callback(void *param, void *data)
1403 {
1404         struct public_ip_list **ip_list = (struct public_ip_list **)param;
1405         struct public_ip_list *new_ip = (struct public_ip_list *)data;
1406
1407         new_ip->next = *ip_list;
1408         *ip_list     = new_ip;
1409         return 0;
1410 }
1411
1412 static int verify_remote_ip_allocation(struct ctdb_context *ctdb,
1413                                        struct ctdb_public_ip_list_old *ips,
1414                                        uint32_t pnn);
1415
1416 static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
1417                                          struct ipalloc_state *ipalloc_state,
1418                                          struct ctdb_node_map_old *nodemap)
1419 {
1420         int j;
1421         int ret;
1422
1423         if (ipalloc_state->num != nodemap->num) {
1424                 DEBUG(DEBUG_ERR,
1425                       (__location__
1426                        " ipalloc_state->num (%d) != nodemap->num (%d) invalid param\n",
1427                        ipalloc_state->num, nodemap->num));
1428                 return -1;
1429         }
1430
1431         for (j=0; j<nodemap->num; j++) {
1432                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1433                         continue;
1434                 }
1435
1436                 /* Retrieve the list of known public IPs from the node */
1437                 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1438                                         TAKEOVER_TIMEOUT(),
1439                                         j,
1440                                         ctdb->nodes,
1441                                         0,
1442                                         &ipalloc_state->known_public_ips[j]);
1443                 if (ret != 0) {
1444                         DEBUG(DEBUG_ERR,
1445                               ("Failed to read known public IPs from node: %u\n",
1446                                j));
1447                         return -1;
1448                 }
1449
1450                 if (ctdb->do_checkpublicip) {
1451                         verify_remote_ip_allocation(ctdb,
1452                                                     ipalloc_state->known_public_ips[j],
1453                                                     j);
1454                 }
1455
1456                 /* Retrieve the list of available public IPs from the node */
1457                 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1458                                         TAKEOVER_TIMEOUT(),
1459                                         j,
1460                                         ctdb->nodes,
1461                                         CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE,
1462                                         &ipalloc_state->available_public_ips[j]);
1463                 if (ret != 0) {
1464                         DEBUG(DEBUG_ERR,
1465                               ("Failed to read available public IPs from node: %u\n",
1466                                j));
1467                         return -1;
1468                 }
1469         }
1470
1471         return 0;
1472 }
1473
1474 static struct public_ip_list *
1475 create_merged_ip_list(struct ctdb_context *ctdb, struct ipalloc_state *ipalloc_state)
1476 {
1477         int i, j;
1478         struct public_ip_list *ip_list;
1479         struct ctdb_public_ip_list_old *public_ips;
1480
1481         TALLOC_FREE(ctdb->ip_tree);
1482         ctdb->ip_tree = trbt_create(ctdb, 0);
1483
1484         for (i=0; i < ctdb->num_nodes; i++) {
1485                 public_ips = ipalloc_state->known_public_ips[i];
1486
1487                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1488                         continue;
1489                 }
1490
1491                 /* there were no public ips for this node */
1492                 if (public_ips == NULL) {
1493                         continue;
1494                 }
1495
1496                 for (j=0; j < public_ips->num; j++) {
1497                         struct public_ip_list *tmp_ip;
1498
1499                         tmp_ip = talloc_zero(ctdb->ip_tree, struct public_ip_list);
1500                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1501                         /* Do not use information about IP addresses hosted
1502                          * on other nodes, it may not be accurate */
1503                         if (public_ips->ips[j].pnn == ctdb->nodes[i]->pnn) {
1504                                 tmp_ip->pnn = public_ips->ips[j].pnn;
1505                         } else {
1506                                 tmp_ip->pnn = -1;
1507                         }
1508                         tmp_ip->addr = public_ips->ips[j].addr;
1509                         tmp_ip->next = NULL;
1510
1511                         trbt_insertarray32_callback(ctdb->ip_tree,
1512                                 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1513                                 add_ip_callback,
1514                                 tmp_ip);
1515                 }
1516         }
1517
1518         ip_list = NULL;
1519         trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1520
1521         return ip_list;
1522 }
1523
1524 /* 
1525  * This is the length of the longtest common prefix between the IPs.
1526  * It is calculated by XOR-ing the 2 IPs together and counting the
1527  * number of leading zeroes.  The implementation means that all
1528  * addresses end up being 128 bits long.
1529  *
1530  * FIXME? Should we consider IPv4 and IPv6 separately given that the
1531  * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1532  * lots of nodes and IP addresses?
1533  */
1534 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1535 {
1536         uint32_t ip1_k[IP_KEYLEN];
1537         uint32_t *t;
1538         int i;
1539         uint32_t x;
1540
1541         uint32_t distance = 0;
1542
1543         memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1544         t = ip_key(ip2);
1545         for (i=0; i<IP_KEYLEN; i++) {
1546                 x = ip1_k[i] ^ t[i];
1547                 if (x == 0) {
1548                         distance += 32;
1549                 } else {
1550                         /* Count number of leading zeroes. 
1551                          * FIXME? This could be optimised...
1552                          */
1553                         while ((x & (1 << 31)) == 0) {
1554                                 x <<= 1;
1555                                 distance += 1;
1556                         }
1557                 }
1558         }
1559
1560         return distance;
1561 }
1562
1563 /* Calculate the IP distance for the given IP relative to IPs on the
1564    given node.  The ips argument is generally the all_ips variable
1565    used in the main part of the algorithm.
1566  */
1567 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1568                                   struct public_ip_list *ips,
1569                                   int pnn)
1570 {
1571         struct public_ip_list *t;
1572         uint32_t d;
1573
1574         uint32_t sum = 0;
1575
1576         for (t=ips; t != NULL; t=t->next) {
1577                 if (t->pnn != pnn) {
1578                         continue;
1579                 }
1580
1581                 /* Optimisation: We never calculate the distance
1582                  * between an address and itself.  This allows us to
1583                  * calculate the effect of removing an address from a
1584                  * node by simply calculating the distance between
1585                  * that address and all of the exitsing addresses.
1586                  * Moreover, we assume that we're only ever dealing
1587                  * with addresses from all_ips so we can identify an
1588                  * address via a pointer rather than doing a more
1589                  * expensive address comparison. */
1590                 if (&(t->addr) == ip) {
1591                         continue;
1592                 }
1593
1594                 d = ip_distance(ip, &(t->addr));
1595                 sum += d * d;  /* Cheaper than pulling in math.h :-) */
1596         }
1597
1598         return sum;
1599 }
1600
1601 /* Return the LCP2 imbalance metric for addresses currently assigned
1602    to the given node.
1603  */
1604 static uint32_t lcp2_imbalance(struct public_ip_list * all_ips, int pnn)
1605 {
1606         struct public_ip_list *t;
1607
1608         uint32_t imbalance = 0;
1609
1610         for (t=all_ips; t!=NULL; t=t->next) {
1611                 if (t->pnn != pnn) {
1612                         continue;
1613                 }
1614                 /* Pass the rest of the IPs rather than the whole
1615                    all_ips input list.
1616                 */
1617                 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1618         }
1619
1620         return imbalance;
1621 }
1622
1623 /* Allocate any unassigned IPs just by looping through the IPs and
1624  * finding the best node for each.
1625  */
1626 static void basic_allocate_unassigned(struct ipalloc_state *ipalloc_state,
1627                                       struct public_ip_list *all_ips)
1628 {
1629         struct public_ip_list *tmp_ip;
1630
1631         /* loop over all ip's and find a physical node to cover for
1632            each unassigned ip.
1633         */
1634         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1635                 if (tmp_ip->pnn == -1) {
1636                         if (find_takeover_node(ipalloc_state,
1637                                                tmp_ip, all_ips)) {
1638                                 DEBUG(DEBUG_WARNING,
1639                                       ("Failed to find node to cover ip %s\n",
1640                                        ctdb_addr_to_str(&tmp_ip->addr)));
1641                         }
1642                 }
1643         }
1644 }
1645
1646 /* Basic non-deterministic rebalancing algorithm.
1647  */
1648 static void basic_failback(struct ipalloc_state *ipalloc_state,
1649                            struct public_ip_list *all_ips,
1650                            int num_ips)
1651 {
1652         int i, numnodes;
1653         int maxnode, maxnum, minnode, minnum, num, retries;
1654         struct public_ip_list *tmp_ip;
1655
1656         numnodes = ipalloc_state->num;
1657         retries = 0;
1658
1659 try_again:
1660         maxnum=0;
1661         minnum=0;
1662
1663         /* for each ip address, loop over all nodes that can serve
1664            this ip and make sure that the difference between the node
1665            serving the most and the node serving the least ip's are
1666            not greater than 1.
1667         */
1668         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1669                 if (tmp_ip->pnn == -1) {
1670                         continue;
1671                 }
1672
1673                 /* Get the highest and lowest number of ips's served by any 
1674                    valid node which can serve this ip.
1675                 */
1676                 maxnode = -1;
1677                 minnode = -1;
1678                 for (i=0; i<numnodes; i++) {
1679                         /* only check nodes that can actually serve this ip */
1680                         if (!can_node_takeover_ip(ipalloc_state, i,
1681                                                   tmp_ip)) {
1682                                 /* no it couldnt   so skip to the next node */
1683                                 continue;
1684                         }
1685
1686                         num = node_ip_coverage(i, all_ips);
1687                         if (maxnode == -1) {
1688                                 maxnode = i;
1689                                 maxnum  = num;
1690                         } else {
1691                                 if (num > maxnum) {
1692                                         maxnode = i;
1693                                         maxnum  = num;
1694                                 }
1695                         }
1696                         if (minnode == -1) {
1697                                 minnode = i;
1698                                 minnum  = num;
1699                         } else {
1700                                 if (num < minnum) {
1701                                         minnode = i;
1702                                         minnum  = num;
1703                                 }
1704                         }
1705                 }
1706                 if (maxnode == -1) {
1707                         DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1708                                 ctdb_addr_to_str(&tmp_ip->addr)));
1709
1710                         continue;
1711                 }
1712
1713                 /* if the spread between the smallest and largest coverage by
1714                    a node is >=2 we steal one of the ips from the node with
1715                    most coverage to even things out a bit.
1716                    try to do this a limited number of times since we dont
1717                    want to spend too much time balancing the ip coverage.
1718                 */
1719                 if ( (maxnum > minnum+1)
1720                      && (retries < (num_ips + 5)) ){
1721                         struct public_ip_list *tmp;
1722
1723                         /* Reassign one of maxnode's VNNs */
1724                         for (tmp=all_ips;tmp;tmp=tmp->next) {
1725                                 if (tmp->pnn == maxnode) {
1726                                         (void)find_takeover_node(ipalloc_state,
1727                                                                  tmp,
1728                                                                  all_ips);
1729                                         retries++;
1730                                         goto try_again;;
1731                                 }
1732                         }
1733                 }
1734         }
1735 }
1736
1737 static bool lcp2_init(struct ipalloc_state *ipalloc_state,
1738                       struct public_ip_list *all_ips,
1739                       uint32_t *force_rebalance_nodes,
1740                       uint32_t **lcp2_imbalances,
1741                       bool **rebalance_candidates)
1742 {
1743         int i, numnodes;
1744         struct public_ip_list *tmp_ip;
1745
1746         numnodes = ipalloc_state->num;
1747
1748         *rebalance_candidates = talloc_array(ipalloc_state, bool, numnodes);
1749         if (*rebalance_candidates == NULL) {
1750                 DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
1751                 return false;
1752         }
1753         *lcp2_imbalances = talloc_array(ipalloc_state, uint32_t, numnodes);
1754         if (*lcp2_imbalances == NULL) {
1755                 DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
1756                 return false;
1757         }
1758
1759         for (i=0; i<numnodes; i++) {
1760                 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1761                 /* First step: assume all nodes are candidates */
1762                 (*rebalance_candidates)[i] = true;
1763         }
1764
1765         /* 2nd step: if a node has IPs assigned then it must have been
1766          * healthy before, so we remove it from consideration.  This
1767          * is overkill but is all we have because we don't maintain
1768          * state between takeover runs.  An alternative would be to
1769          * keep state and invalidate it every time the recovery master
1770          * changes.
1771          */
1772         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1773                 if (tmp_ip->pnn != -1) {
1774                         (*rebalance_candidates)[tmp_ip->pnn] = false;
1775                 }
1776         }
1777
1778         /* 3rd step: if a node is forced to re-balance then
1779            we allow failback onto the node */
1780         if (force_rebalance_nodes == NULL) {
1781                 return true;
1782         }
1783         for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
1784                 uint32_t pnn = force_rebalance_nodes[i];
1785                 if (pnn >= numnodes) {
1786                         DEBUG(DEBUG_ERR,
1787                               (__location__ "unknown node %u\n", pnn));
1788                         continue;
1789                 }
1790
1791                 DEBUG(DEBUG_NOTICE,
1792                       ("Forcing rebalancing of IPs to node %u\n", pnn));
1793                 (*rebalance_candidates)[pnn] = true;
1794         }
1795
1796         return true;
1797 }
1798
1799 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1800  * the IP/node combination that will cost the least.
1801  */
1802 static void lcp2_allocate_unassigned(struct ipalloc_state *ipalloc_state,
1803                                      struct public_ip_list *all_ips,
1804                                      uint32_t *lcp2_imbalances)
1805 {
1806         struct public_ip_list *tmp_ip;
1807         int dstnode, numnodes;
1808
1809         int minnode;
1810         uint32_t mindsum, dstdsum, dstimbl, minimbl;
1811         struct public_ip_list *minip;
1812
1813         bool should_loop = true;
1814         bool have_unassigned = true;
1815
1816         numnodes = ipalloc_state->num;
1817
1818         while (have_unassigned && should_loop) {
1819                 should_loop = false;
1820
1821                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1822                 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1823
1824                 minnode = -1;
1825                 mindsum = 0;
1826                 minip = NULL;
1827
1828                 /* loop over each unassigned ip. */
1829                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1830                         if (tmp_ip->pnn != -1) {
1831                                 continue;
1832                         }
1833
1834                         for (dstnode=0; dstnode<numnodes; dstnode++) {
1835                                 /* only check nodes that can actually takeover this ip */
1836                                 if (!can_node_takeover_ip(ipalloc_state,
1837                                                           dstnode,
1838                                                           tmp_ip)) {
1839                                         /* no it couldnt   so skip to the next node */
1840                                         continue;
1841                                 }
1842
1843                                 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1844                                 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1845                                 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1846                                                    ctdb_addr_to_str(&(tmp_ip->addr)),
1847                                                    dstnode,
1848                                                    dstimbl - lcp2_imbalances[dstnode]));
1849
1850
1851                                 if ((minnode == -1) || (dstdsum < mindsum)) {
1852                                         minnode = dstnode;
1853                                         minimbl = dstimbl;
1854                                         mindsum = dstdsum;
1855                                         minip = tmp_ip;
1856                                         should_loop = true;
1857                                 }
1858                         }
1859                 }
1860
1861                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1862
1863                 /* If we found one then assign it to the given node. */
1864                 if (minnode != -1) {
1865                         minip->pnn = minnode;
1866                         lcp2_imbalances[minnode] = minimbl;
1867                         DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1868                                           ctdb_addr_to_str(&(minip->addr)),
1869                                           minnode,
1870                                           mindsum));
1871                 }
1872
1873                 /* There might be a better way but at least this is clear. */
1874                 have_unassigned = false;
1875                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1876                         if (tmp_ip->pnn == -1) {
1877                                 have_unassigned = true;
1878                         }
1879                 }
1880         }
1881
1882         /* We know if we have an unassigned addresses so we might as
1883          * well optimise.
1884          */
1885         if (have_unassigned) {
1886                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1887                         if (tmp_ip->pnn == -1) {
1888                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1889                                                      ctdb_addr_to_str(&tmp_ip->addr)));
1890                         }
1891                 }
1892         }
1893 }
1894
1895 /* LCP2 algorithm for rebalancing the cluster.  Given a candidate node
1896  * to move IPs from, determines the best IP/destination node
1897  * combination to move from the source node.
1898  */
1899 static bool lcp2_failback_candidate(struct ipalloc_state *ipalloc_state,
1900                                     struct public_ip_list *all_ips,
1901                                     int srcnode,
1902                                     uint32_t *lcp2_imbalances,
1903                                     bool *rebalance_candidates)
1904 {
1905         int dstnode, mindstnode, numnodes;
1906         uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1907         uint32_t minsrcimbl, mindstimbl;
1908         struct public_ip_list *minip;
1909         struct public_ip_list *tmp_ip;
1910
1911         /* Find an IP and destination node that best reduces imbalance. */
1912         srcimbl = 0;
1913         minip = NULL;
1914         minsrcimbl = 0;
1915         mindstnode = -1;
1916         mindstimbl = 0;
1917
1918         numnodes = ipalloc_state->num;
1919
1920         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1921         DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n",
1922                            srcnode, lcp2_imbalances[srcnode]));
1923
1924         for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1925                 /* Only consider addresses on srcnode. */
1926                 if (tmp_ip->pnn != srcnode) {
1927                         continue;
1928                 }
1929
1930                 /* What is this IP address costing the source node? */
1931                 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1932                 srcimbl = lcp2_imbalances[srcnode] - srcdsum;
1933
1934                 /* Consider this IP address would cost each potential
1935                  * destination node.  Destination nodes are limited to
1936                  * those that are newly healthy, since we don't want
1937                  * to do gratuitous failover of IPs just to make minor
1938                  * balance improvements.
1939                  */
1940                 for (dstnode=0; dstnode<numnodes; dstnode++) {
1941                         if (!rebalance_candidates[dstnode]) {
1942                                 continue;
1943                         }
1944
1945                         /* only check nodes that can actually takeover this ip */
1946                         if (!can_node_takeover_ip(ipalloc_state, dstnode,
1947                                                   tmp_ip)) {
1948                                 /* no it couldnt   so skip to the next node */
1949                                 continue;
1950                         }
1951
1952                         dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1953                         dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1954                         DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1955                                            srcnode, -srcdsum,
1956                                            ctdb_addr_to_str(&(tmp_ip->addr)),
1957                                            dstnode, dstdsum));
1958
1959                         if ((dstimbl < lcp2_imbalances[srcnode]) &&
1960                             (dstdsum < srcdsum) &&                      \
1961                             ((mindstnode == -1) ||                              \
1962                              ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1963
1964                                 minip = tmp_ip;
1965                                 minsrcimbl = srcimbl;
1966                                 mindstnode = dstnode;
1967                                 mindstimbl = dstimbl;
1968                         }
1969                 }
1970         }
1971         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1972
1973         if (mindstnode != -1) {
1974                 /* We found a move that makes things better... */
1975                 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1976                                   srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1977                                   ctdb_addr_to_str(&(minip->addr)),
1978                                   mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1979
1980
1981                 lcp2_imbalances[srcnode] = minsrcimbl;
1982                 lcp2_imbalances[mindstnode] = mindstimbl;
1983                 minip->pnn = mindstnode;
1984
1985                 return true;
1986         }
1987
1988         return false;
1989         
1990 }
1991
1992 struct lcp2_imbalance_pnn {
1993         uint32_t imbalance;
1994         int pnn;
1995 };
1996
1997 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
1998 {
1999         const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
2000         const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
2001
2002         if (lipa->imbalance > lipb->imbalance) {
2003                 return -1;
2004         } else if (lipa->imbalance == lipb->imbalance) {
2005                 return 0;
2006         } else {
2007                 return 1;
2008         }
2009 }
2010
2011 /* LCP2 algorithm for rebalancing the cluster.  This finds the source
2012  * node with the highest LCP2 imbalance, and then determines the best
2013  * IP/destination node combination to move from the source node.
2014  */
2015 static void lcp2_failback(struct ipalloc_state *ipalloc_state,
2016                           struct public_ip_list *all_ips,
2017                           uint32_t *lcp2_imbalances,
2018                           bool *rebalance_candidates)
2019 {
2020         int i, numnodes;
2021         struct lcp2_imbalance_pnn * lips;
2022         bool again;
2023
2024         numnodes = ipalloc_state->num;
2025
2026 try_again:
2027         /* Put the imbalances and nodes into an array, sort them and
2028          * iterate through candidates.  Usually the 1st one will be
2029          * used, so this doesn't cost much...
2030          */
2031         DEBUG(DEBUG_DEBUG,("+++++++++++++++++++++++++++++++++++++++++\n"));
2032         DEBUG(DEBUG_DEBUG,("Selecting most imbalanced node from:\n"));
2033         lips = talloc_array(ipalloc_state, struct lcp2_imbalance_pnn, numnodes);
2034         for (i=0; i<numnodes; i++) {
2035                 lips[i].imbalance = lcp2_imbalances[i];
2036                 lips[i].pnn = i;
2037                 DEBUG(DEBUG_DEBUG,(" %d [%d]\n", i, lcp2_imbalances[i]));
2038         }
2039         qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
2040               lcp2_cmp_imbalance_pnn);
2041
2042         again = false;
2043         for (i=0; i<numnodes; i++) {
2044                 /* This means that all nodes had 0 or 1 addresses, so
2045                  * can't be imbalanced.
2046                  */
2047                 if (lips[i].imbalance == 0) {
2048                         break;
2049                 }
2050
2051                 if (lcp2_failback_candidate(ipalloc_state,
2052                                             all_ips,
2053                                             lips[i].pnn,
2054                                             lcp2_imbalances,
2055                                             rebalance_candidates)) {
2056                         again = true;
2057                         break;
2058                 }
2059         }
2060
2061         talloc_free(lips);
2062         if (again) {
2063                 goto try_again;
2064         }
2065 }
2066
2067 static void unassign_unsuitable_ips(struct ipalloc_state *ipalloc_state,
2068                                     struct public_ip_list *all_ips)
2069 {
2070         struct public_ip_list *tmp_ip;
2071
2072         /* verify that the assigned nodes can serve that public ip
2073            and set it to -1 if not
2074         */
2075         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2076                 if (tmp_ip->pnn == -1) {
2077                         continue;
2078                 }
2079                 if (!can_node_host_ip(ipalloc_state, tmp_ip->pnn,
2080                                       tmp_ip) != 0) {
2081                         /* this node can not serve this ip. */
2082                         DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
2083                                            ctdb_addr_to_str(&(tmp_ip->addr)),
2084                                            tmp_ip->pnn));
2085                         tmp_ip->pnn = -1;
2086                 }
2087         }
2088 }
2089
2090 static bool ip_alloc_deterministic_ips(struct ipalloc_state *ipalloc_state,
2091                                        struct public_ip_list *all_ips)
2092 {
2093         struct public_ip_list *tmp_ip;
2094         int i, numnodes;
2095
2096         numnodes = ipalloc_state->num;
2097
2098         DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
2099        /* Allocate IPs to nodes in a modulo fashion so that IPs will
2100         *  always be allocated the same way for a specific set of
2101         *  available/unavailable nodes.
2102         */
2103
2104         for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
2105                 tmp_ip->pnn = i % numnodes;
2106         }
2107
2108         /* IP failback doesn't make sense with deterministic
2109          * IPs, since the modulo step above implicitly fails
2110          * back IPs to their "home" node.
2111          */
2112         if (1 == ipalloc_state->no_ip_failback) {
2113                 DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
2114         }
2115
2116         unassign_unsuitable_ips(ipalloc_state, all_ips);
2117
2118         basic_allocate_unassigned(ipalloc_state, all_ips);
2119
2120         /* No failback here! */
2121
2122         return true;
2123 }
2124
2125 static bool ip_alloc_nondeterministic_ips(struct ipalloc_state *ipalloc_state,
2126                                           struct public_ip_list *all_ips)
2127 {
2128         /* This should be pushed down into basic_failback. */
2129         struct public_ip_list *tmp_ip;
2130         int num_ips = 0;
2131         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2132                 num_ips++;
2133         }
2134
2135         unassign_unsuitable_ips(ipalloc_state, all_ips);
2136
2137         basic_allocate_unassigned(ipalloc_state, all_ips);
2138
2139         /* If we don't want IPs to fail back then don't rebalance IPs. */
2140         if (1 == ipalloc_state->no_ip_failback) {
2141                 return true;
2142         }
2143
2144         /* Now, try to make sure the ip adresses are evenly distributed
2145            across the nodes.
2146         */
2147         basic_failback(ipalloc_state, all_ips, num_ips);
2148
2149         return true;
2150 }
2151
2152 static bool ip_alloc_lcp2(struct ipalloc_state *ipalloc_state,
2153                           struct public_ip_list *all_ips,
2154                           uint32_t *force_rebalance_nodes)
2155 {
2156         uint32_t *lcp2_imbalances;
2157         bool *rebalance_candidates;
2158         int numnodes, num_rebalance_candidates, i;
2159         bool ret = true;
2160
2161         unassign_unsuitable_ips(ipalloc_state, all_ips);
2162
2163         if (!lcp2_init(ipalloc_state, all_ips,force_rebalance_nodes,
2164                        &lcp2_imbalances, &rebalance_candidates)) {
2165                 ret = false;
2166                 goto finished;
2167         }
2168
2169         lcp2_allocate_unassigned(ipalloc_state, all_ips, lcp2_imbalances);
2170
2171         /* If we don't want IPs to fail back then don't rebalance IPs. */
2172         if (1 == ipalloc_state->no_ip_failback) {
2173                 goto finished;
2174         }
2175
2176         /* It is only worth continuing if we have suitable target
2177          * nodes to transfer IPs to.  This check is much cheaper than
2178          * continuing on...
2179          */
2180         numnodes = ipalloc_state->num;
2181         num_rebalance_candidates = 0;
2182         for (i=0; i<numnodes; i++) {
2183                 if (rebalance_candidates[i]) {
2184                         num_rebalance_candidates++;
2185                 }
2186         }
2187         if (num_rebalance_candidates == 0) {
2188                 goto finished;
2189         }
2190
2191         /* Now, try to make sure the ip adresses are evenly distributed
2192            across the nodes.
2193         */
2194         lcp2_failback(ipalloc_state, all_ips,
2195                       lcp2_imbalances, rebalance_candidates);
2196
2197 finished:
2198         return ret;
2199 }
2200
2201 static bool all_nodes_are_disabled(struct ctdb_node_map_old *nodemap)
2202 {
2203         int i;
2204
2205         for (i=0;i<nodemap->num;i++) {
2206                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
2207                         /* Found one completely healthy node */
2208                         return false;
2209                 }
2210         }
2211
2212         return true;
2213 }
2214
2215 /* The calculation part of the IP allocation algorithm. */
2216 static bool ctdb_takeover_run_core(struct ipalloc_state *ipalloc_state,
2217                                    struct public_ip_list *all_ips,
2218                                    uint32_t *force_rebalance_nodes)
2219 {
2220         bool ret;
2221
2222         switch (ipalloc_state->algorithm) {
2223         case IPALLOC_LCP2:
2224                 ret = ip_alloc_lcp2(ipalloc_state, all_ips,
2225                                     force_rebalance_nodes);
2226                 break;
2227         case IPALLOC_DETERMINISTIC:
2228                 ret = ip_alloc_deterministic_ips(ipalloc_state, all_ips);
2229                 break;
2230         case IPALLOC_NONDETERMINISTIC:
2231                 ret = ip_alloc_nondeterministic_ips(ipalloc_state, all_ips);
2232                break;
2233         }
2234
2235         /* at this point ->pnn is the node which will own each IP
2236            or -1 if there is no node that can cover this ip
2237         */
2238
2239         return ret;
2240 }
2241
2242 struct get_tunable_callback_data {
2243         const char *tunable;
2244         uint32_t *out;
2245         bool fatal;
2246 };
2247
2248 static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
2249                                  int32_t res, TDB_DATA outdata,
2250                                  void *callback)
2251 {
2252         struct get_tunable_callback_data *cd =
2253                 (struct get_tunable_callback_data *)callback;
2254         int size;
2255
2256         if (res != 0) {
2257                 /* Already handled in fail callback */
2258                 return;
2259         }
2260
2261         if (outdata.dsize != sizeof(uint32_t)) {
2262                 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading \"%s\" tunable from node %d. Expected %d bytes but received %d bytes\n",
2263                                  cd->tunable, pnn, (int)sizeof(uint32_t),
2264                                  (int)outdata.dsize));
2265                 cd->fatal = true;
2266                 return;
2267         }
2268
2269         size = talloc_array_length(cd->out);
2270         if (pnn >= size) {
2271                 DEBUG(DEBUG_ERR,("Got %s reply from node %d but nodemap only has %d entries\n",
2272                                  cd->tunable, pnn, size));
2273                 return;
2274         }
2275
2276                 
2277         cd->out[pnn] = *(uint32_t *)outdata.dptr;
2278 }
2279
2280 static void get_tunable_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2281                                        int32_t res, TDB_DATA outdata,
2282                                        void *callback)
2283 {
2284         struct get_tunable_callback_data *cd =
2285                 (struct get_tunable_callback_data *)callback;
2286
2287         switch (res) {
2288         case -ETIME:
2289                 DEBUG(DEBUG_ERR,
2290                       ("Timed out getting tunable \"%s\" from node %d\n",
2291                        cd->tunable, pnn));
2292                 cd->fatal = true;
2293                 break;
2294         case -EINVAL:
2295         case -1:
2296                 DEBUG(DEBUG_WARNING,
2297                       ("Tunable \"%s\" not implemented on node %d\n",
2298                        cd->tunable, pnn));
2299                 break;
2300         default:
2301                 DEBUG(DEBUG_ERR,
2302                       ("Unexpected error getting tunable \"%s\" from node %d\n",
2303                        cd->tunable, pnn));
2304                 cd->fatal = true;
2305         }
2306 }
2307
2308 static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
2309                                         TALLOC_CTX *tmp_ctx,
2310                                         struct ctdb_node_map_old *nodemap,
2311                                         const char *tunable,
2312                                         uint32_t default_value)
2313 {
2314         TDB_DATA data;
2315         struct ctdb_control_get_tunable *t;
2316         uint32_t *nodes;
2317         uint32_t *tvals;
2318         struct get_tunable_callback_data callback_data;
2319         int i;
2320
2321         tvals = talloc_array(tmp_ctx, uint32_t, nodemap->num);
2322         CTDB_NO_MEMORY_NULL(ctdb, tvals);
2323         for (i=0; i<nodemap->num; i++) {
2324                 tvals[i] = default_value;
2325         }
2326                 
2327         callback_data.out = tvals;
2328         callback_data.tunable = tunable;
2329         callback_data.fatal = false;
2330
2331         data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(tunable) + 1;
2332         data.dptr  = talloc_size(tmp_ctx, data.dsize);
2333         t = (struct ctdb_control_get_tunable *)data.dptr;
2334         t->length = strlen(tunable)+1;
2335         memcpy(t->name, tunable, t->length);
2336         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2337         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
2338                                       nodes, 0, TAKEOVER_TIMEOUT(),
2339                                       false, data,
2340                                       get_tunable_callback,
2341                                       get_tunable_fail_callback,
2342                                       &callback_data) != 0) {
2343                 if (callback_data.fatal) {
2344                         talloc_free(tvals);
2345                         tvals = NULL;
2346                 }
2347         }
2348         talloc_free(nodes);
2349         talloc_free(data.dptr);
2350
2351         return tvals;
2352 }
2353
2354 /* Set internal flags for IP allocation:
2355  *   Clear ip flags
2356  *   Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
2357  *   Set NOIPHOST ip flag for each INACTIVE node
2358  *   if all nodes are disabled:
2359  *     Set NOIPHOST ip flags from per-node NoIPHostOnAllDisabled tunable
2360  *   else
2361  *     Set NOIPHOST ip flags for disabled nodes
2362  */
2363 static void set_ipflags_internal(struct ipalloc_state *ipalloc_state,
2364                                  struct ctdb_node_map_old *nodemap,
2365                                  uint32_t *tval_noiptakeover,
2366                                  uint32_t *tval_noiphostonalldisabled)
2367 {
2368         int i;
2369
2370         /* IP flags cleared at this point - implicit due to talloc_zero */
2371
2372         for (i=0;i<nodemap->num;i++) {
2373                 /* Can not take IPs on node with NoIPTakeover set */
2374                 if (tval_noiptakeover[i] != 0) {
2375                         ipalloc_state->ipflags[i].noiptakeover = true;
2376                 }
2377
2378                 /* Can not host IPs on INACTIVE node */
2379                 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2380                         ipalloc_state->ipflags[i].noiphost = true;
2381                 }
2382         }
2383
2384         if (all_nodes_are_disabled(nodemap)) {
2385                 /* If all nodes are disabled, can not host IPs on node
2386                  * with NoIPHostOnAllDisabled set
2387                  */
2388                 for (i=0;i<nodemap->num;i++) {
2389                         if (tval_noiphostonalldisabled[i] != 0) {
2390                                 ipalloc_state->ipflags[i].noiphost = true;
2391                         }
2392                 }
2393         } else {
2394                 /* If some nodes are not disabled, then can not host
2395                  * IPs on DISABLED node
2396                  */
2397                 for (i=0;i<nodemap->num;i++) {
2398                         if (nodemap->nodes[i].flags & NODE_FLAGS_DISABLED) {
2399                                 ipalloc_state->ipflags[i].noiphost = true;
2400                         }
2401                 }
2402         }
2403 }
2404
2405 static bool set_ipflags(struct ctdb_context *ctdb,
2406                         struct ipalloc_state *ipalloc_state,
2407                         struct ctdb_node_map_old *nodemap)
2408 {
2409         uint32_t *tval_noiptakeover;
2410         uint32_t *tval_noiphostonalldisabled;
2411
2412         tval_noiptakeover = get_tunable_from_nodes(ctdb, ipalloc_state, nodemap,
2413                                                    "NoIPTakeover", 0);
2414         if (tval_noiptakeover == NULL) {
2415                 return false;
2416         }
2417
2418         tval_noiphostonalldisabled =
2419                 get_tunable_from_nodes(ctdb, ipalloc_state, nodemap,
2420                                        "NoIPHostOnAllDisabled", 0);
2421         if (tval_noiphostonalldisabled == NULL) {
2422                 /* Caller frees tmp_ctx */
2423                 return false;
2424         }
2425
2426         set_ipflags_internal(ipalloc_state, nodemap,
2427                              tval_noiptakeover,
2428                              tval_noiphostonalldisabled);
2429
2430         talloc_free(tval_noiptakeover);
2431         talloc_free(tval_noiphostonalldisabled);
2432
2433         return true;
2434 }
2435
2436 static struct ipalloc_state * ipalloc_state_init(struct ctdb_context *ctdb,
2437                                                  TALLOC_CTX *mem_ctx)
2438 {
2439         struct ipalloc_state *ipalloc_state =
2440                 talloc_zero(mem_ctx, struct ipalloc_state);
2441         if (ipalloc_state == NULL) {
2442                 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
2443                 return NULL;
2444         }
2445
2446         ipalloc_state->num = ctdb->num_nodes;
2447         ipalloc_state->known_public_ips =
2448                 talloc_zero_array(ipalloc_state,
2449                                   struct ctdb_public_ip_list_old *,
2450                                   ipalloc_state->num);
2451         if (ipalloc_state->known_public_ips == NULL) {
2452                 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
2453                 talloc_free(ipalloc_state);
2454                 return NULL;
2455         }
2456         ipalloc_state->available_public_ips =
2457                 talloc_zero_array(ipalloc_state,
2458                                   struct ctdb_public_ip_list_old *,
2459                                   ipalloc_state->num);
2460         if (ipalloc_state->available_public_ips == NULL) {
2461                 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
2462                 talloc_free(ipalloc_state);
2463                 return NULL;
2464         }
2465         ipalloc_state->ipflags =
2466                 talloc_zero_array(ipalloc_state,
2467                                   struct ctdb_ipflags,
2468                                   ipalloc_state->num);
2469         if (ipalloc_state->ipflags == NULL) {
2470                 DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
2471                 talloc_free(ipalloc_state);
2472                 return NULL;
2473         }
2474
2475         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2476                 ipalloc_state->algorithm = IPALLOC_LCP2;
2477         } else if (1 == ctdb->tunable.deterministic_public_ips) {
2478                 ipalloc_state->algorithm = IPALLOC_DETERMINISTIC;
2479         } else {
2480                 ipalloc_state->algorithm = IPALLOC_NONDETERMINISTIC;
2481         }
2482
2483         ipalloc_state->no_ip_failback = ctdb->tunable.no_ip_failback;
2484
2485         return ipalloc_state;
2486 }
2487
2488 struct iprealloc_callback_data {
2489         bool *retry_nodes;
2490         int retry_count;
2491         client_async_callback fail_callback;
2492         void *fail_callback_data;
2493         struct ctdb_node_map_old *nodemap;
2494 };
2495
2496 static void iprealloc_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2497                                         int32_t res, TDB_DATA outdata,
2498                                         void *callback)
2499 {
2500         int numnodes;
2501         struct iprealloc_callback_data *cd =
2502                 (struct iprealloc_callback_data *)callback;
2503
2504         numnodes = talloc_array_length(cd->retry_nodes);
2505         if (pnn > numnodes) {
2506                 DEBUG(DEBUG_ERR,
2507                       ("ipreallocated failure from node %d, "
2508                        "but only %d nodes in nodemap\n",
2509                        pnn, numnodes));
2510                 return;
2511         }
2512
2513         /* Can't run the "ipreallocated" event on a INACTIVE node */
2514         if (cd->nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE) {
2515                 DEBUG(DEBUG_WARNING,
2516                       ("ipreallocated failed on inactive node %d, ignoring\n",
2517                        pnn));
2518                 return;
2519         }
2520
2521         switch (res) {
2522         case -ETIME:
2523                 /* If the control timed out then that's a real error,
2524                  * so call the real fail callback
2525                  */
2526                 if (cd->fail_callback) {
2527                         cd->fail_callback(ctdb, pnn, res, outdata,
2528                                           cd->fail_callback_data);
2529                 } else {
2530                         DEBUG(DEBUG_WARNING,
2531                               ("iprealloc timed out but no callback registered\n"));
2532                 }
2533                 break;
2534         default:
2535                 /* If not a timeout then either the ipreallocated
2536                  * eventscript (or some setup) failed.  This might
2537                  * have failed because the IPREALLOCATED control isn't
2538                  * implemented - right now there is no way of knowing
2539                  * because the error codes are all folded down to -1.
2540                  * Consider retrying using EVENTSCRIPT control...
2541                  */
2542                 DEBUG(DEBUG_WARNING,
2543                       ("ipreallocated failure from node %d, flagging retry\n",
2544                        pnn));
2545                 cd->retry_nodes[pnn] = true;
2546                 cd->retry_count++;
2547         }
2548 }
2549
2550 struct takeover_callback_data {
2551         bool *node_failed;
2552         client_async_callback fail_callback;
2553         void *fail_callback_data;
2554         struct ctdb_node_map_old *nodemap;
2555 };
2556
2557 static void takeover_run_fail_callback(struct ctdb_context *ctdb,
2558                                        uint32_t node_pnn, int32_t res,
2559                                        TDB_DATA outdata, void *callback_data)
2560 {
2561         struct takeover_callback_data *cd =
2562                 talloc_get_type_abort(callback_data,
2563                                       struct takeover_callback_data);
2564         int i;
2565
2566         for (i = 0; i < cd->nodemap->num; i++) {
2567                 if (node_pnn == cd->nodemap->nodes[i].pnn) {
2568                         break;
2569                 }
2570         }
2571
2572         if (i == cd->nodemap->num) {
2573                 DEBUG(DEBUG_ERR, (__location__ " invalid PNN %u\n", node_pnn));
2574                 return;
2575         }
2576
2577         if (!cd->node_failed[i]) {
2578                 cd->node_failed[i] = true;
2579                 cd->fail_callback(ctdb, node_pnn, res, outdata,
2580                                   cd->fail_callback_data);
2581         }
2582 }
2583
2584 /*
2585   make any IP alias changes for public addresses that are necessary 
2586  */
2587 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
2588                       uint32_t *force_rebalance_nodes,
2589                       client_async_callback fail_callback, void *callback_data)
2590 {
2591         int i, j, ret;
2592         struct ctdb_public_ip ip;
2593         uint32_t *nodes;
2594         struct public_ip_list *all_ips, *tmp_ip;
2595         TDB_DATA data;
2596         struct timeval timeout;
2597         struct client_async_data *async_data;
2598         struct ctdb_client_control_state *state;
2599         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2600         struct ipalloc_state *ipalloc_state;
2601         struct takeover_callback_data *takeover_data;
2602         struct iprealloc_callback_data iprealloc_data;
2603         bool *retry_data;
2604         bool can_host_ips;
2605
2606         /*
2607          * ip failover is completely disabled, just send out the 
2608          * ipreallocated event.
2609          */
2610         if (ctdb->tunable.disable_ip_failover != 0) {
2611                 goto ipreallocated;
2612         }
2613
2614         ipalloc_state = ipalloc_state_init(ctdb, tmp_ctx);
2615         if (ipalloc_state == NULL) {
2616                 talloc_free(tmp_ctx);
2617                 return -1;
2618         }
2619
2620         if (!set_ipflags(ctdb, ipalloc_state, nodemap)) {
2621                 DEBUG(DEBUG_ERR,("Failed to set IP flags - aborting takeover run\n"));
2622                 talloc_free(tmp_ctx);
2623                 return -1;
2624         }
2625
2626         /* Fetch known/available public IPs from each active node */
2627         ret = ctdb_reload_remote_public_ips(ctdb, ipalloc_state, nodemap);
2628         if (ret != 0) {
2629                 talloc_free(tmp_ctx);
2630                 return -1;
2631         }
2632
2633         /* Short-circuit IP allocation if no node has available IPs */
2634         can_host_ips = false;
2635         for (i=0; i < ipalloc_state->num; i++) {
2636                 if (ipalloc_state->available_public_ips[i] != NULL) {
2637                         can_host_ips = true;
2638                 }
2639         }
2640         if (!can_host_ips) {
2641                 DEBUG(DEBUG_WARNING,("No nodes available to host public IPs yet\n"));
2642                 return 0;
2643         }
2644
2645         /* since nodes only know about those public addresses that
2646            can be served by that particular node, no single node has
2647            a full list of all public addresses that exist in the cluster.
2648            Walk over all node structures and create a merged list of
2649            all public addresses that exist in the cluster.
2650
2651            keep the tree of ips around as ctdb->ip_tree
2652         */
2653         all_ips = create_merged_ip_list(ctdb, ipalloc_state);
2654
2655         /* Do the IP reassignment calculations */
2656         ctdb_takeover_run_core(ipalloc_state,
2657                                all_ips, force_rebalance_nodes);
2658
2659         /* Now tell all nodes to release any public IPs should not
2660          * host.  This will be a NOOP on nodes that don't currently
2661          * hold the given IP.
2662          */
2663         takeover_data = talloc_zero(tmp_ctx, struct takeover_callback_data);
2664         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data);
2665
2666         takeover_data->node_failed = talloc_zero_array(tmp_ctx,
2667                                                        bool, nodemap->num);
2668         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data->node_failed);
2669         takeover_data->fail_callback = fail_callback;
2670         takeover_data->fail_callback_data = callback_data;
2671         takeover_data->nodemap = nodemap;
2672
2673         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2674         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2675
2676         async_data->fail_callback = takeover_run_fail_callback;
2677         async_data->callback_data = takeover_data;
2678
2679         ZERO_STRUCT(ip); /* Avoid valgrind warnings for union */
2680
2681         /* Send a RELEASE_IP to all nodes that should not be hosting
2682          * each IP.  For each IP, all but one of these will be
2683          * redundant.  However, the redundant ones are used to tell
2684          * nodes which node should be hosting the IP so that commands
2685          * like "ctdb ip" can display a particular nodes idea of who
2686          * is hosting what. */
2687         for (i=0;i<nodemap->num;i++) {
2688                 /* don't talk to unconnected nodes, but do talk to banned nodes */
2689                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2690                         continue;
2691                 }
2692
2693                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2694                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2695                                 /* This node should be serving this
2696                                    vnn so don't tell it to release the ip
2697                                 */
2698                                 continue;
2699                         }
2700                         ip.pnn  = tmp_ip->pnn;
2701                         ip.addr = tmp_ip->addr;
2702
2703                         timeout = TAKEOVER_TIMEOUT();
2704                         data.dsize = sizeof(ip);
2705                         data.dptr  = (uint8_t *)&ip;
2706                         state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2707                                                   0, CTDB_CONTROL_RELEASE_IP, 0,
2708                                                   data, async_data,
2709                                                   &timeout, NULL);
2710                         if (state == NULL) {
2711                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2712                                 talloc_free(tmp_ctx);
2713                                 return -1;
2714                         }
2715
2716                         ctdb_client_async_add(async_data, state);
2717                 }
2718         }
2719         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2720                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2721                 talloc_free(tmp_ctx);
2722                 return -1;
2723         }
2724         talloc_free(async_data);
2725
2726
2727         /* For each IP, send a TAKOVER_IP to the node that should be
2728          * hosting it.  Many of these will often be redundant (since
2729          * the allocation won't have changed) but they can be useful
2730          * to recover from inconsistencies. */
2731         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2732         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2733
2734         async_data->fail_callback = fail_callback;
2735         async_data->callback_data = callback_data;
2736
2737         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2738                 if (tmp_ip->pnn == -1) {
2739                         /* this IP won't be taken over */
2740                         continue;
2741                 }
2742
2743                 ip.pnn  = tmp_ip->pnn;
2744                 ip.addr = tmp_ip->addr;
2745
2746                 timeout = TAKEOVER_TIMEOUT();
2747                 data.dsize = sizeof(ip);
2748                 data.dptr  = (uint8_t *)&ip;
2749                 state = ctdb_control_send(ctdb, tmp_ip->pnn,
2750                                           0, CTDB_CONTROL_TAKEOVER_IP, 0,
2751                                           data, async_data, &timeout, NULL);
2752                 if (state == NULL) {
2753                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2754                         talloc_free(tmp_ctx);
2755                         return -1;
2756                 }
2757
2758                 ctdb_client_async_add(async_data, state);
2759         }
2760         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2761                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2762                 talloc_free(tmp_ctx);
2763                 return -1;
2764         }
2765
2766 ipreallocated:
2767         /*
2768          * Tell all nodes to run eventscripts to process the
2769          * "ipreallocated" event.  This can do a lot of things,
2770          * including restarting services to reconfigure them if public
2771          * IPs have moved.  Once upon a time this event only used to
2772          * update natgw.
2773          */
2774         retry_data = talloc_zero_array(tmp_ctx, bool, nodemap->num);
2775         CTDB_NO_MEMORY_FATAL(ctdb, retry_data);
2776         iprealloc_data.retry_nodes = retry_data;
2777         iprealloc_data.retry_count = 0;
2778         iprealloc_data.fail_callback = fail_callback;
2779         iprealloc_data.fail_callback_data = callback_data;
2780         iprealloc_data.nodemap = nodemap;
2781
2782         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2783         ret = ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
2784                                         nodes, 0, TAKEOVER_TIMEOUT(),
2785                                         false, tdb_null,
2786                                         NULL, iprealloc_fail_callback,
2787                                         &iprealloc_data);
2788         if (ret != 0) {
2789                 /* If the control failed then we should retry to any
2790                  * nodes flagged by iprealloc_fail_callback using the
2791                  * EVENTSCRIPT control.  This is a best-effort at
2792                  * backward compatiblity when running a mixed cluster
2793                  * where some nodes have not yet been upgraded to
2794                  * support the IPREALLOCATED control.
2795                  */
2796                 DEBUG(DEBUG_WARNING,
2797                       ("Retry ipreallocated to some nodes using eventscript control\n"));
2798
2799                 nodes = talloc_array(tmp_ctx, uint32_t,
2800                                      iprealloc_data.retry_count);
2801                 CTDB_NO_MEMORY_FATAL(ctdb, nodes);
2802
2803                 j = 0;
2804                 for (i=0; i<nodemap->num; i++) {
2805                         if (iprealloc_data.retry_nodes[i]) {
2806                                 nodes[j] = i;
2807                                 j++;
2808                         }
2809                 }
2810
2811                 data.dptr  = discard_const("ipreallocated");
2812                 data.dsize = strlen((char *)data.dptr) + 1; 
2813                 ret = ctdb_client_async_control(ctdb,
2814                                                 CTDB_CONTROL_RUN_EVENTSCRIPTS,
2815                                                 nodes, 0, TAKEOVER_TIMEOUT(),
2816                                                 false, data,
2817                                                 NULL, fail_callback,
2818                                                 callback_data);
2819                 if (ret != 0) {
2820                         DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
2821                 }
2822         }
2823
2824         talloc_free(tmp_ctx);
2825         return ret;
2826 }
2827
2828
2829 /*
2830   destroy a ctdb_client_ip structure
2831  */
2832 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2833 {
2834         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2835                 ctdb_addr_to_str(&ip->addr),
2836                 ntohs(ip->addr.ip.sin_port),
2837                 ip->client_id));
2838
2839         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2840         return 0;
2841 }
2842
2843 /*
2844   called by a client to inform us of a TCP connection that it is managing
2845   that should tickled with an ACK when IP takeover is done
2846  */
2847 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2848                                 TDB_DATA indata)
2849 {
2850         struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
2851         struct ctdb_connection *tcp_sock = NULL;
2852         struct ctdb_tcp_list *tcp;
2853         struct ctdb_connection t;
2854         int ret;
2855         TDB_DATA data;
2856         struct ctdb_client_ip *ip;
2857         struct ctdb_vnn *vnn;
2858         ctdb_sock_addr addr;
2859
2860         /* If we don't have public IPs, tickles are useless */
2861         if (ctdb->vnn == NULL) {
2862                 return 0;
2863         }
2864
2865         tcp_sock = (struct ctdb_connection *)indata.dptr;
2866
2867         addr = tcp_sock->src;
2868         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
2869         addr = tcp_sock->dst;
2870         ctdb_canonicalize_ip(&addr, &tcp_sock->dst);
2871
2872         ZERO_STRUCT(addr);
2873         memcpy(&addr, &tcp_sock->dst, sizeof(addr));
2874         vnn = find_public_ip_vnn(ctdb, &addr);
2875         if (vnn == NULL) {
2876                 switch (addr.sa.sa_family) {
2877                 case AF_INET:
2878                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2879                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
2880                                         ctdb_addr_to_str(&addr)));
2881                         }
2882                         break;
2883                 case AF_INET6:
2884                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
2885                                 ctdb_addr_to_str(&addr)));
2886                         break;
2887                 default:
2888                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2889                 }
2890
2891                 return 0;
2892         }
2893
2894         if (vnn->pnn != ctdb->pnn) {
2895                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2896                         ctdb_addr_to_str(&addr),
2897                         client_id, client->pid));
2898                 /* failing this call will tell smbd to die */
2899                 return -1;
2900         }
2901
2902         ip = talloc(client, struct ctdb_client_ip);
2903         CTDB_NO_MEMORY(ctdb, ip);
2904
2905         ip->ctdb      = ctdb;
2906         ip->addr      = addr;
2907         ip->client_id = client_id;
2908         talloc_set_destructor(ip, ctdb_client_ip_destructor);
2909         DLIST_ADD(ctdb->client_ip_list, ip);
2910
2911         tcp = talloc(client, struct ctdb_tcp_list);
2912         CTDB_NO_MEMORY(ctdb, tcp);
2913
2914         tcp->connection.src = tcp_sock->src;
2915         tcp->connection.dst = tcp_sock->dst;
2916
2917         DLIST_ADD(client->tcp_list, tcp);
2918
2919         t.src = tcp_sock->src;
2920         t.dst = tcp_sock->dst;
2921
2922         data.dptr = (uint8_t *)&t;
2923         data.dsize = sizeof(t);
2924
2925         switch (addr.sa.sa_family) {
2926         case AF_INET:
2927                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2928                         (unsigned)ntohs(tcp_sock->dst.ip.sin_port),
2929                         ctdb_addr_to_str(&tcp_sock->src),
2930                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2931                 break;
2932         case AF_INET6:
2933                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2934                         (unsigned)ntohs(tcp_sock->dst.ip6.sin6_port),
2935                         ctdb_addr_to_str(&tcp_sock->src),
2936                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2937                 break;
2938         default:
2939                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2940         }
2941
2942
2943         /* tell all nodes about this tcp connection */
2944         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
2945                                        CTDB_CONTROL_TCP_ADD,
2946                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2947         if (ret != 0) {
2948                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
2949                 return -1;
2950         }
2951
2952         return 0;
2953 }
2954
2955 /*
2956   find a tcp address on a list
2957  */
2958 static struct ctdb_connection *ctdb_tcp_find(struct ctdb_tcp_array *array,
2959                                            struct ctdb_connection *tcp)
2960 {
2961         int i;
2962
2963         if (array == NULL) {
2964                 return NULL;
2965         }
2966
2967         for (i=0;i<array->num;i++) {
2968                 if (ctdb_same_sockaddr(&array->connections[i].src, &tcp->src) &&
2969                     ctdb_same_sockaddr(&array->connections[i].dst, &tcp->dst)) {
2970                         return &array->connections[i];
2971                 }
2972         }
2973         return NULL;
2974 }
2975
2976
2977
2978 /*
2979   called by a daemon to inform us of a TCP connection that one of its
2980   clients managing that should tickled with an ACK when IP takeover is
2981   done
2982  */
2983 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
2984 {
2985         struct ctdb_connection *p = (struct ctdb_connection *)indata.dptr;
2986         struct ctdb_tcp_array *tcparray;
2987         struct ctdb_connection tcp;
2988         struct ctdb_vnn *vnn;
2989
2990         /* If we don't have public IPs, tickles are useless */
2991         if (ctdb->vnn == NULL) {
2992                 return 0;
2993         }
2994
2995         vnn = find_public_ip_vnn(ctdb, &p->dst);
2996         if (vnn == NULL) {
2997                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
2998                         ctdb_addr_to_str(&p->dst)));
2999
3000                 return -1;
3001         }
3002
3003
3004         tcparray = vnn->tcp_array;
3005
3006         /* If this is the first tickle */
3007         if (tcparray == NULL) {
3008                 tcparray = talloc(vnn, struct ctdb_tcp_array);
3009                 CTDB_NO_MEMORY(ctdb, tcparray);
3010                 vnn->tcp_array = tcparray;
3011
3012                 tcparray->num = 0;
3013                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_connection));
3014                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
3015
3016                 tcparray->connections[tcparray->num].src = p->src;
3017                 tcparray->connections[tcparray->num].dst = p->dst;
3018                 tcparray->num++;
3019
3020                 if (tcp_update_needed) {
3021                         vnn->tcp_update_needed = true;
3022                 }
3023                 return 0;
3024         }
3025
3026
3027         /* Do we already have this tickle ?*/
3028         tcp.src = p->src;
3029         tcp.dst = p->dst;
3030         if (ctdb_tcp_find(tcparray, &tcp) != NULL) {
3031                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
3032                         ctdb_addr_to_str(&tcp.dst),
3033                         ntohs(tcp.dst.ip.sin_port),
3034                         vnn->pnn));
3035                 return 0;
3036         }
3037
3038         /* A new tickle, we must add it to the array */
3039         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
3040                                         struct ctdb_connection,
3041                                         tcparray->num+1);
3042         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3043
3044         tcparray->connections[tcparray->num].src = p->src;
3045         tcparray->connections[tcparray->num].dst = p->dst;
3046         tcparray->num++;
3047
3048         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
3049                 ctdb_addr_to_str(&tcp.dst),
3050                 ntohs(tcp.dst.ip.sin_port),
3051                 vnn->pnn));
3052
3053         if (tcp_update_needed) {
3054                 vnn->tcp_update_needed = true;
3055         }
3056
3057         return 0;
3058 }
3059
3060
3061 /*
3062   called by a daemon to inform us of a TCP connection that one of its
3063   clients managing that should tickled with an ACK when IP takeover is
3064   done
3065  */
3066 static void ctdb_remove_connection(struct ctdb_context *ctdb, struct ctdb_connection *conn)
3067 {
3068         struct ctdb_connection *tcpp;
3069         struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst);
3070
3071         if (vnn == NULL) {
3072                 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
3073                         ctdb_addr_to_str(&conn->dst)));
3074                 return;
3075         }
3076
3077         /* if the array is empty we cant remove it
3078            and we don't need to do anything
3079          */
3080         if (vnn->tcp_array == NULL) {
3081                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
3082                         ctdb_addr_to_str(&conn->dst),
3083                         ntohs(conn->dst.ip.sin_port)));
3084                 return;
3085         }
3086
3087
3088         /* See if we know this connection
3089            if we don't know this connection  then we dont need to do anything
3090          */
3091         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
3092         if (tcpp == NULL) {
3093                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
3094                         ctdb_addr_to_str(&conn->dst),
3095                         ntohs(conn->dst.ip.sin_port)));
3096                 return;
3097         }
3098
3099
3100         /* We need to remove this entry from the array.
3101            Instead of allocating a new array and copying data to it
3102            we cheat and just copy the last entry in the existing array
3103            to the entry that is to be removed and just shring the 
3104            ->num field
3105          */
3106         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
3107         vnn->tcp_array->num--;
3108
3109         /* If we deleted the last entry we also need to remove the entire array
3110          */
3111         if (vnn->tcp_array->num == 0) {
3112                 talloc_free(vnn->tcp_array);
3113                 vnn->tcp_array = NULL;
3114         }               
3115
3116         vnn->tcp_update_needed = true;
3117
3118         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
3119                 ctdb_addr_to_str(&conn->src),
3120                 ntohs(conn->src.ip.sin_port)));
3121 }
3122
3123
3124 /*
3125   called by a daemon to inform us of a TCP connection that one of its
3126   clients used are no longer needed in the tickle database
3127  */
3128 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
3129 {
3130         struct ctdb_connection *conn = (struct ctdb_connection *)indata.dptr;
3131
3132         /* If we don't have public IPs, tickles are useless */
3133         if (ctdb->vnn == NULL) {
3134                 return 0;
3135         }
3136
3137         ctdb_remove_connection(ctdb, conn);
3138
3139         return 0;
3140 }
3141
3142
3143 /*
3144   Called when another daemon starts - causes all tickles for all
3145   public addresses we are serving to be sent to the new node on the
3146   next check.  This actually causes the next scheduled call to
3147   tdb_update_tcp_tickles() to update all nodes.  This is simple and
3148   doesn't require careful error handling.
3149  */
3150 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t pnn)
3151 {
3152         struct ctdb_vnn *vnn;
3153
3154         DEBUG(DEBUG_INFO, ("Received startup control from node %lu\n",
3155                            (unsigned long) pnn));
3156
3157         for (vnn = ctdb->vnn; vnn != NULL; vnn = vnn->next) {
3158                 vnn->tcp_update_needed = true;
3159         }
3160
3161         return 0;
3162 }
3163
3164
3165 /*
3166   called when a client structure goes away - hook to remove
3167   elements from the tcp_list in all daemons
3168  */
3169 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
3170 {
3171         while (client->tcp_list) {
3172                 struct ctdb_tcp_list *tcp = client->tcp_list;
3173                 DLIST_REMOVE(client->tcp_list, tcp);
3174                 ctdb_remove_connection(client->ctdb, &tcp->connection);
3175         }
3176 }
3177
3178
3179 void ctdb_release_all_ips(struct ctdb_context *ctdb)
3180 {
3181         struct ctdb_vnn *vnn;
3182         int count = 0;
3183
3184         if (ctdb->tunable.disable_ip_failover == 1) {
3185                 return;
3186         }
3187
3188         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3189                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
3190                         ctdb_vnn_unassign_iface(ctdb, vnn);
3191                         continue;
3192                 }
3193                 if (!vnn->iface) {
3194                         continue;
3195                 }
3196
3197                 /* Don't allow multiple releases at once.  Some code,
3198                  * particularly ctdb_tickle_sentenced_connections() is
3199                  * not re-entrant */
3200                 if (vnn->update_in_flight) {
3201                         DEBUG(DEBUG_WARNING,
3202                               (__location__
3203                                " Not releasing IP %s/%u on interface %s, an update is already in progess\n",
3204                                     ctdb_addr_to_str(&vnn->public_address),
3205                                     vnn->public_netmask_bits,
3206                                     ctdb_vnn_iface_string(vnn)));
3207                         continue;
3208                 }
3209                 vnn->update_in_flight = true;
3210
3211                 DEBUG(DEBUG_INFO,("Release of IP %s/%u on interface %s node:-1\n",
3212                                     ctdb_addr_to_str(&vnn->public_address),
3213                                     vnn->public_netmask_bits,
3214                                     ctdb_vnn_iface_string(vnn)));
3215
3216                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
3217                                   ctdb_vnn_iface_string(vnn),
3218                                   ctdb_addr_to_str(&vnn->public_address),
3219                                   vnn->public_netmask_bits);
3220                 release_kill_clients(ctdb, &vnn->public_address);
3221                 ctdb_vnn_unassign_iface(ctdb, vnn);
3222                 vnn->update_in_flight = false;
3223                 count++;
3224         }
3225
3226         DEBUG(DEBUG_NOTICE,(__location__ " Released %d public IPs\n", count));
3227 }
3228
3229
3230 /*
3231   get list of public IPs
3232  */
3233 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
3234                                     struct ctdb_req_control_old *c, TDB_DATA *outdata)
3235 {
3236         int i, num, len;
3237         struct ctdb_public_ip_list_old *ips;
3238         struct ctdb_vnn *vnn;
3239         bool only_available = false;
3240
3241         if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
3242                 only_available = true;
3243         }
3244
3245         /* count how many public ip structures we have */
3246         num = 0;
3247         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3248                 num++;
3249         }
3250
3251         len = offsetof(struct ctdb_public_ip_list_old, ips) +
3252                 num*sizeof(struct ctdb_public_ip);
3253         ips = talloc_zero_size(outdata, len);
3254         CTDB_NO_MEMORY(ctdb, ips);
3255
3256         i = 0;
3257         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3258                 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
3259                         continue;
3260                 }
3261                 ips->ips[i].pnn  = vnn->pnn;
3262                 ips->ips[i].addr = vnn->public_address;
3263                 i++;
3264         }
3265         ips->num = i;
3266         len = offsetof(struct ctdb_public_ip_list_old, ips) +
3267                 i*sizeof(struct ctdb_public_ip);
3268
3269         outdata->dsize = len;
3270         outdata->dptr  = (uint8_t *)ips;
3271
3272         return 0;
3273 }
3274
3275
3276 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
3277                                         struct ctdb_req_control_old *c,
3278                                         TDB_DATA indata,
3279                                         TDB_DATA *outdata)
3280 {
3281         int i, num, len;
3282         ctdb_sock_addr *addr;
3283         struct ctdb_public_ip_info_old *info;
3284         struct ctdb_vnn *vnn;
3285
3286         addr = (ctdb_sock_addr *)indata.dptr;
3287
3288         vnn = find_public_ip_vnn(ctdb, addr);
3289         if (vnn == NULL) {
3290                 /* if it is not a public ip   it could be our 'single ip' */
3291                 if (ctdb->single_ip_vnn) {
3292                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
3293                                 vnn = ctdb->single_ip_vnn;
3294                         }
3295                 }
3296         }
3297         if (vnn == NULL) {
3298                 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
3299                                  "'%s'not a public address\n",
3300                                  ctdb_addr_to_str(addr)));
3301                 return -1;
3302         }
3303
3304         /* count how many public ip structures we have */
3305         num = 0;
3306         for (;vnn->ifaces[num];) {
3307                 num++;
3308         }
3309
3310         len = offsetof(struct ctdb_public_ip_info_old, ifaces) +
3311                 num*sizeof(struct ctdb_iface);
3312         info = talloc_zero_size(outdata, len);
3313         CTDB_NO_MEMORY(ctdb, info);
3314
3315         info->ip.addr = vnn->public_address;
3316         info->ip.pnn = vnn->pnn;
3317         info->active_idx = 0xFFFFFFFF;
3318
3319         for (i=0; vnn->ifaces[i]; i++) {
3320                 struct ctdb_interface *cur;
3321
3322                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
3323                 if (cur == NULL) {
3324                         DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
3325                                            vnn->ifaces[i]));
3326                         return -1;
3327                 }
3328                 if (vnn->iface == cur) {
3329                         info->active_idx = i;
3330                 }
3331                 strncpy(info->ifaces[i].name, cur->name, sizeof(info->ifaces[i].name)-1);
3332                 info->ifaces[i].link_state = cur->link_up;
3333                 info->ifaces[i].references = cur->references;
3334         }
3335         info->num = i;
3336         len = offsetof(struct ctdb_public_ip_info_old, ifaces) +
3337                 i*sizeof(struct ctdb_iface);
3338
3339         outdata->dsize = len;
3340         outdata->dptr  = (uint8_t *)info;
3341
3342         return 0;
3343 }
3344
3345 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
3346                                 struct ctdb_req_control_old *c,
3347                                 TDB_DATA *outdata)
3348 {
3349         int i, num, len;
3350         struct ctdb_iface_list_old *ifaces;
3351         struct ctdb_interface *cur;
3352
3353         /* count how many public ip structures we have */
3354         num = 0;
3355         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3356                 num++;
3357         }
3358
3359         len = offsetof(struct ctdb_iface_list_old, ifaces) +
3360                 num*sizeof(struct ctdb_iface);
3361         ifaces = talloc_zero_size(outdata, len);
3362         CTDB_NO_MEMORY(ctdb, ifaces);
3363
3364         i = 0;
3365         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3366                 strcpy(ifaces->ifaces[i].name, cur->name);
3367                 ifaces->ifaces[i].link_state = cur->link_up;
3368                 ifaces->ifaces[i].references = cur->references;
3369                 i++;
3370         }
3371         ifaces->num = i;
3372         len = offsetof(struct ctdb_iface_list_old, ifaces) +
3373                 i*sizeof(struct ctdb_iface);
3374
3375         outdata->dsize = len;
3376         outdata->dptr  = (uint8_t *)ifaces;
3377
3378         return 0;
3379 }
3380
3381 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
3382                                     struct ctdb_req_control_old *c,
3383                                     TDB_DATA indata)
3384 {
3385         struct ctdb_iface *info;
3386         struct ctdb_interface *iface;
3387         bool link_up = false;
3388
3389         info = (struct ctdb_iface *)indata.dptr;
3390
3391         if (info->name[CTDB_IFACE_SIZE] != '\0') {
3392                 int len = strnlen(info->name, CTDB_IFACE_SIZE);
3393                 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
3394                                   len, len, info->name));
3395                 return -1;
3396         }
3397
3398         switch (info->link_state) {
3399         case 0:
3400                 link_up = false;
3401                 break;
3402         case 1:
3403                 link_up = true;
3404                 break;
3405         default:
3406                 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
3407                                   (unsigned int)info->link_state));
3408                 return -1;
3409         }
3410
3411         if (info->references != 0) {
3412                 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
3413                                   (unsigned int)info->references));
3414                 return -1;
3415         }
3416
3417         iface = ctdb_find_iface(ctdb, info->name);
3418         if (iface == NULL) {
3419                 return -1;
3420         }
3421
3422         if (link_up == iface->link_up) {
3423                 return 0;
3424         }
3425
3426         DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
3427               ("iface[%s] has changed it's link status %s => %s\n",
3428                iface->name,
3429                iface->link_up?"up":"down",
3430                link_up?"up":"down"));
3431
3432         iface->link_up = link_up;
3433         return 0;
3434 }
3435
3436
3437 /* 
3438    structure containing the listening socket and the list of tcp connections
3439    that the ctdb daemon is to kill
3440 */
3441 struct ctdb_kill_tcp {
3442         struct ctdb_vnn *vnn;
3443         struct ctdb_context *ctdb;
3444         int capture_fd;
3445         struct tevent_fd *fde;
3446         trbt_tree_t *connections;
3447         void *private_data;
3448 };
3449
3450 /*
3451   a tcp connection that is to be killed
3452  */
3453 struct ctdb_killtcp_con {
3454         ctdb_sock_addr src_addr;
3455         ctdb_sock_addr dst_addr;
3456         int count;
3457         struct ctdb_kill_tcp *killtcp;
3458 };
3459
3460 /* this function is used to create a key to represent this socketpair
3461    in the killtcp tree.
3462    this key is used to insert and lookup matching socketpairs that are
3463    to be tickled and RST
3464 */
3465 #define KILLTCP_KEYLEN  10
3466 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
3467 {
3468         static uint32_t key[KILLTCP_KEYLEN];
3469
3470         bzero(key, sizeof(key));
3471
3472         if (src->sa.sa_family != dst->sa.sa_family) {
3473                 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
3474                 return key;
3475         }
3476         
3477         switch (src->sa.sa_family) {
3478         case AF_INET:
3479                 key[0]  = dst->ip.sin_addr.s_addr;
3480                 key[1]  = src->ip.sin_addr.s_addr;
3481                 key[2]  = dst->ip.sin_port;
3482                 key[3]  = src->ip.sin_port;
3483                 break;
3484         case AF_INET6: {
3485                 uint32_t *dst6_addr32 =
3486                         (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
3487                 uint32_t *src6_addr32 =
3488                         (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
3489                 key[0]  = dst6_addr32[3];
3490                 key[1]  = src6_addr32[3];
3491                 key[2]  = dst6_addr32[2];
3492                 key[3]  = src6_addr32[2];
3493                 key[4]  = dst6_addr32[1];
3494                 key[5]  = src6_addr32[1];
3495                 key[6]  = dst6_addr32[0];
3496                 key[7]  = src6_addr32[0];
3497                 key[8]  = dst->ip6.sin6_port;
3498                 key[9]  = src->ip6.sin6_port;
3499                 break;
3500         }
3501         default:
3502                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
3503                 return key;
3504         }
3505
3506         return key;
3507 }
3508
3509 /*
3510   called when we get a read event on the raw socket
3511  */
3512 static void capture_tcp_handler(struct tevent_context *ev,
3513                                 struct tevent_fd *fde,
3514                                 uint16_t flags, void *private_data)
3515 {
3516         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3517         struct ctdb_killtcp_con *con;
3518         ctdb_sock_addr src, dst;
3519         uint32_t ack_seq, seq;
3520
3521         if (!(flags & TEVENT_FD_READ)) {
3522                 return;
3523         }
3524
3525         if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
3526                                 killtcp->private_data,
3527                                 &src, &dst,
3528                                 &ack_seq, &seq) != 0) {
3529                 /* probably a non-tcp ACK packet */
3530                 return;
3531         }
3532
3533         /* check if we have this guy in our list of connections
3534            to kill
3535         */
3536         con = trbt_lookuparray32(killtcp->connections, 
3537                         KILLTCP_KEYLEN, killtcp_key(&src, &dst));
3538         if (con == NULL) {
3539                 /* no this was some other packet we can just ignore */
3540                 return;
3541         }
3542
3543         /* This one has been tickled !
3544            now reset him and remove him from the list.
3545          */
3546         DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
3547                 ntohs(con->dst_addr.ip.sin_port),
3548                 ctdb_addr_to_str(&con->src_addr),
3549                 ntohs(con->src_addr.ip.sin_port)));
3550
3551         ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
3552         talloc_free(con);
3553 }
3554
3555
3556 /* when traversing the list of all tcp connections to send tickle acks to
3557    (so that we can capture the ack coming back and kill the connection
3558     by a RST)
3559    this callback is called for each connection we are currently trying to kill
3560 */
3561 static int tickle_connection_traverse(void *param, void *data)
3562 {
3563         struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
3564
3565         /* have tried too many times, just give up */
3566         if (con->count >= 5) {
3567                 /* can't delete in traverse: reparent to delete_cons */
3568                 talloc_steal(param, con);
3569                 return 0;
3570         }
3571
3572         /* othervise, try tickling it again */
3573         con->count++;
3574         ctdb_sys_send_tcp(
3575                 (ctdb_sock_addr *)&con->dst_addr,
3576                 (ctdb_sock_addr *)&con->src_addr,
3577                 0, 0, 0);
3578         return 0;
3579 }
3580
3581
3582 /* 
3583    called every second until all sentenced connections have been reset
3584  */
3585 static void ctdb_tickle_sentenced_connections(struct tevent_context *ev,
3586                                               struct tevent_timer *te,
3587                                               struct timeval t, void *private_data)
3588 {
3589         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3590         void *delete_cons = talloc_new(NULL);
3591
3592         /* loop over all connections sending tickle ACKs */
3593         trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
3594
3595         /* now we've finished traverse, it's safe to do deletion. */
3596         talloc_free(delete_cons);
3597
3598         /* If there are no more connections to kill we can remove the
3599            entire killtcp structure
3600          */
3601         if ( (killtcp->connections == NULL) || 
3602              (killtcp->connections->root == NULL) ) {
3603                 talloc_free(killtcp);
3604                 return;
3605         }
3606
3607         /* try tickling them again in a seconds time
3608          */
3609         tevent_add_timer(killtcp->ctdb->ev, killtcp,
3610                          timeval_current_ofs(1, 0),
3611                          ctdb_tickle_sentenced_connections, killtcp);
3612 }
3613
3614 /*
3615   destroy the killtcp structure
3616  */
3617 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
3618 {
3619         struct ctdb_vnn *tmpvnn;
3620
3621         /* verify that this vnn is still active */
3622         for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
3623                 if (tmpvnn == killtcp->vnn) {
3624                         break;
3625                 }
3626         }
3627
3628         if (tmpvnn == NULL) {
3629                 return 0;
3630         }
3631
3632         if (killtcp->vnn->killtcp != killtcp) {
3633                 return 0;
3634         }
3635
3636         killtcp->vnn->killtcp = NULL;
3637
3638         return 0;
3639 }
3640
3641
3642 /* nothing fancy here, just unconditionally replace any existing
3643    connection structure with the new one.
3644
3645    don't even free the old one if it did exist, that one is talloc_stolen
3646    by the same node in the tree anyway and will be deleted when the new data 
3647    is deleted
3648 */
3649 static void *add_killtcp_callback(void *parm, void *data)
3650 {
3651         return parm;
3652 }
3653
3654 /*
3655   add a tcp socket to the list of connections we want to RST
3656  */
3657 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb, 
3658                                        ctdb_sock_addr *s,
3659                                        ctdb_sock_addr *d)
3660 {
3661         ctdb_sock_addr src, dst;
3662         struct ctdb_kill_tcp *killtcp;
3663         struct ctdb_killtcp_con *con;
3664         struct ctdb_vnn *vnn;
3665
3666         ctdb_canonicalize_ip(s, &src);
3667         ctdb_canonicalize_ip(d, &dst);
3668
3669         vnn = find_public_ip_vnn(ctdb, &dst);
3670         if (vnn == NULL) {
3671                 vnn = find_public_ip_vnn(ctdb, &src);
3672         }
3673         if (vnn == NULL) {
3674                 /* if it is not a public ip   it could be our 'single ip' */
3675                 if (ctdb->single_ip_vnn) {
3676                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
3677                                 vnn = ctdb->single_ip_vnn;
3678                         }
3679                 }
3680         }
3681         if (vnn == NULL) {
3682                 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n")); 
3683                 return -1;
3684         }
3685
3686         killtcp = vnn->killtcp;
3687         
3688         /* If this is the first connection to kill we must allocate
3689            a new structure
3690          */
3691         if (killtcp == NULL) {
3692                 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
3693                 CTDB_NO_MEMORY(ctdb, killtcp);
3694
3695                 killtcp->vnn         = vnn;
3696                 killtcp->ctdb        = ctdb;
3697                 killtcp->capture_fd  = -1;
3698                 killtcp->connections = trbt_create(killtcp, 0);
3699
3700                 vnn->killtcp         = killtcp;
3701                 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
3702         }
3703
3704
3705
3706         /* create a structure that describes this connection we want to
3707            RST and store it in killtcp->connections
3708         */
3709         con = talloc(killtcp, struct ctdb_killtcp_con);
3710         CTDB_NO_MEMORY(ctdb, con);
3711         con->src_addr = src;
3712         con->dst_addr = dst;
3713         con->count    = 0;
3714         con->killtcp  = killtcp;
3715
3716
3717         trbt_insertarray32_callback(killtcp->connections,
3718                         KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
3719                         add_killtcp_callback, con);
3720
3721         /* 
3722            If we don't have a socket to listen on yet we must create it
3723          */
3724         if (killtcp->capture_fd == -1) {
3725                 const char *iface = ctdb_vnn_iface_string(vnn);
3726                 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
3727                 if (killtcp->capture_fd == -1) {
3728                         DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
3729                                           "socket on iface '%s' for killtcp (%s)\n",
3730                                           iface, strerror(errno)));
3731                         goto failed;
3732                 }
3733         }
3734
3735
3736         if (killtcp->fde == NULL) {
3737                 killtcp->fde = tevent_add_fd(ctdb->ev, killtcp,
3738                                              killtcp->capture_fd,
3739                                              TEVENT_FD_READ,
3740                                              capture_tcp_handler, killtcp);
3741                 tevent_fd_set_auto_close(killtcp->fde);
3742
3743                 /* We also need to set up some events to tickle all these connections
3744                    until they are all reset
3745                 */
3746                 tevent_add_timer(ctdb->ev, killtcp, timeval_current_ofs(1, 0),
3747                                  ctdb_tickle_sentenced_connections, killtcp);
3748         }
3749
3750         /* tickle him once now */
3751         ctdb_sys_send_tcp(
3752                 &con->dst_addr,
3753                 &con->src_addr,
3754                 0, 0, 0);
3755
3756         return 0;
3757
3758 failed:
3759         talloc_free(vnn->killtcp);
3760         vnn->killtcp = NULL;
3761         return -1;
3762 }
3763
3764 /*
3765   kill a TCP connection.
3766  */
3767 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
3768 {
3769         struct ctdb_connection *killtcp = (struct ctdb_connection *)indata.dptr;
3770
3771         return ctdb_killtcp_add_connection(ctdb, &killtcp->src, &killtcp->dst);
3772 }
3773
3774 /*
3775   called by a daemon to inform us of the entire list of TCP tickles for
3776   a particular public address.
3777   this control should only be sent by the node that is currently serving
3778   that public address.
3779  */
3780 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
3781 {
3782         struct ctdb_tickle_list_old *list = (struct ctdb_tickle_list_old *)indata.dptr;
3783         struct ctdb_tcp_array *tcparray;
3784         struct ctdb_vnn *vnn;
3785
3786         /* We must at least have tickles.num or else we cant verify the size
3787            of the received data blob
3788          */
3789         if (indata.dsize < offsetof(struct ctdb_tickle_list_old, connections)) {
3790                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_tickle_list. Not enough data for the tickle.num field\n"));
3791                 return -1;
3792         }
3793
3794         /* verify that the size of data matches what we expect */
3795         if (indata.dsize < offsetof(struct ctdb_tickle_list_old, connections)
3796                          + sizeof(struct ctdb_connection) * list->num) {
3797                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_tickle_list\n"));
3798                 return -1;
3799         }
3800
3801         DEBUG(DEBUG_INFO, ("Received tickle update for public address %s\n",
3802                            ctdb_addr_to_str(&list->addr)));
3803
3804         vnn = find_public_ip_vnn(ctdb, &list->addr);
3805         if (vnn == NULL) {
3806                 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n",
3807                         ctdb_addr_to_str(&list->addr)));
3808
3809                 return 1;
3810         }
3811
3812         /* remove any old ticklelist we might have */
3813         talloc_free(vnn->tcp_array);
3814         vnn->tcp_array = NULL;
3815
3816         tcparray = talloc(vnn, struct ctdb_tcp_array);
3817         CTDB_NO_MEMORY(ctdb, tcparray);
3818
3819         tcparray->num = list->num;
3820
3821         tcparray->connections = talloc_array(tcparray, struct ctdb_connection, tcparray->num);
3822         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3823
3824         memcpy(tcparray->connections, &list->connections[0],
3825                sizeof(struct ctdb_connection)*tcparray->num);
3826
3827         /* We now have a new fresh tickle list array for this vnn */
3828         vnn->tcp_array = tcparray;
3829
3830         return 0;
3831 }
3832
3833 /*
3834   called to return the full list of tickles for the puclic address associated 
3835   with the provided vnn
3836  */
3837 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3838 {
3839         ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3840         struct ctdb_tickle_list_old *list;
3841         struct ctdb_tcp_array *tcparray;
3842         int num;
3843         struct ctdb_vnn *vnn;
3844
3845         vnn = find_public_ip_vnn(ctdb, addr);
3846         if (vnn == NULL) {
3847                 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n", 
3848                         ctdb_addr_to_str(addr)));
3849
3850                 return 1;
3851         }
3852
3853         tcparray = vnn->tcp_array;
3854         if (tcparray) {
3855                 num = tcparray->num;
3856         } else {
3857                 num = 0;
3858         }
3859
3860         outdata->dsize = offsetof(struct ctdb_tickle_list_old, connections)
3861                         + sizeof(struct ctdb_connection) * num;
3862
3863         outdata->dptr  = talloc_size(outdata, outdata->dsize);
3864         CTDB_NO_MEMORY(ctdb, outdata->dptr);
3865         list = (struct ctdb_tickle_list_old *)outdata->dptr;
3866
3867         list->addr = *addr;
3868         list->num = num;
3869         if (num) {
3870                 memcpy(&list->connections[0], tcparray->connections,
3871                         sizeof(struct ctdb_connection) * num);
3872         }
3873
3874         return 0;
3875 }
3876
3877
3878 /*
3879   set the list of all tcp tickles for a public address
3880  */
3881 static int ctdb_send_set_tcp_tickles_for_ip(struct ctdb_context *ctdb,
3882                                             ctdb_sock_addr *addr,
3883                                             struct ctdb_tcp_array *tcparray)
3884 {
3885         int ret, num;
3886         TDB_DATA data;
3887         struct ctdb_tickle_list_old *list;
3888
3889         if (tcparray) {
3890                 num = tcparray->num;
3891         } else {
3892                 num = 0;
3893         }
3894
3895         data.dsize = offsetof(struct ctdb_tickle_list_old, connections) +
3896                         sizeof(struct ctdb_connection) * num;
3897         data.dptr = talloc_size(ctdb, data.dsize);
3898         CTDB_NO_MEMORY(ctdb, data.dptr);
3899
3900         list = (struct ctdb_tickle_list_old *)data.dptr;
3901         list->addr = *addr;
3902         list->num = num;
3903         if (tcparray) {
3904                 memcpy(&list->connections[0], tcparray->connections, sizeof(struct ctdb_connection) * num);
3905         }
3906
3907         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_ALL, 0,
3908                                        CTDB_CONTROL_SET_TCP_TICKLE_LIST,
3909                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3910         if (ret != 0) {
3911                 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
3912                 return -1;
3913         }
3914
3915         talloc_free(data.dptr);
3916
3917         return ret;
3918 }
3919
3920
3921 /*
3922   perform tickle updates if required
3923  */
3924 static void ctdb_update_tcp_tickles(struct tevent_context *ev,
3925                                     struct tevent_timer *te,
3926                                     struct timeval t, void *private_data)
3927 {
3928         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3929         int ret;
3930         struct ctdb_vnn *vnn;
3931
3932         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3933                 /* we only send out updates for public addresses that 
3934                    we have taken over
3935                  */
3936                 if (ctdb->pnn != vnn->pnn) {
3937                         continue;
3938                 }
3939                 /* We only send out the updates if we need to */
3940                 if (!vnn->tcp_update_needed) {
3941                         continue;
3942                 }
3943                 ret = ctdb_send_set_tcp_tickles_for_ip(ctdb,
3944                                                        &vnn->public_address,
3945                                                        vnn->tcp_array);
3946                 if (ret != 0) {
3947                         DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
3948                                 ctdb_addr_to_str(&vnn->public_address)));
3949                 } else {
3950                         DEBUG(DEBUG_INFO,
3951                               ("Sent tickle update for public address %s\n",
3952                                ctdb_addr_to_str(&vnn->public_address)));
3953                         vnn->tcp_update_needed = false;
3954                 }
3955         }
3956
3957         tevent_add_timer(ctdb->ev, ctdb->tickle_update_context,
3958                          timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
3959                          ctdb_update_tcp_tickles, ctdb);
3960 }
3961
3962 /*
3963   start periodic update of tcp tickles
3964  */
3965 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
3966 {
3967         ctdb->tickle_update_context = talloc_new(ctdb);
3968
3969         tevent_add_timer(ctdb->ev, ctdb->tickle_update_context,
3970                          timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
3971                          ctdb_update_tcp_tickles, ctdb);
3972 }
3973
3974
3975
3976
3977 struct control_gratious_arp {
3978         struct ctdb_context *ctdb;
3979         ctdb_sock_addr addr;
3980         const char *iface;
3981         int count;
3982 };
3983
3984 /*
3985   send a control_gratuitous arp
3986  */
3987 static void send_gratious_arp(struct tevent_context *ev,
3988                               struct tevent_timer *te,
3989                               struct timeval t, void *private_data)
3990 {
3991         int ret;
3992         struct control_gratious_arp *arp = talloc_get_type(private_data, 
3993                                                         struct control_gratious_arp);
3994
3995         ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
3996         if (ret != 0) {
3997                 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
3998                                  arp->iface, strerror(errno)));
3999         }
4000
4001
4002         arp->count++;
4003         if (arp->count == CTDB_ARP_REPEAT) {
4004                 talloc_free(arp);
4005                 return;
4006         }
4007
4008         tevent_add_timer(arp->ctdb->ev, arp,
4009                          timeval_current_ofs(CTDB_ARP_INTERVAL, 0),
4010                          send_gratious_arp, arp);
4011 }
4012
4013
4014 /*
4015   send a gratious arp 
4016  */
4017 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
4018 {
4019         struct ctdb_addr_info_old *gratious_arp = (struct ctdb_addr_info_old *)indata.dptr;
4020         struct control_gratious_arp *arp;
4021
4022         /* verify the size of indata */
4023         if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
4024                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
4025                                  (unsigned)indata.dsize, 
4026                                  (unsigned)offsetof(struct ctdb_addr_info_old, iface)));
4027                 return -1;
4028         }
4029         if (indata.dsize != 
4030                 ( offsetof(struct ctdb_addr_info_old, iface)
4031                 + gratious_arp->len ) ){
4032
4033                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4034                         "but should be %u bytes\n", 
4035                          (unsigned)indata.dsize, 
4036                          (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+gratious_arp->len)));
4037                 return -1;
4038         }
4039
4040
4041         arp = talloc(ctdb, struct control_gratious_arp);
4042         CTDB_NO_MEMORY(ctdb, arp);
4043
4044         arp->ctdb  = ctdb;
4045         arp->addr   = gratious_arp->addr;
4046         arp->iface = talloc_strdup(arp, gratious_arp->iface);
4047         CTDB_NO_MEMORY(ctdb, arp->iface);
4048         arp->count = 0;
4049
4050         tevent_add_timer(arp->ctdb->ev, arp,
4051                          timeval_zero(), send_gratious_arp, arp);
4052
4053         return 0;
4054 }
4055
4056 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
4057 {
4058         struct ctdb_addr_info_old *pub = (struct ctdb_addr_info_old *)indata.dptr;
4059         int ret;
4060
4061         /* verify the size of indata */
4062         if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
4063                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_addr_info structure\n"));
4064                 return -1;
4065         }
4066         if (indata.dsize != 
4067                 ( offsetof(struct ctdb_addr_info_old, iface)
4068                 + pub->len ) ){
4069
4070                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4071                         "but should be %u bytes\n", 
4072                          (unsigned)indata.dsize, 
4073                          (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+pub->len)));
4074                 return -1;
4075         }
4076
4077         DEBUG(DEBUG_NOTICE,("Add IP %s\n", ctdb_addr_to_str(&pub->addr)));
4078
4079         ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
4080
4081         if (ret != 0) {
4082                 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
4083                 return -1;
4084         }
4085
4086         return 0;
4087 }
4088
4089 struct delete_ip_callback_state {
4090         struct ctdb_req_control_old *c;
4091 };
4092
4093 /*
4094   called when releaseip event finishes for del_public_address
4095  */
4096 static void delete_ip_callback(struct ctdb_context *ctdb,
4097                                int32_t status, TDB_DATA data,
4098                                const char *errormsg,
4099                                void *private_data)
4100 {
4101         struct delete_ip_callback_state *state =
4102                 talloc_get_type(private_data, struct delete_ip_callback_state);
4103
4104         /* If release failed then fail. */
4105         ctdb_request_control_reply(ctdb, state->c, NULL, status, errormsg);
4106         talloc_free(private_data);
4107 }
4108
4109 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb,
4110                                         struct ctdb_req_control_old *c,
4111                                         TDB_DATA indata, bool *async_reply)
4112 {
4113         struct ctdb_addr_info_old *pub = (struct ctdb_addr_info_old *)indata.dptr;
4114         struct ctdb_vnn *vnn;
4115
4116         /* verify the size of indata */
4117         if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
4118                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_addr_info structure\n"));
4119                 return -1;
4120         }
4121         if (indata.dsize != 
4122                 ( offsetof(struct ctdb_addr_info_old, iface)
4123                 + pub->len ) ){
4124
4125                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4126                         "but should be %u bytes\n", 
4127                          (unsigned)indata.dsize, 
4128                          (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+pub->len)));
4129                 return -1;
4130         }
4131
4132         DEBUG(DEBUG_NOTICE,("Delete IP %s\n", ctdb_addr_to_str(&pub->addr)));
4133
4134         /* walk over all public addresses until we find a match */
4135         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
4136                 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
4137                         if (vnn->pnn == ctdb->pnn) {
4138                                 struct delete_ip_callback_state *state;
4139                                 struct ctdb_public_ip *ip;
4140                                 TDB_DATA data;
4141                                 int ret;
4142
4143                                 vnn->delete_pending = true;
4144
4145                                 state = talloc(ctdb,
4146                                                struct delete_ip_callback_state);
4147                                 CTDB_NO_MEMORY(ctdb, state);
4148                                 state->c = c;
4149
4150                                 ip = talloc(state, struct ctdb_public_ip);
4151                                 if (ip == NULL) {
4152                                         DEBUG(DEBUG_ERR,
4153                                               (__location__ " Out of memory\n"));
4154                                         talloc_free(state);
4155                                         return -1;
4156                                 }
4157                                 ip->pnn = -1;
4158                                 ip->addr = pub->addr;
4159
4160                                 data.dsize = sizeof(struct ctdb_public_ip);
4161                                 data.dptr = (unsigned char *)ip;
4162
4163                                 ret = ctdb_daemon_send_control(ctdb,
4164                                                                ctdb_get_pnn(ctdb),
4165                                                                0,
4166                                                                CTDB_CONTROL_RELEASE_IP,
4167                                                                0, 0,
4168                                                                data,
4169                                                                delete_ip_callback,
4170                                                                state);
4171                                 if (ret == -1) {
4172                                         DEBUG(DEBUG_ERR,
4173                                               (__location__ "Unable to send "
4174                                                "CTDB_CONTROL_RELEASE_IP\n"));
4175                                         talloc_free(state);
4176                                         return -1;
4177                                 }
4178
4179                                 state->c = talloc_steal(state, c);
4180                                 *async_reply = true;
4181                         } else {
4182                                 /* This IP is not hosted on the
4183                                  * current node so just delete it
4184                                  * now. */
4185                                 do_delete_ip(ctdb, vnn);
4186                         }
4187
4188                         return 0;
4189                 }
4190         }
4191
4192         DEBUG(DEBUG_ERR,("Delete IP of unknown public IP address %s\n",
4193                          ctdb_addr_to_str(&pub->addr)));
4194         return -1;
4195 }
4196
4197
4198 struct ipreallocated_callback_state {
4199         struct ctdb_req_control_old *c;
4200 };
4201
4202 static void ctdb_ipreallocated_callback(struct ctdb_context *ctdb,
4203                                         int status, void *p)
4204 {
4205         struct ipreallocated_callback_state *state =
4206                 talloc_get_type(p, struct ipreallocated_callback_state);
4207
4208         if (status != 0) {
4209                 DEBUG(DEBUG_ERR,
4210                       (" \"ipreallocated\" event script failed (status %d)\n",
4211                        status));
4212                 if (status == -ETIME) {
4213                         ctdb_ban_self(ctdb);
4214                 }
4215         }
4216
4217         ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
4218         talloc_free(state);
4219 }
4220
4221 /* A control to run the ipreallocated event */
4222 int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
4223                                    struct ctdb_req_control_old *c,
4224                                    bool *async_reply)
4225 {
4226         int ret;
4227         struct ipreallocated_callback_state *state;
4228
4229         state = talloc(ctdb, struct ipreallocated_callback_state);
4230         CTDB_NO_MEMORY(ctdb, state);
4231
4232         DEBUG(DEBUG_INFO,(__location__ " Running \"ipreallocated\" event\n"));
4233
4234         ret = ctdb_event_script_callback(ctdb, state,
4235                                          ctdb_ipreallocated_callback, state,
4236                                          CTDB_EVENT_IPREALLOCATED,
4237                                          "%s", "");
4238
4239         if (ret != 0) {
4240                 DEBUG(DEBUG_ERR,("Failed to run \"ipreallocated\" event \n"));
4241                 talloc_free(state);
4242                 return -1;
4243         }
4244
4245         /* tell the control that we will be reply asynchronously */
4246         state->c    = talloc_steal(state, c);
4247         *async_reply = true;
4248
4249         return 0;
4250 }
4251
4252
4253 /* This function is called from the recovery daemon to verify that a remote
4254    node has the expected ip allocation.
4255    This is verified against ctdb->ip_tree
4256 */
4257 static int verify_remote_ip_allocation(struct ctdb_context *ctdb,
4258                                        struct ctdb_public_ip_list_old *ips,
4259                                        uint32_t pnn)
4260 {
4261         struct public_ip_list *tmp_ip;
4262         int i;
4263
4264         if (ctdb->ip_tree == NULL) {
4265                 /* don't know the expected allocation yet, assume remote node
4266                    is correct. */
4267                 return 0;
4268         }
4269
4270         if (ips == NULL) {
4271                 return 0;
4272         }
4273
4274         for (i=0; i<ips->num; i++) {
4275                 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
4276                 if (tmp_ip == NULL) {
4277                         DEBUG(DEBUG_ERR,("Node %u has new or unknown public IP %s\n", pnn, ctdb_addr_to_str(&ips->ips[i].addr)));
4278                         return -1;
4279                 }
4280
4281                 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
4282                         continue;
4283                 }
4284
4285                 if (tmp_ip->pnn != ips->ips[i].pnn) {
4286                         DEBUG(DEBUG_ERR,
4287                               ("Inconsistent IP allocation - node %u thinks %s is held by node %u while it is assigned to node %u\n",
4288                                pnn,
4289                                ctdb_addr_to_str(&ips->ips[i].addr),
4290                                ips->ips[i].pnn, tmp_ip->pnn));
4291                         return -1;
4292                 }
4293         }
4294
4295         return 0;
4296 }
4297
4298 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
4299 {
4300         struct public_ip_list *tmp_ip;
4301
4302         /* IP tree is never built if DisableIPFailover is set */
4303         if (ctdb->tunable.disable_ip_failover != 0) {
4304                 return 0;
4305         }
4306
4307         if (ctdb->ip_tree == NULL) {
4308                 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
4309                 return -1;
4310         }
4311
4312         tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
4313         if (tmp_ip == NULL) {
4314                 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
4315                 return -1;
4316         }
4317
4318         DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
4319         tmp_ip->pnn = ip->pnn;
4320
4321         return 0;
4322 }
4323
4324 void clear_ip_assignment_tree(struct ctdb_context *ctdb)
4325 {
4326         TALLOC_FREE(ctdb->ip_tree);
4327 }
4328
4329 struct ctdb_reloadips_handle {
4330         struct ctdb_context *ctdb;
4331         struct ctdb_req_control_old *c;
4332         int status;
4333         int fd[2];
4334         pid_t child;
4335         struct tevent_fd *fde;
4336 };
4337
4338 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
4339 {
4340         if (h == h->ctdb->reload_ips) {
4341                 h->ctdb->reload_ips = NULL;
4342         }
4343         if (h->c != NULL) {
4344                 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
4345                 h->c = NULL;
4346         }
4347         ctdb_kill(h->ctdb, h->child, SIGKILL);
4348         return 0;
4349 }
4350
4351 static void ctdb_reloadips_timeout_event(struct tevent_context *ev,
4352                                          struct tevent_timer *te,
4353                                          struct timeval t, void *private_data)
4354 {
4355         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4356
4357         talloc_free(h);
4358 }
4359
4360 static void ctdb_reloadips_child_handler(struct tevent_context *ev,
4361                                          struct tevent_fd *fde,
4362                                          uint16_t flags, void *private_data)
4363 {
4364         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4365
4366         char res;
4367         int ret;
4368
4369         ret = sys_read(h->fd[0], &res, 1);
4370         if (ret < 1 || res != 0) {
4371                 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
4372                 res = 1;
4373         }
4374         h->status = res;
4375
4376         talloc_free(h);
4377 }
4378
4379 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
4380 {
4381         TALLOC_CTX *mem_ctx = talloc_new(NULL);
4382         struct ctdb_public_ip_list_old *ips;
4383         struct ctdb_vnn *vnn;
4384         struct client_async_data *async_data;
4385         struct timeval timeout;
4386         TDB_DATA data;
4387         struct ctdb_client_control_state *state;
4388         bool first_add;
4389         int i, ret;
4390
4391         CTDB_NO_MEMORY(ctdb, mem_ctx);
4392
4393         /* Read IPs from local node */
4394         ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(),
4395                                        CTDB_CURRENT_NODE, mem_ctx, &ips);
4396         if (ret != 0) {
4397                 DEBUG(DEBUG_ERR,
4398                       ("Unable to fetch public IPs from local node\n"));
4399                 talloc_free(mem_ctx);
4400                 return -1;
4401         }
4402
4403         /* Read IPs file - this is safe since this is a child process */
4404         ctdb->vnn = NULL;
4405         if (ctdb_set_public_addresses(ctdb, false) != 0) {
4406                 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
4407                 talloc_free(mem_ctx);
4408                 return -1;
4409         }
4410
4411         async_data = talloc_zero(mem_ctx, struct client_async_data);
4412         CTDB_NO_MEMORY(ctdb, async_data);
4413
4414         /* Compare IPs between node and file for IPs to be deleted */
4415         for (i = 0; i < ips->num; i++) {
4416                 /* */
4417                 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4418                         if (ctdb_same_ip(&vnn->public_address,
4419                                          &ips->ips[i].addr)) {
4420                                 /* IP is still in file */
4421                                 break;
4422                         }
4423                 }
4424
4425                 if (vnn == NULL) {
4426                         /* Delete IP ips->ips[i] */
4427                         struct ctdb_addr_info_old *pub;
4428
4429                         DEBUG(DEBUG_NOTICE,
4430                               ("IP %s no longer configured, deleting it\n",
4431                                ctdb_addr_to_str(&ips->ips[i].addr)));
4432
4433                         pub = talloc_zero(mem_ctx, struct ctdb_addr_info_old);
4434                         CTDB_NO_MEMORY(ctdb, pub);
4435
4436                         pub->addr  = ips->ips[i].addr;
4437                         pub->mask  = 0;
4438                         pub->len   = 0;
4439
4440                         timeout = TAKEOVER_TIMEOUT();
4441
4442                         data.dsize = offsetof(struct ctdb_addr_info_old,
4443                                               iface) + pub->len;
4444                         data.dptr = (uint8_t *)pub;
4445
4446                         state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4447                                                   CTDB_CONTROL_DEL_PUBLIC_IP,
4448                                                   0, data, async_data,
4449                                                   &timeout, NULL);
4450                         if (state == NULL) {
4451                                 DEBUG(DEBUG_ERR,
4452                                       (__location__
4453                                        " failed sending CTDB_CONTROL_DEL_PUBLIC_IP\n"));
4454                                 goto failed;
4455                         }
4456
4457                         ctdb_client_async_add(async_data, state);
4458                 }
4459         }
4460
4461         /* Compare IPs between node and file for IPs to be added */
4462         first_add = true;
4463         for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4464                 for (i = 0; i < ips->num; i++) {
4465                         if (ctdb_same_ip(&vnn->public_address,
4466                                          &ips->ips[i].addr)) {
4467                                 /* IP already on node */
4468                                 break;
4469                         }
4470                 }
4471                 if (i == ips->num) {
4472                         /* Add IP ips->ips[i] */
4473                         struct ctdb_addr_info_old *pub;
4474                         const char *ifaces = NULL;
4475                         uint32_t len;
4476                         int iface = 0;
4477
4478                         DEBUG(DEBUG_NOTICE,
4479                               ("New IP %s configured, adding it\n",
4480                                ctdb_addr_to_str(&vnn->public_address)));
4481                         if (first_add) {
4482                                 uint32_t pnn = ctdb_get_pnn(ctdb);
4483
4484                                 data.dsize = sizeof(pnn);
4485                                 data.dptr  = (uint8_t *)&pnn;
4486
4487                                 ret = ctdb_client_send_message(
4488                                         ctdb,
4489                                         CTDB_BROADCAST_CONNECTED,
4490                                         CTDB_SRVID_REBALANCE_NODE,
4491                                         data);
4492                                 if (ret != 0) {
4493                                         DEBUG(DEBUG_WARNING,
4494                                               ("Failed to send message to force node reallocation - IPs may be unbalanced\n"));
4495                                 }
4496
4497                                 first_add = false;
4498                         }
4499
4500                         ifaces = vnn->ifaces[0];
4501                         iface = 1;
4502                         while (vnn->ifaces[iface] != NULL) {
4503                                 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces,
4504                                                          vnn->ifaces[iface]);
4505                                 iface++;
4506                         }
4507
4508                         len   = strlen(ifaces) + 1;
4509                         pub = talloc_zero_size(mem_ctx,
4510                                                offsetof(struct ctdb_addr_info_old, iface) + len);
4511                         CTDB_NO_MEMORY(ctdb, pub);
4512
4513                         pub->addr  = vnn->public_address;
4514                         pub->mask  = vnn->public_netmask_bits;
4515                         pub->len   = len;
4516                         memcpy(&pub->iface[0], ifaces, pub->len);
4517
4518                         timeout = TAKEOVER_TIMEOUT();
4519
4520                         data.dsize = offsetof(struct ctdb_addr_info_old,
4521                                               iface) + pub->len;
4522                         data.dptr = (uint8_t *)pub;
4523
4524                         state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4525                                                   CTDB_CONTROL_ADD_PUBLIC_IP,
4526                                                   0, data, async_data,
4527                                                   &timeout, NULL);
4528                         if (state == NULL) {
4529                                 DEBUG(DEBUG_ERR,
4530                                       (__location__
4531                                        " failed sending CTDB_CONTROL_ADD_PUBLIC_IP\n"));
4532                                 goto failed;
4533                         }
4534
4535                         ctdb_client_async_add(async_data, state);
4536                 }
4537         }
4538
4539         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
4540                 DEBUG(DEBUG_ERR,(__location__ " Add/delete IPs failed\n"));
4541                 goto failed;
4542         }
4543
4544         talloc_free(mem_ctx);
4545         return 0;
4546
4547 failed:
4548         talloc_free(mem_ctx);
4549         return -1;
4550 }
4551
4552 /* This control is sent to force the node to re-read the public addresses file
4553    and drop any addresses we should nnot longer host, and add new addresses
4554    that we are now able to host
4555 */
4556 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control_old *c, bool *async_reply)
4557 {
4558         struct ctdb_reloadips_handle *h;
4559         pid_t parent = getpid();
4560
4561         if (ctdb->reload_ips != NULL) {
4562                 talloc_free(ctdb->reload_ips);
4563                 ctdb->reload_ips = NULL;
4564         }
4565
4566         h = talloc(ctdb, struct ctdb_reloadips_handle);
4567         CTDB_NO_MEMORY(ctdb, h);
4568         h->ctdb     = ctdb;
4569         h->c        = NULL;
4570         h->status   = -1;
4571         
4572         if (pipe(h->fd) == -1) {
4573                 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
4574                 talloc_free(h);
4575                 return -1;
4576         }
4577
4578         h->child = ctdb_fork(ctdb);
4579         if (h->child == (pid_t)-1) {
4580                 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
4581                 close(h->fd[0]);
4582                 close(h->fd[1]);
4583                 talloc_free(h);
4584                 return -1;
4585         }
4586
4587         /* child process */
4588         if (h->child == 0) {
4589                 signed char res = 0;
4590
4591                 close(h->fd[0]);
4592                 debug_extra = talloc_asprintf(NULL, "reloadips:");
4593
4594                 prctl_set_comment("ctdb_reloadips");
4595                 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
4596                         DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
4597                         res = -1;
4598                 } else {
4599                         res = ctdb_reloadips_child(ctdb);
4600                         if (res != 0) {
4601                                 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
4602                         }
4603                 }
4604
4605                 sys_write(h->fd[1], &res, 1);
4606                 /* make sure we die when our parent dies */
4607                 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
4608                         sleep(5);
4609                 }
4610                 _exit(0);
4611         }
4612
4613         h->c             = talloc_steal(h, c);
4614
4615         close(h->fd[1]);
4616         set_close_on_exec(h->fd[0]);
4617
4618         talloc_set_destructor(h, ctdb_reloadips_destructor);
4619
4620
4621         h->fde = tevent_add_fd(ctdb->ev, h, h->fd[0], TEVENT_FD_READ,
4622                                ctdb_reloadips_child_handler, (void *)h);
4623         tevent_fd_set_auto_close(h->fde);
4624
4625         tevent_add_timer(ctdb->ev, h, timeval_current_ofs(120, 0),
4626                          ctdb_reloadips_timeout_event, h);
4627
4628         /* we reply later */
4629         *async_reply = true;
4630         return 0;
4631 }