227bd162d441c8d89504e4c4e8b3ace3120c90d4
[obnox/samba/samba-obnox.git] / ctdb / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6    Copyright (C) Martin Schwenke  2011
7
8    This program is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3 of the License, or
11    (at your option) any later version.
12    
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17    
18    You should have received a copy of the GNU General Public License
19    along with this program; if not, see <http://www.gnu.org/licenses/>.
20 */
21 #include "replace.h"
22 #include "system/network.h"
23 #include "system/filesys.h"
24 #include "system/time.h"
25 #include "system/wait.h"
26
27 #include <talloc.h>
28 #include <tevent.h>
29
30 #include "lib/util/dlinklist.h"
31 #include "lib/util/debug.h"
32 #include "lib/util/samba_util.h"
33 #include "lib/util/util_process.h"
34
35 #include "ctdb_private.h"
36 #include "ctdb_client.h"
37
38 #include "common/rb_tree.h"
39 #include "common/reqid.h"
40 #include "common/system.h"
41 #include "common/common.h"
42 #include "common/logging.h"
43
44
45 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
46
47 #define CTDB_ARP_INTERVAL 1
48 #define CTDB_ARP_REPEAT   3
49
50 /* Flags used in IP allocation algorithms. */
51 enum ipalloc_algorithm {
52         IPALLOC_DETERMINISTIC,
53         IPALLOC_NONDETERMINISTIC,
54         IPALLOC_LCP2,
55 };
56
57 struct ipalloc_state {
58         uint32_t num;
59
60         /* Arrays with data for each node */
61         struct ctdb_public_ip_list_old **known_public_ips;
62         struct ctdb_public_ip_list_old **available_public_ips;
63         bool *noiptakeover;
64         bool *noiphost;
65
66         struct public_ip_list *all_ips;
67         enum ipalloc_algorithm algorithm;
68         uint32_t no_ip_failback;
69         uint32_t *force_rebalance_nodes;
70 };
71
72 struct ctdb_interface {
73         struct ctdb_interface *prev, *next;
74         const char *name;
75         bool link_up;
76         uint32_t references;
77 };
78
79 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
80 {
81         if (vnn->iface) {
82                 return vnn->iface->name;
83         }
84
85         return "__none__";
86 }
87
88 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
89 {
90         struct ctdb_interface *i;
91
92         /* Verify that we don't have an entry for this ip yet */
93         for (i=ctdb->ifaces;i;i=i->next) {
94                 if (strcmp(i->name, iface) == 0) {
95                         return 0;
96                 }
97         }
98
99         /* create a new structure for this interface */
100         i = talloc_zero(ctdb, struct ctdb_interface);
101         CTDB_NO_MEMORY_FATAL(ctdb, i);
102         i->name = talloc_strdup(i, iface);
103         CTDB_NO_MEMORY(ctdb, i->name);
104
105         i->link_up = true;
106
107         DLIST_ADD(ctdb->ifaces, i);
108
109         return 0;
110 }
111
112 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
113                                         const char *name)
114 {
115         int n;
116
117         for (n = 0; vnn->ifaces[n] != NULL; n++) {
118                 if (strcmp(name, vnn->ifaces[n]) == 0) {
119                         return true;
120                 }
121         }
122
123         return false;
124 }
125
126 /* If any interfaces now have no possible IPs then delete them.  This
127  * implementation is naive (i.e. simple) rather than clever
128  * (i.e. complex).  Given that this is run on delip and that operation
129  * is rare, this doesn't need to be efficient - it needs to be
130  * foolproof.  One alternative is reference counting, where the logic
131  * is distributed and can, therefore, be broken in multiple places.
132  * Another alternative is to build a red-black tree of interfaces that
133  * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
134  * once) and then walking ctdb->ifaces once and deleting those not in
135  * the tree.  Let's go to one of those if the naive implementation
136  * causes problems...  :-)
137  */
138 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
139                                         struct ctdb_vnn *vnn)
140 {
141         struct ctdb_interface *i, *next;
142
143         /* For each interface, check if there's an IP using it. */
144         for (i = ctdb->ifaces; i != NULL; i = next) {
145                 struct ctdb_vnn *tv;
146                 bool found;
147                 next = i->next;
148
149                 /* Only consider interfaces named in the given VNN. */
150                 if (!vnn_has_interface_with_name(vnn, i->name)) {
151                         continue;
152                 }
153
154                 /* Is the "single IP" on this interface? */
155                 if ((ctdb->single_ip_vnn != NULL) &&
156                     (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
157                     (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
158                         /* Found, next interface please... */
159                         continue;
160                 }
161                 /* Search for a vnn with this interface. */
162                 found = false;
163                 for (tv=ctdb->vnn; tv; tv=tv->next) {
164                         if (vnn_has_interface_with_name(tv, i->name)) {
165                                 found = true;
166                                 break;
167                         }
168                 }
169
170                 if (!found) {
171                         /* None of the VNNs are using this interface. */
172                         DLIST_REMOVE(ctdb->ifaces, i);
173                         talloc_free(i);
174                 }
175         }
176 }
177
178
179 static struct ctdb_interface *ctdb_find_iface(struct ctdb_context *ctdb,
180                                               const char *iface)
181 {
182         struct ctdb_interface *i;
183
184         for (i=ctdb->ifaces;i;i=i->next) {
185                 if (strcmp(i->name, iface) == 0) {
186                         return i;
187                 }
188         }
189
190         return NULL;
191 }
192
193 static struct ctdb_interface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
194                                                   struct ctdb_vnn *vnn)
195 {
196         int i;
197         struct ctdb_interface *cur = NULL;
198         struct ctdb_interface *best = NULL;
199
200         for (i=0; vnn->ifaces[i]; i++) {
201
202                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
203                 if (cur == NULL) {
204                         continue;
205                 }
206
207                 if (!cur->link_up) {
208                         continue;
209                 }
210
211                 if (best == NULL) {
212                         best = cur;
213                         continue;
214                 }
215
216                 if (cur->references < best->references) {
217                         best = cur;
218                         continue;
219                 }
220         }
221
222         return best;
223 }
224
225 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
226                                      struct ctdb_vnn *vnn)
227 {
228         struct ctdb_interface *best = NULL;
229
230         if (vnn->iface) {
231                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
232                                    "still assigned to iface '%s'\n",
233                                    ctdb_addr_to_str(&vnn->public_address),
234                                    ctdb_vnn_iface_string(vnn)));
235                 return 0;
236         }
237
238         best = ctdb_vnn_best_iface(ctdb, vnn);
239         if (best == NULL) {
240                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
241                                   "cannot assign to iface any iface\n",
242                                   ctdb_addr_to_str(&vnn->public_address)));
243                 return -1;
244         }
245
246         vnn->iface = best;
247         best->references++;
248         vnn->pnn = ctdb->pnn;
249
250         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
251                            "now assigned to iface '%s' refs[%d]\n",
252                            ctdb_addr_to_str(&vnn->public_address),
253                            ctdb_vnn_iface_string(vnn),
254                            best->references));
255         return 0;
256 }
257
258 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
259                                     struct ctdb_vnn *vnn)
260 {
261         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
262                            "now unassigned (old iface '%s' refs[%d])\n",
263                            ctdb_addr_to_str(&vnn->public_address),
264                            ctdb_vnn_iface_string(vnn),
265                            vnn->iface?vnn->iface->references:0));
266         if (vnn->iface) {
267                 vnn->iface->references--;
268         }
269         vnn->iface = NULL;
270         if (vnn->pnn == ctdb->pnn) {
271                 vnn->pnn = -1;
272         }
273 }
274
275 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
276                                struct ctdb_vnn *vnn)
277 {
278         int i;
279
280         /* Nodes that are not RUNNING can not host IPs */
281         if (ctdb->runstate != CTDB_RUNSTATE_RUNNING) {
282                 return false;
283         }
284
285         if (vnn->delete_pending) {
286                 return false;
287         }
288
289         if (vnn->iface && vnn->iface->link_up) {
290                 return true;
291         }
292
293         for (i=0; vnn->ifaces[i]; i++) {
294                 struct ctdb_interface *cur;
295
296                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
297                 if (cur == NULL) {
298                         continue;
299                 }
300
301                 if (cur->link_up) {
302                         return true;
303                 }
304         }
305
306         return false;
307 }
308
309 struct ctdb_takeover_arp {
310         struct ctdb_context *ctdb;
311         uint32_t count;
312         ctdb_sock_addr addr;
313         struct ctdb_tcp_array *tcparray;
314         struct ctdb_vnn *vnn;
315 };
316
317
318 /*
319   lists of tcp endpoints
320  */
321 struct ctdb_tcp_list {
322         struct ctdb_tcp_list *prev, *next;
323         struct ctdb_connection connection;
324 };
325
326 /*
327   list of clients to kill on IP release
328  */
329 struct ctdb_client_ip {
330         struct ctdb_client_ip *prev, *next;
331         struct ctdb_context *ctdb;
332         ctdb_sock_addr addr;
333         uint32_t client_id;
334 };
335
336
337 /*
338   send a gratuitous arp
339  */
340 static void ctdb_control_send_arp(struct tevent_context *ev,
341                                   struct tevent_timer *te,
342                                   struct timeval t, void *private_data)
343 {
344         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
345                                                         struct ctdb_takeover_arp);
346         int i, ret;
347         struct ctdb_tcp_array *tcparray;
348         const char *iface = ctdb_vnn_iface_string(arp->vnn);
349
350         ret = ctdb_sys_send_arp(&arp->addr, iface);
351         if (ret != 0) {
352                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
353                                   iface, strerror(errno)));
354         }
355
356         tcparray = arp->tcparray;
357         if (tcparray) {
358                 for (i=0;i<tcparray->num;i++) {
359                         struct ctdb_connection *tcon;
360
361                         tcon = &tcparray->connections[i];
362                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
363                                 (unsigned)ntohs(tcon->dst.ip.sin_port),
364                                 ctdb_addr_to_str(&tcon->src),
365                                 (unsigned)ntohs(tcon->src.ip.sin_port)));
366                         ret = ctdb_sys_send_tcp(
367                                 &tcon->src,
368                                 &tcon->dst,
369                                 0, 0, 0);
370                         if (ret != 0) {
371                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
372                                         ctdb_addr_to_str(&tcon->src)));
373                         }
374                 }
375         }
376
377         arp->count++;
378
379         if (arp->count == CTDB_ARP_REPEAT) {
380                 talloc_free(arp);
381                 return;
382         }
383
384         tevent_add_timer(arp->ctdb->ev, arp->vnn->takeover_ctx,
385                          timeval_current_ofs(CTDB_ARP_INTERVAL, 100000),
386                          ctdb_control_send_arp, arp);
387 }
388
389 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
390                                        struct ctdb_vnn *vnn)
391 {
392         struct ctdb_takeover_arp *arp;
393         struct ctdb_tcp_array *tcparray;
394
395         if (!vnn->takeover_ctx) {
396                 vnn->takeover_ctx = talloc_new(vnn);
397                 if (!vnn->takeover_ctx) {
398                         return -1;
399                 }
400         }
401
402         arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
403         if (!arp) {
404                 return -1;
405         }
406
407         arp->ctdb = ctdb;
408         arp->addr = vnn->public_address;
409         arp->vnn  = vnn;
410
411         tcparray = vnn->tcp_array;
412         if (tcparray) {
413                 /* add all of the known tcp connections for this IP to the
414                    list of tcp connections to send tickle acks for */
415                 arp->tcparray = talloc_steal(arp, tcparray);
416
417                 vnn->tcp_array = NULL;
418                 vnn->tcp_update_needed = true;
419         }
420
421         tevent_add_timer(arp->ctdb->ev, vnn->takeover_ctx,
422                          timeval_zero(), ctdb_control_send_arp, arp);
423
424         return 0;
425 }
426
427 struct takeover_callback_state {
428         struct ctdb_req_control_old *c;
429         ctdb_sock_addr *addr;
430         struct ctdb_vnn *vnn;
431 };
432
433 struct ctdb_do_takeip_state {
434         struct ctdb_req_control_old *c;
435         struct ctdb_vnn *vnn;
436 };
437
438 /*
439   called when takeip event finishes
440  */
441 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
442                                     void *private_data)
443 {
444         struct ctdb_do_takeip_state *state =
445                 talloc_get_type(private_data, struct ctdb_do_takeip_state);
446         int32_t ret;
447         TDB_DATA data;
448
449         if (status != 0) {
450                 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
451         
452                 if (status == -ETIME) {
453                         ctdb_ban_self(ctdb);
454                 }
455                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
456                                  ctdb_addr_to_str(&state->vnn->public_address),
457                                  ctdb_vnn_iface_string(state->vnn)));
458                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
459
460                 node->flags |= NODE_FLAGS_UNHEALTHY;
461                 talloc_free(state);
462                 return;
463         }
464
465         if (ctdb->do_checkpublicip) {
466
467         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
468         if (ret != 0) {
469                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
470                 talloc_free(state);
471                 return;
472         }
473
474         }
475
476         data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
477         data.dsize = strlen((char *)data.dptr) + 1;
478         DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
479
480         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
481
482
483         /* the control succeeded */
484         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
485         talloc_free(state);
486         return;
487 }
488
489 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
490 {
491         state->vnn->update_in_flight = false;
492         return 0;
493 }
494
495 /*
496   take over an ip address
497  */
498 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
499                               struct ctdb_req_control_old *c,
500                               struct ctdb_vnn *vnn)
501 {
502         int ret;
503         struct ctdb_do_takeip_state *state;
504
505         if (vnn->update_in_flight) {
506                 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
507                                     "update for this IP already in flight\n",
508                                     ctdb_addr_to_str(&vnn->public_address),
509                                     vnn->public_netmask_bits));
510                 return -1;
511         }
512
513         ret = ctdb_vnn_assign_iface(ctdb, vnn);
514         if (ret != 0) {
515                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
516                                  "assign a usable interface\n",
517                                  ctdb_addr_to_str(&vnn->public_address),
518                                  vnn->public_netmask_bits));
519                 return -1;
520         }
521
522         state = talloc(vnn, struct ctdb_do_takeip_state);
523         CTDB_NO_MEMORY(ctdb, state);
524
525         state->c = talloc_steal(ctdb, c);
526         state->vnn   = vnn;
527
528         vnn->update_in_flight = true;
529         talloc_set_destructor(state, ctdb_takeip_destructor);
530
531         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
532                             ctdb_addr_to_str(&vnn->public_address),
533                             vnn->public_netmask_bits,
534                             ctdb_vnn_iface_string(vnn)));
535
536         ret = ctdb_event_script_callback(ctdb,
537                                          state,
538                                          ctdb_do_takeip_callback,
539                                          state,
540                                          CTDB_EVENT_TAKE_IP,
541                                          "%s %s %u",
542                                          ctdb_vnn_iface_string(vnn),
543                                          ctdb_addr_to_str(&vnn->public_address),
544                                          vnn->public_netmask_bits);
545
546         if (ret != 0) {
547                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
548                         ctdb_addr_to_str(&vnn->public_address),
549                         ctdb_vnn_iface_string(vnn)));
550                 talloc_free(state);
551                 return -1;
552         }
553
554         return 0;
555 }
556
557 struct ctdb_do_updateip_state {
558         struct ctdb_req_control_old *c;
559         struct ctdb_interface *old;
560         struct ctdb_vnn *vnn;
561 };
562
563 /*
564   called when updateip event finishes
565  */
566 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
567                                       void *private_data)
568 {
569         struct ctdb_do_updateip_state *state =
570                 talloc_get_type(private_data, struct ctdb_do_updateip_state);
571         int32_t ret;
572
573         if (status != 0) {
574                 if (status == -ETIME) {
575                         ctdb_ban_self(ctdb);
576                 }
577                 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
578                         ctdb_addr_to_str(&state->vnn->public_address),
579                         state->old->name,
580                         ctdb_vnn_iface_string(state->vnn)));
581
582                 /*
583                  * All we can do is reset the old interface
584                  * and let the next run fix it
585                  */
586                 ctdb_vnn_unassign_iface(ctdb, state->vnn);
587                 state->vnn->iface = state->old;
588                 state->vnn->iface->references++;
589
590                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
591                 talloc_free(state);
592                 return;
593         }
594
595         if (ctdb->do_checkpublicip) {
596
597         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
598         if (ret != 0) {
599                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
600                 talloc_free(state);
601                 return;
602         }
603
604         }
605
606         /* the control succeeded */
607         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
608         talloc_free(state);
609         return;
610 }
611
612 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
613 {
614         state->vnn->update_in_flight = false;
615         return 0;
616 }
617
618 /*
619   update (move) an ip address
620  */
621 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
622                                 struct ctdb_req_control_old *c,
623                                 struct ctdb_vnn *vnn)
624 {
625         int ret;
626         struct ctdb_do_updateip_state *state;
627         struct ctdb_interface *old = vnn->iface;
628         const char *new_name;
629
630         if (vnn->update_in_flight) {
631                 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
632                                     "update for this IP already in flight\n",
633                                     ctdb_addr_to_str(&vnn->public_address),
634                                     vnn->public_netmask_bits));
635                 return -1;
636         }
637
638         ctdb_vnn_unassign_iface(ctdb, vnn);
639         ret = ctdb_vnn_assign_iface(ctdb, vnn);
640         if (ret != 0) {
641                 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
642                                  "assin a usable interface (old iface '%s')\n",
643                                  ctdb_addr_to_str(&vnn->public_address),
644                                  vnn->public_netmask_bits,
645                                  old->name));
646                 return -1;
647         }
648
649         new_name = ctdb_vnn_iface_string(vnn);
650         if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
651                 /* A benign update from one interface onto itself.
652                  * no need to run the eventscripts in this case, just return
653                  * success.
654                  */
655                 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
656                 return 0;
657         }
658
659         state = talloc(vnn, struct ctdb_do_updateip_state);
660         CTDB_NO_MEMORY(ctdb, state);
661
662         state->c = talloc_steal(ctdb, c);
663         state->old = old;
664         state->vnn = vnn;
665
666         vnn->update_in_flight = true;
667         talloc_set_destructor(state, ctdb_updateip_destructor);
668
669         DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
670                             "interface %s to %s\n",
671                             ctdb_addr_to_str(&vnn->public_address),
672                             vnn->public_netmask_bits,
673                             old->name,
674                             new_name));
675
676         ret = ctdb_event_script_callback(ctdb,
677                                          state,
678                                          ctdb_do_updateip_callback,
679                                          state,
680                                          CTDB_EVENT_UPDATE_IP,
681                                          "%s %s %s %u",
682                                          state->old->name,
683                                          new_name,
684                                          ctdb_addr_to_str(&vnn->public_address),
685                                          vnn->public_netmask_bits);
686         if (ret != 0) {
687                 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
688                                  ctdb_addr_to_str(&vnn->public_address),
689                                  old->name, new_name));
690                 talloc_free(state);
691                 return -1;
692         }
693
694         return 0;
695 }
696
697 /*
698   Find the vnn of the node that has a public ip address
699   returns -1 if the address is not known as a public address
700  */
701 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
702 {
703         struct ctdb_vnn *vnn;
704
705         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
706                 if (ctdb_same_ip(&vnn->public_address, addr)) {
707                         return vnn;
708                 }
709         }
710
711         return NULL;
712 }
713
714 /*
715   take over an ip address
716  */
717 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
718                                  struct ctdb_req_control_old *c,
719                                  TDB_DATA indata,
720                                  bool *async_reply)
721 {
722         int ret;
723         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
724         struct ctdb_vnn *vnn;
725         bool have_ip = false;
726         bool do_updateip = false;
727         bool do_takeip = false;
728         struct ctdb_interface *best_iface = NULL;
729
730         if (pip->pnn != ctdb->pnn) {
731                 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
732                                  "with pnn %d, but we're node %d\n",
733                                  ctdb_addr_to_str(&pip->addr),
734                                  pip->pnn, ctdb->pnn));
735                 return -1;
736         }
737
738         /* update out vnn list */
739         vnn = find_public_ip_vnn(ctdb, &pip->addr);
740         if (vnn == NULL) {
741                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
742                         ctdb_addr_to_str(&pip->addr)));
743                 return 0;
744         }
745
746         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
747                 have_ip = ctdb_sys_have_ip(&pip->addr);
748         }
749         best_iface = ctdb_vnn_best_iface(ctdb, vnn);
750         if (best_iface == NULL) {
751                 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
752                                  "a usable interface (old %s, have_ip %d)\n",
753                                  ctdb_addr_to_str(&vnn->public_address),
754                                  vnn->public_netmask_bits,
755                                  ctdb_vnn_iface_string(vnn),
756                                  have_ip));
757                 return -1;
758         }
759
760         if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
761                 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
762                 have_ip = false;
763         }
764
765
766         if (vnn->iface == NULL && have_ip) {
767                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
768                                   "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
769                                  ctdb_addr_to_str(&vnn->public_address)));
770                 return 0;
771         }
772
773         if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
774                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
775                                   "and we have it on iface[%s], but it was assigned to node %d"
776                                   "and we are node %d, banning ourself\n",
777                                  ctdb_addr_to_str(&vnn->public_address),
778                                  ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
779                 ctdb_ban_self(ctdb);
780                 return -1;
781         }
782
783         if (vnn->pnn == -1 && have_ip) {
784                 vnn->pnn = ctdb->pnn;
785                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
786                                   "and we already have it on iface[%s], update local daemon\n",
787                                  ctdb_addr_to_str(&vnn->public_address),
788                                   ctdb_vnn_iface_string(vnn)));
789                 return 0;
790         }
791
792         if (vnn->iface) {
793                 if (vnn->iface != best_iface) {
794                         if (!vnn->iface->link_up) {
795                                 do_updateip = true;
796                         } else if (vnn->iface->references > (best_iface->references + 1)) {
797                                 /* only move when the rebalance gains something */
798                                         do_updateip = true;
799                         }
800                 }
801         }
802
803         if (!have_ip) {
804                 if (do_updateip) {
805                         ctdb_vnn_unassign_iface(ctdb, vnn);
806                         do_updateip = false;
807                 }
808                 do_takeip = true;
809         }
810
811         if (do_takeip) {
812                 ret = ctdb_do_takeip(ctdb, c, vnn);
813                 if (ret != 0) {
814                         return -1;
815                 }
816         } else if (do_updateip) {
817                 ret = ctdb_do_updateip(ctdb, c, vnn);
818                 if (ret != 0) {
819                         return -1;
820                 }
821         } else {
822                 /*
823                  * The interface is up and the kernel known the ip
824                  * => do nothing
825                  */
826                 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
827                         ctdb_addr_to_str(&pip->addr),
828                         vnn->public_netmask_bits,
829                         ctdb_vnn_iface_string(vnn)));
830                 return 0;
831         }
832
833         /* tell ctdb_control.c that we will be replying asynchronously */
834         *async_reply = true;
835
836         return 0;
837 }
838
839 /*
840   kill any clients that are registered with a IP that is being released
841  */
842 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
843 {
844         struct ctdb_client_ip *ip;
845
846         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
847                 ctdb_addr_to_str(addr)));
848
849         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
850                 ctdb_sock_addr tmp_addr;
851
852                 tmp_addr = ip->addr;
853                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
854                         ip->client_id,
855                         ctdb_addr_to_str(&ip->addr)));
856
857                 if (ctdb_same_ip(&tmp_addr, addr)) {
858                         struct ctdb_client *client = reqid_find(ctdb->idr,
859                                                                 ip->client_id,
860                                                                 struct ctdb_client);
861                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
862                                 ip->client_id,
863                                 ctdb_addr_to_str(&ip->addr),
864                                 client->pid));
865
866                         if (client->pid != 0) {
867                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
868                                         (unsigned)client->pid,
869                                         ctdb_addr_to_str(addr),
870                                         ip->client_id));
871                                 kill(client->pid, SIGKILL);
872                         }
873                 }
874         }
875 }
876
877 static void do_delete_ip(struct ctdb_context *ctdb, struct ctdb_vnn *vnn)
878 {
879         DLIST_REMOVE(ctdb->vnn, vnn);
880         ctdb_vnn_unassign_iface(ctdb, vnn);
881         ctdb_remove_orphaned_ifaces(ctdb, vnn);
882         talloc_free(vnn);
883 }
884
885 /*
886   called when releaseip event finishes
887  */
888 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
889                                 void *private_data)
890 {
891         struct takeover_callback_state *state = 
892                 talloc_get_type(private_data, struct takeover_callback_state);
893         TDB_DATA data;
894
895         if (status == -ETIME) {
896                 ctdb_ban_self(ctdb);
897         }
898
899         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
900                 if  (ctdb_sys_have_ip(state->addr)) {
901                         DEBUG(DEBUG_ERR,
902                               ("IP %s still hosted during release IP callback, failing\n",
903                                ctdb_addr_to_str(state->addr)));
904                         ctdb_request_control_reply(ctdb, state->c,
905                                                    NULL, -1, NULL);
906                         talloc_free(state);
907                         return;
908                 }
909         }
910
911         /* send a message to all clients of this node telling them
912            that the cluster has been reconfigured and they should
913            release any sockets on this IP */
914         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
915         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
916         data.dsize = strlen((char *)data.dptr)+1;
917
918         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
919
920         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
921
922         /* kill clients that have registered with this IP */
923         release_kill_clients(ctdb, state->addr);
924
925         ctdb_vnn_unassign_iface(ctdb, state->vnn);
926
927         /* Process the IP if it has been marked for deletion */
928         if (state->vnn->delete_pending) {
929                 do_delete_ip(ctdb, state->vnn);
930                 state->vnn = NULL;
931         }
932
933         /* the control succeeded */
934         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
935         talloc_free(state);
936 }
937
938 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
939 {
940         if (state->vnn != NULL) {
941                 state->vnn->update_in_flight = false;
942         }
943         return 0;
944 }
945
946 /*
947   release an ip address
948  */
949 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
950                                 struct ctdb_req_control_old *c,
951                                 TDB_DATA indata, 
952                                 bool *async_reply)
953 {
954         int ret;
955         struct takeover_callback_state *state;
956         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
957         struct ctdb_vnn *vnn;
958         char *iface;
959
960         /* update our vnn list */
961         vnn = find_public_ip_vnn(ctdb, &pip->addr);
962         if (vnn == NULL) {
963                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
964                         ctdb_addr_to_str(&pip->addr)));
965                 return 0;
966         }
967         vnn->pnn = pip->pnn;
968
969         /* stop any previous arps */
970         talloc_free(vnn->takeover_ctx);
971         vnn->takeover_ctx = NULL;
972
973         /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
974          * lazy multicast to drop an IP from any node that isn't the
975          * intended new node.  The following causes makes ctdbd ignore
976          * a release for any address it doesn't host.
977          */
978         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
979                 if (!ctdb_sys_have_ip(&pip->addr)) {
980                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
981                                 ctdb_addr_to_str(&pip->addr),
982                                 vnn->public_netmask_bits,
983                                 ctdb_vnn_iface_string(vnn)));
984                         ctdb_vnn_unassign_iface(ctdb, vnn);
985                         return 0;
986                 }
987         } else {
988                 if (vnn->iface == NULL) {
989                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
990                                            ctdb_addr_to_str(&pip->addr),
991                                            vnn->public_netmask_bits));
992                         return 0;
993                 }
994         }
995
996         /* There is a potential race between take_ip and us because we
997          * update the VNN via a callback that run when the
998          * eventscripts have been run.  Avoid the race by allowing one
999          * update to be in flight at a time.
1000          */
1001         if (vnn->update_in_flight) {
1002                 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
1003                                     "update for this IP already in flight\n",
1004                                     ctdb_addr_to_str(&vnn->public_address),
1005                                     vnn->public_netmask_bits));
1006                 return -1;
1007         }
1008
1009         iface = strdup(ctdb_vnn_iface_string(vnn));
1010
1011         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
1012                 ctdb_addr_to_str(&pip->addr),
1013                 vnn->public_netmask_bits,
1014                 iface,
1015                 pip->pnn));
1016
1017         state = talloc(ctdb, struct takeover_callback_state);
1018         if (state == NULL) {
1019                 ctdb_set_error(ctdb, "Out of memory at %s:%d",
1020                                __FILE__, __LINE__);
1021                 free(iface);
1022                 return -1;
1023         }
1024
1025         state->c = talloc_steal(state, c);
1026         state->addr = talloc(state, ctdb_sock_addr);       
1027         if (state->addr == NULL) {
1028                 ctdb_set_error(ctdb, "Out of memory at %s:%d",
1029                                __FILE__, __LINE__);
1030                 free(iface);
1031                 talloc_free(state);
1032                 return -1;
1033         }
1034         *state->addr = pip->addr;
1035         state->vnn   = vnn;
1036
1037         vnn->update_in_flight = true;
1038         talloc_set_destructor(state, ctdb_releaseip_destructor);
1039
1040         ret = ctdb_event_script_callback(ctdb, 
1041                                          state, release_ip_callback, state,
1042                                          CTDB_EVENT_RELEASE_IP,
1043                                          "%s %s %u",
1044                                          iface,
1045                                          ctdb_addr_to_str(&pip->addr),
1046                                          vnn->public_netmask_bits);
1047         free(iface);
1048         if (ret != 0) {
1049                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1050                         ctdb_addr_to_str(&pip->addr),
1051                         ctdb_vnn_iface_string(vnn)));
1052                 talloc_free(state);
1053                 return -1;
1054         }
1055
1056         /* tell the control that we will be reply asynchronously */
1057         *async_reply = true;
1058         return 0;
1059 }
1060
1061 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1062                                    ctdb_sock_addr *addr,
1063                                    unsigned mask, const char *ifaces,
1064                                    bool check_address)
1065 {
1066         struct ctdb_vnn      *vnn;
1067         uint32_t num = 0;
1068         char *tmp;
1069         const char *iface;
1070         int i;
1071         int ret;
1072
1073         tmp = strdup(ifaces);
1074         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1075                 if (!ctdb_sys_check_iface_exists(iface)) {
1076                         DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1077                         free(tmp);
1078                         return -1;
1079                 }
1080         }
1081         free(tmp);
1082
1083         /* Verify that we don't have an entry for this ip yet */
1084         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1085                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1086                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
1087                                 ctdb_addr_to_str(addr)));
1088                         return -1;
1089                 }               
1090         }
1091
1092         /* create a new vnn structure for this ip address */
1093         vnn = talloc_zero(ctdb, struct ctdb_vnn);
1094         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1095         vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1096         tmp = talloc_strdup(vnn, ifaces);
1097         CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1098         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1099                 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1100                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1101                 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1102                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1103                 num++;
1104         }
1105         talloc_free(tmp);
1106         vnn->ifaces[num] = NULL;
1107         vnn->public_address      = *addr;
1108         vnn->public_netmask_bits = mask;
1109         vnn->pnn                 = -1;
1110         if (check_address) {
1111                 if (ctdb_sys_have_ip(addr)) {
1112                         DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1113                         vnn->pnn = ctdb->pnn;
1114                 }
1115         }
1116
1117         for (i=0; vnn->ifaces[i]; i++) {
1118                 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1119                 if (ret != 0) {
1120                         DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1121                                            "for public_address[%s]\n",
1122                                            vnn->ifaces[i], ctdb_addr_to_str(addr)));
1123                         talloc_free(vnn);
1124                         return -1;
1125                 }
1126         }
1127
1128         DLIST_ADD(ctdb->vnn, vnn);
1129
1130         return 0;
1131 }
1132
1133 /*
1134   setup the public address lists from a file
1135 */
1136 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1137 {
1138         char **lines;
1139         int nlines;
1140         int i;
1141
1142         lines = file_lines_load(ctdb->public_addresses_file, &nlines, 0, ctdb);
1143         if (lines == NULL) {
1144                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1145                 return -1;
1146         }
1147         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1148                 nlines--;
1149         }
1150
1151         for (i=0;i<nlines;i++) {
1152                 unsigned mask;
1153                 ctdb_sock_addr addr;
1154                 const char *addrstr;
1155                 const char *ifaces;
1156                 char *tok, *line;
1157
1158                 line = lines[i];
1159                 while ((*line == ' ') || (*line == '\t')) {
1160                         line++;
1161                 }
1162                 if (*line == '#') {
1163                         continue;
1164                 }
1165                 if (strcmp(line, "") == 0) {
1166                         continue;
1167                 }
1168                 tok = strtok(line, " \t");
1169                 addrstr = tok;
1170                 tok = strtok(NULL, " \t");
1171                 if (tok == NULL) {
1172                         if (NULL == ctdb->default_public_interface) {
1173                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1174                                          i+1));
1175                                 talloc_free(lines);
1176                                 return -1;
1177                         }
1178                         ifaces = ctdb->default_public_interface;
1179                 } else {
1180                         ifaces = tok;
1181                 }
1182
1183                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1184                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1185                         talloc_free(lines);
1186                         return -1;
1187                 }
1188                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1189                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1190                         talloc_free(lines);
1191                         return -1;
1192                 }
1193         }
1194
1195
1196         talloc_free(lines);
1197         return 0;
1198 }
1199
1200 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1201                               const char *iface,
1202                               const char *ip)
1203 {
1204         struct ctdb_vnn *svnn;
1205         struct ctdb_interface *cur = NULL;
1206         bool ok;
1207         int ret;
1208
1209         svnn = talloc_zero(ctdb, struct ctdb_vnn);
1210         CTDB_NO_MEMORY(ctdb, svnn);
1211
1212         svnn->ifaces = talloc_array(svnn, const char *, 2);
1213         CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1214         svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1215         CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1216         svnn->ifaces[1] = NULL;
1217
1218         ok = parse_ip(ip, iface, 0, &svnn->public_address);
1219         if (!ok) {
1220                 talloc_free(svnn);
1221                 return -1;
1222         }
1223
1224         ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1225         if (ret != 0) {
1226                 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1227                                    "for single_ip[%s]\n",
1228                                    svnn->ifaces[0],
1229                                    ctdb_addr_to_str(&svnn->public_address)));
1230                 talloc_free(svnn);
1231                 return -1;
1232         }
1233
1234         /* assume the single public ip interface is initially "good" */
1235         cur = ctdb_find_iface(ctdb, iface);
1236         if (cur == NULL) {
1237                 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1238                 return -1;
1239         }
1240         cur->link_up = true;
1241
1242         ret = ctdb_vnn_assign_iface(ctdb, svnn);
1243         if (ret != 0) {
1244                 talloc_free(svnn);
1245                 return -1;
1246         }
1247
1248         ctdb->single_ip_vnn = svnn;
1249         return 0;
1250 }
1251
1252 struct public_ip_list {
1253         struct public_ip_list *next;
1254         uint32_t pnn;
1255         ctdb_sock_addr addr;
1256 };
1257
1258 /* Given a physical node, return the number of
1259    public addresses that is currently assigned to this node.
1260 */
1261 static int node_ip_coverage(int32_t pnn, struct public_ip_list *ips)
1262 {
1263         int num=0;
1264
1265         for (;ips;ips=ips->next) {
1266                 if (ips->pnn == pnn) {
1267                         num++;
1268                 }
1269         }
1270         return num;
1271 }
1272
1273
1274 /* Can the given node host the given IP: is the public IP known to the
1275  * node and is NOIPHOST unset?
1276 */
1277 static bool can_node_host_ip(struct ipalloc_state *ipalloc_state,
1278                              int32_t pnn,
1279                              struct public_ip_list *ip)
1280 {
1281         struct ctdb_public_ip_list_old *public_ips;
1282         int i;
1283
1284         if (ipalloc_state->noiphost[pnn]) {
1285                 return false;
1286         }
1287
1288         public_ips = ipalloc_state->available_public_ips[pnn];
1289
1290         if (public_ips == NULL) {
1291                 return false;
1292         }
1293
1294         for (i=0; i<public_ips->num; i++) {
1295                 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1296                         /* yes, this node can serve this public ip */
1297                         return true;
1298                 }
1299         }
1300
1301         return false;
1302 }
1303
1304 static bool can_node_takeover_ip(struct ipalloc_state *ipalloc_state,
1305                                  int32_t pnn,
1306                                  struct public_ip_list *ip)
1307 {
1308         if (ipalloc_state->noiptakeover[pnn]) {
1309                 return false;
1310         }
1311
1312         return can_node_host_ip(ipalloc_state, pnn, ip);
1313 }
1314
1315 /* search the node lists list for a node to takeover this ip.
1316    pick the node that currently are serving the least number of ips
1317    so that the ips get spread out evenly.
1318 */
1319 static int find_takeover_node(struct ipalloc_state *ipalloc_state,
1320                               struct public_ip_list *ip)
1321 {
1322         int pnn, min=0, num;
1323         int i, numnodes;
1324
1325         numnodes = ipalloc_state->num;
1326         pnn    = -1;
1327         for (i=0; i<numnodes; i++) {
1328                 /* verify that this node can serve this ip */
1329                 if (!can_node_takeover_ip(ipalloc_state, i, ip)) {
1330                         /* no it couldnt   so skip to the next node */
1331                         continue;
1332                 }
1333
1334                 num = node_ip_coverage(i, ipalloc_state->all_ips);
1335                 /* was this the first node we checked ? */
1336                 if (pnn == -1) {
1337                         pnn = i;
1338                         min  = num;
1339                 } else {
1340                         if (num < min) {
1341                                 pnn = i;
1342                                 min  = num;
1343                         }
1344                 }
1345         }
1346         if (pnn == -1) {
1347                 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1348                         ctdb_addr_to_str(&ip->addr)));
1349
1350                 return -1;
1351         }
1352
1353         ip->pnn = pnn;
1354         return 0;
1355 }
1356
1357 #define IP_KEYLEN       4
1358 static uint32_t *ip_key(ctdb_sock_addr *ip)
1359 {
1360         static uint32_t key[IP_KEYLEN];
1361
1362         bzero(key, sizeof(key));
1363
1364         switch (ip->sa.sa_family) {
1365         case AF_INET:
1366                 key[3]  = htonl(ip->ip.sin_addr.s_addr);
1367                 break;
1368         case AF_INET6: {
1369                 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1370                 key[0]  = htonl(s6_a32[0]);
1371                 key[1]  = htonl(s6_a32[1]);
1372                 key[2]  = htonl(s6_a32[2]);
1373                 key[3]  = htonl(s6_a32[3]);
1374                 break;
1375         }
1376         default:
1377                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1378                 return key;
1379         }
1380
1381         return key;
1382 }
1383
1384 static void *add_ip_callback(void *parm, void *data)
1385 {
1386         struct public_ip_list *this_ip = parm;
1387         struct public_ip_list *prev_ip = data;
1388
1389         if (prev_ip == NULL) {
1390                 return parm;
1391         }
1392         if (this_ip->pnn == -1) {
1393                 this_ip->pnn = prev_ip->pnn;
1394         }
1395
1396         return parm;
1397 }
1398
1399 static int getips_count_callback(void *param, void *data)
1400 {
1401         struct public_ip_list **ip_list = (struct public_ip_list **)param;
1402         struct public_ip_list *new_ip = (struct public_ip_list *)data;
1403
1404         new_ip->next = *ip_list;
1405         *ip_list     = new_ip;
1406         return 0;
1407 }
1408
1409 static int verify_remote_ip_allocation(struct ctdb_context *ctdb,
1410                                        struct ctdb_public_ip_list_old *ips,
1411                                        uint32_t pnn);
1412
1413 static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
1414                                          struct ipalloc_state *ipalloc_state,
1415                                          struct ctdb_node_map_old *nodemap)
1416 {
1417         int j;
1418         int ret;
1419
1420         if (ipalloc_state->num != nodemap->num) {
1421                 DEBUG(DEBUG_ERR,
1422                       (__location__
1423                        " ipalloc_state->num (%d) != nodemap->num (%d) invalid param\n",
1424                        ipalloc_state->num, nodemap->num));
1425                 return -1;
1426         }
1427
1428         for (j=0; j<nodemap->num; j++) {
1429                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1430                         continue;
1431                 }
1432
1433                 /* Retrieve the list of known public IPs from the node */
1434                 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1435                                         TAKEOVER_TIMEOUT(),
1436                                         j,
1437                                         ipalloc_state->known_public_ips,
1438                                         0,
1439                                         &ipalloc_state->known_public_ips[j]);
1440                 if (ret != 0) {
1441                         DEBUG(DEBUG_ERR,
1442                               ("Failed to read known public IPs from node: %u\n",
1443                                j));
1444                         return -1;
1445                 }
1446
1447                 if (ctdb->do_checkpublicip) {
1448                         verify_remote_ip_allocation(ctdb,
1449                                                     ipalloc_state->known_public_ips[j],
1450                                                     j);
1451                 }
1452
1453                 /* Retrieve the list of available public IPs from the node */
1454                 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1455                                         TAKEOVER_TIMEOUT(),
1456                                         j,
1457                                         ipalloc_state->available_public_ips,
1458                                         CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE,
1459                                         &ipalloc_state->available_public_ips[j]);
1460                 if (ret != 0) {
1461                         DEBUG(DEBUG_ERR,
1462                               ("Failed to read available public IPs from node: %u\n",
1463                                j));
1464                         return -1;
1465                 }
1466         }
1467
1468         return 0;
1469 }
1470
1471 static struct public_ip_list *
1472 create_merged_ip_list(struct ctdb_context *ctdb, struct ipalloc_state *ipalloc_state)
1473 {
1474         int i, j;
1475         struct public_ip_list *ip_list;
1476         struct ctdb_public_ip_list_old *public_ips;
1477
1478         TALLOC_FREE(ctdb->ip_tree);
1479         ctdb->ip_tree = trbt_create(ctdb, 0);
1480
1481         for (i=0; i < ctdb->num_nodes; i++) {
1482                 public_ips = ipalloc_state->known_public_ips[i];
1483
1484                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1485                         continue;
1486                 }
1487
1488                 /* there were no public ips for this node */
1489                 if (public_ips == NULL) {
1490                         continue;
1491                 }
1492
1493                 for (j=0; j < public_ips->num; j++) {
1494                         struct public_ip_list *tmp_ip;
1495
1496                         tmp_ip = talloc_zero(ctdb->ip_tree, struct public_ip_list);
1497                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1498                         /* Do not use information about IP addresses hosted
1499                          * on other nodes, it may not be accurate */
1500                         if (public_ips->ips[j].pnn == ctdb->nodes[i]->pnn) {
1501                                 tmp_ip->pnn = public_ips->ips[j].pnn;
1502                         } else {
1503                                 tmp_ip->pnn = -1;
1504                         }
1505                         tmp_ip->addr = public_ips->ips[j].addr;
1506                         tmp_ip->next = NULL;
1507
1508                         trbt_insertarray32_callback(ctdb->ip_tree,
1509                                 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1510                                 add_ip_callback,
1511                                 tmp_ip);
1512                 }
1513         }
1514
1515         ip_list = NULL;
1516         trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1517
1518         return ip_list;
1519 }
1520
1521 /* 
1522  * This is the length of the longtest common prefix between the IPs.
1523  * It is calculated by XOR-ing the 2 IPs together and counting the
1524  * number of leading zeroes.  The implementation means that all
1525  * addresses end up being 128 bits long.
1526  *
1527  * FIXME? Should we consider IPv4 and IPv6 separately given that the
1528  * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1529  * lots of nodes and IP addresses?
1530  */
1531 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1532 {
1533         uint32_t ip1_k[IP_KEYLEN];
1534         uint32_t *t;
1535         int i;
1536         uint32_t x;
1537
1538         uint32_t distance = 0;
1539
1540         memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1541         t = ip_key(ip2);
1542         for (i=0; i<IP_KEYLEN; i++) {
1543                 x = ip1_k[i] ^ t[i];
1544                 if (x == 0) {
1545                         distance += 32;
1546                 } else {
1547                         /* Count number of leading zeroes. 
1548                          * FIXME? This could be optimised...
1549                          */
1550                         while ((x & (1 << 31)) == 0) {
1551                                 x <<= 1;
1552                                 distance += 1;
1553                         }
1554                 }
1555         }
1556
1557         return distance;
1558 }
1559
1560 /* Calculate the IP distance for the given IP relative to IPs on the
1561    given node.  The ips argument is generally the all_ips variable
1562    used in the main part of the algorithm.
1563  */
1564 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1565                                   struct public_ip_list *ips,
1566                                   int pnn)
1567 {
1568         struct public_ip_list *t;
1569         uint32_t d;
1570
1571         uint32_t sum = 0;
1572
1573         for (t = ips; t != NULL; t = t->next) {
1574                 if (t->pnn != pnn) {
1575                         continue;
1576                 }
1577
1578                 /* Optimisation: We never calculate the distance
1579                  * between an address and itself.  This allows us to
1580                  * calculate the effect of removing an address from a
1581                  * node by simply calculating the distance between
1582                  * that address and all of the exitsing addresses.
1583                  * Moreover, we assume that we're only ever dealing
1584                  * with addresses from all_ips so we can identify an
1585                  * address via a pointer rather than doing a more
1586                  * expensive address comparison. */
1587                 if (&(t->addr) == ip) {
1588                         continue;
1589                 }
1590
1591                 d = ip_distance(ip, &(t->addr));
1592                 sum += d * d;  /* Cheaper than pulling in math.h :-) */
1593         }
1594
1595         return sum;
1596 }
1597
1598 /* Return the LCP2 imbalance metric for addresses currently assigned
1599    to the given node.
1600  */
1601 static uint32_t lcp2_imbalance(struct public_ip_list * all_ips, int pnn)
1602 {
1603         struct public_ip_list *t;
1604
1605         uint32_t imbalance = 0;
1606
1607         for (t = all_ips; t != NULL; t = t->next) {
1608                 if (t->pnn != pnn) {
1609                         continue;
1610                 }
1611                 /* Pass the rest of the IPs rather than the whole
1612                    all_ips input list.
1613                 */
1614                 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1615         }
1616
1617         return imbalance;
1618 }
1619
1620 /* Allocate any unassigned IPs just by looping through the IPs and
1621  * finding the best node for each.
1622  */
1623 static void basic_allocate_unassigned(struct ipalloc_state *ipalloc_state)
1624 {
1625         struct public_ip_list *t;
1626
1627         /* loop over all ip's and find a physical node to cover for
1628            each unassigned ip.
1629         */
1630         for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
1631                 if (t->pnn == -1) {
1632                         if (find_takeover_node(ipalloc_state, t)) {
1633                                 DEBUG(DEBUG_WARNING,
1634                                       ("Failed to find node to cover ip %s\n",
1635                                        ctdb_addr_to_str(&t->addr)));
1636                         }
1637                 }
1638         }
1639 }
1640
1641 /* Basic non-deterministic rebalancing algorithm.
1642  */
1643 static void basic_failback(struct ipalloc_state *ipalloc_state,
1644                            int num_ips)
1645 {
1646         int i, numnodes;
1647         int maxnode, maxnum, minnode, minnum, num, retries;
1648         struct public_ip_list *t;
1649
1650         numnodes = ipalloc_state->num;
1651         retries = 0;
1652
1653 try_again:
1654         maxnum=0;
1655         minnum=0;
1656
1657         /* for each ip address, loop over all nodes that can serve
1658            this ip and make sure that the difference between the node
1659            serving the most and the node serving the least ip's are
1660            not greater than 1.
1661         */
1662         for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
1663                 if (t->pnn == -1) {
1664                         continue;
1665                 }
1666
1667                 /* Get the highest and lowest number of ips's served by any 
1668                    valid node which can serve this ip.
1669                 */
1670                 maxnode = -1;
1671                 minnode = -1;
1672                 for (i=0; i<numnodes; i++) {
1673                         /* only check nodes that can actually serve this ip */
1674                         if (!can_node_takeover_ip(ipalloc_state, i,
1675                                                   t)) {
1676                                 /* no it couldnt   so skip to the next node */
1677                                 continue;
1678                         }
1679
1680                         num = node_ip_coverage(i, ipalloc_state->all_ips);
1681                         if (maxnode == -1) {
1682                                 maxnode = i;
1683                                 maxnum  = num;
1684                         } else {
1685                                 if (num > maxnum) {
1686                                         maxnode = i;
1687                                         maxnum  = num;
1688                                 }
1689                         }
1690                         if (minnode == -1) {
1691                                 minnode = i;
1692                                 minnum  = num;
1693                         } else {
1694                                 if (num < minnum) {
1695                                         minnode = i;
1696                                         minnum  = num;
1697                                 }
1698                         }
1699                 }
1700                 if (maxnode == -1) {
1701                         DEBUG(DEBUG_WARNING,
1702                               (__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1703                                ctdb_addr_to_str(&t->addr)));
1704
1705                         continue;
1706                 }
1707
1708                 /* if the spread between the smallest and largest coverage by
1709                    a node is >=2 we steal one of the ips from the node with
1710                    most coverage to even things out a bit.
1711                    try to do this a limited number of times since we dont
1712                    want to spend too much time balancing the ip coverage.
1713                 */
1714                 if ((maxnum > minnum+1) &&
1715                     (retries < (num_ips + 5))){
1716                         struct public_ip_list *tt;
1717
1718                         /* Reassign one of maxnode's VNNs */
1719                         for (tt = ipalloc_state->all_ips; tt != NULL; tt = tt->next) {
1720                                 if (tt->pnn == maxnode) {
1721                                         (void)find_takeover_node(ipalloc_state,
1722                                                                  tt);
1723                                         retries++;
1724                                         goto try_again;;
1725                                 }
1726                         }
1727                 }
1728         }
1729 }
1730
1731 static bool lcp2_init(struct ipalloc_state *ipalloc_state,
1732                       uint32_t **lcp2_imbalances,
1733                       bool **rebalance_candidates)
1734 {
1735         int i, numnodes;
1736         struct public_ip_list *t;
1737
1738         numnodes = ipalloc_state->num;
1739
1740         *rebalance_candidates = talloc_array(ipalloc_state, bool, numnodes);
1741         if (*rebalance_candidates == NULL) {
1742                 DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
1743                 return false;
1744         }
1745         *lcp2_imbalances = talloc_array(ipalloc_state, uint32_t, numnodes);
1746         if (*lcp2_imbalances == NULL) {
1747                 DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
1748                 return false;
1749         }
1750
1751         for (i=0; i<numnodes; i++) {
1752                 (*lcp2_imbalances)[i] =
1753                         lcp2_imbalance(ipalloc_state->all_ips, i);
1754                 /* First step: assume all nodes are candidates */
1755                 (*rebalance_candidates)[i] = true;
1756         }
1757
1758         /* 2nd step: if a node has IPs assigned then it must have been
1759          * healthy before, so we remove it from consideration.  This
1760          * is overkill but is all we have because we don't maintain
1761          * state between takeover runs.  An alternative would be to
1762          * keep state and invalidate it every time the recovery master
1763          * changes.
1764          */
1765         for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
1766                 if (t->pnn != -1) {
1767                         (*rebalance_candidates)[t->pnn] = false;
1768                 }
1769         }
1770
1771         /* 3rd step: if a node is forced to re-balance then
1772            we allow failback onto the node */
1773         if (ipalloc_state->force_rebalance_nodes == NULL) {
1774                 return true;
1775         }
1776         for (i = 0;
1777              i < talloc_array_length(ipalloc_state->force_rebalance_nodes);
1778              i++) {
1779                 uint32_t pnn = ipalloc_state->force_rebalance_nodes[i];
1780                 if (pnn >= numnodes) {
1781                         DEBUG(DEBUG_ERR,
1782                               (__location__ "unknown node %u\n", pnn));
1783                         continue;
1784                 }
1785
1786                 DEBUG(DEBUG_NOTICE,
1787                       ("Forcing rebalancing of IPs to node %u\n", pnn));
1788                 (*rebalance_candidates)[pnn] = true;
1789         }
1790
1791         return true;
1792 }
1793
1794 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1795  * the IP/node combination that will cost the least.
1796  */
1797 static void lcp2_allocate_unassigned(struct ipalloc_state *ipalloc_state,
1798                                      uint32_t *lcp2_imbalances)
1799 {
1800         struct public_ip_list *t;
1801         int dstnode, numnodes;
1802
1803         int minnode;
1804         uint32_t mindsum, dstdsum, dstimbl, minimbl;
1805         struct public_ip_list *minip;
1806
1807         bool should_loop = true;
1808         bool have_unassigned = true;
1809
1810         numnodes = ipalloc_state->num;
1811
1812         while (have_unassigned && should_loop) {
1813                 should_loop = false;
1814
1815                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1816                 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1817
1818                 minnode = -1;
1819                 mindsum = 0;
1820                 minip = NULL;
1821
1822                 /* loop over each unassigned ip. */
1823                 for (t = ipalloc_state->all_ips; t != NULL ; t = t->next) {
1824                         if (t->pnn != -1) {
1825                                 continue;
1826                         }
1827
1828                         for (dstnode = 0; dstnode < numnodes; dstnode++) {
1829                                 /* only check nodes that can actually takeover this ip */
1830                                 if (!can_node_takeover_ip(ipalloc_state,
1831                                                           dstnode,
1832                                                           t)) {
1833                                         /* no it couldnt   so skip to the next node */
1834                                         continue;
1835                                 }
1836
1837                                 dstdsum = ip_distance_2_sum(&(t->addr),
1838                                                             ipalloc_state->all_ips,
1839                                                             dstnode);
1840                                 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1841                                 DEBUG(DEBUG_DEBUG,
1842                                       (" %s -> %d [+%d]\n",
1843                                        ctdb_addr_to_str(&(t->addr)),
1844                                        dstnode,
1845                                        dstimbl - lcp2_imbalances[dstnode]));
1846
1847
1848                                 if ((minnode == -1) || (dstdsum < mindsum)) {
1849                                         minnode = dstnode;
1850                                         minimbl = dstimbl;
1851                                         mindsum = dstdsum;
1852                                         minip = t;
1853                                         should_loop = true;
1854                                 }
1855                         }
1856                 }
1857
1858                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1859
1860                 /* If we found one then assign it to the given node. */
1861                 if (minnode != -1) {
1862                         minip->pnn = minnode;
1863                         lcp2_imbalances[minnode] = minimbl;
1864                         DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1865                                           ctdb_addr_to_str(&(minip->addr)),
1866                                           minnode,
1867                                           mindsum));
1868                 }
1869
1870                 /* There might be a better way but at least this is clear. */
1871                 have_unassigned = false;
1872                 for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
1873                         if (t->pnn == -1) {
1874                                 have_unassigned = true;
1875                         }
1876                 }
1877         }
1878
1879         /* We know if we have an unassigned addresses so we might as
1880          * well optimise.
1881          */
1882         if (have_unassigned) {
1883                 for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
1884                         if (t->pnn == -1) {
1885                                 DEBUG(DEBUG_WARNING,
1886                                       ("Failed to find node to cover ip %s\n",
1887                                        ctdb_addr_to_str(&t->addr)));
1888                         }
1889                 }
1890         }
1891 }
1892
1893 /* LCP2 algorithm for rebalancing the cluster.  Given a candidate node
1894  * to move IPs from, determines the best IP/destination node
1895  * combination to move from the source node.
1896  */
1897 static bool lcp2_failback_candidate(struct ipalloc_state *ipalloc_state,
1898                                     int srcnode,
1899                                     uint32_t *lcp2_imbalances,
1900                                     bool *rebalance_candidates)
1901 {
1902         int dstnode, mindstnode, numnodes;
1903         uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1904         uint32_t minsrcimbl, mindstimbl;
1905         struct public_ip_list *minip;
1906         struct public_ip_list *t;
1907
1908         /* Find an IP and destination node that best reduces imbalance. */
1909         srcimbl = 0;
1910         minip = NULL;
1911         minsrcimbl = 0;
1912         mindstnode = -1;
1913         mindstimbl = 0;
1914
1915         numnodes = ipalloc_state->num;
1916
1917         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1918         DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n",
1919                            srcnode, lcp2_imbalances[srcnode]));
1920
1921         for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
1922                 /* Only consider addresses on srcnode. */
1923                 if (t->pnn != srcnode) {
1924                         continue;
1925                 }
1926
1927                 /* What is this IP address costing the source node? */
1928                 srcdsum = ip_distance_2_sum(&(t->addr),
1929                                             ipalloc_state->all_ips,
1930                                             srcnode);
1931                 srcimbl = lcp2_imbalances[srcnode] - srcdsum;
1932
1933                 /* Consider this IP address would cost each potential
1934                  * destination node.  Destination nodes are limited to
1935                  * those that are newly healthy, since we don't want
1936                  * to do gratuitous failover of IPs just to make minor
1937                  * balance improvements.
1938                  */
1939                 for (dstnode = 0; dstnode < numnodes; dstnode++) {
1940                         if (!rebalance_candidates[dstnode]) {
1941                                 continue;
1942                         }
1943
1944                         /* only check nodes that can actually takeover this ip */
1945                         if (!can_node_takeover_ip(ipalloc_state, dstnode,
1946                                                   t)) {
1947                                 /* no it couldnt   so skip to the next node */
1948                                 continue;
1949                         }
1950
1951                         dstdsum = ip_distance_2_sum(&(t->addr),
1952                                                     ipalloc_state->all_ips,
1953                                                     dstnode);
1954                         dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1955                         DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1956                                            srcnode, -srcdsum,
1957                                            ctdb_addr_to_str(&(t->addr)),
1958                                            dstnode, dstdsum));
1959
1960                         if ((dstimbl < lcp2_imbalances[srcnode]) &&
1961                             (dstdsum < srcdsum) &&                      \
1962                             ((mindstnode == -1) ||                              \
1963                              ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1964
1965                                 minip = t;
1966                                 minsrcimbl = srcimbl;
1967                                 mindstnode = dstnode;
1968                                 mindstimbl = dstimbl;
1969                         }
1970                 }
1971         }
1972         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1973
1974         if (mindstnode != -1) {
1975                 /* We found a move that makes things better... */
1976                 DEBUG(DEBUG_INFO,
1977                       ("%d [%d] -> %s -> %d [+%d]\n",
1978                        srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1979                        ctdb_addr_to_str(&(minip->addr)),
1980                        mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1981
1982
1983                 lcp2_imbalances[srcnode] = minsrcimbl;
1984                 lcp2_imbalances[mindstnode] = mindstimbl;
1985                 minip->pnn = mindstnode;
1986
1987                 return true;
1988         }
1989
1990         return false;
1991         
1992 }
1993
1994 struct lcp2_imbalance_pnn {
1995         uint32_t imbalance;
1996         int pnn;
1997 };
1998
1999 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
2000 {
2001         const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
2002         const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
2003
2004         if (lipa->imbalance > lipb->imbalance) {
2005                 return -1;
2006         } else if (lipa->imbalance == lipb->imbalance) {
2007                 return 0;
2008         } else {
2009                 return 1;
2010         }
2011 }
2012
2013 /* LCP2 algorithm for rebalancing the cluster.  This finds the source
2014  * node with the highest LCP2 imbalance, and then determines the best
2015  * IP/destination node combination to move from the source node.
2016  */
2017 static void lcp2_failback(struct ipalloc_state *ipalloc_state,
2018                           uint32_t *lcp2_imbalances,
2019                           bool *rebalance_candidates)
2020 {
2021         int i, numnodes;
2022         struct lcp2_imbalance_pnn * lips;
2023         bool again;
2024
2025         numnodes = ipalloc_state->num;
2026
2027 try_again:
2028         /* Put the imbalances and nodes into an array, sort them and
2029          * iterate through candidates.  Usually the 1st one will be
2030          * used, so this doesn't cost much...
2031          */
2032         DEBUG(DEBUG_DEBUG,("+++++++++++++++++++++++++++++++++++++++++\n"));
2033         DEBUG(DEBUG_DEBUG,("Selecting most imbalanced node from:\n"));
2034         lips = talloc_array(ipalloc_state, struct lcp2_imbalance_pnn, numnodes);
2035         for (i = 0; i < numnodes; i++) {
2036                 lips[i].imbalance = lcp2_imbalances[i];
2037                 lips[i].pnn = i;
2038                 DEBUG(DEBUG_DEBUG,(" %d [%d]\n", i, lcp2_imbalances[i]));
2039         }
2040         qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
2041               lcp2_cmp_imbalance_pnn);
2042
2043         again = false;
2044         for (i = 0; i < numnodes; i++) {
2045                 /* This means that all nodes had 0 or 1 addresses, so
2046                  * can't be imbalanced.
2047                  */
2048                 if (lips[i].imbalance == 0) {
2049                         break;
2050                 }
2051
2052                 if (lcp2_failback_candidate(ipalloc_state,
2053                                             lips[i].pnn,
2054                                             lcp2_imbalances,
2055                                             rebalance_candidates)) {
2056                         again = true;
2057                         break;
2058                 }
2059         }
2060
2061         talloc_free(lips);
2062         if (again) {
2063                 goto try_again;
2064         }
2065 }
2066
2067 static void unassign_unsuitable_ips(struct ipalloc_state *ipalloc_state)
2068 {
2069         struct public_ip_list *t;
2070
2071         /* verify that the assigned nodes can serve that public ip
2072            and set it to -1 if not
2073         */
2074         for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
2075                 if (t->pnn == -1) {
2076                         continue;
2077                 }
2078                 if (!can_node_host_ip(ipalloc_state, t->pnn, t) != 0) {
2079                         /* this node can not serve this ip. */
2080                         DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
2081                                            ctdb_addr_to_str(&(t->addr)),
2082                                            t->pnn));
2083                         t->pnn = -1;
2084                 }
2085         }
2086 }
2087
2088 static bool ipalloc_deterministic(struct ipalloc_state *ipalloc_state)
2089 {
2090         struct public_ip_list *t;
2091         int i, numnodes;
2092
2093         numnodes = ipalloc_state->num;
2094
2095         DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
2096        /* Allocate IPs to nodes in a modulo fashion so that IPs will
2097         *  always be allocated the same way for a specific set of
2098         *  available/unavailable nodes.
2099         */
2100
2101         for (i = 0, t = ipalloc_state->all_ips; t!= NULL; t = t->next, i++) {
2102                 t->pnn = i % numnodes;
2103         }
2104
2105         /* IP failback doesn't make sense with deterministic
2106          * IPs, since the modulo step above implicitly fails
2107          * back IPs to their "home" node.
2108          */
2109         if (1 == ipalloc_state->no_ip_failback) {
2110                 DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
2111         }
2112
2113         unassign_unsuitable_ips(ipalloc_state);
2114
2115         basic_allocate_unassigned(ipalloc_state);
2116
2117         /* No failback here! */
2118
2119         return true;
2120 }
2121
2122 static bool ipalloc_nondeterministic(struct ipalloc_state *ipalloc_state)
2123 {
2124         /* This should be pushed down into basic_failback. */
2125         struct public_ip_list *t;
2126         int num_ips = 0;
2127         for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
2128                 num_ips++;
2129         }
2130
2131         unassign_unsuitable_ips(ipalloc_state);
2132
2133         basic_allocate_unassigned(ipalloc_state);
2134
2135         /* If we don't want IPs to fail back then don't rebalance IPs. */
2136         if (1 == ipalloc_state->no_ip_failback) {
2137                 return true;
2138         }
2139
2140         /* Now, try to make sure the ip adresses are evenly distributed
2141            across the nodes.
2142         */
2143         basic_failback(ipalloc_state, num_ips);
2144
2145         return true;
2146 }
2147
2148 static bool ipalloc_lcp2(struct ipalloc_state *ipalloc_state)
2149 {
2150         uint32_t *lcp2_imbalances;
2151         bool *rebalance_candidates;
2152         int numnodes, num_rebalance_candidates, i;
2153         bool ret = true;
2154
2155         unassign_unsuitable_ips(ipalloc_state);
2156
2157         if (!lcp2_init(ipalloc_state,
2158                        &lcp2_imbalances, &rebalance_candidates)) {
2159                 ret = false;
2160                 goto finished;
2161         }
2162
2163         lcp2_allocate_unassigned(ipalloc_state, lcp2_imbalances);
2164
2165         /* If we don't want IPs to fail back then don't rebalance IPs. */
2166         if (1 == ipalloc_state->no_ip_failback) {
2167                 goto finished;
2168         }
2169
2170         /* It is only worth continuing if we have suitable target
2171          * nodes to transfer IPs to.  This check is much cheaper than
2172          * continuing on...
2173          */
2174         numnodes = ipalloc_state->num;
2175         num_rebalance_candidates = 0;
2176         for (i=0; i<numnodes; i++) {
2177                 if (rebalance_candidates[i]) {
2178                         num_rebalance_candidates++;
2179                 }
2180         }
2181         if (num_rebalance_candidates == 0) {
2182                 goto finished;
2183         }
2184
2185         /* Now, try to make sure the ip adresses are evenly distributed
2186            across the nodes.
2187         */
2188         lcp2_failback(ipalloc_state, lcp2_imbalances, rebalance_candidates);
2189
2190 finished:
2191         return ret;
2192 }
2193
2194 static bool all_nodes_are_disabled(struct ctdb_node_map_old *nodemap)
2195 {
2196         int i;
2197
2198         for (i=0;i<nodemap->num;i++) {
2199                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
2200                         /* Found one completely healthy node */
2201                         return false;
2202                 }
2203         }
2204
2205         return true;
2206 }
2207
2208 /* The calculation part of the IP allocation algorithm. */
2209 static bool ipalloc(struct ipalloc_state *ipalloc_state)
2210 {
2211         bool ret;
2212
2213         switch (ipalloc_state->algorithm) {
2214         case IPALLOC_LCP2:
2215                 ret = ipalloc_lcp2(ipalloc_state);
2216                 break;
2217         case IPALLOC_DETERMINISTIC:
2218                 ret = ipalloc_deterministic(ipalloc_state);
2219                 break;
2220         case IPALLOC_NONDETERMINISTIC:
2221                 ret = ipalloc_nondeterministic(ipalloc_state);
2222                break;
2223         }
2224
2225         /* at this point ->pnn is the node which will own each IP
2226            or -1 if there is no node that can cover this ip
2227         */
2228
2229         return ret;
2230 }
2231
2232 struct get_tunable_callback_data {
2233         const char *tunable;
2234         uint32_t *out;
2235         bool fatal;
2236 };
2237
2238 static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
2239                                  int32_t res, TDB_DATA outdata,
2240                                  void *callback)
2241 {
2242         struct get_tunable_callback_data *cd =
2243                 (struct get_tunable_callback_data *)callback;
2244         int size;
2245
2246         if (res != 0) {
2247                 /* Already handled in fail callback */
2248                 return;
2249         }
2250
2251         if (outdata.dsize != sizeof(uint32_t)) {
2252                 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading \"%s\" tunable from node %d. Expected %d bytes but received %d bytes\n",
2253                                  cd->tunable, pnn, (int)sizeof(uint32_t),
2254                                  (int)outdata.dsize));
2255                 cd->fatal = true;
2256                 return;
2257         }
2258
2259         size = talloc_array_length(cd->out);
2260         if (pnn >= size) {
2261                 DEBUG(DEBUG_ERR,("Got %s reply from node %d but nodemap only has %d entries\n",
2262                                  cd->tunable, pnn, size));
2263                 return;
2264         }
2265
2266                 
2267         cd->out[pnn] = *(uint32_t *)outdata.dptr;
2268 }
2269
2270 static void get_tunable_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2271                                        int32_t res, TDB_DATA outdata,
2272                                        void *callback)
2273 {
2274         struct get_tunable_callback_data *cd =
2275                 (struct get_tunable_callback_data *)callback;
2276
2277         switch (res) {
2278         case -ETIME:
2279                 DEBUG(DEBUG_ERR,
2280                       ("Timed out getting tunable \"%s\" from node %d\n",
2281                        cd->tunable, pnn));
2282                 cd->fatal = true;
2283                 break;
2284         case -EINVAL:
2285         case -1:
2286                 DEBUG(DEBUG_WARNING,
2287                       ("Tunable \"%s\" not implemented on node %d\n",
2288                        cd->tunable, pnn));
2289                 break;
2290         default:
2291                 DEBUG(DEBUG_ERR,
2292                       ("Unexpected error getting tunable \"%s\" from node %d\n",
2293                        cd->tunable, pnn));
2294                 cd->fatal = true;
2295         }
2296 }
2297
2298 static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
2299                                         TALLOC_CTX *tmp_ctx,
2300                                         struct ctdb_node_map_old *nodemap,
2301                                         const char *tunable,
2302                                         uint32_t default_value)
2303 {
2304         TDB_DATA data;
2305         struct ctdb_control_get_tunable *t;
2306         uint32_t *nodes;
2307         uint32_t *tvals;
2308         struct get_tunable_callback_data callback_data;
2309         int i;
2310
2311         tvals = talloc_array(tmp_ctx, uint32_t, nodemap->num);
2312         CTDB_NO_MEMORY_NULL(ctdb, tvals);
2313         for (i=0; i<nodemap->num; i++) {
2314                 tvals[i] = default_value;
2315         }
2316                 
2317         callback_data.out = tvals;
2318         callback_data.tunable = tunable;
2319         callback_data.fatal = false;
2320
2321         data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(tunable) + 1;
2322         data.dptr  = talloc_size(tmp_ctx, data.dsize);
2323         t = (struct ctdb_control_get_tunable *)data.dptr;
2324         t->length = strlen(tunable)+1;
2325         memcpy(t->name, tunable, t->length);
2326         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2327         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
2328                                       nodes, 0, TAKEOVER_TIMEOUT(),
2329                                       false, data,
2330                                       get_tunable_callback,
2331                                       get_tunable_fail_callback,
2332                                       &callback_data) != 0) {
2333                 if (callback_data.fatal) {
2334                         talloc_free(tvals);
2335                         tvals = NULL;
2336                 }
2337         }
2338         talloc_free(nodes);
2339         talloc_free(data.dptr);
2340
2341         return tvals;
2342 }
2343
2344 /* Set internal flags for IP allocation:
2345  *   Clear ip flags
2346  *   Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
2347  *   Set NOIPHOST ip flag for each INACTIVE node
2348  *   if all nodes are disabled:
2349  *     Set NOIPHOST ip flags from per-node NoIPHostOnAllDisabled tunable
2350  *   else
2351  *     Set NOIPHOST ip flags for disabled nodes
2352  */
2353 static void set_ipflags_internal(struct ipalloc_state *ipalloc_state,
2354                                  struct ctdb_node_map_old *nodemap,
2355                                  uint32_t *tval_noiptakeover,
2356                                  uint32_t *tval_noiphostonalldisabled)
2357 {
2358         int i;
2359
2360         for (i=0;i<nodemap->num;i++) {
2361                 /* Can not take IPs on node with NoIPTakeover set */
2362                 if (tval_noiptakeover[i] != 0) {
2363                         ipalloc_state->noiptakeover[i] = true;
2364                 }
2365
2366                 /* Can not host IPs on INACTIVE node */
2367                 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2368                         ipalloc_state->noiphost[i] = true;
2369                 }
2370         }
2371
2372         if (all_nodes_are_disabled(nodemap)) {
2373                 /* If all nodes are disabled, can not host IPs on node
2374                  * with NoIPHostOnAllDisabled set
2375                  */
2376                 for (i=0;i<nodemap->num;i++) {
2377                         if (tval_noiphostonalldisabled[i] != 0) {
2378                                 ipalloc_state->noiphost[i] = true;
2379                         }
2380                 }
2381         } else {
2382                 /* If some nodes are not disabled, then can not host
2383                  * IPs on DISABLED node
2384                  */
2385                 for (i=0;i<nodemap->num;i++) {
2386                         if (nodemap->nodes[i].flags & NODE_FLAGS_DISABLED) {
2387                                 ipalloc_state->noiphost[i] = true;
2388                         }
2389                 }
2390         }
2391 }
2392
2393 static bool set_ipflags(struct ctdb_context *ctdb,
2394                         struct ipalloc_state *ipalloc_state,
2395                         struct ctdb_node_map_old *nodemap)
2396 {
2397         uint32_t *tval_noiptakeover;
2398         uint32_t *tval_noiphostonalldisabled;
2399
2400         tval_noiptakeover = get_tunable_from_nodes(ctdb, ipalloc_state, nodemap,
2401                                                    "NoIPTakeover", 0);
2402         if (tval_noiptakeover == NULL) {
2403                 return false;
2404         }
2405
2406         tval_noiphostonalldisabled =
2407                 get_tunable_from_nodes(ctdb, ipalloc_state, nodemap,
2408                                        "NoIPHostOnAllDisabled", 0);
2409         if (tval_noiphostonalldisabled == NULL) {
2410                 /* Caller frees tmp_ctx */
2411                 return false;
2412         }
2413
2414         set_ipflags_internal(ipalloc_state, nodemap,
2415                              tval_noiptakeover,
2416                              tval_noiphostonalldisabled);
2417
2418         talloc_free(tval_noiptakeover);
2419         talloc_free(tval_noiphostonalldisabled);
2420
2421         return true;
2422 }
2423
2424 static struct ipalloc_state * ipalloc_state_init(struct ctdb_context *ctdb,
2425                                                  TALLOC_CTX *mem_ctx)
2426 {
2427         struct ipalloc_state *ipalloc_state =
2428                 talloc_zero(mem_ctx, struct ipalloc_state);
2429         if (ipalloc_state == NULL) {
2430                 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
2431                 return NULL;
2432         }
2433
2434         ipalloc_state->num = ctdb->num_nodes;
2435         ipalloc_state->known_public_ips =
2436                 talloc_zero_array(ipalloc_state,
2437                                   struct ctdb_public_ip_list_old *,
2438                                   ipalloc_state->num);
2439         if (ipalloc_state->known_public_ips == NULL) {
2440                 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
2441                 talloc_free(ipalloc_state);
2442                 return NULL;
2443         }
2444         ipalloc_state->available_public_ips =
2445                 talloc_zero_array(ipalloc_state,
2446                                   struct ctdb_public_ip_list_old *,
2447                                   ipalloc_state->num);
2448         if (ipalloc_state->available_public_ips == NULL) {
2449                 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
2450                 talloc_free(ipalloc_state);
2451                 return NULL;
2452         }
2453         ipalloc_state->noiptakeover =
2454                 talloc_zero_array(ipalloc_state,
2455                                   bool,
2456                                   ipalloc_state->num);
2457         if (ipalloc_state->noiptakeover == NULL) {
2458                 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
2459                 talloc_free(ipalloc_state);
2460                 return NULL;
2461         }
2462         ipalloc_state->noiphost =
2463                 talloc_zero_array(ipalloc_state,
2464                                   bool,
2465                                   ipalloc_state->num);
2466         if (ipalloc_state->noiphost == NULL) {
2467                 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
2468                 talloc_free(ipalloc_state);
2469                 return NULL;
2470         }
2471
2472         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2473                 ipalloc_state->algorithm = IPALLOC_LCP2;
2474         } else if (1 == ctdb->tunable.deterministic_public_ips) {
2475                 ipalloc_state->algorithm = IPALLOC_DETERMINISTIC;
2476         } else {
2477                 ipalloc_state->algorithm = IPALLOC_NONDETERMINISTIC;
2478         }
2479
2480         ipalloc_state->no_ip_failback = ctdb->tunable.no_ip_failback;
2481
2482         return ipalloc_state;
2483 }
2484
2485 struct iprealloc_callback_data {
2486         bool *retry_nodes;
2487         int retry_count;
2488         client_async_callback fail_callback;
2489         void *fail_callback_data;
2490         struct ctdb_node_map_old *nodemap;
2491 };
2492
2493 static void iprealloc_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2494                                         int32_t res, TDB_DATA outdata,
2495                                         void *callback)
2496 {
2497         int numnodes;
2498         struct iprealloc_callback_data *cd =
2499                 (struct iprealloc_callback_data *)callback;
2500
2501         numnodes = talloc_array_length(cd->retry_nodes);
2502         if (pnn > numnodes) {
2503                 DEBUG(DEBUG_ERR,
2504                       ("ipreallocated failure from node %d, "
2505                        "but only %d nodes in nodemap\n",
2506                        pnn, numnodes));
2507                 return;
2508         }
2509
2510         /* Can't run the "ipreallocated" event on a INACTIVE node */
2511         if (cd->nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE) {
2512                 DEBUG(DEBUG_WARNING,
2513                       ("ipreallocated failed on inactive node %d, ignoring\n",
2514                        pnn));
2515                 return;
2516         }
2517
2518         switch (res) {
2519         case -ETIME:
2520                 /* If the control timed out then that's a real error,
2521                  * so call the real fail callback
2522                  */
2523                 if (cd->fail_callback) {
2524                         cd->fail_callback(ctdb, pnn, res, outdata,
2525                                           cd->fail_callback_data);
2526                 } else {
2527                         DEBUG(DEBUG_WARNING,
2528                               ("iprealloc timed out but no callback registered\n"));
2529                 }
2530                 break;
2531         default:
2532                 /* If not a timeout then either the ipreallocated
2533                  * eventscript (or some setup) failed.  This might
2534                  * have failed because the IPREALLOCATED control isn't
2535                  * implemented - right now there is no way of knowing
2536                  * because the error codes are all folded down to -1.
2537                  * Consider retrying using EVENTSCRIPT control...
2538                  */
2539                 DEBUG(DEBUG_WARNING,
2540                       ("ipreallocated failure from node %d, flagging retry\n",
2541                        pnn));
2542                 cd->retry_nodes[pnn] = true;
2543                 cd->retry_count++;
2544         }
2545 }
2546
2547 struct takeover_callback_data {
2548         bool *node_failed;
2549         client_async_callback fail_callback;
2550         void *fail_callback_data;
2551         struct ctdb_node_map_old *nodemap;
2552 };
2553
2554 static void takeover_run_fail_callback(struct ctdb_context *ctdb,
2555                                        uint32_t node_pnn, int32_t res,
2556                                        TDB_DATA outdata, void *callback_data)
2557 {
2558         struct takeover_callback_data *cd =
2559                 talloc_get_type_abort(callback_data,
2560                                       struct takeover_callback_data);
2561         int i;
2562
2563         for (i = 0; i < cd->nodemap->num; i++) {
2564                 if (node_pnn == cd->nodemap->nodes[i].pnn) {
2565                         break;
2566                 }
2567         }
2568
2569         if (i == cd->nodemap->num) {
2570                 DEBUG(DEBUG_ERR, (__location__ " invalid PNN %u\n", node_pnn));
2571                 return;
2572         }
2573
2574         if (!cd->node_failed[i]) {
2575                 cd->node_failed[i] = true;
2576                 cd->fail_callback(ctdb, node_pnn, res, outdata,
2577                                   cd->fail_callback_data);
2578         }
2579 }
2580
2581 /*
2582   make any IP alias changes for public addresses that are necessary 
2583  */
2584 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
2585                       uint32_t *force_rebalance_nodes,
2586                       client_async_callback fail_callback, void *callback_data)
2587 {
2588         int i, j, ret;
2589         struct ctdb_public_ip ip;
2590         uint32_t *nodes;
2591         struct public_ip_list *all_ips, *tmp_ip;
2592         TDB_DATA data;
2593         struct timeval timeout;
2594         struct client_async_data *async_data;
2595         struct ctdb_client_control_state *state;
2596         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2597         struct ipalloc_state *ipalloc_state;
2598         struct takeover_callback_data *takeover_data;
2599         struct iprealloc_callback_data iprealloc_data;
2600         bool *retry_data;
2601         bool can_host_ips;
2602
2603         /*
2604          * ip failover is completely disabled, just send out the 
2605          * ipreallocated event.
2606          */
2607         if (ctdb->tunable.disable_ip_failover != 0) {
2608                 goto ipreallocated;
2609         }
2610
2611         ipalloc_state = ipalloc_state_init(ctdb, tmp_ctx);
2612         if (ipalloc_state == NULL) {
2613                 talloc_free(tmp_ctx);
2614                 return -1;
2615         }
2616
2617         if (!set_ipflags(ctdb, ipalloc_state, nodemap)) {
2618                 DEBUG(DEBUG_ERR,("Failed to set IP flags - aborting takeover run\n"));
2619                 talloc_free(tmp_ctx);
2620                 return -1;
2621         }
2622
2623         /* Fetch known/available public IPs from each active node */
2624         ret = ctdb_reload_remote_public_ips(ctdb, ipalloc_state, nodemap);
2625         if (ret != 0) {
2626                 talloc_free(tmp_ctx);
2627                 return -1;
2628         }
2629
2630         /* Short-circuit IP allocation if no node has available IPs */
2631         can_host_ips = false;
2632         for (i=0; i < ipalloc_state->num; i++) {
2633                 if (ipalloc_state->available_public_ips[i] != NULL) {
2634                         can_host_ips = true;
2635                 }
2636         }
2637         if (!can_host_ips) {
2638                 DEBUG(DEBUG_WARNING,("No nodes available to host public IPs yet\n"));
2639                 return 0;
2640         }
2641
2642         /* since nodes only know about those public addresses that
2643            can be served by that particular node, no single node has
2644            a full list of all public addresses that exist in the cluster.
2645            Walk over all node structures and create a merged list of
2646            all public addresses that exist in the cluster.
2647
2648            keep the tree of ips around as ctdb->ip_tree
2649         */
2650         all_ips = create_merged_ip_list(ctdb, ipalloc_state);
2651         ipalloc_state->all_ips = all_ips;
2652
2653         ipalloc_state->force_rebalance_nodes = force_rebalance_nodes;
2654
2655         /* Do the IP reassignment calculations */
2656         ipalloc(ipalloc_state);
2657
2658         /* Now tell all nodes to release any public IPs should not
2659          * host.  This will be a NOOP on nodes that don't currently
2660          * hold the given IP.
2661          */
2662         takeover_data = talloc_zero(tmp_ctx, struct takeover_callback_data);
2663         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data);
2664
2665         takeover_data->node_failed = talloc_zero_array(tmp_ctx,
2666                                                        bool, nodemap->num);
2667         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data->node_failed);
2668         takeover_data->fail_callback = fail_callback;
2669         takeover_data->fail_callback_data = callback_data;
2670         takeover_data->nodemap = nodemap;
2671
2672         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2673         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2674
2675         async_data->fail_callback = takeover_run_fail_callback;
2676         async_data->callback_data = takeover_data;
2677
2678         ZERO_STRUCT(ip); /* Avoid valgrind warnings for union */
2679
2680         /* Send a RELEASE_IP to all nodes that should not be hosting
2681          * each IP.  For each IP, all but one of these will be
2682          * redundant.  However, the redundant ones are used to tell
2683          * nodes which node should be hosting the IP so that commands
2684          * like "ctdb ip" can display a particular nodes idea of who
2685          * is hosting what. */
2686         for (i=0;i<nodemap->num;i++) {
2687                 /* don't talk to unconnected nodes, but do talk to banned nodes */
2688                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2689                         continue;
2690                 }
2691
2692                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2693                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2694                                 /* This node should be serving this
2695                                    vnn so don't tell it to release the ip
2696                                 */
2697                                 continue;
2698                         }
2699                         ip.pnn  = tmp_ip->pnn;
2700                         ip.addr = tmp_ip->addr;
2701
2702                         timeout = TAKEOVER_TIMEOUT();
2703                         data.dsize = sizeof(ip);
2704                         data.dptr  = (uint8_t *)&ip;
2705                         state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2706                                                   0, CTDB_CONTROL_RELEASE_IP, 0,
2707                                                   data, async_data,
2708                                                   &timeout, NULL);
2709                         if (state == NULL) {
2710                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2711                                 talloc_free(tmp_ctx);
2712                                 return -1;
2713                         }
2714
2715                         ctdb_client_async_add(async_data, state);
2716                 }
2717         }
2718         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2719                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2720                 talloc_free(tmp_ctx);
2721                 return -1;
2722         }
2723         talloc_free(async_data);
2724
2725
2726         /* For each IP, send a TAKOVER_IP to the node that should be
2727          * hosting it.  Many of these will often be redundant (since
2728          * the allocation won't have changed) but they can be useful
2729          * to recover from inconsistencies. */
2730         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2731         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2732
2733         async_data->fail_callback = fail_callback;
2734         async_data->callback_data = callback_data;
2735
2736         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2737                 if (tmp_ip->pnn == -1) {
2738                         /* this IP won't be taken over */
2739                         continue;
2740                 }
2741
2742                 ip.pnn  = tmp_ip->pnn;
2743                 ip.addr = tmp_ip->addr;
2744
2745                 timeout = TAKEOVER_TIMEOUT();
2746                 data.dsize = sizeof(ip);
2747                 data.dptr  = (uint8_t *)&ip;
2748                 state = ctdb_control_send(ctdb, tmp_ip->pnn,
2749                                           0, CTDB_CONTROL_TAKEOVER_IP, 0,
2750                                           data, async_data, &timeout, NULL);
2751                 if (state == NULL) {
2752                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2753                         talloc_free(tmp_ctx);
2754                         return -1;
2755                 }
2756
2757                 ctdb_client_async_add(async_data, state);
2758         }
2759         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2760                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2761                 talloc_free(tmp_ctx);
2762                 return -1;
2763         }
2764
2765 ipreallocated:
2766         /*
2767          * Tell all nodes to run eventscripts to process the
2768          * "ipreallocated" event.  This can do a lot of things,
2769          * including restarting services to reconfigure them if public
2770          * IPs have moved.  Once upon a time this event only used to
2771          * update natgw.
2772          */
2773         retry_data = talloc_zero_array(tmp_ctx, bool, nodemap->num);
2774         CTDB_NO_MEMORY_FATAL(ctdb, retry_data);
2775         iprealloc_data.retry_nodes = retry_data;
2776         iprealloc_data.retry_count = 0;
2777         iprealloc_data.fail_callback = fail_callback;
2778         iprealloc_data.fail_callback_data = callback_data;
2779         iprealloc_data.nodemap = nodemap;
2780
2781         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2782         ret = ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
2783                                         nodes, 0, TAKEOVER_TIMEOUT(),
2784                                         false, tdb_null,
2785                                         NULL, iprealloc_fail_callback,
2786                                         &iprealloc_data);
2787         if (ret != 0) {
2788                 /* If the control failed then we should retry to any
2789                  * nodes flagged by iprealloc_fail_callback using the
2790                  * EVENTSCRIPT control.  This is a best-effort at
2791                  * backward compatiblity when running a mixed cluster
2792                  * where some nodes have not yet been upgraded to
2793                  * support the IPREALLOCATED control.
2794                  */
2795                 DEBUG(DEBUG_WARNING,
2796                       ("Retry ipreallocated to some nodes using eventscript control\n"));
2797
2798                 nodes = talloc_array(tmp_ctx, uint32_t,
2799                                      iprealloc_data.retry_count);
2800                 CTDB_NO_MEMORY_FATAL(ctdb, nodes);
2801
2802                 j = 0;
2803                 for (i=0; i<nodemap->num; i++) {
2804                         if (iprealloc_data.retry_nodes[i]) {
2805                                 nodes[j] = i;
2806                                 j++;
2807                         }
2808                 }
2809
2810                 data.dptr  = discard_const("ipreallocated");
2811                 data.dsize = strlen((char *)data.dptr) + 1; 
2812                 ret = ctdb_client_async_control(ctdb,
2813                                                 CTDB_CONTROL_RUN_EVENTSCRIPTS,
2814                                                 nodes, 0, TAKEOVER_TIMEOUT(),
2815                                                 false, data,
2816                                                 NULL, fail_callback,
2817                                                 callback_data);
2818                 if (ret != 0) {
2819                         DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
2820                 }
2821         }
2822
2823         talloc_free(tmp_ctx);
2824         return ret;
2825 }
2826
2827
2828 /*
2829   destroy a ctdb_client_ip structure
2830  */
2831 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2832 {
2833         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2834                 ctdb_addr_to_str(&ip->addr),
2835                 ntohs(ip->addr.ip.sin_port),
2836                 ip->client_id));
2837
2838         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2839         return 0;
2840 }
2841
2842 /*
2843   called by a client to inform us of a TCP connection that it is managing
2844   that should tickled with an ACK when IP takeover is done
2845  */
2846 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2847                                 TDB_DATA indata)
2848 {
2849         struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
2850         struct ctdb_connection *tcp_sock = NULL;
2851         struct ctdb_tcp_list *tcp;
2852         struct ctdb_connection t;
2853         int ret;
2854         TDB_DATA data;
2855         struct ctdb_client_ip *ip;
2856         struct ctdb_vnn *vnn;
2857         ctdb_sock_addr addr;
2858
2859         /* If we don't have public IPs, tickles are useless */
2860         if (ctdb->vnn == NULL) {
2861                 return 0;
2862         }
2863
2864         tcp_sock = (struct ctdb_connection *)indata.dptr;
2865
2866         addr = tcp_sock->src;
2867         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
2868         addr = tcp_sock->dst;
2869         ctdb_canonicalize_ip(&addr, &tcp_sock->dst);
2870
2871         ZERO_STRUCT(addr);
2872         memcpy(&addr, &tcp_sock->dst, sizeof(addr));
2873         vnn = find_public_ip_vnn(ctdb, &addr);
2874         if (vnn == NULL) {
2875                 switch (addr.sa.sa_family) {
2876                 case AF_INET:
2877                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2878                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
2879                                         ctdb_addr_to_str(&addr)));
2880                         }
2881                         break;
2882                 case AF_INET6:
2883                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
2884                                 ctdb_addr_to_str(&addr)));
2885                         break;
2886                 default:
2887                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2888                 }
2889
2890                 return 0;
2891         }
2892
2893         if (vnn->pnn != ctdb->pnn) {
2894                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2895                         ctdb_addr_to_str(&addr),
2896                         client_id, client->pid));
2897                 /* failing this call will tell smbd to die */
2898                 return -1;
2899         }
2900
2901         ip = talloc(client, struct ctdb_client_ip);
2902         CTDB_NO_MEMORY(ctdb, ip);
2903
2904         ip->ctdb      = ctdb;
2905         ip->addr      = addr;
2906         ip->client_id = client_id;
2907         talloc_set_destructor(ip, ctdb_client_ip_destructor);
2908         DLIST_ADD(ctdb->client_ip_list, ip);
2909
2910         tcp = talloc(client, struct ctdb_tcp_list);
2911         CTDB_NO_MEMORY(ctdb, tcp);
2912
2913         tcp->connection.src = tcp_sock->src;
2914         tcp->connection.dst = tcp_sock->dst;
2915
2916         DLIST_ADD(client->tcp_list, tcp);
2917
2918         t.src = tcp_sock->src;
2919         t.dst = tcp_sock->dst;
2920
2921         data.dptr = (uint8_t *)&t;
2922         data.dsize = sizeof(t);
2923
2924         switch (addr.sa.sa_family) {
2925         case AF_INET:
2926                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2927                         (unsigned)ntohs(tcp_sock->dst.ip.sin_port),
2928                         ctdb_addr_to_str(&tcp_sock->src),
2929                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2930                 break;
2931         case AF_INET6:
2932                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2933                         (unsigned)ntohs(tcp_sock->dst.ip6.sin6_port),
2934                         ctdb_addr_to_str(&tcp_sock->src),
2935                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2936                 break;
2937         default:
2938                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2939         }
2940
2941
2942         /* tell all nodes about this tcp connection */
2943         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
2944                                        CTDB_CONTROL_TCP_ADD,
2945                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2946         if (ret != 0) {
2947                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
2948                 return -1;
2949         }
2950
2951         return 0;
2952 }
2953
2954 /*
2955   find a tcp address on a list
2956  */
2957 static struct ctdb_connection *ctdb_tcp_find(struct ctdb_tcp_array *array,
2958                                            struct ctdb_connection *tcp)
2959 {
2960         int i;
2961
2962         if (array == NULL) {
2963                 return NULL;
2964         }
2965
2966         for (i=0;i<array->num;i++) {
2967                 if (ctdb_same_sockaddr(&array->connections[i].src, &tcp->src) &&
2968                     ctdb_same_sockaddr(&array->connections[i].dst, &tcp->dst)) {
2969                         return &array->connections[i];
2970                 }
2971         }
2972         return NULL;
2973 }
2974
2975
2976
2977 /*
2978   called by a daemon to inform us of a TCP connection that one of its
2979   clients managing that should tickled with an ACK when IP takeover is
2980   done
2981  */
2982 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
2983 {
2984         struct ctdb_connection *p = (struct ctdb_connection *)indata.dptr;
2985         struct ctdb_tcp_array *tcparray;
2986         struct ctdb_connection tcp;
2987         struct ctdb_vnn *vnn;
2988
2989         /* If we don't have public IPs, tickles are useless */
2990         if (ctdb->vnn == NULL) {
2991                 return 0;
2992         }
2993
2994         vnn = find_public_ip_vnn(ctdb, &p->dst);
2995         if (vnn == NULL) {
2996                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
2997                         ctdb_addr_to_str(&p->dst)));
2998
2999                 return -1;
3000         }
3001
3002
3003         tcparray = vnn->tcp_array;
3004
3005         /* If this is the first tickle */
3006         if (tcparray == NULL) {
3007                 tcparray = talloc(vnn, struct ctdb_tcp_array);
3008                 CTDB_NO_MEMORY(ctdb, tcparray);
3009                 vnn->tcp_array = tcparray;
3010
3011                 tcparray->num = 0;
3012                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_connection));
3013                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
3014
3015                 tcparray->connections[tcparray->num].src = p->src;
3016                 tcparray->connections[tcparray->num].dst = p->dst;
3017                 tcparray->num++;
3018
3019                 if (tcp_update_needed) {
3020                         vnn->tcp_update_needed = true;
3021                 }
3022                 return 0;
3023         }
3024
3025
3026         /* Do we already have this tickle ?*/
3027         tcp.src = p->src;
3028         tcp.dst = p->dst;
3029         if (ctdb_tcp_find(tcparray, &tcp) != NULL) {
3030                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
3031                         ctdb_addr_to_str(&tcp.dst),
3032                         ntohs(tcp.dst.ip.sin_port),
3033                         vnn->pnn));
3034                 return 0;
3035         }
3036
3037         /* A new tickle, we must add it to the array */
3038         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
3039                                         struct ctdb_connection,
3040                                         tcparray->num+1);
3041         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3042
3043         tcparray->connections[tcparray->num].src = p->src;
3044         tcparray->connections[tcparray->num].dst = p->dst;
3045         tcparray->num++;
3046
3047         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
3048                 ctdb_addr_to_str(&tcp.dst),
3049                 ntohs(tcp.dst.ip.sin_port),
3050                 vnn->pnn));
3051
3052         if (tcp_update_needed) {
3053                 vnn->tcp_update_needed = true;
3054         }
3055
3056         return 0;
3057 }
3058
3059
3060 static void ctdb_remove_connection(struct ctdb_vnn *vnn, struct ctdb_connection *conn)
3061 {
3062         struct ctdb_connection *tcpp;
3063
3064         if (vnn == NULL) {
3065                 return;
3066         }
3067
3068         /* if the array is empty we cant remove it
3069            and we don't need to do anything
3070          */
3071         if (vnn->tcp_array == NULL) {
3072                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
3073                         ctdb_addr_to_str(&conn->dst),
3074                         ntohs(conn->dst.ip.sin_port)));
3075                 return;
3076         }
3077
3078
3079         /* See if we know this connection
3080            if we don't know this connection  then we dont need to do anything
3081          */
3082         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
3083         if (tcpp == NULL) {
3084                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
3085                         ctdb_addr_to_str(&conn->dst),
3086                         ntohs(conn->dst.ip.sin_port)));
3087                 return;
3088         }
3089
3090
3091         /* We need to remove this entry from the array.
3092            Instead of allocating a new array and copying data to it
3093            we cheat and just copy the last entry in the existing array
3094            to the entry that is to be removed and just shring the 
3095            ->num field
3096          */
3097         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
3098         vnn->tcp_array->num--;
3099
3100         /* If we deleted the last entry we also need to remove the entire array
3101          */
3102         if (vnn->tcp_array->num == 0) {
3103                 talloc_free(vnn->tcp_array);
3104                 vnn->tcp_array = NULL;
3105         }               
3106
3107         vnn->tcp_update_needed = true;
3108
3109         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
3110                 ctdb_addr_to_str(&conn->src),
3111                 ntohs(conn->src.ip.sin_port)));
3112 }
3113
3114
3115 /*
3116   called by a daemon to inform us of a TCP connection that one of its
3117   clients used are no longer needed in the tickle database
3118  */
3119 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
3120 {
3121         struct ctdb_vnn *vnn;
3122         struct ctdb_connection *conn = (struct ctdb_connection *)indata.dptr;
3123
3124         /* If we don't have public IPs, tickles are useless */
3125         if (ctdb->vnn == NULL) {
3126                 return 0;
3127         }
3128
3129         vnn = find_public_ip_vnn(ctdb, &conn->dst);
3130         if (vnn == NULL) {
3131                 DEBUG(DEBUG_ERR,
3132                       (__location__ " unable to find public address %s\n",
3133                        ctdb_addr_to_str(&conn->dst)));
3134                 return 0;
3135         }
3136
3137         ctdb_remove_connection(vnn, conn);
3138
3139         return 0;
3140 }
3141
3142
3143 /*
3144   Called when another daemon starts - causes all tickles for all
3145   public addresses we are serving to be sent to the new node on the
3146   next check.  This actually causes the next scheduled call to
3147   tdb_update_tcp_tickles() to update all nodes.  This is simple and
3148   doesn't require careful error handling.
3149  */
3150 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t pnn)
3151 {
3152         struct ctdb_vnn *vnn;
3153
3154         DEBUG(DEBUG_INFO, ("Received startup control from node %lu\n",
3155                            (unsigned long) pnn));
3156
3157         for (vnn = ctdb->vnn; vnn != NULL; vnn = vnn->next) {
3158                 vnn->tcp_update_needed = true;
3159         }
3160
3161         return 0;
3162 }
3163
3164
3165 /*
3166   called when a client structure goes away - hook to remove
3167   elements from the tcp_list in all daemons
3168  */
3169 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
3170 {
3171         while (client->tcp_list) {
3172                 struct ctdb_vnn *vnn;
3173                 struct ctdb_tcp_list *tcp = client->tcp_list;
3174                 struct ctdb_connection *conn = &tcp->connection;
3175
3176                 DLIST_REMOVE(client->tcp_list, tcp);
3177
3178                 vnn = find_public_ip_vnn(client->ctdb,
3179                                          &conn->dst);
3180                 if (vnn == NULL) {
3181                         DEBUG(DEBUG_ERR,
3182                               (__location__ " unable to find public address %s\n",
3183                                ctdb_addr_to_str(&conn->dst)));
3184                         continue;
3185                 }
3186
3187                 /* If the IP address is hosted on this node then
3188                  * remove the connection. */
3189                 if (vnn->pnn == client->ctdb->pnn) {
3190                         ctdb_remove_connection(vnn, conn);
3191                 }
3192
3193                 /* Otherwise this function has been called because the
3194                  * server IP address has been released to another node
3195                  * and the client has exited.  This means that we
3196                  * should not delete the connection information.  The
3197                  * takeover node processes connections too. */
3198         }
3199 }
3200
3201
3202 void ctdb_release_all_ips(struct ctdb_context *ctdb)
3203 {
3204         struct ctdb_vnn *vnn;
3205         int count = 0;
3206
3207         if (ctdb->tunable.disable_ip_failover == 1) {
3208                 return;
3209         }
3210
3211         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3212                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
3213                         ctdb_vnn_unassign_iface(ctdb, vnn);
3214                         continue;
3215                 }
3216                 if (!vnn->iface) {
3217                         continue;
3218                 }
3219
3220                 /* Don't allow multiple releases at once.  Some code,
3221                  * particularly ctdb_tickle_sentenced_connections() is
3222                  * not re-entrant */
3223                 if (vnn->update_in_flight) {
3224                         DEBUG(DEBUG_WARNING,
3225                               (__location__
3226                                " Not releasing IP %s/%u on interface %s, an update is already in progess\n",
3227                                     ctdb_addr_to_str(&vnn->public_address),
3228                                     vnn->public_netmask_bits,
3229                                     ctdb_vnn_iface_string(vnn)));
3230                         continue;
3231                 }
3232                 vnn->update_in_flight = true;
3233
3234                 DEBUG(DEBUG_INFO,("Release of IP %s/%u on interface %s node:-1\n",
3235                                     ctdb_addr_to_str(&vnn->public_address),
3236                                     vnn->public_netmask_bits,
3237                                     ctdb_vnn_iface_string(vnn)));
3238
3239                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
3240                                   ctdb_vnn_iface_string(vnn),
3241                                   ctdb_addr_to_str(&vnn->public_address),
3242                                   vnn->public_netmask_bits);
3243                 release_kill_clients(ctdb, &vnn->public_address);
3244                 ctdb_vnn_unassign_iface(ctdb, vnn);
3245                 vnn->update_in_flight = false;
3246                 count++;
3247         }
3248
3249         DEBUG(DEBUG_NOTICE,(__location__ " Released %d public IPs\n", count));
3250 }
3251
3252
3253 /*
3254   get list of public IPs
3255  */
3256 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
3257                                     struct ctdb_req_control_old *c, TDB_DATA *outdata)
3258 {
3259         int i, num, len;
3260         struct ctdb_public_ip_list_old *ips;
3261         struct ctdb_vnn *vnn;
3262         bool only_available = false;
3263
3264         if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
3265                 only_available = true;
3266         }
3267
3268         /* count how many public ip structures we have */
3269         num = 0;
3270         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3271                 num++;
3272         }
3273
3274         len = offsetof(struct ctdb_public_ip_list_old, ips) +
3275                 num*sizeof(struct ctdb_public_ip);
3276         ips = talloc_zero_size(outdata, len);
3277         CTDB_NO_MEMORY(ctdb, ips);
3278
3279         i = 0;
3280         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3281                 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
3282                         continue;
3283                 }
3284                 ips->ips[i].pnn  = vnn->pnn;
3285                 ips->ips[i].addr = vnn->public_address;
3286                 i++;
3287         }
3288         ips->num = i;
3289         len = offsetof(struct ctdb_public_ip_list_old, ips) +
3290                 i*sizeof(struct ctdb_public_ip);
3291
3292         outdata->dsize = len;
3293         outdata->dptr  = (uint8_t *)ips;
3294
3295         return 0;
3296 }
3297
3298
3299 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
3300                                         struct ctdb_req_control_old *c,
3301                                         TDB_DATA indata,
3302                                         TDB_DATA *outdata)
3303 {
3304         int i, num, len;
3305         ctdb_sock_addr *addr;
3306         struct ctdb_public_ip_info_old *info;
3307         struct ctdb_vnn *vnn;
3308
3309         addr = (ctdb_sock_addr *)indata.dptr;
3310
3311         vnn = find_public_ip_vnn(ctdb, addr);
3312         if (vnn == NULL) {
3313                 /* if it is not a public ip   it could be our 'single ip' */
3314                 if (ctdb->single_ip_vnn) {
3315                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
3316                                 vnn = ctdb->single_ip_vnn;
3317                         }
3318                 }
3319         }
3320         if (vnn == NULL) {
3321                 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
3322                                  "'%s'not a public address\n",
3323                                  ctdb_addr_to_str(addr)));
3324                 return -1;
3325         }
3326
3327         /* count how many public ip structures we have */
3328         num = 0;
3329         for (;vnn->ifaces[num];) {
3330                 num++;
3331         }
3332
3333         len = offsetof(struct ctdb_public_ip_info_old, ifaces) +
3334                 num*sizeof(struct ctdb_iface);
3335         info = talloc_zero_size(outdata, len);
3336         CTDB_NO_MEMORY(ctdb, info);
3337
3338         info->ip.addr = vnn->public_address;
3339         info->ip.pnn = vnn->pnn;
3340         info->active_idx = 0xFFFFFFFF;
3341
3342         for (i=0; vnn->ifaces[i]; i++) {
3343                 struct ctdb_interface *cur;
3344
3345                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
3346                 if (cur == NULL) {
3347                         DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
3348                                            vnn->ifaces[i]));
3349                         return -1;
3350                 }
3351                 if (vnn->iface == cur) {
3352                         info->active_idx = i;
3353                 }
3354                 strncpy(info->ifaces[i].name, cur->name, sizeof(info->ifaces[i].name)-1);
3355                 info->ifaces[i].link_state = cur->link_up;
3356                 info->ifaces[i].references = cur->references;
3357         }
3358         info->num = i;
3359         len = offsetof(struct ctdb_public_ip_info_old, ifaces) +
3360                 i*sizeof(struct ctdb_iface);
3361
3362         outdata->dsize = len;
3363         outdata->dptr  = (uint8_t *)info;
3364
3365         return 0;
3366 }
3367
3368 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
3369                                 struct ctdb_req_control_old *c,
3370                                 TDB_DATA *outdata)
3371 {
3372         int i, num, len;
3373         struct ctdb_iface_list_old *ifaces;
3374         struct ctdb_interface *cur;
3375
3376         /* count how many public ip structures we have */
3377         num = 0;
3378         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3379                 num++;
3380         }
3381
3382         len = offsetof(struct ctdb_iface_list_old, ifaces) +
3383                 num*sizeof(struct ctdb_iface);
3384         ifaces = talloc_zero_size(outdata, len);
3385         CTDB_NO_MEMORY(ctdb, ifaces);
3386
3387         i = 0;
3388         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3389                 strcpy(ifaces->ifaces[i].name, cur->name);
3390                 ifaces->ifaces[i].link_state = cur->link_up;
3391                 ifaces->ifaces[i].references = cur->references;
3392                 i++;
3393         }
3394         ifaces->num = i;
3395         len = offsetof(struct ctdb_iface_list_old, ifaces) +
3396                 i*sizeof(struct ctdb_iface);
3397
3398         outdata->dsize = len;
3399         outdata->dptr  = (uint8_t *)ifaces;
3400
3401         return 0;
3402 }
3403
3404 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
3405                                     struct ctdb_req_control_old *c,
3406                                     TDB_DATA indata)
3407 {
3408         struct ctdb_iface *info;
3409         struct ctdb_interface *iface;
3410         bool link_up = false;
3411
3412         info = (struct ctdb_iface *)indata.dptr;
3413
3414         if (info->name[CTDB_IFACE_SIZE] != '\0') {
3415                 int len = strnlen(info->name, CTDB_IFACE_SIZE);
3416                 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
3417                                   len, len, info->name));
3418                 return -1;
3419         }
3420
3421         switch (info->link_state) {
3422         case 0:
3423                 link_up = false;
3424                 break;
3425         case 1:
3426                 link_up = true;
3427                 break;
3428         default:
3429                 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
3430                                   (unsigned int)info->link_state));
3431                 return -1;
3432         }
3433
3434         if (info->references != 0) {
3435                 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
3436                                   (unsigned int)info->references));
3437                 return -1;
3438         }
3439
3440         iface = ctdb_find_iface(ctdb, info->name);
3441         if (iface == NULL) {
3442                 return -1;
3443         }
3444
3445         if (link_up == iface->link_up) {
3446                 return 0;
3447         }
3448
3449         DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
3450               ("iface[%s] has changed it's link status %s => %s\n",
3451                iface->name,
3452                iface->link_up?"up":"down",
3453                link_up?"up":"down"));
3454
3455         iface->link_up = link_up;
3456         return 0;
3457 }
3458
3459
3460 /* 
3461    structure containing the listening socket and the list of tcp connections
3462    that the ctdb daemon is to kill
3463 */
3464 struct ctdb_kill_tcp {
3465         struct ctdb_vnn *vnn;
3466         struct ctdb_context *ctdb;
3467         int capture_fd;
3468         struct tevent_fd *fde;
3469         trbt_tree_t *connections;
3470         void *private_data;
3471 };
3472
3473 /*
3474   a tcp connection that is to be killed
3475  */
3476 struct ctdb_killtcp_con {
3477         ctdb_sock_addr src_addr;
3478         ctdb_sock_addr dst_addr;
3479         int count;
3480         struct ctdb_kill_tcp *killtcp;
3481 };
3482
3483 /* this function is used to create a key to represent this socketpair
3484    in the killtcp tree.
3485    this key is used to insert and lookup matching socketpairs that are
3486    to be tickled and RST
3487 */
3488 #define KILLTCP_KEYLEN  10
3489 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
3490 {
3491         static uint32_t key[KILLTCP_KEYLEN];
3492
3493         bzero(key, sizeof(key));
3494
3495         if (src->sa.sa_family != dst->sa.sa_family) {
3496                 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
3497                 return key;
3498         }
3499         
3500         switch (src->sa.sa_family) {
3501         case AF_INET:
3502                 key[0]  = dst->ip.sin_addr.s_addr;
3503                 key[1]  = src->ip.sin_addr.s_addr;
3504                 key[2]  = dst->ip.sin_port;
3505                 key[3]  = src->ip.sin_port;
3506                 break;
3507         case AF_INET6: {
3508                 uint32_t *dst6_addr32 =
3509                         (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
3510                 uint32_t *src6_addr32 =
3511                         (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
3512                 key[0]  = dst6_addr32[3];
3513                 key[1]  = src6_addr32[3];
3514                 key[2]  = dst6_addr32[2];
3515                 key[3]  = src6_addr32[2];
3516                 key[4]  = dst6_addr32[1];
3517                 key[5]  = src6_addr32[1];
3518                 key[6]  = dst6_addr32[0];
3519                 key[7]  = src6_addr32[0];
3520                 key[8]  = dst->ip6.sin6_port;
3521                 key[9]  = src->ip6.sin6_port;
3522                 break;
3523         }
3524         default:
3525                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
3526                 return key;
3527         }
3528
3529         return key;
3530 }
3531
3532 /*
3533   called when we get a read event on the raw socket
3534  */
3535 static void capture_tcp_handler(struct tevent_context *ev,
3536                                 struct tevent_fd *fde,
3537                                 uint16_t flags, void *private_data)
3538 {
3539         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3540         struct ctdb_killtcp_con *con;
3541         ctdb_sock_addr src, dst;
3542         uint32_t ack_seq, seq;
3543
3544         if (!(flags & TEVENT_FD_READ)) {
3545                 return;
3546         }
3547
3548         if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
3549                                 killtcp->private_data,
3550                                 &src, &dst,
3551                                 &ack_seq, &seq) != 0) {
3552                 /* probably a non-tcp ACK packet */
3553                 return;
3554         }
3555
3556         /* check if we have this guy in our list of connections
3557            to kill
3558         */
3559         con = trbt_lookuparray32(killtcp->connections, 
3560                         KILLTCP_KEYLEN, killtcp_key(&src, &dst));
3561         if (con == NULL) {
3562                 /* no this was some other packet we can just ignore */
3563                 return;
3564         }
3565
3566         /* This one has been tickled !
3567            now reset him and remove him from the list.
3568          */
3569         DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
3570                 ntohs(con->dst_addr.ip.sin_port),
3571                 ctdb_addr_to_str(&con->src_addr),
3572                 ntohs(con->src_addr.ip.sin_port)));
3573
3574         ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
3575         talloc_free(con);
3576 }
3577
3578
3579 /* when traversing the list of all tcp connections to send tickle acks to
3580    (so that we can capture the ack coming back and kill the connection
3581     by a RST)
3582    this callback is called for each connection we are currently trying to kill
3583 */
3584 static int tickle_connection_traverse(void *param, void *data)
3585 {
3586         struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
3587
3588         /* have tried too many times, just give up */
3589         if (con->count >= 5) {
3590                 /* can't delete in traverse: reparent to delete_cons */
3591                 talloc_steal(param, con);
3592                 return 0;
3593         }
3594
3595         /* othervise, try tickling it again */
3596         con->count++;
3597         ctdb_sys_send_tcp(
3598                 (ctdb_sock_addr *)&con->dst_addr,
3599                 (ctdb_sock_addr *)&con->src_addr,
3600                 0, 0, 0);
3601         return 0;
3602 }
3603
3604
3605 /* 
3606    called every second until all sentenced connections have been reset
3607  */
3608 static void ctdb_tickle_sentenced_connections(struct tevent_context *ev,
3609                                               struct tevent_timer *te,
3610                                               struct timeval t, void *private_data)
3611 {
3612         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3613         void *delete_cons = talloc_new(NULL);
3614
3615         /* loop over all connections sending tickle ACKs */
3616         trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
3617
3618         /* now we've finished traverse, it's safe to do deletion. */
3619         talloc_free(delete_cons);
3620
3621         /* If there are no more connections to kill we can remove the
3622            entire killtcp structure
3623          */
3624         if ( (killtcp->connections == NULL) || 
3625              (killtcp->connections->root == NULL) ) {
3626                 talloc_free(killtcp);
3627                 return;
3628         }
3629
3630         /* try tickling them again in a seconds time
3631          */
3632         tevent_add_timer(killtcp->ctdb->ev, killtcp,
3633                          timeval_current_ofs(1, 0),
3634                          ctdb_tickle_sentenced_connections, killtcp);
3635 }
3636
3637 /*
3638   destroy the killtcp structure
3639  */
3640 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
3641 {
3642         struct ctdb_vnn *tmpvnn;
3643
3644         /* verify that this vnn is still active */
3645         for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
3646                 if (tmpvnn == killtcp->vnn) {
3647                         break;
3648                 }
3649         }
3650
3651         if (tmpvnn == NULL) {
3652                 return 0;
3653         }
3654
3655         if (killtcp->vnn->killtcp != killtcp) {
3656                 return 0;
3657         }
3658
3659         killtcp->vnn->killtcp = NULL;
3660
3661         return 0;
3662 }
3663
3664
3665 /* nothing fancy here, just unconditionally replace any existing
3666    connection structure with the new one.
3667
3668    don't even free the old one if it did exist, that one is talloc_stolen
3669    by the same node in the tree anyway and will be deleted when the new data 
3670    is deleted
3671 */
3672 static void *add_killtcp_callback(void *parm, void *data)
3673 {
3674         return parm;
3675 }
3676
3677 /*
3678   add a tcp socket to the list of connections we want to RST
3679  */
3680 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb, 
3681                                        ctdb_sock_addr *s,
3682                                        ctdb_sock_addr *d)
3683 {
3684         ctdb_sock_addr src, dst;
3685         struct ctdb_kill_tcp *killtcp;
3686         struct ctdb_killtcp_con *con;
3687         struct ctdb_vnn *vnn;
3688
3689         ctdb_canonicalize_ip(s, &src);
3690         ctdb_canonicalize_ip(d, &dst);
3691
3692         vnn = find_public_ip_vnn(ctdb, &dst);
3693         if (vnn == NULL) {
3694                 vnn = find_public_ip_vnn(ctdb, &src);
3695         }
3696         if (vnn == NULL) {
3697                 /* if it is not a public ip   it could be our 'single ip' */
3698                 if (ctdb->single_ip_vnn) {
3699                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
3700                                 vnn = ctdb->single_ip_vnn;
3701                         }
3702                 }
3703         }
3704         if (vnn == NULL) {
3705                 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n")); 
3706                 return -1;
3707         }
3708
3709         killtcp = vnn->killtcp;
3710         
3711         /* If this is the first connection to kill we must allocate
3712            a new structure
3713          */
3714         if (killtcp == NULL) {
3715                 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
3716                 CTDB_NO_MEMORY(ctdb, killtcp);
3717
3718                 killtcp->vnn         = vnn;
3719                 killtcp->ctdb        = ctdb;
3720                 killtcp->capture_fd  = -1;
3721                 killtcp->connections = trbt_create(killtcp, 0);
3722
3723                 vnn->killtcp         = killtcp;
3724                 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
3725         }
3726
3727
3728
3729         /* create a structure that describes this connection we want to
3730            RST and store it in killtcp->connections
3731         */
3732         con = talloc(killtcp, struct ctdb_killtcp_con);
3733         CTDB_NO_MEMORY(ctdb, con);
3734         con->src_addr = src;
3735         con->dst_addr = dst;
3736         con->count    = 0;
3737         con->killtcp  = killtcp;
3738
3739
3740         trbt_insertarray32_callback(killtcp->connections,
3741                         KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
3742                         add_killtcp_callback, con);
3743
3744         /* 
3745            If we don't have a socket to listen on yet we must create it
3746          */
3747         if (killtcp->capture_fd == -1) {
3748                 const char *iface = ctdb_vnn_iface_string(vnn);
3749                 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
3750                 if (killtcp->capture_fd == -1) {
3751                         DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
3752                                           "socket on iface '%s' for killtcp (%s)\n",
3753                                           iface, strerror(errno)));
3754                         goto failed;
3755                 }
3756         }
3757
3758
3759         if (killtcp->fde == NULL) {
3760                 killtcp->fde = tevent_add_fd(ctdb->ev, killtcp,
3761                                              killtcp->capture_fd,
3762                                              TEVENT_FD_READ,
3763                                              capture_tcp_handler, killtcp);
3764                 tevent_fd_set_auto_close(killtcp->fde);
3765
3766                 /* We also need to set up some events to tickle all these connections
3767                    until they are all reset
3768                 */
3769                 tevent_add_timer(ctdb->ev, killtcp, timeval_current_ofs(1, 0),
3770                                  ctdb_tickle_sentenced_connections, killtcp);
3771         }
3772
3773         /* tickle him once now */
3774         ctdb_sys_send_tcp(
3775                 &con->dst_addr,
3776                 &con->src_addr,
3777                 0, 0, 0);
3778
3779         return 0;
3780
3781 failed:
3782         talloc_free(vnn->killtcp);
3783         vnn->killtcp = NULL;
3784         return -1;
3785 }
3786
3787 /*
3788   kill a TCP connection.
3789  */
3790 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
3791 {
3792         struct ctdb_connection *killtcp = (struct ctdb_connection *)indata.dptr;
3793
3794         return ctdb_killtcp_add_connection(ctdb, &killtcp->src, &killtcp->dst);
3795 }
3796
3797 /*
3798   called by a daemon to inform us of the entire list of TCP tickles for
3799   a particular public address.
3800   this control should only be sent by the node that is currently serving
3801   that public address.
3802  */
3803 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
3804 {
3805         struct ctdb_tickle_list_old *list = (struct ctdb_tickle_list_old *)indata.dptr;
3806         struct ctdb_tcp_array *tcparray;
3807         struct ctdb_vnn *vnn;
3808
3809         /* We must at least have tickles.num or else we cant verify the size
3810            of the received data blob
3811          */
3812         if (indata.dsize < offsetof(struct ctdb_tickle_list_old, connections)) {
3813                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_tickle_list. Not enough data for the tickle.num field\n"));
3814                 return -1;
3815         }
3816
3817         /* verify that the size of data matches what we expect */
3818         if (indata.dsize < offsetof(struct ctdb_tickle_list_old, connections)
3819                          + sizeof(struct ctdb_connection) * list->num) {
3820                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_tickle_list\n"));
3821                 return -1;
3822         }
3823
3824         DEBUG(DEBUG_INFO, ("Received tickle update for public address %s\n",
3825                            ctdb_addr_to_str(&list->addr)));
3826
3827         vnn = find_public_ip_vnn(ctdb, &list->addr);
3828         if (vnn == NULL) {
3829                 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n",
3830                         ctdb_addr_to_str(&list->addr)));
3831
3832                 return 1;
3833         }
3834
3835         if (vnn->pnn == ctdb->pnn) {
3836                 DEBUG(DEBUG_INFO,
3837                       ("Ignoring redundant set tcp tickle list, this node hosts '%s'\n",
3838                        ctdb_addr_to_str(&list->addr)));
3839                 return 0;
3840         }
3841
3842         /* remove any old ticklelist we might have */
3843         talloc_free(vnn->tcp_array);
3844         vnn->tcp_array = NULL;
3845
3846         tcparray = talloc(vnn, struct ctdb_tcp_array);
3847         CTDB_NO_MEMORY(ctdb, tcparray);
3848
3849         tcparray->num = list->num;
3850
3851         tcparray->connections = talloc_array(tcparray, struct ctdb_connection, tcparray->num);
3852         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3853
3854         memcpy(tcparray->connections, &list->connections[0],
3855                sizeof(struct ctdb_connection)*tcparray->num);
3856
3857         /* We now have a new fresh tickle list array for this vnn */
3858         vnn->tcp_array = tcparray;
3859
3860         return 0;
3861 }
3862
3863 /*
3864   called to return the full list of tickles for the puclic address associated 
3865   with the provided vnn
3866  */
3867 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3868 {
3869         ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3870         struct ctdb_tickle_list_old *list;
3871         struct ctdb_tcp_array *tcparray;
3872         int num;
3873         struct ctdb_vnn *vnn;
3874
3875         vnn = find_public_ip_vnn(ctdb, addr);
3876         if (vnn == NULL) {
3877                 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n", 
3878                         ctdb_addr_to_str(addr)));
3879
3880                 return 1;
3881         }
3882
3883         tcparray = vnn->tcp_array;
3884         if (tcparray) {
3885                 num = tcparray->num;
3886         } else {
3887                 num = 0;
3888         }
3889
3890         outdata->dsize = offsetof(struct ctdb_tickle_list_old, connections)
3891                         + sizeof(struct ctdb_connection) * num;
3892
3893         outdata->dptr  = talloc_size(outdata, outdata->dsize);
3894         CTDB_NO_MEMORY(ctdb, outdata->dptr);
3895         list = (struct ctdb_tickle_list_old *)outdata->dptr;
3896
3897         list->addr = *addr;
3898         list->num = num;
3899         if (num) {
3900                 memcpy(&list->connections[0], tcparray->connections,
3901                         sizeof(struct ctdb_connection) * num);
3902         }
3903
3904         return 0;
3905 }
3906
3907
3908 /*
3909   set the list of all tcp tickles for a public address
3910  */
3911 static int ctdb_send_set_tcp_tickles_for_ip(struct ctdb_context *ctdb,
3912                                             ctdb_sock_addr *addr,
3913                                             struct ctdb_tcp_array *tcparray)
3914 {
3915         int ret, num;
3916         TDB_DATA data;
3917         struct ctdb_tickle_list_old *list;
3918
3919         if (tcparray) {
3920                 num = tcparray->num;
3921         } else {
3922                 num = 0;
3923         }
3924
3925         data.dsize = offsetof(struct ctdb_tickle_list_old, connections) +
3926                         sizeof(struct ctdb_connection) * num;
3927         data.dptr = talloc_size(ctdb, data.dsize);
3928         CTDB_NO_MEMORY(ctdb, data.dptr);
3929
3930         list = (struct ctdb_tickle_list_old *)data.dptr;
3931         list->addr = *addr;
3932         list->num = num;
3933         if (tcparray) {
3934                 memcpy(&list->connections[0], tcparray->connections, sizeof(struct ctdb_connection) * num);
3935         }
3936
3937         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_ALL, 0,
3938                                        CTDB_CONTROL_SET_TCP_TICKLE_LIST,
3939                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3940         if (ret != 0) {
3941                 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
3942                 return -1;
3943         }
3944
3945         talloc_free(data.dptr);
3946
3947         return ret;
3948 }
3949
3950
3951 /*
3952   perform tickle updates if required
3953  */
3954 static void ctdb_update_tcp_tickles(struct tevent_context *ev,
3955                                     struct tevent_timer *te,
3956                                     struct timeval t, void *private_data)
3957 {
3958         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3959         int ret;
3960         struct ctdb_vnn *vnn;
3961
3962         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3963                 /* we only send out updates for public addresses that 
3964                    we have taken over
3965                  */
3966                 if (ctdb->pnn != vnn->pnn) {
3967                         continue;
3968                 }
3969                 /* We only send out the updates if we need to */
3970                 if (!vnn->tcp_update_needed) {
3971                         continue;
3972                 }
3973                 ret = ctdb_send_set_tcp_tickles_for_ip(ctdb,
3974                                                        &vnn->public_address,
3975                                                        vnn->tcp_array);
3976                 if (ret != 0) {
3977                         DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
3978                                 ctdb_addr_to_str(&vnn->public_address)));
3979                 } else {
3980                         DEBUG(DEBUG_INFO,
3981                               ("Sent tickle update for public address %s\n",
3982                                ctdb_addr_to_str(&vnn->public_address)));
3983                         vnn->tcp_update_needed = false;
3984                 }
3985         }
3986
3987         tevent_add_timer(ctdb->ev, ctdb->tickle_update_context,
3988                          timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
3989                          ctdb_update_tcp_tickles, ctdb);
3990 }
3991
3992 /*
3993   start periodic update of tcp tickles
3994  */
3995 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
3996 {
3997         ctdb->tickle_update_context = talloc_new(ctdb);
3998
3999         tevent_add_timer(ctdb->ev, ctdb->tickle_update_context,
4000                          timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
4001                          ctdb_update_tcp_tickles, ctdb);
4002 }
4003
4004
4005
4006
4007 struct control_gratious_arp {
4008         struct ctdb_context *ctdb;
4009         ctdb_sock_addr addr;
4010         const char *iface;
4011         int count;
4012 };
4013
4014 /*
4015   send a control_gratuitous arp
4016  */
4017 static void send_gratious_arp(struct tevent_context *ev,
4018                               struct tevent_timer *te,
4019                               struct timeval t, void *private_data)
4020 {
4021         int ret;
4022         struct control_gratious_arp *arp = talloc_get_type(private_data, 
4023                                                         struct control_gratious_arp);
4024
4025         ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
4026         if (ret != 0) {
4027                 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
4028                                  arp->iface, strerror(errno)));
4029         }
4030
4031
4032         arp->count++;
4033         if (arp->count == CTDB_ARP_REPEAT) {
4034                 talloc_free(arp);
4035                 return;
4036         }
4037
4038         tevent_add_timer(arp->ctdb->ev, arp,
4039                          timeval_current_ofs(CTDB_ARP_INTERVAL, 0),
4040                          send_gratious_arp, arp);
4041 }
4042
4043
4044 /*
4045   send a gratious arp 
4046  */
4047 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
4048 {
4049         struct ctdb_addr_info_old *gratious_arp = (struct ctdb_addr_info_old *)indata.dptr;
4050         struct control_gratious_arp *arp;
4051
4052         /* verify the size of indata */
4053         if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
4054                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
4055                                  (unsigned)indata.dsize, 
4056                                  (unsigned)offsetof(struct ctdb_addr_info_old, iface)));
4057                 return -1;
4058         }
4059         if (indata.dsize != 
4060                 ( offsetof(struct ctdb_addr_info_old, iface)
4061                 + gratious_arp->len ) ){
4062
4063                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4064                         "but should be %u bytes\n", 
4065                          (unsigned)indata.dsize, 
4066                          (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+gratious_arp->len)));
4067                 return -1;
4068         }
4069
4070
4071         arp = talloc(ctdb, struct control_gratious_arp);
4072         CTDB_NO_MEMORY(ctdb, arp);
4073
4074         arp->ctdb  = ctdb;
4075         arp->addr   = gratious_arp->addr;
4076         arp->iface = talloc_strdup(arp, gratious_arp->iface);
4077         CTDB_NO_MEMORY(ctdb, arp->iface);
4078         arp->count = 0;
4079
4080         tevent_add_timer(arp->ctdb->ev, arp,
4081                          timeval_zero(), send_gratious_arp, arp);
4082
4083         return 0;
4084 }
4085
4086 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
4087 {
4088         struct ctdb_addr_info_old *pub = (struct ctdb_addr_info_old *)indata.dptr;
4089         int ret;
4090
4091         /* verify the size of indata */
4092         if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
4093                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_addr_info structure\n"));
4094                 return -1;
4095         }
4096         if (indata.dsize != 
4097                 ( offsetof(struct ctdb_addr_info_old, iface)
4098                 + pub->len ) ){
4099
4100                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4101                         "but should be %u bytes\n", 
4102                          (unsigned)indata.dsize, 
4103                          (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+pub->len)));
4104                 return -1;
4105         }
4106
4107         DEBUG(DEBUG_NOTICE,("Add IP %s\n", ctdb_addr_to_str(&pub->addr)));
4108
4109         ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
4110
4111         if (ret != 0) {
4112                 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
4113                 return -1;
4114         }
4115
4116         return 0;
4117 }
4118
4119 struct delete_ip_callback_state {
4120         struct ctdb_req_control_old *c;
4121 };
4122
4123 /*
4124   called when releaseip event finishes for del_public_address
4125  */
4126 static void delete_ip_callback(struct ctdb_context *ctdb,
4127                                int32_t status, TDB_DATA data,
4128                                const char *errormsg,
4129                                void *private_data)
4130 {
4131         struct delete_ip_callback_state *state =
4132                 talloc_get_type(private_data, struct delete_ip_callback_state);
4133
4134         /* If release failed then fail. */
4135         ctdb_request_control_reply(ctdb, state->c, NULL, status, errormsg);
4136         talloc_free(private_data);
4137 }
4138
4139 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb,
4140                                         struct ctdb_req_control_old *c,
4141                                         TDB_DATA indata, bool *async_reply)
4142 {
4143         struct ctdb_addr_info_old *pub = (struct ctdb_addr_info_old *)indata.dptr;
4144         struct ctdb_vnn *vnn;
4145
4146         /* verify the size of indata */
4147         if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
4148                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_addr_info structure\n"));
4149                 return -1;
4150         }
4151         if (indata.dsize != 
4152                 ( offsetof(struct ctdb_addr_info_old, iface)
4153                 + pub->len ) ){
4154
4155                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4156                         "but should be %u bytes\n", 
4157                          (unsigned)indata.dsize, 
4158                          (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+pub->len)));
4159                 return -1;
4160         }
4161
4162         DEBUG(DEBUG_NOTICE,("Delete IP %s\n", ctdb_addr_to_str(&pub->addr)));
4163
4164         /* walk over all public addresses until we find a match */
4165         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
4166                 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
4167                         if (vnn->pnn == ctdb->pnn) {
4168                                 struct delete_ip_callback_state *state;
4169                                 struct ctdb_public_ip *ip;
4170                                 TDB_DATA data;
4171                                 int ret;
4172
4173                                 vnn->delete_pending = true;
4174
4175                                 state = talloc(ctdb,
4176                                                struct delete_ip_callback_state);
4177                                 CTDB_NO_MEMORY(ctdb, state);
4178                                 state->c = c;
4179
4180                                 ip = talloc(state, struct ctdb_public_ip);
4181                                 if (ip == NULL) {
4182                                         DEBUG(DEBUG_ERR,
4183                                               (__location__ " Out of memory\n"));
4184                                         talloc_free(state);
4185                                         return -1;
4186                                 }
4187                                 ip->pnn = -1;
4188                                 ip->addr = pub->addr;
4189
4190                                 data.dsize = sizeof(struct ctdb_public_ip);
4191                                 data.dptr = (unsigned char *)ip;
4192
4193                                 ret = ctdb_daemon_send_control(ctdb,
4194                                                                ctdb_get_pnn(ctdb),
4195                                                                0,
4196                                                                CTDB_CONTROL_RELEASE_IP,
4197                                                                0, 0,
4198                                                                data,
4199                                                                delete_ip_callback,
4200                                                                state);
4201                                 if (ret == -1) {
4202                                         DEBUG(DEBUG_ERR,
4203                                               (__location__ "Unable to send "
4204                                                "CTDB_CONTROL_RELEASE_IP\n"));
4205                                         talloc_free(state);
4206                                         return -1;
4207                                 }
4208
4209                                 state->c = talloc_steal(state, c);
4210                                 *async_reply = true;
4211                         } else {
4212                                 /* This IP is not hosted on the
4213                                  * current node so just delete it
4214                                  * now. */
4215                                 do_delete_ip(ctdb, vnn);
4216                         }
4217
4218                         return 0;
4219                 }
4220         }
4221
4222         DEBUG(DEBUG_ERR,("Delete IP of unknown public IP address %s\n",
4223                          ctdb_addr_to_str(&pub->addr)));
4224         return -1;
4225 }
4226
4227
4228 struct ipreallocated_callback_state {
4229         struct ctdb_req_control_old *c;
4230 };
4231
4232 static void ctdb_ipreallocated_callback(struct ctdb_context *ctdb,
4233                                         int status, void *p)
4234 {
4235         struct ipreallocated_callback_state *state =
4236                 talloc_get_type(p, struct ipreallocated_callback_state);
4237
4238         if (status != 0) {
4239                 DEBUG(DEBUG_ERR,
4240                       (" \"ipreallocated\" event script failed (status %d)\n",
4241                        status));
4242                 if (status == -ETIME) {
4243                         ctdb_ban_self(ctdb);
4244                 }
4245         }
4246
4247         ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
4248         talloc_free(state);
4249 }
4250
4251 /* A control to run the ipreallocated event */
4252 int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
4253                                    struct ctdb_req_control_old *c,
4254                                    bool *async_reply)
4255 {
4256         int ret;
4257         struct ipreallocated_callback_state *state;
4258
4259         state = talloc(ctdb, struct ipreallocated_callback_state);
4260         CTDB_NO_MEMORY(ctdb, state);
4261
4262         DEBUG(DEBUG_INFO,(__location__ " Running \"ipreallocated\" event\n"));
4263
4264         ret = ctdb_event_script_callback(ctdb, state,
4265                                          ctdb_ipreallocated_callback, state,
4266                                          CTDB_EVENT_IPREALLOCATED,
4267                                          "%s", "");
4268
4269         if (ret != 0) {
4270                 DEBUG(DEBUG_ERR,("Failed to run \"ipreallocated\" event \n"));
4271                 talloc_free(state);
4272                 return -1;
4273         }
4274
4275         /* tell the control that we will be reply asynchronously */
4276         state->c    = talloc_steal(state, c);
4277         *async_reply = true;
4278
4279         return 0;
4280 }
4281
4282
4283 /* This function is called from the recovery daemon to verify that a remote
4284    node has the expected ip allocation.
4285    This is verified against ctdb->ip_tree
4286 */
4287 static int verify_remote_ip_allocation(struct ctdb_context *ctdb,
4288                                        struct ctdb_public_ip_list_old *ips,
4289                                        uint32_t pnn)
4290 {
4291         struct public_ip_list *tmp_ip;
4292         int i;
4293
4294         if (ctdb->ip_tree == NULL) {
4295                 /* don't know the expected allocation yet, assume remote node
4296                    is correct. */
4297                 return 0;
4298         }
4299
4300         if (ips == NULL) {
4301                 return 0;
4302         }
4303
4304         for (i=0; i<ips->num; i++) {
4305                 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
4306                 if (tmp_ip == NULL) {
4307                         DEBUG(DEBUG_ERR,("Node %u has new or unknown public IP %s\n", pnn, ctdb_addr_to_str(&ips->ips[i].addr)));
4308                         return -1;
4309                 }
4310
4311                 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
4312                         continue;
4313                 }
4314
4315                 if (tmp_ip->pnn != ips->ips[i].pnn) {
4316                         DEBUG(DEBUG_ERR,
4317                               ("Inconsistent IP allocation - node %u thinks %s is held by node %u while it is assigned to node %u\n",
4318                                pnn,
4319                                ctdb_addr_to_str(&ips->ips[i].addr),
4320                                ips->ips[i].pnn, tmp_ip->pnn));
4321                         return -1;
4322                 }
4323         }
4324
4325         return 0;
4326 }
4327
4328 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
4329 {
4330         struct public_ip_list *tmp_ip;
4331
4332         /* IP tree is never built if DisableIPFailover is set */
4333         if (ctdb->tunable.disable_ip_failover != 0) {
4334                 return 0;
4335         }
4336
4337         if (ctdb->ip_tree == NULL) {
4338                 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
4339                 return -1;
4340         }
4341
4342         tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
4343         if (tmp_ip == NULL) {
4344                 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
4345                 return -1;
4346         }
4347
4348         DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
4349         tmp_ip->pnn = ip->pnn;
4350
4351         return 0;
4352 }
4353
4354 void clear_ip_assignment_tree(struct ctdb_context *ctdb)
4355 {
4356         TALLOC_FREE(ctdb->ip_tree);
4357 }
4358
4359 struct ctdb_reloadips_handle {
4360         struct ctdb_context *ctdb;
4361         struct ctdb_req_control_old *c;
4362         int status;
4363         int fd[2];
4364         pid_t child;
4365         struct tevent_fd *fde;
4366 };
4367
4368 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
4369 {
4370         if (h == h->ctdb->reload_ips) {
4371                 h->ctdb->reload_ips = NULL;
4372         }
4373         if (h->c != NULL) {
4374                 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
4375                 h->c = NULL;
4376         }
4377         ctdb_kill(h->ctdb, h->child, SIGKILL);
4378         return 0;
4379 }
4380
4381 static void ctdb_reloadips_timeout_event(struct tevent_context *ev,
4382                                          struct tevent_timer *te,
4383                                          struct timeval t, void *private_data)
4384 {
4385         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4386
4387         talloc_free(h);
4388 }
4389
4390 static void ctdb_reloadips_child_handler(struct tevent_context *ev,
4391                                          struct tevent_fd *fde,
4392                                          uint16_t flags, void *private_data)
4393 {
4394         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4395
4396         char res;
4397         int ret;
4398
4399         ret = sys_read(h->fd[0], &res, 1);
4400         if (ret < 1 || res != 0) {
4401                 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
4402                 res = 1;
4403         }
4404         h->status = res;
4405
4406         talloc_free(h);
4407 }
4408
4409 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
4410 {
4411         TALLOC_CTX *mem_ctx = talloc_new(NULL);
4412         struct ctdb_public_ip_list_old *ips;
4413         struct ctdb_vnn *vnn;
4414         struct client_async_data *async_data;
4415         struct timeval timeout;
4416         TDB_DATA data;
4417         struct ctdb_client_control_state *state;
4418         bool first_add;
4419         int i, ret;
4420
4421         CTDB_NO_MEMORY(ctdb, mem_ctx);
4422
4423         /* Read IPs from local node */
4424         ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(),
4425                                        CTDB_CURRENT_NODE, mem_ctx, &ips);
4426         if (ret != 0) {
4427                 DEBUG(DEBUG_ERR,
4428                       ("Unable to fetch public IPs from local node\n"));
4429                 talloc_free(mem_ctx);
4430                 return -1;
4431         }
4432
4433         /* Read IPs file - this is safe since this is a child process */
4434         ctdb->vnn = NULL;
4435         if (ctdb_set_public_addresses(ctdb, false) != 0) {
4436                 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
4437                 talloc_free(mem_ctx);
4438                 return -1;
4439         }
4440
4441         async_data = talloc_zero(mem_ctx, struct client_async_data);
4442         CTDB_NO_MEMORY(ctdb, async_data);
4443
4444         /* Compare IPs between node and file for IPs to be deleted */
4445         for (i = 0; i < ips->num; i++) {
4446                 /* */
4447                 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4448                         if (ctdb_same_ip(&vnn->public_address,
4449                                          &ips->ips[i].addr)) {
4450                                 /* IP is still in file */
4451                                 break;
4452                         }
4453                 }
4454
4455                 if (vnn == NULL) {
4456                         /* Delete IP ips->ips[i] */
4457                         struct ctdb_addr_info_old *pub;
4458
4459                         DEBUG(DEBUG_NOTICE,
4460                               ("IP %s no longer configured, deleting it\n",
4461                                ctdb_addr_to_str(&ips->ips[i].addr)));
4462
4463                         pub = talloc_zero(mem_ctx, struct ctdb_addr_info_old);
4464                         CTDB_NO_MEMORY(ctdb, pub);
4465
4466                         pub->addr  = ips->ips[i].addr;
4467                         pub->mask  = 0;
4468                         pub->len   = 0;
4469
4470                         timeout = TAKEOVER_TIMEOUT();
4471
4472                         data.dsize = offsetof(struct ctdb_addr_info_old,
4473                                               iface) + pub->len;
4474                         data.dptr = (uint8_t *)pub;
4475
4476                         state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4477                                                   CTDB_CONTROL_DEL_PUBLIC_IP,
4478                                                   0, data, async_data,
4479                                                   &timeout, NULL);
4480                         if (state == NULL) {
4481                                 DEBUG(DEBUG_ERR,
4482                                       (__location__
4483                                        " failed sending CTDB_CONTROL_DEL_PUBLIC_IP\n"));
4484                                 goto failed;
4485                         }
4486
4487                         ctdb_client_async_add(async_data, state);
4488                 }
4489         }
4490
4491         /* Compare IPs between node and file for IPs to be added */
4492         first_add = true;
4493         for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4494                 for (i = 0; i < ips->num; i++) {
4495                         if (ctdb_same_ip(&vnn->public_address,
4496                                          &ips->ips[i].addr)) {
4497                                 /* IP already on node */
4498                                 break;
4499                         }
4500                 }
4501                 if (i == ips->num) {
4502                         /* Add IP ips->ips[i] */
4503                         struct ctdb_addr_info_old *pub;
4504                         const char *ifaces = NULL;
4505                         uint32_t len;
4506                         int iface = 0;
4507
4508                         DEBUG(DEBUG_NOTICE,
4509                               ("New IP %s configured, adding it\n",
4510                                ctdb_addr_to_str(&vnn->public_address)));
4511                         if (first_add) {
4512                                 uint32_t pnn = ctdb_get_pnn(ctdb);
4513
4514                                 data.dsize = sizeof(pnn);
4515                                 data.dptr  = (uint8_t *)&pnn;
4516
4517                                 ret = ctdb_client_send_message(
4518                                         ctdb,
4519                                         CTDB_BROADCAST_CONNECTED,
4520                                         CTDB_SRVID_REBALANCE_NODE,
4521                                         data);
4522                                 if (ret != 0) {
4523                                         DEBUG(DEBUG_WARNING,
4524                                               ("Failed to send message to force node reallocation - IPs may be unbalanced\n"));
4525                                 }
4526
4527                                 first_add = false;
4528                         }
4529
4530                         ifaces = vnn->ifaces[0];
4531                         iface = 1;
4532                         while (vnn->ifaces[iface] != NULL) {
4533                                 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces,
4534                                                          vnn->ifaces[iface]);
4535                                 iface++;
4536                         }
4537
4538                         len   = strlen(ifaces) + 1;
4539                         pub = talloc_zero_size(mem_ctx,
4540                                                offsetof(struct ctdb_addr_info_old, iface) + len);
4541                         CTDB_NO_MEMORY(ctdb, pub);
4542
4543                         pub->addr  = vnn->public_address;
4544                         pub->mask  = vnn->public_netmask_bits;
4545                         pub->len   = len;
4546                         memcpy(&pub->iface[0], ifaces, pub->len);
4547
4548                         timeout = TAKEOVER_TIMEOUT();
4549
4550                         data.dsize = offsetof(struct ctdb_addr_info_old,
4551                                               iface) + pub->len;
4552                         data.dptr = (uint8_t *)pub;
4553
4554                         state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4555                                                   CTDB_CONTROL_ADD_PUBLIC_IP,
4556                                                   0, data, async_data,
4557                                                   &timeout, NULL);
4558                         if (state == NULL) {
4559                                 DEBUG(DEBUG_ERR,
4560                                       (__location__
4561                                        " failed sending CTDB_CONTROL_ADD_PUBLIC_IP\n"));
4562                                 goto failed;
4563                         }
4564
4565                         ctdb_client_async_add(async_data, state);
4566                 }
4567         }
4568
4569         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
4570                 DEBUG(DEBUG_ERR,(__location__ " Add/delete IPs failed\n"));
4571                 goto failed;
4572         }
4573
4574         talloc_free(mem_ctx);
4575         return 0;
4576
4577 failed:
4578         talloc_free(mem_ctx);
4579         return -1;
4580 }
4581
4582 /* This control is sent to force the node to re-read the public addresses file
4583    and drop any addresses we should nnot longer host, and add new addresses
4584    that we are now able to host
4585 */
4586 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control_old *c, bool *async_reply)
4587 {
4588         struct ctdb_reloadips_handle *h;
4589         pid_t parent = getpid();
4590
4591         if (ctdb->reload_ips != NULL) {
4592                 talloc_free(ctdb->reload_ips);
4593                 ctdb->reload_ips = NULL;
4594         }
4595
4596         h = talloc(ctdb, struct ctdb_reloadips_handle);
4597         CTDB_NO_MEMORY(ctdb, h);
4598         h->ctdb     = ctdb;
4599         h->c        = NULL;
4600         h->status   = -1;
4601         
4602         if (pipe(h->fd) == -1) {
4603                 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
4604                 talloc_free(h);
4605                 return -1;
4606         }
4607
4608         h->child = ctdb_fork(ctdb);
4609         if (h->child == (pid_t)-1) {
4610                 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
4611                 close(h->fd[0]);
4612                 close(h->fd[1]);
4613                 talloc_free(h);
4614                 return -1;
4615         }
4616
4617         /* child process */
4618         if (h->child == 0) {
4619                 signed char res = 0;
4620
4621                 close(h->fd[0]);
4622                 debug_extra = talloc_asprintf(NULL, "reloadips:");
4623
4624                 prctl_set_comment("ctdb_reloadips");
4625                 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
4626                         DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
4627                         res = -1;
4628                 } else {
4629                         res = ctdb_reloadips_child(ctdb);
4630                         if (res != 0) {
4631                                 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
4632                         }
4633                 }
4634
4635                 sys_write(h->fd[1], &res, 1);
4636                 /* make sure we die when our parent dies */
4637                 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
4638                         sleep(5);
4639                 }
4640                 _exit(0);
4641         }
4642
4643         h->c             = talloc_steal(h, c);
4644
4645         close(h->fd[1]);
4646         set_close_on_exec(h->fd[0]);
4647
4648         talloc_set_destructor(h, ctdb_reloadips_destructor);
4649
4650
4651         h->fde = tevent_add_fd(ctdb->ev, h, h->fd[0], TEVENT_FD_READ,
4652                                ctdb_reloadips_child_handler, (void *)h);
4653         tevent_fd_set_auto_close(h->fde);
4654
4655         tevent_add_timer(ctdb->ev, h, timeval_current_ofs(120, 0),
4656                          ctdb_reloadips_timeout_event, h);
4657
4658         /* we reply later */
4659         *async_reply = true;
4660         return 0;
4661 }