ctdb-ipalloc: New enum ipalloc_algorithm in ipalloc_state
[obnox/samba/samba-obnox.git] / ctdb / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6    Copyright (C) Martin Schwenke  2011
7
8    This program is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3 of the License, or
11    (at your option) any later version.
12    
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17    
18    You should have received a copy of the GNU General Public License
19    along with this program; if not, see <http://www.gnu.org/licenses/>.
20 */
21 #include "replace.h"
22 #include "system/network.h"
23 #include "system/filesys.h"
24 #include "system/time.h"
25 #include "system/wait.h"
26
27 #include <talloc.h>
28 #include <tevent.h>
29
30 #include "lib/util/dlinklist.h"
31 #include "lib/util/debug.h"
32 #include "lib/util/samba_util.h"
33 #include "lib/util/util_process.h"
34
35 #include "ctdb_private.h"
36 #include "ctdb_client.h"
37
38 #include "common/rb_tree.h"
39 #include "common/reqid.h"
40 #include "common/system.h"
41 #include "common/common.h"
42 #include "common/logging.h"
43
44
45 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
46
47 #define CTDB_ARP_INTERVAL 1
48 #define CTDB_ARP_REPEAT   3
49
50 /* Flags used in IP allocation algorithms. */
51 struct ctdb_ipflags {
52         bool noiptakeover;
53         bool noiphost;
54 };
55
56 enum ipalloc_algorithm {
57         IPALLOC_DETERMINISTIC,
58         IPALLOC_NONDETERMINISTIC,
59         IPALLOC_LCP2,
60 };
61
62 struct ipalloc_state {
63         uint32_t num;
64
65         /* Arrays with data for each node */
66         struct ctdb_public_ip_list_old **known_public_ips;
67         struct ctdb_public_ip_list_old **available_public_ips;
68
69         enum ipalloc_algorithm algorithm;
70 };
71
72 struct ctdb_interface {
73         struct ctdb_interface *prev, *next;
74         const char *name;
75         bool link_up;
76         uint32_t references;
77 };
78
79 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
80 {
81         if (vnn->iface) {
82                 return vnn->iface->name;
83         }
84
85         return "__none__";
86 }
87
88 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
89 {
90         struct ctdb_interface *i;
91
92         /* Verify that we don't have an entry for this ip yet */
93         for (i=ctdb->ifaces;i;i=i->next) {
94                 if (strcmp(i->name, iface) == 0) {
95                         return 0;
96                 }
97         }
98
99         /* create a new structure for this interface */
100         i = talloc_zero(ctdb, struct ctdb_interface);
101         CTDB_NO_MEMORY_FATAL(ctdb, i);
102         i->name = talloc_strdup(i, iface);
103         CTDB_NO_MEMORY(ctdb, i->name);
104
105         i->link_up = true;
106
107         DLIST_ADD(ctdb->ifaces, i);
108
109         return 0;
110 }
111
112 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
113                                         const char *name)
114 {
115         int n;
116
117         for (n = 0; vnn->ifaces[n] != NULL; n++) {
118                 if (strcmp(name, vnn->ifaces[n]) == 0) {
119                         return true;
120                 }
121         }
122
123         return false;
124 }
125
126 /* If any interfaces now have no possible IPs then delete them.  This
127  * implementation is naive (i.e. simple) rather than clever
128  * (i.e. complex).  Given that this is run on delip and that operation
129  * is rare, this doesn't need to be efficient - it needs to be
130  * foolproof.  One alternative is reference counting, where the logic
131  * is distributed and can, therefore, be broken in multiple places.
132  * Another alternative is to build a red-black tree of interfaces that
133  * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
134  * once) and then walking ctdb->ifaces once and deleting those not in
135  * the tree.  Let's go to one of those if the naive implementation
136  * causes problems...  :-)
137  */
138 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
139                                         struct ctdb_vnn *vnn)
140 {
141         struct ctdb_interface *i, *next;
142
143         /* For each interface, check if there's an IP using it. */
144         for (i = ctdb->ifaces; i != NULL; i = next) {
145                 struct ctdb_vnn *tv;
146                 bool found;
147                 next = i->next;
148
149                 /* Only consider interfaces named in the given VNN. */
150                 if (!vnn_has_interface_with_name(vnn, i->name)) {
151                         continue;
152                 }
153
154                 /* Is the "single IP" on this interface? */
155                 if ((ctdb->single_ip_vnn != NULL) &&
156                     (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
157                     (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
158                         /* Found, next interface please... */
159                         continue;
160                 }
161                 /* Search for a vnn with this interface. */
162                 found = false;
163                 for (tv=ctdb->vnn; tv; tv=tv->next) {
164                         if (vnn_has_interface_with_name(tv, i->name)) {
165                                 found = true;
166                                 break;
167                         }
168                 }
169
170                 if (!found) {
171                         /* None of the VNNs are using this interface. */
172                         DLIST_REMOVE(ctdb->ifaces, i);
173                         talloc_free(i);
174                 }
175         }
176 }
177
178
179 static struct ctdb_interface *ctdb_find_iface(struct ctdb_context *ctdb,
180                                               const char *iface)
181 {
182         struct ctdb_interface *i;
183
184         for (i=ctdb->ifaces;i;i=i->next) {
185                 if (strcmp(i->name, iface) == 0) {
186                         return i;
187                 }
188         }
189
190         return NULL;
191 }
192
193 static struct ctdb_interface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
194                                                   struct ctdb_vnn *vnn)
195 {
196         int i;
197         struct ctdb_interface *cur = NULL;
198         struct ctdb_interface *best = NULL;
199
200         for (i=0; vnn->ifaces[i]; i++) {
201
202                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
203                 if (cur == NULL) {
204                         continue;
205                 }
206
207                 if (!cur->link_up) {
208                         continue;
209                 }
210
211                 if (best == NULL) {
212                         best = cur;
213                         continue;
214                 }
215
216                 if (cur->references < best->references) {
217                         best = cur;
218                         continue;
219                 }
220         }
221
222         return best;
223 }
224
225 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
226                                      struct ctdb_vnn *vnn)
227 {
228         struct ctdb_interface *best = NULL;
229
230         if (vnn->iface) {
231                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
232                                    "still assigned to iface '%s'\n",
233                                    ctdb_addr_to_str(&vnn->public_address),
234                                    ctdb_vnn_iface_string(vnn)));
235                 return 0;
236         }
237
238         best = ctdb_vnn_best_iface(ctdb, vnn);
239         if (best == NULL) {
240                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
241                                   "cannot assign to iface any iface\n",
242                                   ctdb_addr_to_str(&vnn->public_address)));
243                 return -1;
244         }
245
246         vnn->iface = best;
247         best->references++;
248         vnn->pnn = ctdb->pnn;
249
250         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
251                            "now assigned to iface '%s' refs[%d]\n",
252                            ctdb_addr_to_str(&vnn->public_address),
253                            ctdb_vnn_iface_string(vnn),
254                            best->references));
255         return 0;
256 }
257
258 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
259                                     struct ctdb_vnn *vnn)
260 {
261         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
262                            "now unassigned (old iface '%s' refs[%d])\n",
263                            ctdb_addr_to_str(&vnn->public_address),
264                            ctdb_vnn_iface_string(vnn),
265                            vnn->iface?vnn->iface->references:0));
266         if (vnn->iface) {
267                 vnn->iface->references--;
268         }
269         vnn->iface = NULL;
270         if (vnn->pnn == ctdb->pnn) {
271                 vnn->pnn = -1;
272         }
273 }
274
275 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
276                                struct ctdb_vnn *vnn)
277 {
278         int i;
279
280         /* Nodes that are not RUNNING can not host IPs */
281         if (ctdb->runstate != CTDB_RUNSTATE_RUNNING) {
282                 return false;
283         }
284
285         if (vnn->delete_pending) {
286                 return false;
287         }
288
289         if (vnn->iface && vnn->iface->link_up) {
290                 return true;
291         }
292
293         for (i=0; vnn->ifaces[i]; i++) {
294                 struct ctdb_interface *cur;
295
296                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
297                 if (cur == NULL) {
298                         continue;
299                 }
300
301                 if (cur->link_up) {
302                         return true;
303                 }
304         }
305
306         return false;
307 }
308
309 struct ctdb_takeover_arp {
310         struct ctdb_context *ctdb;
311         uint32_t count;
312         ctdb_sock_addr addr;
313         struct ctdb_tcp_array *tcparray;
314         struct ctdb_vnn *vnn;
315 };
316
317
318 /*
319   lists of tcp endpoints
320  */
321 struct ctdb_tcp_list {
322         struct ctdb_tcp_list *prev, *next;
323         struct ctdb_connection connection;
324 };
325
326 /*
327   list of clients to kill on IP release
328  */
329 struct ctdb_client_ip {
330         struct ctdb_client_ip *prev, *next;
331         struct ctdb_context *ctdb;
332         ctdb_sock_addr addr;
333         uint32_t client_id;
334 };
335
336
337 /*
338   send a gratuitous arp
339  */
340 static void ctdb_control_send_arp(struct tevent_context *ev,
341                                   struct tevent_timer *te,
342                                   struct timeval t, void *private_data)
343 {
344         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
345                                                         struct ctdb_takeover_arp);
346         int i, ret;
347         struct ctdb_tcp_array *tcparray;
348         const char *iface = ctdb_vnn_iface_string(arp->vnn);
349
350         ret = ctdb_sys_send_arp(&arp->addr, iface);
351         if (ret != 0) {
352                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
353                                   iface, strerror(errno)));
354         }
355
356         tcparray = arp->tcparray;
357         if (tcparray) {
358                 for (i=0;i<tcparray->num;i++) {
359                         struct ctdb_connection *tcon;
360
361                         tcon = &tcparray->connections[i];
362                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
363                                 (unsigned)ntohs(tcon->dst.ip.sin_port),
364                                 ctdb_addr_to_str(&tcon->src),
365                                 (unsigned)ntohs(tcon->src.ip.sin_port)));
366                         ret = ctdb_sys_send_tcp(
367                                 &tcon->src,
368                                 &tcon->dst,
369                                 0, 0, 0);
370                         if (ret != 0) {
371                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
372                                         ctdb_addr_to_str(&tcon->src)));
373                         }
374                 }
375         }
376
377         arp->count++;
378
379         if (arp->count == CTDB_ARP_REPEAT) {
380                 talloc_free(arp);
381                 return;
382         }
383
384         tevent_add_timer(arp->ctdb->ev, arp->vnn->takeover_ctx,
385                          timeval_current_ofs(CTDB_ARP_INTERVAL, 100000),
386                          ctdb_control_send_arp, arp);
387 }
388
389 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
390                                        struct ctdb_vnn *vnn)
391 {
392         struct ctdb_takeover_arp *arp;
393         struct ctdb_tcp_array *tcparray;
394
395         if (!vnn->takeover_ctx) {
396                 vnn->takeover_ctx = talloc_new(vnn);
397                 if (!vnn->takeover_ctx) {
398                         return -1;
399                 }
400         }
401
402         arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
403         if (!arp) {
404                 return -1;
405         }
406
407         arp->ctdb = ctdb;
408         arp->addr = vnn->public_address;
409         arp->vnn  = vnn;
410
411         tcparray = vnn->tcp_array;
412         if (tcparray) {
413                 /* add all of the known tcp connections for this IP to the
414                    list of tcp connections to send tickle acks for */
415                 arp->tcparray = talloc_steal(arp, tcparray);
416
417                 vnn->tcp_array = NULL;
418                 vnn->tcp_update_needed = true;
419         }
420
421         tevent_add_timer(arp->ctdb->ev, vnn->takeover_ctx,
422                          timeval_zero(), ctdb_control_send_arp, arp);
423
424         return 0;
425 }
426
427 struct takeover_callback_state {
428         struct ctdb_req_control_old *c;
429         ctdb_sock_addr *addr;
430         struct ctdb_vnn *vnn;
431 };
432
433 struct ctdb_do_takeip_state {
434         struct ctdb_req_control_old *c;
435         struct ctdb_vnn *vnn;
436 };
437
438 /*
439   called when takeip event finishes
440  */
441 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
442                                     void *private_data)
443 {
444         struct ctdb_do_takeip_state *state =
445                 talloc_get_type(private_data, struct ctdb_do_takeip_state);
446         int32_t ret;
447         TDB_DATA data;
448
449         if (status != 0) {
450                 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
451         
452                 if (status == -ETIME) {
453                         ctdb_ban_self(ctdb);
454                 }
455                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
456                                  ctdb_addr_to_str(&state->vnn->public_address),
457                                  ctdb_vnn_iface_string(state->vnn)));
458                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
459
460                 node->flags |= NODE_FLAGS_UNHEALTHY;
461                 talloc_free(state);
462                 return;
463         }
464
465         if (ctdb->do_checkpublicip) {
466
467         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
468         if (ret != 0) {
469                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
470                 talloc_free(state);
471                 return;
472         }
473
474         }
475
476         data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
477         data.dsize = strlen((char *)data.dptr) + 1;
478         DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
479
480         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
481
482
483         /* the control succeeded */
484         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
485         talloc_free(state);
486         return;
487 }
488
489 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
490 {
491         state->vnn->update_in_flight = false;
492         return 0;
493 }
494
495 /*
496   take over an ip address
497  */
498 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
499                               struct ctdb_req_control_old *c,
500                               struct ctdb_vnn *vnn)
501 {
502         int ret;
503         struct ctdb_do_takeip_state *state;
504
505         if (vnn->update_in_flight) {
506                 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
507                                     "update for this IP already in flight\n",
508                                     ctdb_addr_to_str(&vnn->public_address),
509                                     vnn->public_netmask_bits));
510                 return -1;
511         }
512
513         ret = ctdb_vnn_assign_iface(ctdb, vnn);
514         if (ret != 0) {
515                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
516                                  "assign a usable interface\n",
517                                  ctdb_addr_to_str(&vnn->public_address),
518                                  vnn->public_netmask_bits));
519                 return -1;
520         }
521
522         state = talloc(vnn, struct ctdb_do_takeip_state);
523         CTDB_NO_MEMORY(ctdb, state);
524
525         state->c = talloc_steal(ctdb, c);
526         state->vnn   = vnn;
527
528         vnn->update_in_flight = true;
529         talloc_set_destructor(state, ctdb_takeip_destructor);
530
531         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
532                             ctdb_addr_to_str(&vnn->public_address),
533                             vnn->public_netmask_bits,
534                             ctdb_vnn_iface_string(vnn)));
535
536         ret = ctdb_event_script_callback(ctdb,
537                                          state,
538                                          ctdb_do_takeip_callback,
539                                          state,
540                                          CTDB_EVENT_TAKE_IP,
541                                          "%s %s %u",
542                                          ctdb_vnn_iface_string(vnn),
543                                          ctdb_addr_to_str(&vnn->public_address),
544                                          vnn->public_netmask_bits);
545
546         if (ret != 0) {
547                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
548                         ctdb_addr_to_str(&vnn->public_address),
549                         ctdb_vnn_iface_string(vnn)));
550                 talloc_free(state);
551                 return -1;
552         }
553
554         return 0;
555 }
556
557 struct ctdb_do_updateip_state {
558         struct ctdb_req_control_old *c;
559         struct ctdb_interface *old;
560         struct ctdb_vnn *vnn;
561 };
562
563 /*
564   called when updateip event finishes
565  */
566 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
567                                       void *private_data)
568 {
569         struct ctdb_do_updateip_state *state =
570                 talloc_get_type(private_data, struct ctdb_do_updateip_state);
571         int32_t ret;
572
573         if (status != 0) {
574                 if (status == -ETIME) {
575                         ctdb_ban_self(ctdb);
576                 }
577                 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
578                         ctdb_addr_to_str(&state->vnn->public_address),
579                         state->old->name,
580                         ctdb_vnn_iface_string(state->vnn)));
581
582                 /*
583                  * All we can do is reset the old interface
584                  * and let the next run fix it
585                  */
586                 ctdb_vnn_unassign_iface(ctdb, state->vnn);
587                 state->vnn->iface = state->old;
588                 state->vnn->iface->references++;
589
590                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
591                 talloc_free(state);
592                 return;
593         }
594
595         if (ctdb->do_checkpublicip) {
596
597         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
598         if (ret != 0) {
599                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
600                 talloc_free(state);
601                 return;
602         }
603
604         }
605
606         /* the control succeeded */
607         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
608         talloc_free(state);
609         return;
610 }
611
612 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
613 {
614         state->vnn->update_in_flight = false;
615         return 0;
616 }
617
618 /*
619   update (move) an ip address
620  */
621 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
622                                 struct ctdb_req_control_old *c,
623                                 struct ctdb_vnn *vnn)
624 {
625         int ret;
626         struct ctdb_do_updateip_state *state;
627         struct ctdb_interface *old = vnn->iface;
628         const char *new_name;
629
630         if (vnn->update_in_flight) {
631                 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
632                                     "update for this IP already in flight\n",
633                                     ctdb_addr_to_str(&vnn->public_address),
634                                     vnn->public_netmask_bits));
635                 return -1;
636         }
637
638         ctdb_vnn_unassign_iface(ctdb, vnn);
639         ret = ctdb_vnn_assign_iface(ctdb, vnn);
640         if (ret != 0) {
641                 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
642                                  "assin a usable interface (old iface '%s')\n",
643                                  ctdb_addr_to_str(&vnn->public_address),
644                                  vnn->public_netmask_bits,
645                                  old->name));
646                 return -1;
647         }
648
649         new_name = ctdb_vnn_iface_string(vnn);
650         if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
651                 /* A benign update from one interface onto itself.
652                  * no need to run the eventscripts in this case, just return
653                  * success.
654                  */
655                 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
656                 return 0;
657         }
658
659         state = talloc(vnn, struct ctdb_do_updateip_state);
660         CTDB_NO_MEMORY(ctdb, state);
661
662         state->c = talloc_steal(ctdb, c);
663         state->old = old;
664         state->vnn = vnn;
665
666         vnn->update_in_flight = true;
667         talloc_set_destructor(state, ctdb_updateip_destructor);
668
669         DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
670                             "interface %s to %s\n",
671                             ctdb_addr_to_str(&vnn->public_address),
672                             vnn->public_netmask_bits,
673                             old->name,
674                             new_name));
675
676         ret = ctdb_event_script_callback(ctdb,
677                                          state,
678                                          ctdb_do_updateip_callback,
679                                          state,
680                                          CTDB_EVENT_UPDATE_IP,
681                                          "%s %s %s %u",
682                                          state->old->name,
683                                          new_name,
684                                          ctdb_addr_to_str(&vnn->public_address),
685                                          vnn->public_netmask_bits);
686         if (ret != 0) {
687                 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
688                                  ctdb_addr_to_str(&vnn->public_address),
689                                  old->name, new_name));
690                 talloc_free(state);
691                 return -1;
692         }
693
694         return 0;
695 }
696
697 /*
698   Find the vnn of the node that has a public ip address
699   returns -1 if the address is not known as a public address
700  */
701 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
702 {
703         struct ctdb_vnn *vnn;
704
705         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
706                 if (ctdb_same_ip(&vnn->public_address, addr)) {
707                         return vnn;
708                 }
709         }
710
711         return NULL;
712 }
713
714 /*
715   take over an ip address
716  */
717 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
718                                  struct ctdb_req_control_old *c,
719                                  TDB_DATA indata,
720                                  bool *async_reply)
721 {
722         int ret;
723         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
724         struct ctdb_vnn *vnn;
725         bool have_ip = false;
726         bool do_updateip = false;
727         bool do_takeip = false;
728         struct ctdb_interface *best_iface = NULL;
729
730         if (pip->pnn != ctdb->pnn) {
731                 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
732                                  "with pnn %d, but we're node %d\n",
733                                  ctdb_addr_to_str(&pip->addr),
734                                  pip->pnn, ctdb->pnn));
735                 return -1;
736         }
737
738         /* update out vnn list */
739         vnn = find_public_ip_vnn(ctdb, &pip->addr);
740         if (vnn == NULL) {
741                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
742                         ctdb_addr_to_str(&pip->addr)));
743                 return 0;
744         }
745
746         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
747                 have_ip = ctdb_sys_have_ip(&pip->addr);
748         }
749         best_iface = ctdb_vnn_best_iface(ctdb, vnn);
750         if (best_iface == NULL) {
751                 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
752                                  "a usable interface (old %s, have_ip %d)\n",
753                                  ctdb_addr_to_str(&vnn->public_address),
754                                  vnn->public_netmask_bits,
755                                  ctdb_vnn_iface_string(vnn),
756                                  have_ip));
757                 return -1;
758         }
759
760         if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
761                 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
762                 have_ip = false;
763         }
764
765
766         if (vnn->iface == NULL && have_ip) {
767                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
768                                   "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
769                                  ctdb_addr_to_str(&vnn->public_address)));
770                 return 0;
771         }
772
773         if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
774                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
775                                   "and we have it on iface[%s], but it was assigned to node %d"
776                                   "and we are node %d, banning ourself\n",
777                                  ctdb_addr_to_str(&vnn->public_address),
778                                  ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
779                 ctdb_ban_self(ctdb);
780                 return -1;
781         }
782
783         if (vnn->pnn == -1 && have_ip) {
784                 vnn->pnn = ctdb->pnn;
785                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
786                                   "and we already have it on iface[%s], update local daemon\n",
787                                  ctdb_addr_to_str(&vnn->public_address),
788                                   ctdb_vnn_iface_string(vnn)));
789                 return 0;
790         }
791
792         if (vnn->iface) {
793                 if (vnn->iface != best_iface) {
794                         if (!vnn->iface->link_up) {
795                                 do_updateip = true;
796                         } else if (vnn->iface->references > (best_iface->references + 1)) {
797                                 /* only move when the rebalance gains something */
798                                         do_updateip = true;
799                         }
800                 }
801         }
802
803         if (!have_ip) {
804                 if (do_updateip) {
805                         ctdb_vnn_unassign_iface(ctdb, vnn);
806                         do_updateip = false;
807                 }
808                 do_takeip = true;
809         }
810
811         if (do_takeip) {
812                 ret = ctdb_do_takeip(ctdb, c, vnn);
813                 if (ret != 0) {
814                         return -1;
815                 }
816         } else if (do_updateip) {
817                 ret = ctdb_do_updateip(ctdb, c, vnn);
818                 if (ret != 0) {
819                         return -1;
820                 }
821         } else {
822                 /*
823                  * The interface is up and the kernel known the ip
824                  * => do nothing
825                  */
826                 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
827                         ctdb_addr_to_str(&pip->addr),
828                         vnn->public_netmask_bits,
829                         ctdb_vnn_iface_string(vnn)));
830                 return 0;
831         }
832
833         /* tell ctdb_control.c that we will be replying asynchronously */
834         *async_reply = true;
835
836         return 0;
837 }
838
839 /*
840   kill any clients that are registered with a IP that is being released
841  */
842 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
843 {
844         struct ctdb_client_ip *ip;
845
846         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
847                 ctdb_addr_to_str(addr)));
848
849         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
850                 ctdb_sock_addr tmp_addr;
851
852                 tmp_addr = ip->addr;
853                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
854                         ip->client_id,
855                         ctdb_addr_to_str(&ip->addr)));
856
857                 if (ctdb_same_ip(&tmp_addr, addr)) {
858                         struct ctdb_client *client = reqid_find(ctdb->idr,
859                                                                 ip->client_id,
860                                                                 struct ctdb_client);
861                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
862                                 ip->client_id,
863                                 ctdb_addr_to_str(&ip->addr),
864                                 client->pid));
865
866                         if (client->pid != 0) {
867                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
868                                         (unsigned)client->pid,
869                                         ctdb_addr_to_str(addr),
870                                         ip->client_id));
871                                 kill(client->pid, SIGKILL);
872                         }
873                 }
874         }
875 }
876
877 static void do_delete_ip(struct ctdb_context *ctdb, struct ctdb_vnn *vnn)
878 {
879         DLIST_REMOVE(ctdb->vnn, vnn);
880         ctdb_vnn_unassign_iface(ctdb, vnn);
881         ctdb_remove_orphaned_ifaces(ctdb, vnn);
882         talloc_free(vnn);
883 }
884
885 /*
886   called when releaseip event finishes
887  */
888 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
889                                 void *private_data)
890 {
891         struct takeover_callback_state *state = 
892                 talloc_get_type(private_data, struct takeover_callback_state);
893         TDB_DATA data;
894
895         if (status == -ETIME) {
896                 ctdb_ban_self(ctdb);
897         }
898
899         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
900                 if  (ctdb_sys_have_ip(state->addr)) {
901                         DEBUG(DEBUG_ERR,
902                               ("IP %s still hosted during release IP callback, failing\n",
903                                ctdb_addr_to_str(state->addr)));
904                         ctdb_request_control_reply(ctdb, state->c,
905                                                    NULL, -1, NULL);
906                         talloc_free(state);
907                         return;
908                 }
909         }
910
911         /* send a message to all clients of this node telling them
912            that the cluster has been reconfigured and they should
913            release any sockets on this IP */
914         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
915         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
916         data.dsize = strlen((char *)data.dptr)+1;
917
918         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
919
920         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
921
922         /* kill clients that have registered with this IP */
923         release_kill_clients(ctdb, state->addr);
924
925         ctdb_vnn_unassign_iface(ctdb, state->vnn);
926
927         /* Process the IP if it has been marked for deletion */
928         if (state->vnn->delete_pending) {
929                 do_delete_ip(ctdb, state->vnn);
930                 state->vnn = NULL;
931         }
932
933         /* the control succeeded */
934         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
935         talloc_free(state);
936 }
937
938 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
939 {
940         if (state->vnn != NULL) {
941                 state->vnn->update_in_flight = false;
942         }
943         return 0;
944 }
945
946 /*
947   release an ip address
948  */
949 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
950                                 struct ctdb_req_control_old *c,
951                                 TDB_DATA indata, 
952                                 bool *async_reply)
953 {
954         int ret;
955         struct takeover_callback_state *state;
956         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
957         struct ctdb_vnn *vnn;
958         char *iface;
959
960         /* update our vnn list */
961         vnn = find_public_ip_vnn(ctdb, &pip->addr);
962         if (vnn == NULL) {
963                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
964                         ctdb_addr_to_str(&pip->addr)));
965                 return 0;
966         }
967         vnn->pnn = pip->pnn;
968
969         /* stop any previous arps */
970         talloc_free(vnn->takeover_ctx);
971         vnn->takeover_ctx = NULL;
972
973         /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
974          * lazy multicast to drop an IP from any node that isn't the
975          * intended new node.  The following causes makes ctdbd ignore
976          * a release for any address it doesn't host.
977          */
978         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
979                 if (!ctdb_sys_have_ip(&pip->addr)) {
980                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
981                                 ctdb_addr_to_str(&pip->addr),
982                                 vnn->public_netmask_bits,
983                                 ctdb_vnn_iface_string(vnn)));
984                         ctdb_vnn_unassign_iface(ctdb, vnn);
985                         return 0;
986                 }
987         } else {
988                 if (vnn->iface == NULL) {
989                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
990                                            ctdb_addr_to_str(&pip->addr),
991                                            vnn->public_netmask_bits));
992                         return 0;
993                 }
994         }
995
996         /* There is a potential race between take_ip and us because we
997          * update the VNN via a callback that run when the
998          * eventscripts have been run.  Avoid the race by allowing one
999          * update to be in flight at a time.
1000          */
1001         if (vnn->update_in_flight) {
1002                 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
1003                                     "update for this IP already in flight\n",
1004                                     ctdb_addr_to_str(&vnn->public_address),
1005                                     vnn->public_netmask_bits));
1006                 return -1;
1007         }
1008
1009         iface = strdup(ctdb_vnn_iface_string(vnn));
1010
1011         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
1012                 ctdb_addr_to_str(&pip->addr),
1013                 vnn->public_netmask_bits,
1014                 iface,
1015                 pip->pnn));
1016
1017         state = talloc(ctdb, struct takeover_callback_state);
1018         if (state == NULL) {
1019                 ctdb_set_error(ctdb, "Out of memory at %s:%d",
1020                                __FILE__, __LINE__);
1021                 free(iface);
1022                 return -1;
1023         }
1024
1025         state->c = talloc_steal(state, c);
1026         state->addr = talloc(state, ctdb_sock_addr);       
1027         if (state->addr == NULL) {
1028                 ctdb_set_error(ctdb, "Out of memory at %s:%d",
1029                                __FILE__, __LINE__);
1030                 free(iface);
1031                 talloc_free(state);
1032                 return -1;
1033         }
1034         *state->addr = pip->addr;
1035         state->vnn   = vnn;
1036
1037         vnn->update_in_flight = true;
1038         talloc_set_destructor(state, ctdb_releaseip_destructor);
1039
1040         ret = ctdb_event_script_callback(ctdb, 
1041                                          state, release_ip_callback, state,
1042                                          CTDB_EVENT_RELEASE_IP,
1043                                          "%s %s %u",
1044                                          iface,
1045                                          ctdb_addr_to_str(&pip->addr),
1046                                          vnn->public_netmask_bits);
1047         free(iface);
1048         if (ret != 0) {
1049                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1050                         ctdb_addr_to_str(&pip->addr),
1051                         ctdb_vnn_iface_string(vnn)));
1052                 talloc_free(state);
1053                 return -1;
1054         }
1055
1056         /* tell the control that we will be reply asynchronously */
1057         *async_reply = true;
1058         return 0;
1059 }
1060
1061 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1062                                    ctdb_sock_addr *addr,
1063                                    unsigned mask, const char *ifaces,
1064                                    bool check_address)
1065 {
1066         struct ctdb_vnn      *vnn;
1067         uint32_t num = 0;
1068         char *tmp;
1069         const char *iface;
1070         int i;
1071         int ret;
1072
1073         tmp = strdup(ifaces);
1074         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1075                 if (!ctdb_sys_check_iface_exists(iface)) {
1076                         DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1077                         free(tmp);
1078                         return -1;
1079                 }
1080         }
1081         free(tmp);
1082
1083         /* Verify that we don't have an entry for this ip yet */
1084         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1085                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1086                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
1087                                 ctdb_addr_to_str(addr)));
1088                         return -1;
1089                 }               
1090         }
1091
1092         /* create a new vnn structure for this ip address */
1093         vnn = talloc_zero(ctdb, struct ctdb_vnn);
1094         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1095         vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1096         tmp = talloc_strdup(vnn, ifaces);
1097         CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1098         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1099                 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1100                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1101                 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1102                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1103                 num++;
1104         }
1105         talloc_free(tmp);
1106         vnn->ifaces[num] = NULL;
1107         vnn->public_address      = *addr;
1108         vnn->public_netmask_bits = mask;
1109         vnn->pnn                 = -1;
1110         if (check_address) {
1111                 if (ctdb_sys_have_ip(addr)) {
1112                         DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1113                         vnn->pnn = ctdb->pnn;
1114                 }
1115         }
1116
1117         for (i=0; vnn->ifaces[i]; i++) {
1118                 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1119                 if (ret != 0) {
1120                         DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1121                                            "for public_address[%s]\n",
1122                                            vnn->ifaces[i], ctdb_addr_to_str(addr)));
1123                         talloc_free(vnn);
1124                         return -1;
1125                 }
1126         }
1127
1128         DLIST_ADD(ctdb->vnn, vnn);
1129
1130         return 0;
1131 }
1132
1133 /*
1134   setup the public address lists from a file
1135 */
1136 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1137 {
1138         char **lines;
1139         int nlines;
1140         int i;
1141
1142         lines = file_lines_load(ctdb->public_addresses_file, &nlines, 0, ctdb);
1143         if (lines == NULL) {
1144                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1145                 return -1;
1146         }
1147         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1148                 nlines--;
1149         }
1150
1151         for (i=0;i<nlines;i++) {
1152                 unsigned mask;
1153                 ctdb_sock_addr addr;
1154                 const char *addrstr;
1155                 const char *ifaces;
1156                 char *tok, *line;
1157
1158                 line = lines[i];
1159                 while ((*line == ' ') || (*line == '\t')) {
1160                         line++;
1161                 }
1162                 if (*line == '#') {
1163                         continue;
1164                 }
1165                 if (strcmp(line, "") == 0) {
1166                         continue;
1167                 }
1168                 tok = strtok(line, " \t");
1169                 addrstr = tok;
1170                 tok = strtok(NULL, " \t");
1171                 if (tok == NULL) {
1172                         if (NULL == ctdb->default_public_interface) {
1173                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1174                                          i+1));
1175                                 talloc_free(lines);
1176                                 return -1;
1177                         }
1178                         ifaces = ctdb->default_public_interface;
1179                 } else {
1180                         ifaces = tok;
1181                 }
1182
1183                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1184                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1185                         talloc_free(lines);
1186                         return -1;
1187                 }
1188                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1189                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1190                         talloc_free(lines);
1191                         return -1;
1192                 }
1193         }
1194
1195
1196         talloc_free(lines);
1197         return 0;
1198 }
1199
1200 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1201                               const char *iface,
1202                               const char *ip)
1203 {
1204         struct ctdb_vnn *svnn;
1205         struct ctdb_interface *cur = NULL;
1206         bool ok;
1207         int ret;
1208
1209         svnn = talloc_zero(ctdb, struct ctdb_vnn);
1210         CTDB_NO_MEMORY(ctdb, svnn);
1211
1212         svnn->ifaces = talloc_array(svnn, const char *, 2);
1213         CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1214         svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1215         CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1216         svnn->ifaces[1] = NULL;
1217
1218         ok = parse_ip(ip, iface, 0, &svnn->public_address);
1219         if (!ok) {
1220                 talloc_free(svnn);
1221                 return -1;
1222         }
1223
1224         ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1225         if (ret != 0) {
1226                 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1227                                    "for single_ip[%s]\n",
1228                                    svnn->ifaces[0],
1229                                    ctdb_addr_to_str(&svnn->public_address)));
1230                 talloc_free(svnn);
1231                 return -1;
1232         }
1233
1234         /* assume the single public ip interface is initially "good" */
1235         cur = ctdb_find_iface(ctdb, iface);
1236         if (cur == NULL) {
1237                 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1238                 return -1;
1239         }
1240         cur->link_up = true;
1241
1242         ret = ctdb_vnn_assign_iface(ctdb, svnn);
1243         if (ret != 0) {
1244                 talloc_free(svnn);
1245                 return -1;
1246         }
1247
1248         ctdb->single_ip_vnn = svnn;
1249         return 0;
1250 }
1251
1252 struct public_ip_list {
1253         struct public_ip_list *next;
1254         uint32_t pnn;
1255         ctdb_sock_addr addr;
1256 };
1257
1258 /* Given a physical node, return the number of
1259    public addresses that is currently assigned to this node.
1260 */
1261 static int node_ip_coverage(int32_t pnn, struct public_ip_list *ips)
1262 {
1263         int num=0;
1264
1265         for (;ips;ips=ips->next) {
1266                 if (ips->pnn == pnn) {
1267                         num++;
1268                 }
1269         }
1270         return num;
1271 }
1272
1273
1274 /* Can the given node host the given IP: is the public IP known to the
1275  * node and is NOIPHOST unset?
1276 */
1277 static bool can_node_host_ip(struct ctdb_context *ctdb, int32_t pnn,
1278                              struct ctdb_ipflags ipflags,
1279                              struct public_ip_list *ip)
1280 {
1281         struct ctdb_public_ip_list_old *public_ips;
1282         int i;
1283
1284         if (ipflags.noiphost) {
1285                 return false;
1286         }
1287
1288         public_ips = ctdb->ipalloc_state->available_public_ips[pnn];
1289
1290         if (public_ips == NULL) {
1291                 return false;
1292         }
1293
1294         for (i=0; i<public_ips->num; i++) {
1295                 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1296                         /* yes, this node can serve this public ip */
1297                         return true;
1298                 }
1299         }
1300
1301         return false;
1302 }
1303
1304 static bool can_node_takeover_ip(struct ctdb_context *ctdb, int32_t pnn,
1305                                  struct ctdb_ipflags ipflags,
1306                                  struct public_ip_list *ip)
1307 {
1308         if (ipflags.noiptakeover) {
1309                 return false;
1310         }
1311
1312         return can_node_host_ip(ctdb, pnn, ipflags, ip);
1313 }
1314
1315 /* search the node lists list for a node to takeover this ip.
1316    pick the node that currently are serving the least number of ips
1317    so that the ips get spread out evenly.
1318 */
1319 static int find_takeover_node(struct ctdb_context *ctdb,
1320                               struct ctdb_ipflags *ipflags,
1321                               struct public_ip_list *ip,
1322                               struct public_ip_list *all_ips)
1323 {
1324         int pnn, min=0, num;
1325         int i, numnodes;
1326
1327         numnodes = talloc_array_length(ipflags);
1328         pnn    = -1;
1329         for (i=0; i<numnodes; i++) {
1330                 /* verify that this node can serve this ip */
1331                 if (!can_node_takeover_ip(ctdb, i, ipflags[i], ip)) {
1332                         /* no it couldnt   so skip to the next node */
1333                         continue;
1334                 }
1335
1336                 num = node_ip_coverage(i, all_ips);
1337                 /* was this the first node we checked ? */
1338                 if (pnn == -1) {
1339                         pnn = i;
1340                         min  = num;
1341                 } else {
1342                         if (num < min) {
1343                                 pnn = i;
1344                                 min  = num;
1345                         }
1346                 }
1347         }       
1348         if (pnn == -1) {
1349                 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1350                         ctdb_addr_to_str(&ip->addr)));
1351
1352                 return -1;
1353         }
1354
1355         ip->pnn = pnn;
1356         return 0;
1357 }
1358
1359 #define IP_KEYLEN       4
1360 static uint32_t *ip_key(ctdb_sock_addr *ip)
1361 {
1362         static uint32_t key[IP_KEYLEN];
1363
1364         bzero(key, sizeof(key));
1365
1366         switch (ip->sa.sa_family) {
1367         case AF_INET:
1368                 key[3]  = htonl(ip->ip.sin_addr.s_addr);
1369                 break;
1370         case AF_INET6: {
1371                 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1372                 key[0]  = htonl(s6_a32[0]);
1373                 key[1]  = htonl(s6_a32[1]);
1374                 key[2]  = htonl(s6_a32[2]);
1375                 key[3]  = htonl(s6_a32[3]);
1376                 break;
1377         }
1378         default:
1379                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1380                 return key;
1381         }
1382
1383         return key;
1384 }
1385
1386 static void *add_ip_callback(void *parm, void *data)
1387 {
1388         struct public_ip_list *this_ip = parm;
1389         struct public_ip_list *prev_ip = data;
1390
1391         if (prev_ip == NULL) {
1392                 return parm;
1393         }
1394         if (this_ip->pnn == -1) {
1395                 this_ip->pnn = prev_ip->pnn;
1396         }
1397
1398         return parm;
1399 }
1400
1401 static int getips_count_callback(void *param, void *data)
1402 {
1403         struct public_ip_list **ip_list = (struct public_ip_list **)param;
1404         struct public_ip_list *new_ip = (struct public_ip_list *)data;
1405
1406         new_ip->next = *ip_list;
1407         *ip_list     = new_ip;
1408         return 0;
1409 }
1410
1411 static int verify_remote_ip_allocation(struct ctdb_context *ctdb,
1412                                        struct ctdb_public_ip_list_old *ips,
1413                                        uint32_t pnn);
1414
1415 static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
1416                                          struct ipalloc_state *ipalloc_state,
1417                                          struct ctdb_node_map_old *nodemap)
1418 {
1419         int j;
1420         int ret;
1421
1422         if (ipalloc_state->num != nodemap->num) {
1423                 DEBUG(DEBUG_ERR,
1424                       (__location__
1425                        " ipalloc_state->num (%d) != nodemap->num (%d) invalid param\n",
1426                        ipalloc_state->num, nodemap->num));
1427                 return -1;
1428         }
1429
1430         for (j=0; j<nodemap->num; j++) {
1431                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1432                         continue;
1433                 }
1434
1435                 /* Retrieve the list of known public IPs from the node */
1436                 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1437                                         TAKEOVER_TIMEOUT(),
1438                                         j,
1439                                         ctdb->nodes,
1440                                         0,
1441                                         &ipalloc_state->known_public_ips[j]);
1442                 if (ret != 0) {
1443                         DEBUG(DEBUG_ERR,
1444                               ("Failed to read known public IPs from node: %u\n",
1445                                j));
1446                         return -1;
1447                 }
1448
1449                 if (ctdb->do_checkpublicip) {
1450                         verify_remote_ip_allocation(ctdb,
1451                                                     ipalloc_state->known_public_ips[j],
1452                                                     j);
1453                 }
1454
1455                 /* Retrieve the list of available public IPs from the node */
1456                 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1457                                         TAKEOVER_TIMEOUT(),
1458                                         j,
1459                                         ctdb->nodes,
1460                                         CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE,
1461                                         &ipalloc_state->available_public_ips[j]);
1462                 if (ret != 0) {
1463                         DEBUG(DEBUG_ERR,
1464                               ("Failed to read available public IPs from node: %u\n",
1465                                j));
1466                         return -1;
1467                 }
1468         }
1469
1470         return 0;
1471 }
1472
1473 static struct public_ip_list *
1474 create_merged_ip_list(struct ctdb_context *ctdb)
1475 {
1476         int i, j;
1477         struct public_ip_list *ip_list;
1478         struct ctdb_public_ip_list_old *public_ips;
1479
1480         if (ctdb->ip_tree != NULL) {
1481                 talloc_free(ctdb->ip_tree);
1482                 ctdb->ip_tree = NULL;
1483         }
1484         ctdb->ip_tree = trbt_create(ctdb, 0);
1485
1486         for (i=0;i<ctdb->num_nodes;i++) {
1487                 public_ips = ctdb->ipalloc_state->known_public_ips[i];
1488
1489                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1490                         continue;
1491                 }
1492
1493                 /* there were no public ips for this node */
1494                 if (public_ips == NULL) {
1495                         continue;
1496                 }               
1497
1498                 for (j=0;j<public_ips->num;j++) {
1499                         struct public_ip_list *tmp_ip;
1500
1501                         tmp_ip = talloc_zero(ctdb->ip_tree, struct public_ip_list);
1502                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1503                         /* Do not use information about IP addresses hosted
1504                          * on other nodes, it may not be accurate */
1505                         if (public_ips->ips[j].pnn == ctdb->nodes[i]->pnn) {
1506                                 tmp_ip->pnn = public_ips->ips[j].pnn;
1507                         } else {
1508                                 tmp_ip->pnn = -1;
1509                         }
1510                         tmp_ip->addr = public_ips->ips[j].addr;
1511                         tmp_ip->next = NULL;
1512
1513                         trbt_insertarray32_callback(ctdb->ip_tree,
1514                                 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1515                                 add_ip_callback,
1516                                 tmp_ip);
1517                 }
1518         }
1519
1520         ip_list = NULL;
1521         trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1522
1523         return ip_list;
1524 }
1525
1526 /* 
1527  * This is the length of the longtest common prefix between the IPs.
1528  * It is calculated by XOR-ing the 2 IPs together and counting the
1529  * number of leading zeroes.  The implementation means that all
1530  * addresses end up being 128 bits long.
1531  *
1532  * FIXME? Should we consider IPv4 and IPv6 separately given that the
1533  * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1534  * lots of nodes and IP addresses?
1535  */
1536 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1537 {
1538         uint32_t ip1_k[IP_KEYLEN];
1539         uint32_t *t;
1540         int i;
1541         uint32_t x;
1542
1543         uint32_t distance = 0;
1544
1545         memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1546         t = ip_key(ip2);
1547         for (i=0; i<IP_KEYLEN; i++) {
1548                 x = ip1_k[i] ^ t[i];
1549                 if (x == 0) {
1550                         distance += 32;
1551                 } else {
1552                         /* Count number of leading zeroes. 
1553                          * FIXME? This could be optimised...
1554                          */
1555                         while ((x & (1 << 31)) == 0) {
1556                                 x <<= 1;
1557                                 distance += 1;
1558                         }
1559                 }
1560         }
1561
1562         return distance;
1563 }
1564
1565 /* Calculate the IP distance for the given IP relative to IPs on the
1566    given node.  The ips argument is generally the all_ips variable
1567    used in the main part of the algorithm.
1568  */
1569 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1570                                   struct public_ip_list *ips,
1571                                   int pnn)
1572 {
1573         struct public_ip_list *t;
1574         uint32_t d;
1575
1576         uint32_t sum = 0;
1577
1578         for (t=ips; t != NULL; t=t->next) {
1579                 if (t->pnn != pnn) {
1580                         continue;
1581                 }
1582
1583                 /* Optimisation: We never calculate the distance
1584                  * between an address and itself.  This allows us to
1585                  * calculate the effect of removing an address from a
1586                  * node by simply calculating the distance between
1587                  * that address and all of the exitsing addresses.
1588                  * Moreover, we assume that we're only ever dealing
1589                  * with addresses from all_ips so we can identify an
1590                  * address via a pointer rather than doing a more
1591                  * expensive address comparison. */
1592                 if (&(t->addr) == ip) {
1593                         continue;
1594                 }
1595
1596                 d = ip_distance(ip, &(t->addr));
1597                 sum += d * d;  /* Cheaper than pulling in math.h :-) */
1598         }
1599
1600         return sum;
1601 }
1602
1603 /* Return the LCP2 imbalance metric for addresses currently assigned
1604    to the given node.
1605  */
1606 static uint32_t lcp2_imbalance(struct public_ip_list * all_ips, int pnn)
1607 {
1608         struct public_ip_list *t;
1609
1610         uint32_t imbalance = 0;
1611
1612         for (t=all_ips; t!=NULL; t=t->next) {
1613                 if (t->pnn != pnn) {
1614                         continue;
1615                 }
1616                 /* Pass the rest of the IPs rather than the whole
1617                    all_ips input list.
1618                 */
1619                 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1620         }
1621
1622         return imbalance;
1623 }
1624
1625 /* Allocate any unassigned IPs just by looping through the IPs and
1626  * finding the best node for each.
1627  */
1628 static void basic_allocate_unassigned(struct ctdb_context *ctdb,
1629                                       struct ctdb_ipflags *ipflags,
1630                                       struct public_ip_list *all_ips)
1631 {
1632         struct public_ip_list *tmp_ip;
1633
1634         /* loop over all ip's and find a physical node to cover for 
1635            each unassigned ip.
1636         */
1637         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1638                 if (tmp_ip->pnn == -1) {
1639                         if (find_takeover_node(ctdb, ipflags, tmp_ip, all_ips)) {
1640                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1641                                         ctdb_addr_to_str(&tmp_ip->addr)));
1642                         }
1643                 }
1644         }
1645 }
1646
1647 /* Basic non-deterministic rebalancing algorithm.
1648  */
1649 static void basic_failback(struct ctdb_context *ctdb,
1650                            struct ctdb_ipflags *ipflags,
1651                            struct public_ip_list *all_ips,
1652                            int num_ips)
1653 {
1654         int i, numnodes;
1655         int maxnode, maxnum, minnode, minnum, num, retries;
1656         struct public_ip_list *tmp_ip;
1657
1658         numnodes = talloc_array_length(ipflags);
1659         retries = 0;
1660
1661 try_again:
1662         maxnum=0;
1663         minnum=0;
1664
1665         /* for each ip address, loop over all nodes that can serve
1666            this ip and make sure that the difference between the node
1667            serving the most and the node serving the least ip's are
1668            not greater than 1.
1669         */
1670         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1671                 if (tmp_ip->pnn == -1) {
1672                         continue;
1673                 }
1674
1675                 /* Get the highest and lowest number of ips's served by any 
1676                    valid node which can serve this ip.
1677                 */
1678                 maxnode = -1;
1679                 minnode = -1;
1680                 for (i=0; i<numnodes; i++) {
1681                         /* only check nodes that can actually serve this ip */
1682                         if (!can_node_takeover_ip(ctdb, i, ipflags[i], tmp_ip)) {
1683                                 /* no it couldnt   so skip to the next node */
1684                                 continue;
1685                         }
1686
1687                         num = node_ip_coverage(i, all_ips);
1688                         if (maxnode == -1) {
1689                                 maxnode = i;
1690                                 maxnum  = num;
1691                         } else {
1692                                 if (num > maxnum) {
1693                                         maxnode = i;
1694                                         maxnum  = num;
1695                                 }
1696                         }
1697                         if (minnode == -1) {
1698                                 minnode = i;
1699                                 minnum  = num;
1700                         } else {
1701                                 if (num < minnum) {
1702                                         minnode = i;
1703                                         minnum  = num;
1704                                 }
1705                         }
1706                 }
1707                 if (maxnode == -1) {
1708                         DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1709                                 ctdb_addr_to_str(&tmp_ip->addr)));
1710
1711                         continue;
1712                 }
1713
1714                 /* if the spread between the smallest and largest coverage by
1715                    a node is >=2 we steal one of the ips from the node with
1716                    most coverage to even things out a bit.
1717                    try to do this a limited number of times since we dont
1718                    want to spend too much time balancing the ip coverage.
1719                 */
1720                 if ( (maxnum > minnum+1)
1721                      && (retries < (num_ips + 5)) ){
1722                         struct public_ip_list *tmp;
1723
1724                         /* Reassign one of maxnode's VNNs */
1725                         for (tmp=all_ips;tmp;tmp=tmp->next) {
1726                                 if (tmp->pnn == maxnode) {
1727                                         (void)find_takeover_node(ctdb, ipflags, tmp, all_ips);
1728                                         retries++;
1729                                         goto try_again;;
1730                                 }
1731                         }
1732                 }
1733         }
1734 }
1735
1736 static void lcp2_init(struct ctdb_context *tmp_ctx,
1737                       struct ctdb_ipflags *ipflags,
1738                       struct public_ip_list *all_ips,
1739                       uint32_t *force_rebalance_nodes,
1740                       uint32_t **lcp2_imbalances,
1741                       bool **rebalance_candidates)
1742 {
1743         int i, numnodes;
1744         struct public_ip_list *tmp_ip;
1745
1746         numnodes = talloc_array_length(ipflags);
1747
1748         *rebalance_candidates = talloc_array(tmp_ctx, bool, numnodes);
1749         CTDB_NO_MEMORY_FATAL(tmp_ctx, *rebalance_candidates);
1750         *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, numnodes);
1751         CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
1752
1753         for (i=0; i<numnodes; i++) {
1754                 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1755                 /* First step: assume all nodes are candidates */
1756                 (*rebalance_candidates)[i] = true;
1757         }
1758
1759         /* 2nd step: if a node has IPs assigned then it must have been
1760          * healthy before, so we remove it from consideration.  This
1761          * is overkill but is all we have because we don't maintain
1762          * state between takeover runs.  An alternative would be to
1763          * keep state and invalidate it every time the recovery master
1764          * changes.
1765          */
1766         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1767                 if (tmp_ip->pnn != -1) {
1768                         (*rebalance_candidates)[tmp_ip->pnn] = false;
1769                 }
1770         }
1771
1772         /* 3rd step: if a node is forced to re-balance then
1773            we allow failback onto the node */
1774         if (force_rebalance_nodes == NULL) {
1775                 return;
1776         }
1777         for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
1778                 uint32_t pnn = force_rebalance_nodes[i];
1779                 if (pnn >= numnodes) {
1780                         DEBUG(DEBUG_ERR,
1781                               (__location__ "unknown node %u\n", pnn));
1782                         continue;
1783                 }
1784
1785                 DEBUG(DEBUG_NOTICE,
1786                       ("Forcing rebalancing of IPs to node %u\n", pnn));
1787                 (*rebalance_candidates)[pnn] = true;
1788         }
1789 }
1790
1791 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1792  * the IP/node combination that will cost the least.
1793  */
1794 static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
1795                                      struct ctdb_ipflags *ipflags,
1796                                      struct public_ip_list *all_ips,
1797                                      uint32_t *lcp2_imbalances)
1798 {
1799         struct public_ip_list *tmp_ip;
1800         int dstnode, numnodes;
1801
1802         int minnode;
1803         uint32_t mindsum, dstdsum, dstimbl, minimbl;
1804         struct public_ip_list *minip;
1805
1806         bool should_loop = true;
1807         bool have_unassigned = true;
1808
1809         numnodes = talloc_array_length(ipflags);
1810
1811         while (have_unassigned && should_loop) {
1812                 should_loop = false;
1813
1814                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1815                 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1816
1817                 minnode = -1;
1818                 mindsum = 0;
1819                 minip = NULL;
1820
1821                 /* loop over each unassigned ip. */
1822                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1823                         if (tmp_ip->pnn != -1) {
1824                                 continue;
1825                         }
1826
1827                         for (dstnode=0; dstnode<numnodes; dstnode++) {
1828                                 /* only check nodes that can actually takeover this ip */
1829                                 if (!can_node_takeover_ip(ctdb, dstnode,
1830                                                           ipflags[dstnode],
1831                                                           tmp_ip)) {
1832                                         /* no it couldnt   so skip to the next node */
1833                                         continue;
1834                                 }
1835
1836                                 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1837                                 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1838                                 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1839                                                    ctdb_addr_to_str(&(tmp_ip->addr)),
1840                                                    dstnode,
1841                                                    dstimbl - lcp2_imbalances[dstnode]));
1842
1843
1844                                 if ((minnode == -1) || (dstdsum < mindsum)) {
1845                                         minnode = dstnode;
1846                                         minimbl = dstimbl;
1847                                         mindsum = dstdsum;
1848                                         minip = tmp_ip;
1849                                         should_loop = true;
1850                                 }
1851                         }
1852                 }
1853
1854                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1855
1856                 /* If we found one then assign it to the given node. */
1857                 if (minnode != -1) {
1858                         minip->pnn = minnode;
1859                         lcp2_imbalances[minnode] = minimbl;
1860                         DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1861                                           ctdb_addr_to_str(&(minip->addr)),
1862                                           minnode,
1863                                           mindsum));
1864                 }
1865
1866                 /* There might be a better way but at least this is clear. */
1867                 have_unassigned = false;
1868                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1869                         if (tmp_ip->pnn == -1) {
1870                                 have_unassigned = true;
1871                         }
1872                 }
1873         }
1874
1875         /* We know if we have an unassigned addresses so we might as
1876          * well optimise.
1877          */
1878         if (have_unassigned) {
1879                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1880                         if (tmp_ip->pnn == -1) {
1881                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1882                                                      ctdb_addr_to_str(&tmp_ip->addr)));
1883                         }
1884                 }
1885         }
1886 }
1887
1888 /* LCP2 algorithm for rebalancing the cluster.  Given a candidate node
1889  * to move IPs from, determines the best IP/destination node
1890  * combination to move from the source node.
1891  */
1892 static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
1893                                     struct ctdb_ipflags *ipflags,
1894                                     struct public_ip_list *all_ips,
1895                                     int srcnode,
1896                                     uint32_t *lcp2_imbalances,
1897                                     bool *rebalance_candidates)
1898 {
1899         int dstnode, mindstnode, numnodes;
1900         uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1901         uint32_t minsrcimbl, mindstimbl;
1902         struct public_ip_list *minip;
1903         struct public_ip_list *tmp_ip;
1904
1905         /* Find an IP and destination node that best reduces imbalance. */
1906         srcimbl = 0;
1907         minip = NULL;
1908         minsrcimbl = 0;
1909         mindstnode = -1;
1910         mindstimbl = 0;
1911
1912         numnodes = talloc_array_length(ipflags);
1913
1914         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1915         DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n",
1916                            srcnode, lcp2_imbalances[srcnode]));
1917
1918         for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1919                 /* Only consider addresses on srcnode. */
1920                 if (tmp_ip->pnn != srcnode) {
1921                         continue;
1922                 }
1923
1924                 /* What is this IP address costing the source node? */
1925                 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1926                 srcimbl = lcp2_imbalances[srcnode] - srcdsum;
1927
1928                 /* Consider this IP address would cost each potential
1929                  * destination node.  Destination nodes are limited to
1930                  * those that are newly healthy, since we don't want
1931                  * to do gratuitous failover of IPs just to make minor
1932                  * balance improvements.
1933                  */
1934                 for (dstnode=0; dstnode<numnodes; dstnode++) {
1935                         if (!rebalance_candidates[dstnode]) {
1936                                 continue;
1937                         }
1938
1939                         /* only check nodes that can actually takeover this ip */
1940                         if (!can_node_takeover_ip(ctdb, dstnode,
1941                                                   ipflags[dstnode], tmp_ip)) {
1942                                 /* no it couldnt   so skip to the next node */
1943                                 continue;
1944                         }
1945
1946                         dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1947                         dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1948                         DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1949                                            srcnode, -srcdsum,
1950                                            ctdb_addr_to_str(&(tmp_ip->addr)),
1951                                            dstnode, dstdsum));
1952
1953                         if ((dstimbl < lcp2_imbalances[srcnode]) &&
1954                             (dstdsum < srcdsum) &&                      \
1955                             ((mindstnode == -1) ||                              \
1956                              ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1957
1958                                 minip = tmp_ip;
1959                                 minsrcimbl = srcimbl;
1960                                 mindstnode = dstnode;
1961                                 mindstimbl = dstimbl;
1962                         }
1963                 }
1964         }
1965         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1966
1967         if (mindstnode != -1) {
1968                 /* We found a move that makes things better... */
1969                 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1970                                   srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1971                                   ctdb_addr_to_str(&(minip->addr)),
1972                                   mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1973
1974
1975                 lcp2_imbalances[srcnode] = minsrcimbl;
1976                 lcp2_imbalances[mindstnode] = mindstimbl;
1977                 minip->pnn = mindstnode;
1978
1979                 return true;
1980         }
1981
1982         return false;
1983         
1984 }
1985
1986 struct lcp2_imbalance_pnn {
1987         uint32_t imbalance;
1988         int pnn;
1989 };
1990
1991 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
1992 {
1993         const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
1994         const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
1995
1996         if (lipa->imbalance > lipb->imbalance) {
1997                 return -1;
1998         } else if (lipa->imbalance == lipb->imbalance) {
1999                 return 0;
2000         } else {
2001                 return 1;
2002         }
2003 }
2004
2005 /* LCP2 algorithm for rebalancing the cluster.  This finds the source
2006  * node with the highest LCP2 imbalance, and then determines the best
2007  * IP/destination node combination to move from the source node.
2008  */
2009 static void lcp2_failback(struct ctdb_context *ctdb,
2010                           struct ctdb_ipflags *ipflags,
2011                           struct public_ip_list *all_ips,
2012                           uint32_t *lcp2_imbalances,
2013                           bool *rebalance_candidates)
2014 {
2015         int i, numnodes;
2016         struct lcp2_imbalance_pnn * lips;
2017         bool again;
2018
2019         numnodes = talloc_array_length(ipflags);
2020
2021 try_again:
2022         /* Put the imbalances and nodes into an array, sort them and
2023          * iterate through candidates.  Usually the 1st one will be
2024          * used, so this doesn't cost much...
2025          */
2026         DEBUG(DEBUG_DEBUG,("+++++++++++++++++++++++++++++++++++++++++\n"));
2027         DEBUG(DEBUG_DEBUG,("Selecting most imbalanced node from:\n"));
2028         lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, numnodes);
2029         for (i=0; i<numnodes; i++) {
2030                 lips[i].imbalance = lcp2_imbalances[i];
2031                 lips[i].pnn = i;
2032                 DEBUG(DEBUG_DEBUG,(" %d [%d]\n", i, lcp2_imbalances[i]));
2033         }
2034         qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
2035               lcp2_cmp_imbalance_pnn);
2036
2037         again = false;
2038         for (i=0; i<numnodes; i++) {
2039                 /* This means that all nodes had 0 or 1 addresses, so
2040                  * can't be imbalanced.
2041                  */
2042                 if (lips[i].imbalance == 0) {
2043                         break;
2044                 }
2045
2046                 if (lcp2_failback_candidate(ctdb,
2047                                             ipflags,
2048                                             all_ips,
2049                                             lips[i].pnn,
2050                                             lcp2_imbalances,
2051                                             rebalance_candidates)) {
2052                         again = true;
2053                         break;
2054                 }
2055         }
2056
2057         talloc_free(lips);
2058         if (again) {
2059                 goto try_again;
2060         }
2061 }
2062
2063 static void unassign_unsuitable_ips(struct ctdb_context *ctdb,
2064                                     struct ctdb_ipflags *ipflags,
2065                                     struct public_ip_list *all_ips)
2066 {
2067         struct public_ip_list *tmp_ip;
2068
2069         /* verify that the assigned nodes can serve that public ip
2070            and set it to -1 if not
2071         */
2072         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2073                 if (tmp_ip->pnn == -1) {
2074                         continue;
2075                 }
2076                 if (!can_node_host_ip(ctdb, tmp_ip->pnn,
2077                                       ipflags[tmp_ip->pnn], tmp_ip) != 0) {
2078                         /* this node can not serve this ip. */
2079                         DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
2080                                            ctdb_addr_to_str(&(tmp_ip->addr)),
2081                                            tmp_ip->pnn));
2082                         tmp_ip->pnn = -1;
2083                 }
2084         }
2085 }
2086
2087 static void ip_alloc_deterministic_ips(struct ctdb_context *ctdb,
2088                                        struct ctdb_ipflags *ipflags,
2089                                        struct public_ip_list *all_ips)
2090 {
2091         struct public_ip_list *tmp_ip;
2092         int i, numnodes;
2093
2094         numnodes = talloc_array_length(ipflags);
2095
2096         DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
2097        /* Allocate IPs to nodes in a modulo fashion so that IPs will
2098         *  always be allocated the same way for a specific set of
2099         *  available/unavailable nodes.
2100         */
2101
2102         for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
2103                 tmp_ip->pnn = i % numnodes;
2104         }
2105
2106         /* IP failback doesn't make sense with deterministic
2107          * IPs, since the modulo step above implicitly fails
2108          * back IPs to their "home" node.
2109          */
2110         if (1 == ctdb->tunable.no_ip_failback) {
2111                 DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
2112         }
2113
2114         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2115
2116         basic_allocate_unassigned(ctdb, ipflags, all_ips);
2117
2118         /* No failback here! */
2119 }
2120
2121 static void ip_alloc_nondeterministic_ips(struct ctdb_context *ctdb,
2122                                           struct ctdb_ipflags *ipflags,
2123                                           struct public_ip_list *all_ips)
2124 {
2125         /* This should be pushed down into basic_failback. */
2126         struct public_ip_list *tmp_ip;
2127         int num_ips = 0;
2128         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2129                 num_ips++;
2130         }
2131
2132         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2133
2134         basic_allocate_unassigned(ctdb, ipflags, all_ips);
2135
2136         /* If we don't want IPs to fail back then don't rebalance IPs. */
2137         if (1 == ctdb->tunable.no_ip_failback) {
2138                 return;
2139         }
2140
2141         /* Now, try to make sure the ip adresses are evenly distributed
2142            across the nodes.
2143         */
2144         basic_failback(ctdb, ipflags, all_ips, num_ips);
2145 }
2146
2147 static void ip_alloc_lcp2(struct ctdb_context *ctdb,
2148                           struct ctdb_ipflags *ipflags,
2149                           struct public_ip_list *all_ips,
2150                           uint32_t *force_rebalance_nodes)
2151 {
2152         uint32_t *lcp2_imbalances;
2153         bool *rebalance_candidates;
2154         int numnodes, num_rebalance_candidates, i;
2155
2156         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2157
2158         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2159
2160         lcp2_init(tmp_ctx, ipflags, all_ips,force_rebalance_nodes,
2161                   &lcp2_imbalances, &rebalance_candidates);
2162
2163         lcp2_allocate_unassigned(ctdb, ipflags, all_ips, lcp2_imbalances);
2164
2165         /* If we don't want IPs to fail back then don't rebalance IPs. */
2166         if (1 == ctdb->tunable.no_ip_failback) {
2167                 goto finished;
2168         }
2169
2170         /* It is only worth continuing if we have suitable target
2171          * nodes to transfer IPs to.  This check is much cheaper than
2172          * continuing on...
2173          */
2174         numnodes = talloc_array_length(ipflags);
2175         num_rebalance_candidates = 0;
2176         for (i=0; i<numnodes; i++) {
2177                 if (rebalance_candidates[i]) {
2178                         num_rebalance_candidates++;
2179                 }
2180         }
2181         if (num_rebalance_candidates == 0) {
2182                 goto finished;
2183         }
2184
2185         /* Now, try to make sure the ip adresses are evenly distributed
2186            across the nodes.
2187         */
2188         lcp2_failback(ctdb, ipflags, all_ips,
2189                       lcp2_imbalances, rebalance_candidates);
2190
2191 finished:
2192         talloc_free(tmp_ctx);
2193 }
2194
2195 static bool all_nodes_are_disabled(struct ctdb_node_map_old *nodemap)
2196 {
2197         int i;
2198
2199         for (i=0;i<nodemap->num;i++) {
2200                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
2201                         /* Found one completely healthy node */
2202                         return false;
2203                 }
2204         }
2205
2206         return true;
2207 }
2208
2209 /* The calculation part of the IP allocation algorithm. */
2210 static void ctdb_takeover_run_core(struct ctdb_context *ctdb,
2211                                    struct ctdb_ipflags *ipflags,
2212                                    struct public_ip_list **all_ips_p,
2213                                    uint32_t *force_rebalance_nodes)
2214 {
2215         /* since nodes only know about those public addresses that
2216            can be served by that particular node, no single node has
2217            a full list of all public addresses that exist in the cluster.
2218            Walk over all node structures and create a merged list of
2219            all public addresses that exist in the cluster.
2220
2221            keep the tree of ips around as ctdb->ip_tree
2222         */
2223         *all_ips_p = create_merged_ip_list(ctdb);
2224
2225         switch (ctdb->ipalloc_state->algorithm) {
2226         case IPALLOC_LCP2:
2227                 ip_alloc_lcp2(ctdb, ipflags, *all_ips_p, force_rebalance_nodes);
2228                 break;
2229         case IPALLOC_DETERMINISTIC:
2230                 ip_alloc_deterministic_ips(ctdb, ipflags, *all_ips_p);
2231                 break;
2232         case IPALLOC_NONDETERMINISTIC:
2233                 ip_alloc_nondeterministic_ips(ctdb, ipflags, *all_ips_p);
2234                break;
2235         }
2236
2237         /* at this point ->pnn is the node which will own each IP
2238            or -1 if there is no node that can cover this ip
2239         */
2240
2241         return;
2242 }
2243
2244 struct get_tunable_callback_data {
2245         const char *tunable;
2246         uint32_t *out;
2247         bool fatal;
2248 };
2249
2250 static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
2251                                  int32_t res, TDB_DATA outdata,
2252                                  void *callback)
2253 {
2254         struct get_tunable_callback_data *cd =
2255                 (struct get_tunable_callback_data *)callback;
2256         int size;
2257
2258         if (res != 0) {
2259                 /* Already handled in fail callback */
2260                 return;
2261         }
2262
2263         if (outdata.dsize != sizeof(uint32_t)) {
2264                 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading \"%s\" tunable from node %d. Expected %d bytes but received %d bytes\n",
2265                                  cd->tunable, pnn, (int)sizeof(uint32_t),
2266                                  (int)outdata.dsize));
2267                 cd->fatal = true;
2268                 return;
2269         }
2270
2271         size = talloc_array_length(cd->out);
2272         if (pnn >= size) {
2273                 DEBUG(DEBUG_ERR,("Got %s reply from node %d but nodemap only has %d entries\n",
2274                                  cd->tunable, pnn, size));
2275                 return;
2276         }
2277
2278                 
2279         cd->out[pnn] = *(uint32_t *)outdata.dptr;
2280 }
2281
2282 static void get_tunable_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2283                                        int32_t res, TDB_DATA outdata,
2284                                        void *callback)
2285 {
2286         struct get_tunable_callback_data *cd =
2287                 (struct get_tunable_callback_data *)callback;
2288
2289         switch (res) {
2290         case -ETIME:
2291                 DEBUG(DEBUG_ERR,
2292                       ("Timed out getting tunable \"%s\" from node %d\n",
2293                        cd->tunable, pnn));
2294                 cd->fatal = true;
2295                 break;
2296         case -EINVAL:
2297         case -1:
2298                 DEBUG(DEBUG_WARNING,
2299                       ("Tunable \"%s\" not implemented on node %d\n",
2300                        cd->tunable, pnn));
2301                 break;
2302         default:
2303                 DEBUG(DEBUG_ERR,
2304                       ("Unexpected error getting tunable \"%s\" from node %d\n",
2305                        cd->tunable, pnn));
2306                 cd->fatal = true;
2307         }
2308 }
2309
2310 static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
2311                                         TALLOC_CTX *tmp_ctx,
2312                                         struct ctdb_node_map_old *nodemap,
2313                                         const char *tunable,
2314                                         uint32_t default_value)
2315 {
2316         TDB_DATA data;
2317         struct ctdb_control_get_tunable *t;
2318         uint32_t *nodes;
2319         uint32_t *tvals;
2320         struct get_tunable_callback_data callback_data;
2321         int i;
2322
2323         tvals = talloc_array(tmp_ctx, uint32_t, nodemap->num);
2324         CTDB_NO_MEMORY_NULL(ctdb, tvals);
2325         for (i=0; i<nodemap->num; i++) {
2326                 tvals[i] = default_value;
2327         }
2328                 
2329         callback_data.out = tvals;
2330         callback_data.tunable = tunable;
2331         callback_data.fatal = false;
2332
2333         data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(tunable) + 1;
2334         data.dptr  = talloc_size(tmp_ctx, data.dsize);
2335         t = (struct ctdb_control_get_tunable *)data.dptr;
2336         t->length = strlen(tunable)+1;
2337         memcpy(t->name, tunable, t->length);
2338         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2339         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
2340                                       nodes, 0, TAKEOVER_TIMEOUT(),
2341                                       false, data,
2342                                       get_tunable_callback,
2343                                       get_tunable_fail_callback,
2344                                       &callback_data) != 0) {
2345                 if (callback_data.fatal) {
2346                         talloc_free(tvals);
2347                         tvals = NULL;
2348                 }
2349         }
2350         talloc_free(nodes);
2351         talloc_free(data.dptr);
2352
2353         return tvals;
2354 }
2355
2356 /* Set internal flags for IP allocation:
2357  *   Clear ip flags
2358  *   Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
2359  *   Set NOIPHOST ip flag for each INACTIVE node
2360  *   if all nodes are disabled:
2361  *     Set NOIPHOST ip flags from per-node NoIPHostOnAllDisabled tunable
2362  *   else
2363  *     Set NOIPHOST ip flags for disabled nodes
2364  */
2365 static struct ctdb_ipflags *
2366 set_ipflags_internal(struct ctdb_context *ctdb,
2367                      TALLOC_CTX *tmp_ctx,
2368                      struct ctdb_node_map_old *nodemap,
2369                      uint32_t *tval_noiptakeover,
2370                      uint32_t *tval_noiphostonalldisabled)
2371 {
2372         int i;
2373         struct ctdb_ipflags *ipflags;
2374
2375         /* Clear IP flags - implicit due to talloc_zero */
2376         ipflags = talloc_zero_array(tmp_ctx, struct ctdb_ipflags, nodemap->num);
2377         CTDB_NO_MEMORY_NULL(ctdb, ipflags);
2378
2379         for (i=0;i<nodemap->num;i++) {
2380                 /* Can not take IPs on node with NoIPTakeover set */
2381                 if (tval_noiptakeover[i] != 0) {
2382                         ipflags[i].noiptakeover = true;
2383                 }
2384
2385                 /* Can not host IPs on INACTIVE node */
2386                 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2387                         ipflags[i].noiphost = true;
2388                 }
2389         }
2390
2391         if (all_nodes_are_disabled(nodemap)) {
2392                 /* If all nodes are disabled, can not host IPs on node
2393                  * with NoIPHostOnAllDisabled set
2394                  */
2395                 for (i=0;i<nodemap->num;i++) {
2396                         if (tval_noiphostonalldisabled[i] != 0) {
2397                                 ipflags[i].noiphost = true;
2398                         }
2399                 }
2400         } else {
2401                 /* If some nodes are not disabled, then can not host
2402                  * IPs on DISABLED node
2403                  */
2404                 for (i=0;i<nodemap->num;i++) {
2405                         if (nodemap->nodes[i].flags & NODE_FLAGS_DISABLED) {
2406                                 ipflags[i].noiphost = true;
2407                         }
2408                 }
2409         }
2410
2411         return ipflags;
2412 }
2413
2414 static struct ctdb_ipflags *set_ipflags(struct ctdb_context *ctdb,
2415                                         TALLOC_CTX *tmp_ctx,
2416                                         struct ctdb_node_map_old *nodemap)
2417 {
2418         uint32_t *tval_noiptakeover;
2419         uint32_t *tval_noiphostonalldisabled;
2420         struct ctdb_ipflags *ipflags;
2421
2422
2423         tval_noiptakeover = get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2424                                                    "NoIPTakeover", 0);
2425         if (tval_noiptakeover == NULL) {
2426                 return NULL;
2427         }
2428
2429         tval_noiphostonalldisabled =
2430                 get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2431                                        "NoIPHostOnAllDisabled", 0);
2432         if (tval_noiphostonalldisabled == NULL) {
2433                 /* Caller frees tmp_ctx */
2434                 return NULL;
2435         }
2436
2437         ipflags = set_ipflags_internal(ctdb, tmp_ctx, nodemap,
2438                                        tval_noiptakeover,
2439                                        tval_noiphostonalldisabled);
2440
2441         talloc_free(tval_noiptakeover);
2442         talloc_free(tval_noiphostonalldisabled);
2443
2444         return ipflags;
2445 }
2446
2447 static struct ipalloc_state * ipalloc_state_init(struct ctdb_context *ctdb,
2448                                                  TALLOC_CTX *mem_ctx)
2449 {
2450         struct ipalloc_state *ipalloc_state =
2451                 talloc_zero(mem_ctx, struct ipalloc_state);
2452         if (ipalloc_state == NULL) {
2453                 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
2454                 return NULL;
2455         }
2456
2457         ipalloc_state->num = ctdb->num_nodes;
2458         ipalloc_state->known_public_ips =
2459                 talloc_zero_array(ipalloc_state,
2460                                   struct ctdb_public_ip_list_old *,
2461                                   ipalloc_state->num);
2462         if (ipalloc_state->known_public_ips == NULL) {
2463                 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
2464                 talloc_free(ipalloc_state);
2465                 return NULL;
2466         }
2467         ipalloc_state->available_public_ips =
2468                 talloc_zero_array(ipalloc_state,
2469                                   struct ctdb_public_ip_list_old *,
2470                                   ipalloc_state->num);
2471         if (ipalloc_state->available_public_ips == NULL) {
2472                 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
2473                 talloc_free(ipalloc_state);
2474                 return NULL;
2475         }
2476
2477         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2478                 ipalloc_state->algorithm = IPALLOC_LCP2;
2479         } else if (1 == ctdb->tunable.deterministic_public_ips) {
2480                 ipalloc_state->algorithm = IPALLOC_DETERMINISTIC;
2481         } else {
2482                 ipalloc_state->algorithm = IPALLOC_NONDETERMINISTIC;
2483         }
2484
2485         return ipalloc_state;
2486 }
2487
2488 struct iprealloc_callback_data {
2489         bool *retry_nodes;
2490         int retry_count;
2491         client_async_callback fail_callback;
2492         void *fail_callback_data;
2493         struct ctdb_node_map_old *nodemap;
2494 };
2495
2496 static void iprealloc_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2497                                         int32_t res, TDB_DATA outdata,
2498                                         void *callback)
2499 {
2500         int numnodes;
2501         struct iprealloc_callback_data *cd =
2502                 (struct iprealloc_callback_data *)callback;
2503
2504         numnodes = talloc_array_length(cd->retry_nodes);
2505         if (pnn > numnodes) {
2506                 DEBUG(DEBUG_ERR,
2507                       ("ipreallocated failure from node %d, "
2508                        "but only %d nodes in nodemap\n",
2509                        pnn, numnodes));
2510                 return;
2511         }
2512
2513         /* Can't run the "ipreallocated" event on a INACTIVE node */
2514         if (cd->nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE) {
2515                 DEBUG(DEBUG_WARNING,
2516                       ("ipreallocated failed on inactive node %d, ignoring\n",
2517                        pnn));
2518                 return;
2519         }
2520
2521         switch (res) {
2522         case -ETIME:
2523                 /* If the control timed out then that's a real error,
2524                  * so call the real fail callback
2525                  */
2526                 if (cd->fail_callback) {
2527                         cd->fail_callback(ctdb, pnn, res, outdata,
2528                                           cd->fail_callback_data);
2529                 } else {
2530                         DEBUG(DEBUG_WARNING,
2531                               ("iprealloc timed out but no callback registered\n"));
2532                 }
2533                 break;
2534         default:
2535                 /* If not a timeout then either the ipreallocated
2536                  * eventscript (or some setup) failed.  This might
2537                  * have failed because the IPREALLOCATED control isn't
2538                  * implemented - right now there is no way of knowing
2539                  * because the error codes are all folded down to -1.
2540                  * Consider retrying using EVENTSCRIPT control...
2541                  */
2542                 DEBUG(DEBUG_WARNING,
2543                       ("ipreallocated failure from node %d, flagging retry\n",
2544                        pnn));
2545                 cd->retry_nodes[pnn] = true;
2546                 cd->retry_count++;
2547         }
2548 }
2549
2550 struct takeover_callback_data {
2551         bool *node_failed;
2552         client_async_callback fail_callback;
2553         void *fail_callback_data;
2554         struct ctdb_node_map_old *nodemap;
2555 };
2556
2557 static void takeover_run_fail_callback(struct ctdb_context *ctdb,
2558                                        uint32_t node_pnn, int32_t res,
2559                                        TDB_DATA outdata, void *callback_data)
2560 {
2561         struct takeover_callback_data *cd =
2562                 talloc_get_type_abort(callback_data,
2563                                       struct takeover_callback_data);
2564         int i;
2565
2566         for (i = 0; i < cd->nodemap->num; i++) {
2567                 if (node_pnn == cd->nodemap->nodes[i].pnn) {
2568                         break;
2569                 }
2570         }
2571
2572         if (i == cd->nodemap->num) {
2573                 DEBUG(DEBUG_ERR, (__location__ " invalid PNN %u\n", node_pnn));
2574                 return;
2575         }
2576
2577         if (!cd->node_failed[i]) {
2578                 cd->node_failed[i] = true;
2579                 cd->fail_callback(ctdb, node_pnn, res, outdata,
2580                                   cd->fail_callback_data);
2581         }
2582 }
2583
2584 /*
2585   make any IP alias changes for public addresses that are necessary 
2586  */
2587 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
2588                       uint32_t *force_rebalance_nodes,
2589                       client_async_callback fail_callback, void *callback_data)
2590 {
2591         int i, j, ret;
2592         struct ctdb_public_ip ip;
2593         uint32_t *nodes;
2594         struct public_ip_list *all_ips, *tmp_ip;
2595         TDB_DATA data;
2596         struct timeval timeout;
2597         struct client_async_data *async_data;
2598         struct ctdb_client_control_state *state;
2599         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2600         struct ctdb_ipflags *ipflags;
2601         struct ipalloc_state *ipalloc_state;
2602         struct takeover_callback_data *takeover_data;
2603         struct iprealloc_callback_data iprealloc_data;
2604         bool *retry_data;
2605         bool can_host_ips;
2606
2607         /*
2608          * ip failover is completely disabled, just send out the 
2609          * ipreallocated event.
2610          */
2611         if (ctdb->tunable.disable_ip_failover != 0) {
2612                 goto ipreallocated;
2613         }
2614
2615         ipalloc_state = ipalloc_state_init(ctdb, tmp_ctx);
2616         if (ipalloc_state == NULL) {
2617                 talloc_free(tmp_ctx);
2618                 return -1;
2619         }
2620         ctdb->ipalloc_state = ipalloc_state;
2621
2622         ipflags = set_ipflags(ctdb, tmp_ctx, nodemap);
2623         if (ipflags == NULL) {
2624                 DEBUG(DEBUG_ERR,("Failed to set IP flags - aborting takeover run\n"));
2625                 talloc_free(tmp_ctx);
2626                 return -1;
2627         }
2628
2629         /* Fetch known/available public IPs from each active node */
2630         ret = ctdb_reload_remote_public_ips(ctdb, ipalloc_state, nodemap);
2631         if (ret != 0) {
2632                 talloc_free(tmp_ctx);
2633                 return -1;
2634         }
2635
2636         /* Short-circuit IP allocation if no node has available IPs */
2637         can_host_ips = false;
2638         for (i=0; i < ipalloc_state->num; i++) {
2639                 if (ipalloc_state->available_public_ips[i] != NULL) {
2640                         can_host_ips = true;
2641                 }
2642         }
2643         if (!can_host_ips) {
2644                 DEBUG(DEBUG_WARNING,("No nodes available to host public IPs yet\n"));
2645                 return 0;
2646         }
2647
2648         /* Do the IP reassignment calculations */
2649         ctdb_takeover_run_core(ctdb, ipflags, &all_ips, force_rebalance_nodes);
2650
2651         /* Now tell all nodes to release any public IPs should not
2652          * host.  This will be a NOOP on nodes that don't currently
2653          * hold the given IP.
2654          */
2655         takeover_data = talloc_zero(tmp_ctx, struct takeover_callback_data);
2656         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data);
2657
2658         takeover_data->node_failed = talloc_zero_array(tmp_ctx,
2659                                                        bool, nodemap->num);
2660         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data->node_failed);
2661         takeover_data->fail_callback = fail_callback;
2662         takeover_data->fail_callback_data = callback_data;
2663         takeover_data->nodemap = nodemap;
2664
2665         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2666         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2667
2668         async_data->fail_callback = takeover_run_fail_callback;
2669         async_data->callback_data = takeover_data;
2670
2671         ZERO_STRUCT(ip); /* Avoid valgrind warnings for union */
2672
2673         /* Send a RELEASE_IP to all nodes that should not be hosting
2674          * each IP.  For each IP, all but one of these will be
2675          * redundant.  However, the redundant ones are used to tell
2676          * nodes which node should be hosting the IP so that commands
2677          * like "ctdb ip" can display a particular nodes idea of who
2678          * is hosting what. */
2679         for (i=0;i<nodemap->num;i++) {
2680                 /* don't talk to unconnected nodes, but do talk to banned nodes */
2681                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2682                         continue;
2683                 }
2684
2685                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2686                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2687                                 /* This node should be serving this
2688                                    vnn so don't tell it to release the ip
2689                                 */
2690                                 continue;
2691                         }
2692                         ip.pnn  = tmp_ip->pnn;
2693                         ip.addr = tmp_ip->addr;
2694
2695                         timeout = TAKEOVER_TIMEOUT();
2696                         data.dsize = sizeof(ip);
2697                         data.dptr  = (uint8_t *)&ip;
2698                         state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2699                                                   0, CTDB_CONTROL_RELEASE_IP, 0,
2700                                                   data, async_data,
2701                                                   &timeout, NULL);
2702                         if (state == NULL) {
2703                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2704                                 talloc_free(tmp_ctx);
2705                                 return -1;
2706                         }
2707
2708                         ctdb_client_async_add(async_data, state);
2709                 }
2710         }
2711         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2712                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2713                 talloc_free(tmp_ctx);
2714                 return -1;
2715         }
2716         talloc_free(async_data);
2717
2718
2719         /* For each IP, send a TAKOVER_IP to the node that should be
2720          * hosting it.  Many of these will often be redundant (since
2721          * the allocation won't have changed) but they can be useful
2722          * to recover from inconsistencies. */
2723         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2724         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2725
2726         async_data->fail_callback = fail_callback;
2727         async_data->callback_data = callback_data;
2728
2729         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2730                 if (tmp_ip->pnn == -1) {
2731                         /* this IP won't be taken over */
2732                         continue;
2733                 }
2734
2735                 ip.pnn  = tmp_ip->pnn;
2736                 ip.addr = tmp_ip->addr;
2737
2738                 timeout = TAKEOVER_TIMEOUT();
2739                 data.dsize = sizeof(ip);
2740                 data.dptr  = (uint8_t *)&ip;
2741                 state = ctdb_control_send(ctdb, tmp_ip->pnn,
2742                                           0, CTDB_CONTROL_TAKEOVER_IP, 0,
2743                                           data, async_data, &timeout, NULL);
2744                 if (state == NULL) {
2745                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2746                         talloc_free(tmp_ctx);
2747                         return -1;
2748                 }
2749
2750                 ctdb_client_async_add(async_data, state);
2751         }
2752         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2753                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2754                 talloc_free(tmp_ctx);
2755                 return -1;
2756         }
2757
2758 ipreallocated:
2759         /*
2760          * Tell all nodes to run eventscripts to process the
2761          * "ipreallocated" event.  This can do a lot of things,
2762          * including restarting services to reconfigure them if public
2763          * IPs have moved.  Once upon a time this event only used to
2764          * update natgw.
2765          */
2766         retry_data = talloc_zero_array(tmp_ctx, bool, nodemap->num);
2767         CTDB_NO_MEMORY_FATAL(ctdb, retry_data);
2768         iprealloc_data.retry_nodes = retry_data;
2769         iprealloc_data.retry_count = 0;
2770         iprealloc_data.fail_callback = fail_callback;
2771         iprealloc_data.fail_callback_data = callback_data;
2772         iprealloc_data.nodemap = nodemap;
2773
2774         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2775         ret = ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
2776                                         nodes, 0, TAKEOVER_TIMEOUT(),
2777                                         false, tdb_null,
2778                                         NULL, iprealloc_fail_callback,
2779                                         &iprealloc_data);
2780         if (ret != 0) {
2781                 /* If the control failed then we should retry to any
2782                  * nodes flagged by iprealloc_fail_callback using the
2783                  * EVENTSCRIPT control.  This is a best-effort at
2784                  * backward compatiblity when running a mixed cluster
2785                  * where some nodes have not yet been upgraded to
2786                  * support the IPREALLOCATED control.
2787                  */
2788                 DEBUG(DEBUG_WARNING,
2789                       ("Retry ipreallocated to some nodes using eventscript control\n"));
2790
2791                 nodes = talloc_array(tmp_ctx, uint32_t,
2792                                      iprealloc_data.retry_count);
2793                 CTDB_NO_MEMORY_FATAL(ctdb, nodes);
2794
2795                 j = 0;
2796                 for (i=0; i<nodemap->num; i++) {
2797                         if (iprealloc_data.retry_nodes[i]) {
2798                                 nodes[j] = i;
2799                                 j++;
2800                         }
2801                 }
2802
2803                 data.dptr  = discard_const("ipreallocated");
2804                 data.dsize = strlen((char *)data.dptr) + 1; 
2805                 ret = ctdb_client_async_control(ctdb,
2806                                                 CTDB_CONTROL_RUN_EVENTSCRIPTS,
2807                                                 nodes, 0, TAKEOVER_TIMEOUT(),
2808                                                 false, data,
2809                                                 NULL, fail_callback,
2810                                                 callback_data);
2811                 if (ret != 0) {
2812                         DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
2813                 }
2814         }
2815
2816         talloc_free(tmp_ctx);
2817         return ret;
2818 }
2819
2820
2821 /*
2822   destroy a ctdb_client_ip structure
2823  */
2824 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2825 {
2826         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2827                 ctdb_addr_to_str(&ip->addr),
2828                 ntohs(ip->addr.ip.sin_port),
2829                 ip->client_id));
2830
2831         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2832         return 0;
2833 }
2834
2835 /*
2836   called by a client to inform us of a TCP connection that it is managing
2837   that should tickled with an ACK when IP takeover is done
2838  */
2839 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2840                                 TDB_DATA indata)
2841 {
2842         struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
2843         struct ctdb_connection *tcp_sock = NULL;
2844         struct ctdb_tcp_list *tcp;
2845         struct ctdb_connection t;
2846         int ret;
2847         TDB_DATA data;
2848         struct ctdb_client_ip *ip;
2849         struct ctdb_vnn *vnn;
2850         ctdb_sock_addr addr;
2851
2852         /* If we don't have public IPs, tickles are useless */
2853         if (ctdb->vnn == NULL) {
2854                 return 0;
2855         }
2856
2857         tcp_sock = (struct ctdb_connection *)indata.dptr;
2858
2859         addr = tcp_sock->src;
2860         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
2861         addr = tcp_sock->dst;
2862         ctdb_canonicalize_ip(&addr, &tcp_sock->dst);
2863
2864         ZERO_STRUCT(addr);
2865         memcpy(&addr, &tcp_sock->dst, sizeof(addr));
2866         vnn = find_public_ip_vnn(ctdb, &addr);
2867         if (vnn == NULL) {
2868                 switch (addr.sa.sa_family) {
2869                 case AF_INET:
2870                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2871                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
2872                                         ctdb_addr_to_str(&addr)));
2873                         }
2874                         break;
2875                 case AF_INET6:
2876                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
2877                                 ctdb_addr_to_str(&addr)));
2878                         break;
2879                 default:
2880                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2881                 }
2882
2883                 return 0;
2884         }
2885
2886         if (vnn->pnn != ctdb->pnn) {
2887                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2888                         ctdb_addr_to_str(&addr),
2889                         client_id, client->pid));
2890                 /* failing this call will tell smbd to die */
2891                 return -1;
2892         }
2893
2894         ip = talloc(client, struct ctdb_client_ip);
2895         CTDB_NO_MEMORY(ctdb, ip);
2896
2897         ip->ctdb      = ctdb;
2898         ip->addr      = addr;
2899         ip->client_id = client_id;
2900         talloc_set_destructor(ip, ctdb_client_ip_destructor);
2901         DLIST_ADD(ctdb->client_ip_list, ip);
2902
2903         tcp = talloc(client, struct ctdb_tcp_list);
2904         CTDB_NO_MEMORY(ctdb, tcp);
2905
2906         tcp->connection.src = tcp_sock->src;
2907         tcp->connection.dst = tcp_sock->dst;
2908
2909         DLIST_ADD(client->tcp_list, tcp);
2910
2911         t.src = tcp_sock->src;
2912         t.dst = tcp_sock->dst;
2913
2914         data.dptr = (uint8_t *)&t;
2915         data.dsize = sizeof(t);
2916
2917         switch (addr.sa.sa_family) {
2918         case AF_INET:
2919                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2920                         (unsigned)ntohs(tcp_sock->dst.ip.sin_port),
2921                         ctdb_addr_to_str(&tcp_sock->src),
2922                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2923                 break;
2924         case AF_INET6:
2925                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2926                         (unsigned)ntohs(tcp_sock->dst.ip6.sin6_port),
2927                         ctdb_addr_to_str(&tcp_sock->src),
2928                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2929                 break;
2930         default:
2931                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2932         }
2933
2934
2935         /* tell all nodes about this tcp connection */
2936         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
2937                                        CTDB_CONTROL_TCP_ADD,
2938                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2939         if (ret != 0) {
2940                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
2941                 return -1;
2942         }
2943
2944         return 0;
2945 }
2946
2947 /*
2948   find a tcp address on a list
2949  */
2950 static struct ctdb_connection *ctdb_tcp_find(struct ctdb_tcp_array *array,
2951                                            struct ctdb_connection *tcp)
2952 {
2953         int i;
2954
2955         if (array == NULL) {
2956                 return NULL;
2957         }
2958
2959         for (i=0;i<array->num;i++) {
2960                 if (ctdb_same_sockaddr(&array->connections[i].src, &tcp->src) &&
2961                     ctdb_same_sockaddr(&array->connections[i].dst, &tcp->dst)) {
2962                         return &array->connections[i];
2963                 }
2964         }
2965         return NULL;
2966 }
2967
2968
2969
2970 /*
2971   called by a daemon to inform us of a TCP connection that one of its
2972   clients managing that should tickled with an ACK when IP takeover is
2973   done
2974  */
2975 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
2976 {
2977         struct ctdb_connection *p = (struct ctdb_connection *)indata.dptr;
2978         struct ctdb_tcp_array *tcparray;
2979         struct ctdb_connection tcp;
2980         struct ctdb_vnn *vnn;
2981
2982         /* If we don't have public IPs, tickles are useless */
2983         if (ctdb->vnn == NULL) {
2984                 return 0;
2985         }
2986
2987         vnn = find_public_ip_vnn(ctdb, &p->dst);
2988         if (vnn == NULL) {
2989                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
2990                         ctdb_addr_to_str(&p->dst)));
2991
2992                 return -1;
2993         }
2994
2995
2996         tcparray = vnn->tcp_array;
2997
2998         /* If this is the first tickle */
2999         if (tcparray == NULL) {
3000                 tcparray = talloc(vnn, struct ctdb_tcp_array);
3001                 CTDB_NO_MEMORY(ctdb, tcparray);
3002                 vnn->tcp_array = tcparray;
3003
3004                 tcparray->num = 0;
3005                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_connection));
3006                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
3007
3008                 tcparray->connections[tcparray->num].src = p->src;
3009                 tcparray->connections[tcparray->num].dst = p->dst;
3010                 tcparray->num++;
3011
3012                 if (tcp_update_needed) {
3013                         vnn->tcp_update_needed = true;
3014                 }
3015                 return 0;
3016         }
3017
3018
3019         /* Do we already have this tickle ?*/
3020         tcp.src = p->src;
3021         tcp.dst = p->dst;
3022         if (ctdb_tcp_find(tcparray, &tcp) != NULL) {
3023                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
3024                         ctdb_addr_to_str(&tcp.dst),
3025                         ntohs(tcp.dst.ip.sin_port),
3026                         vnn->pnn));
3027                 return 0;
3028         }
3029
3030         /* A new tickle, we must add it to the array */
3031         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
3032                                         struct ctdb_connection,
3033                                         tcparray->num+1);
3034         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3035
3036         tcparray->connections[tcparray->num].src = p->src;
3037         tcparray->connections[tcparray->num].dst = p->dst;
3038         tcparray->num++;
3039
3040         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
3041                 ctdb_addr_to_str(&tcp.dst),
3042                 ntohs(tcp.dst.ip.sin_port),
3043                 vnn->pnn));
3044
3045         if (tcp_update_needed) {
3046                 vnn->tcp_update_needed = true;
3047         }
3048
3049         return 0;
3050 }
3051
3052
3053 /*
3054   called by a daemon to inform us of a TCP connection that one of its
3055   clients managing that should tickled with an ACK when IP takeover is
3056   done
3057  */
3058 static void ctdb_remove_connection(struct ctdb_context *ctdb, struct ctdb_connection *conn)
3059 {
3060         struct ctdb_connection *tcpp;
3061         struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst);
3062
3063         if (vnn == NULL) {
3064                 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
3065                         ctdb_addr_to_str(&conn->dst)));
3066                 return;
3067         }
3068
3069         /* if the array is empty we cant remove it
3070            and we don't need to do anything
3071          */
3072         if (vnn->tcp_array == NULL) {
3073                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
3074                         ctdb_addr_to_str(&conn->dst),
3075                         ntohs(conn->dst.ip.sin_port)));
3076                 return;
3077         }
3078
3079
3080         /* See if we know this connection
3081            if we don't know this connection  then we dont need to do anything
3082          */
3083         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
3084         if (tcpp == NULL) {
3085                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
3086                         ctdb_addr_to_str(&conn->dst),
3087                         ntohs(conn->dst.ip.sin_port)));
3088                 return;
3089         }
3090
3091
3092         /* We need to remove this entry from the array.
3093            Instead of allocating a new array and copying data to it
3094            we cheat and just copy the last entry in the existing array
3095            to the entry that is to be removed and just shring the 
3096            ->num field
3097          */
3098         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
3099         vnn->tcp_array->num--;
3100
3101         /* If we deleted the last entry we also need to remove the entire array
3102          */
3103         if (vnn->tcp_array->num == 0) {
3104                 talloc_free(vnn->tcp_array);
3105                 vnn->tcp_array = NULL;
3106         }               
3107
3108         vnn->tcp_update_needed = true;
3109
3110         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
3111                 ctdb_addr_to_str(&conn->src),
3112                 ntohs(conn->src.ip.sin_port)));
3113 }
3114
3115
3116 /*
3117   called by a daemon to inform us of a TCP connection that one of its
3118   clients used are no longer needed in the tickle database
3119  */
3120 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
3121 {
3122         struct ctdb_connection *conn = (struct ctdb_connection *)indata.dptr;
3123
3124         /* If we don't have public IPs, tickles are useless */
3125         if (ctdb->vnn == NULL) {
3126                 return 0;
3127         }
3128
3129         ctdb_remove_connection(ctdb, conn);
3130
3131         return 0;
3132 }
3133
3134
3135 /*
3136   Called when another daemon starts - causes all tickles for all
3137   public addresses we are serving to be sent to the new node on the
3138   next check.  This actually causes the next scheduled call to
3139   tdb_update_tcp_tickles() to update all nodes.  This is simple and
3140   doesn't require careful error handling.
3141  */
3142 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t pnn)
3143 {
3144         struct ctdb_vnn *vnn;
3145
3146         DEBUG(DEBUG_INFO, ("Received startup control from node %lu\n",
3147                            (unsigned long) pnn));
3148
3149         for (vnn = ctdb->vnn; vnn != NULL; vnn = vnn->next) {
3150                 vnn->tcp_update_needed = true;
3151         }
3152
3153         return 0;
3154 }
3155
3156
3157 /*
3158   called when a client structure goes away - hook to remove
3159   elements from the tcp_list in all daemons
3160  */
3161 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
3162 {
3163         while (client->tcp_list) {
3164                 struct ctdb_tcp_list *tcp = client->tcp_list;
3165                 DLIST_REMOVE(client->tcp_list, tcp);
3166                 ctdb_remove_connection(client->ctdb, &tcp->connection);
3167         }
3168 }
3169
3170
3171 void ctdb_release_all_ips(struct ctdb_context *ctdb)
3172 {
3173         struct ctdb_vnn *vnn;
3174         int count = 0;
3175
3176         if (ctdb->tunable.disable_ip_failover == 1) {
3177                 return;
3178         }
3179
3180         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3181                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
3182                         ctdb_vnn_unassign_iface(ctdb, vnn);
3183                         continue;
3184                 }
3185                 if (!vnn->iface) {
3186                         continue;
3187                 }
3188
3189                 /* Don't allow multiple releases at once.  Some code,
3190                  * particularly ctdb_tickle_sentenced_connections() is
3191                  * not re-entrant */
3192                 if (vnn->update_in_flight) {
3193                         DEBUG(DEBUG_WARNING,
3194                               (__location__
3195                                " Not releasing IP %s/%u on interface %s, an update is already in progess\n",
3196                                     ctdb_addr_to_str(&vnn->public_address),
3197                                     vnn->public_netmask_bits,
3198                                     ctdb_vnn_iface_string(vnn)));
3199                         continue;
3200                 }
3201                 vnn->update_in_flight = true;
3202
3203                 DEBUG(DEBUG_INFO,("Release of IP %s/%u on interface %s node:-1\n",
3204                                     ctdb_addr_to_str(&vnn->public_address),
3205                                     vnn->public_netmask_bits,
3206                                     ctdb_vnn_iface_string(vnn)));
3207
3208                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
3209                                   ctdb_vnn_iface_string(vnn),
3210                                   ctdb_addr_to_str(&vnn->public_address),
3211                                   vnn->public_netmask_bits);
3212                 release_kill_clients(ctdb, &vnn->public_address);
3213                 ctdb_vnn_unassign_iface(ctdb, vnn);
3214                 vnn->update_in_flight = false;
3215                 count++;
3216         }
3217
3218         DEBUG(DEBUG_NOTICE,(__location__ " Released %d public IPs\n", count));
3219 }
3220
3221
3222 /*
3223   get list of public IPs
3224  */
3225 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
3226                                     struct ctdb_req_control_old *c, TDB_DATA *outdata)
3227 {
3228         int i, num, len;
3229         struct ctdb_public_ip_list_old *ips;
3230         struct ctdb_vnn *vnn;
3231         bool only_available = false;
3232
3233         if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
3234                 only_available = true;
3235         }
3236
3237         /* count how many public ip structures we have */
3238         num = 0;
3239         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3240                 num++;
3241         }
3242
3243         len = offsetof(struct ctdb_public_ip_list_old, ips) +
3244                 num*sizeof(struct ctdb_public_ip);
3245         ips = talloc_zero_size(outdata, len);
3246         CTDB_NO_MEMORY(ctdb, ips);
3247
3248         i = 0;
3249         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3250                 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
3251                         continue;
3252                 }
3253                 ips->ips[i].pnn  = vnn->pnn;
3254                 ips->ips[i].addr = vnn->public_address;
3255                 i++;
3256         }
3257         ips->num = i;
3258         len = offsetof(struct ctdb_public_ip_list_old, ips) +
3259                 i*sizeof(struct ctdb_public_ip);
3260
3261         outdata->dsize = len;
3262         outdata->dptr  = (uint8_t *)ips;
3263
3264         return 0;
3265 }
3266
3267
3268 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
3269                                         struct ctdb_req_control_old *c,
3270                                         TDB_DATA indata,
3271                                         TDB_DATA *outdata)
3272 {
3273         int i, num, len;
3274         ctdb_sock_addr *addr;
3275         struct ctdb_public_ip_info_old *info;
3276         struct ctdb_vnn *vnn;
3277
3278         addr = (ctdb_sock_addr *)indata.dptr;
3279
3280         vnn = find_public_ip_vnn(ctdb, addr);
3281         if (vnn == NULL) {
3282                 /* if it is not a public ip   it could be our 'single ip' */
3283                 if (ctdb->single_ip_vnn) {
3284                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
3285                                 vnn = ctdb->single_ip_vnn;
3286                         }
3287                 }
3288         }
3289         if (vnn == NULL) {
3290                 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
3291                                  "'%s'not a public address\n",
3292                                  ctdb_addr_to_str(addr)));
3293                 return -1;
3294         }
3295
3296         /* count how many public ip structures we have */
3297         num = 0;
3298         for (;vnn->ifaces[num];) {
3299                 num++;
3300         }
3301
3302         len = offsetof(struct ctdb_public_ip_info_old, ifaces) +
3303                 num*sizeof(struct ctdb_iface);
3304         info = talloc_zero_size(outdata, len);
3305         CTDB_NO_MEMORY(ctdb, info);
3306
3307         info->ip.addr = vnn->public_address;
3308         info->ip.pnn = vnn->pnn;
3309         info->active_idx = 0xFFFFFFFF;
3310
3311         for (i=0; vnn->ifaces[i]; i++) {
3312                 struct ctdb_interface *cur;
3313
3314                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
3315                 if (cur == NULL) {
3316                         DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
3317                                            vnn->ifaces[i]));
3318                         return -1;
3319                 }
3320                 if (vnn->iface == cur) {
3321                         info->active_idx = i;
3322                 }
3323                 strncpy(info->ifaces[i].name, cur->name, sizeof(info->ifaces[i].name)-1);
3324                 info->ifaces[i].link_state = cur->link_up;
3325                 info->ifaces[i].references = cur->references;
3326         }
3327         info->num = i;
3328         len = offsetof(struct ctdb_public_ip_info_old, ifaces) +
3329                 i*sizeof(struct ctdb_iface);
3330
3331         outdata->dsize = len;
3332         outdata->dptr  = (uint8_t *)info;
3333
3334         return 0;
3335 }
3336
3337 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
3338                                 struct ctdb_req_control_old *c,
3339                                 TDB_DATA *outdata)
3340 {
3341         int i, num, len;
3342         struct ctdb_iface_list_old *ifaces;
3343         struct ctdb_interface *cur;
3344
3345         /* count how many public ip structures we have */
3346         num = 0;
3347         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3348                 num++;
3349         }
3350
3351         len = offsetof(struct ctdb_iface_list_old, ifaces) +
3352                 num*sizeof(struct ctdb_iface);
3353         ifaces = talloc_zero_size(outdata, len);
3354         CTDB_NO_MEMORY(ctdb, ifaces);
3355
3356         i = 0;
3357         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3358                 strcpy(ifaces->ifaces[i].name, cur->name);
3359                 ifaces->ifaces[i].link_state = cur->link_up;
3360                 ifaces->ifaces[i].references = cur->references;
3361                 i++;
3362         }
3363         ifaces->num = i;
3364         len = offsetof(struct ctdb_iface_list_old, ifaces) +
3365                 i*sizeof(struct ctdb_iface);
3366
3367         outdata->dsize = len;
3368         outdata->dptr  = (uint8_t *)ifaces;
3369
3370         return 0;
3371 }
3372
3373 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
3374                                     struct ctdb_req_control_old *c,
3375                                     TDB_DATA indata)
3376 {
3377         struct ctdb_iface *info;
3378         struct ctdb_interface *iface;
3379         bool link_up = false;
3380
3381         info = (struct ctdb_iface *)indata.dptr;
3382
3383         if (info->name[CTDB_IFACE_SIZE] != '\0') {
3384                 int len = strnlen(info->name, CTDB_IFACE_SIZE);
3385                 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
3386                                   len, len, info->name));
3387                 return -1;
3388         }
3389
3390         switch (info->link_state) {
3391         case 0:
3392                 link_up = false;
3393                 break;
3394         case 1:
3395                 link_up = true;
3396                 break;
3397         default:
3398                 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
3399                                   (unsigned int)info->link_state));
3400                 return -1;
3401         }
3402
3403         if (info->references != 0) {
3404                 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
3405                                   (unsigned int)info->references));
3406                 return -1;
3407         }
3408
3409         iface = ctdb_find_iface(ctdb, info->name);
3410         if (iface == NULL) {
3411                 return -1;
3412         }
3413
3414         if (link_up == iface->link_up) {
3415                 return 0;
3416         }
3417
3418         DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
3419               ("iface[%s] has changed it's link status %s => %s\n",
3420                iface->name,
3421                iface->link_up?"up":"down",
3422                link_up?"up":"down"));
3423
3424         iface->link_up = link_up;
3425         return 0;
3426 }
3427
3428
3429 /* 
3430    structure containing the listening socket and the list of tcp connections
3431    that the ctdb daemon is to kill
3432 */
3433 struct ctdb_kill_tcp {
3434         struct ctdb_vnn *vnn;
3435         struct ctdb_context *ctdb;
3436         int capture_fd;
3437         struct tevent_fd *fde;
3438         trbt_tree_t *connections;
3439         void *private_data;
3440 };
3441
3442 /*
3443   a tcp connection that is to be killed
3444  */
3445 struct ctdb_killtcp_con {
3446         ctdb_sock_addr src_addr;
3447         ctdb_sock_addr dst_addr;
3448         int count;
3449         struct ctdb_kill_tcp *killtcp;
3450 };
3451
3452 /* this function is used to create a key to represent this socketpair
3453    in the killtcp tree.
3454    this key is used to insert and lookup matching socketpairs that are
3455    to be tickled and RST
3456 */
3457 #define KILLTCP_KEYLEN  10
3458 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
3459 {
3460         static uint32_t key[KILLTCP_KEYLEN];
3461
3462         bzero(key, sizeof(key));
3463
3464         if (src->sa.sa_family != dst->sa.sa_family) {
3465                 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
3466                 return key;
3467         }
3468         
3469         switch (src->sa.sa_family) {
3470         case AF_INET:
3471                 key[0]  = dst->ip.sin_addr.s_addr;
3472                 key[1]  = src->ip.sin_addr.s_addr;
3473                 key[2]  = dst->ip.sin_port;
3474                 key[3]  = src->ip.sin_port;
3475                 break;
3476         case AF_INET6: {
3477                 uint32_t *dst6_addr32 =
3478                         (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
3479                 uint32_t *src6_addr32 =
3480                         (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
3481                 key[0]  = dst6_addr32[3];
3482                 key[1]  = src6_addr32[3];
3483                 key[2]  = dst6_addr32[2];
3484                 key[3]  = src6_addr32[2];
3485                 key[4]  = dst6_addr32[1];
3486                 key[5]  = src6_addr32[1];
3487                 key[6]  = dst6_addr32[0];
3488                 key[7]  = src6_addr32[0];
3489                 key[8]  = dst->ip6.sin6_port;
3490                 key[9]  = src->ip6.sin6_port;
3491                 break;
3492         }
3493         default:
3494                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
3495                 return key;
3496         }
3497
3498         return key;
3499 }
3500
3501 /*
3502   called when we get a read event on the raw socket
3503  */
3504 static void capture_tcp_handler(struct tevent_context *ev,
3505                                 struct tevent_fd *fde,
3506                                 uint16_t flags, void *private_data)
3507 {
3508         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3509         struct ctdb_killtcp_con *con;
3510         ctdb_sock_addr src, dst;
3511         uint32_t ack_seq, seq;
3512
3513         if (!(flags & TEVENT_FD_READ)) {
3514                 return;
3515         }
3516
3517         if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
3518                                 killtcp->private_data,
3519                                 &src, &dst,
3520                                 &ack_seq, &seq) != 0) {
3521                 /* probably a non-tcp ACK packet */
3522                 return;
3523         }
3524
3525         /* check if we have this guy in our list of connections
3526            to kill
3527         */
3528         con = trbt_lookuparray32(killtcp->connections, 
3529                         KILLTCP_KEYLEN, killtcp_key(&src, &dst));
3530         if (con == NULL) {
3531                 /* no this was some other packet we can just ignore */
3532                 return;
3533         }
3534
3535         /* This one has been tickled !
3536            now reset him and remove him from the list.
3537          */
3538         DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
3539                 ntohs(con->dst_addr.ip.sin_port),
3540                 ctdb_addr_to_str(&con->src_addr),
3541                 ntohs(con->src_addr.ip.sin_port)));
3542
3543         ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
3544         talloc_free(con);
3545 }
3546
3547
3548 /* when traversing the list of all tcp connections to send tickle acks to
3549    (so that we can capture the ack coming back and kill the connection
3550     by a RST)
3551    this callback is called for each connection we are currently trying to kill
3552 */
3553 static int tickle_connection_traverse(void *param, void *data)
3554 {
3555         struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
3556
3557         /* have tried too many times, just give up */
3558         if (con->count >= 5) {
3559                 /* can't delete in traverse: reparent to delete_cons */
3560                 talloc_steal(param, con);
3561                 return 0;
3562         }
3563
3564         /* othervise, try tickling it again */
3565         con->count++;
3566         ctdb_sys_send_tcp(
3567                 (ctdb_sock_addr *)&con->dst_addr,
3568                 (ctdb_sock_addr *)&con->src_addr,
3569                 0, 0, 0);
3570         return 0;
3571 }
3572
3573
3574 /* 
3575    called every second until all sentenced connections have been reset
3576  */
3577 static void ctdb_tickle_sentenced_connections(struct tevent_context *ev,
3578                                               struct tevent_timer *te,
3579                                               struct timeval t, void *private_data)
3580 {
3581         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3582         void *delete_cons = talloc_new(NULL);
3583
3584         /* loop over all connections sending tickle ACKs */
3585         trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
3586
3587         /* now we've finished traverse, it's safe to do deletion. */
3588         talloc_free(delete_cons);
3589
3590         /* If there are no more connections to kill we can remove the
3591            entire killtcp structure
3592          */
3593         if ( (killtcp->connections == NULL) || 
3594              (killtcp->connections->root == NULL) ) {
3595                 talloc_free(killtcp);
3596                 return;
3597         }
3598
3599         /* try tickling them again in a seconds time
3600          */
3601         tevent_add_timer(killtcp->ctdb->ev, killtcp,
3602                          timeval_current_ofs(1, 0),
3603                          ctdb_tickle_sentenced_connections, killtcp);
3604 }
3605
3606 /*
3607   destroy the killtcp structure
3608  */
3609 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
3610 {
3611         struct ctdb_vnn *tmpvnn;
3612
3613         /* verify that this vnn is still active */
3614         for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
3615                 if (tmpvnn == killtcp->vnn) {
3616                         break;
3617                 }
3618         }
3619
3620         if (tmpvnn == NULL) {
3621                 return 0;
3622         }
3623
3624         if (killtcp->vnn->killtcp != killtcp) {
3625                 return 0;
3626         }
3627
3628         killtcp->vnn->killtcp = NULL;
3629
3630         return 0;
3631 }
3632
3633
3634 /* nothing fancy here, just unconditionally replace any existing
3635    connection structure with the new one.
3636
3637    don't even free the old one if it did exist, that one is talloc_stolen
3638    by the same node in the tree anyway and will be deleted when the new data 
3639    is deleted
3640 */
3641 static void *add_killtcp_callback(void *parm, void *data)
3642 {
3643         return parm;
3644 }
3645
3646 /*
3647   add a tcp socket to the list of connections we want to RST
3648  */
3649 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb, 
3650                                        ctdb_sock_addr *s,
3651                                        ctdb_sock_addr *d)
3652 {
3653         ctdb_sock_addr src, dst;
3654         struct ctdb_kill_tcp *killtcp;
3655         struct ctdb_killtcp_con *con;
3656         struct ctdb_vnn *vnn;
3657
3658         ctdb_canonicalize_ip(s, &src);
3659         ctdb_canonicalize_ip(d, &dst);
3660
3661         vnn = find_public_ip_vnn(ctdb, &dst);
3662         if (vnn == NULL) {
3663                 vnn = find_public_ip_vnn(ctdb, &src);
3664         }
3665         if (vnn == NULL) {
3666                 /* if it is not a public ip   it could be our 'single ip' */
3667                 if (ctdb->single_ip_vnn) {
3668                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
3669                                 vnn = ctdb->single_ip_vnn;
3670                         }
3671                 }
3672         }
3673         if (vnn == NULL) {
3674                 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n")); 
3675                 return -1;
3676         }
3677
3678         killtcp = vnn->killtcp;
3679         
3680         /* If this is the first connection to kill we must allocate
3681            a new structure
3682          */
3683         if (killtcp == NULL) {
3684                 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
3685                 CTDB_NO_MEMORY(ctdb, killtcp);
3686
3687                 killtcp->vnn         = vnn;
3688                 killtcp->ctdb        = ctdb;
3689                 killtcp->capture_fd  = -1;
3690                 killtcp->connections = trbt_create(killtcp, 0);
3691
3692                 vnn->killtcp         = killtcp;
3693                 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
3694         }
3695
3696
3697
3698         /* create a structure that describes this connection we want to
3699            RST and store it in killtcp->connections
3700         */
3701         con = talloc(killtcp, struct ctdb_killtcp_con);
3702         CTDB_NO_MEMORY(ctdb, con);
3703         con->src_addr = src;
3704         con->dst_addr = dst;
3705         con->count    = 0;
3706         con->killtcp  = killtcp;
3707
3708
3709         trbt_insertarray32_callback(killtcp->connections,
3710                         KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
3711                         add_killtcp_callback, con);
3712
3713         /* 
3714            If we don't have a socket to listen on yet we must create it
3715          */
3716         if (killtcp->capture_fd == -1) {
3717                 const char *iface = ctdb_vnn_iface_string(vnn);
3718                 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
3719                 if (killtcp->capture_fd == -1) {
3720                         DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
3721                                           "socket on iface '%s' for killtcp (%s)\n",
3722                                           iface, strerror(errno)));
3723                         goto failed;
3724                 }
3725         }
3726
3727
3728         if (killtcp->fde == NULL) {
3729                 killtcp->fde = tevent_add_fd(ctdb->ev, killtcp,
3730                                              killtcp->capture_fd,
3731                                              TEVENT_FD_READ,
3732                                              capture_tcp_handler, killtcp);
3733                 tevent_fd_set_auto_close(killtcp->fde);
3734
3735                 /* We also need to set up some events to tickle all these connections
3736                    until they are all reset
3737                 */
3738                 tevent_add_timer(ctdb->ev, killtcp, timeval_current_ofs(1, 0),
3739                                  ctdb_tickle_sentenced_connections, killtcp);
3740         }
3741
3742         /* tickle him once now */
3743         ctdb_sys_send_tcp(
3744                 &con->dst_addr,
3745                 &con->src_addr,
3746                 0, 0, 0);
3747
3748         return 0;
3749
3750 failed:
3751         talloc_free(vnn->killtcp);
3752         vnn->killtcp = NULL;
3753         return -1;
3754 }
3755
3756 /*
3757   kill a TCP connection.
3758  */
3759 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
3760 {
3761         struct ctdb_connection *killtcp = (struct ctdb_connection *)indata.dptr;
3762
3763         return ctdb_killtcp_add_connection(ctdb, &killtcp->src, &killtcp->dst);
3764 }
3765
3766 /*
3767   called by a daemon to inform us of the entire list of TCP tickles for
3768   a particular public address.
3769   this control should only be sent by the node that is currently serving
3770   that public address.
3771  */
3772 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
3773 {
3774         struct ctdb_tickle_list_old *list = (struct ctdb_tickle_list_old *)indata.dptr;
3775         struct ctdb_tcp_array *tcparray;
3776         struct ctdb_vnn *vnn;
3777
3778         /* We must at least have tickles.num or else we cant verify the size
3779            of the received data blob
3780          */
3781         if (indata.dsize < offsetof(struct ctdb_tickle_list_old, connections)) {
3782                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_tickle_list. Not enough data for the tickle.num field\n"));
3783                 return -1;
3784         }
3785
3786         /* verify that the size of data matches what we expect */
3787         if (indata.dsize < offsetof(struct ctdb_tickle_list_old, connections)
3788                          + sizeof(struct ctdb_connection) * list->num) {
3789                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_tickle_list\n"));
3790                 return -1;
3791         }
3792
3793         DEBUG(DEBUG_INFO, ("Received tickle update for public address %s\n",
3794                            ctdb_addr_to_str(&list->addr)));
3795
3796         vnn = find_public_ip_vnn(ctdb, &list->addr);
3797         if (vnn == NULL) {
3798                 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n",
3799                         ctdb_addr_to_str(&list->addr)));
3800
3801                 return 1;
3802         }
3803
3804         /* remove any old ticklelist we might have */
3805         talloc_free(vnn->tcp_array);
3806         vnn->tcp_array = NULL;
3807
3808         tcparray = talloc(vnn, struct ctdb_tcp_array);
3809         CTDB_NO_MEMORY(ctdb, tcparray);
3810
3811         tcparray->num = list->num;
3812
3813         tcparray->connections = talloc_array(tcparray, struct ctdb_connection, tcparray->num);
3814         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3815
3816         memcpy(tcparray->connections, &list->connections[0],
3817                sizeof(struct ctdb_connection)*tcparray->num);
3818
3819         /* We now have a new fresh tickle list array for this vnn */
3820         vnn->tcp_array = tcparray;
3821
3822         return 0;
3823 }
3824
3825 /*
3826   called to return the full list of tickles for the puclic address associated 
3827   with the provided vnn
3828  */
3829 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3830 {
3831         ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3832         struct ctdb_tickle_list_old *list;
3833         struct ctdb_tcp_array *tcparray;
3834         int num;
3835         struct ctdb_vnn *vnn;
3836
3837         vnn = find_public_ip_vnn(ctdb, addr);
3838         if (vnn == NULL) {
3839                 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n", 
3840                         ctdb_addr_to_str(addr)));
3841
3842                 return 1;
3843         }
3844
3845         tcparray = vnn->tcp_array;
3846         if (tcparray) {
3847                 num = tcparray->num;
3848         } else {
3849                 num = 0;
3850         }
3851
3852         outdata->dsize = offsetof(struct ctdb_tickle_list_old, connections)
3853                         + sizeof(struct ctdb_connection) * num;
3854
3855         outdata->dptr  = talloc_size(outdata, outdata->dsize);
3856         CTDB_NO_MEMORY(ctdb, outdata->dptr);
3857         list = (struct ctdb_tickle_list_old *)outdata->dptr;
3858
3859         list->addr = *addr;
3860         list->num = num;
3861         if (num) {
3862                 memcpy(&list->connections[0], tcparray->connections,
3863                         sizeof(struct ctdb_connection) * num);
3864         }
3865
3866         return 0;
3867 }
3868
3869
3870 /*
3871   set the list of all tcp tickles for a public address
3872  */
3873 static int ctdb_send_set_tcp_tickles_for_ip(struct ctdb_context *ctdb,
3874                                             ctdb_sock_addr *addr,
3875                                             struct ctdb_tcp_array *tcparray)
3876 {
3877         int ret, num;
3878         TDB_DATA data;
3879         struct ctdb_tickle_list_old *list;
3880
3881         if (tcparray) {
3882                 num = tcparray->num;
3883         } else {
3884                 num = 0;
3885         }
3886
3887         data.dsize = offsetof(struct ctdb_tickle_list_old, connections) +
3888                         sizeof(struct ctdb_connection) * num;
3889         data.dptr = talloc_size(ctdb, data.dsize);
3890         CTDB_NO_MEMORY(ctdb, data.dptr);
3891
3892         list = (struct ctdb_tickle_list_old *)data.dptr;
3893         list->addr = *addr;
3894         list->num = num;
3895         if (tcparray) {
3896                 memcpy(&list->connections[0], tcparray->connections, sizeof(struct ctdb_connection) * num);
3897         }
3898
3899         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_ALL, 0,
3900                                        CTDB_CONTROL_SET_TCP_TICKLE_LIST,
3901                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3902         if (ret != 0) {
3903                 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
3904                 return -1;
3905         }
3906
3907         talloc_free(data.dptr);
3908
3909         return ret;
3910 }
3911
3912
3913 /*
3914   perform tickle updates if required
3915  */
3916 static void ctdb_update_tcp_tickles(struct tevent_context *ev,
3917                                     struct tevent_timer *te,
3918                                     struct timeval t, void *private_data)
3919 {
3920         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3921         int ret;
3922         struct ctdb_vnn *vnn;
3923
3924         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3925                 /* we only send out updates for public addresses that 
3926                    we have taken over
3927                  */
3928                 if (ctdb->pnn != vnn->pnn) {
3929                         continue;
3930                 }
3931                 /* We only send out the updates if we need to */
3932                 if (!vnn->tcp_update_needed) {
3933                         continue;
3934                 }
3935                 ret = ctdb_send_set_tcp_tickles_for_ip(ctdb,
3936                                                        &vnn->public_address,
3937                                                        vnn->tcp_array);
3938                 if (ret != 0) {
3939                         DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
3940                                 ctdb_addr_to_str(&vnn->public_address)));
3941                 } else {
3942                         DEBUG(DEBUG_INFO,
3943                               ("Sent tickle update for public address %s\n",
3944                                ctdb_addr_to_str(&vnn->public_address)));
3945                         vnn->tcp_update_needed = false;
3946                 }
3947         }
3948
3949         tevent_add_timer(ctdb->ev, ctdb->tickle_update_context,
3950                          timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
3951                          ctdb_update_tcp_tickles, ctdb);
3952 }
3953
3954 /*
3955   start periodic update of tcp tickles
3956  */
3957 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
3958 {
3959         ctdb->tickle_update_context = talloc_new(ctdb);
3960
3961         tevent_add_timer(ctdb->ev, ctdb->tickle_update_context,
3962                          timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
3963                          ctdb_update_tcp_tickles, ctdb);
3964 }
3965
3966
3967
3968
3969 struct control_gratious_arp {
3970         struct ctdb_context *ctdb;
3971         ctdb_sock_addr addr;
3972         const char *iface;
3973         int count;
3974 };
3975
3976 /*
3977   send a control_gratuitous arp
3978  */
3979 static void send_gratious_arp(struct tevent_context *ev,
3980                               struct tevent_timer *te,
3981                               struct timeval t, void *private_data)
3982 {
3983         int ret;
3984         struct control_gratious_arp *arp = talloc_get_type(private_data, 
3985                                                         struct control_gratious_arp);
3986
3987         ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
3988         if (ret != 0) {
3989                 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
3990                                  arp->iface, strerror(errno)));
3991         }
3992
3993
3994         arp->count++;
3995         if (arp->count == CTDB_ARP_REPEAT) {
3996                 talloc_free(arp);
3997                 return;
3998         }
3999
4000         tevent_add_timer(arp->ctdb->ev, arp,
4001                          timeval_current_ofs(CTDB_ARP_INTERVAL, 0),
4002                          send_gratious_arp, arp);
4003 }
4004
4005
4006 /*
4007   send a gratious arp 
4008  */
4009 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
4010 {
4011         struct ctdb_addr_info_old *gratious_arp = (struct ctdb_addr_info_old *)indata.dptr;
4012         struct control_gratious_arp *arp;
4013
4014         /* verify the size of indata */
4015         if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
4016                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
4017                                  (unsigned)indata.dsize, 
4018                                  (unsigned)offsetof(struct ctdb_addr_info_old, iface)));
4019                 return -1;
4020         }
4021         if (indata.dsize != 
4022                 ( offsetof(struct ctdb_addr_info_old, iface)
4023                 + gratious_arp->len ) ){
4024
4025                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4026                         "but should be %u bytes\n", 
4027                          (unsigned)indata.dsize, 
4028                          (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+gratious_arp->len)));
4029                 return -1;
4030         }
4031
4032
4033         arp = talloc(ctdb, struct control_gratious_arp);
4034         CTDB_NO_MEMORY(ctdb, arp);
4035
4036         arp->ctdb  = ctdb;
4037         arp->addr   = gratious_arp->addr;
4038         arp->iface = talloc_strdup(arp, gratious_arp->iface);
4039         CTDB_NO_MEMORY(ctdb, arp->iface);
4040         arp->count = 0;
4041
4042         tevent_add_timer(arp->ctdb->ev, arp,
4043                          timeval_zero(), send_gratious_arp, arp);
4044
4045         return 0;
4046 }
4047
4048 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
4049 {
4050         struct ctdb_addr_info_old *pub = (struct ctdb_addr_info_old *)indata.dptr;
4051         int ret;
4052
4053         /* verify the size of indata */
4054         if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
4055                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_addr_info structure\n"));
4056                 return -1;
4057         }
4058         if (indata.dsize != 
4059                 ( offsetof(struct ctdb_addr_info_old, iface)
4060                 + pub->len ) ){
4061
4062                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4063                         "but should be %u bytes\n", 
4064                          (unsigned)indata.dsize, 
4065                          (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+pub->len)));
4066                 return -1;
4067         }
4068
4069         DEBUG(DEBUG_NOTICE,("Add IP %s\n", ctdb_addr_to_str(&pub->addr)));
4070
4071         ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
4072
4073         if (ret != 0) {
4074                 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
4075                 return -1;
4076         }
4077
4078         return 0;
4079 }
4080
4081 struct delete_ip_callback_state {
4082         struct ctdb_req_control_old *c;
4083 };
4084
4085 /*
4086   called when releaseip event finishes for del_public_address
4087  */
4088 static void delete_ip_callback(struct ctdb_context *ctdb,
4089                                int32_t status, TDB_DATA data,
4090                                const char *errormsg,
4091                                void *private_data)
4092 {
4093         struct delete_ip_callback_state *state =
4094                 talloc_get_type(private_data, struct delete_ip_callback_state);
4095
4096         /* If release failed then fail. */
4097         ctdb_request_control_reply(ctdb, state->c, NULL, status, errormsg);
4098         talloc_free(private_data);
4099 }
4100
4101 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb,
4102                                         struct ctdb_req_control_old *c,
4103                                         TDB_DATA indata, bool *async_reply)
4104 {
4105         struct ctdb_addr_info_old *pub = (struct ctdb_addr_info_old *)indata.dptr;
4106         struct ctdb_vnn *vnn;
4107
4108         /* verify the size of indata */
4109         if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
4110                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_addr_info structure\n"));
4111                 return -1;
4112         }
4113         if (indata.dsize != 
4114                 ( offsetof(struct ctdb_addr_info_old, iface)
4115                 + pub->len ) ){
4116
4117                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4118                         "but should be %u bytes\n", 
4119                          (unsigned)indata.dsize, 
4120                          (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+pub->len)));
4121                 return -1;
4122         }
4123
4124         DEBUG(DEBUG_NOTICE,("Delete IP %s\n", ctdb_addr_to_str(&pub->addr)));
4125
4126         /* walk over all public addresses until we find a match */
4127         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
4128                 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
4129                         if (vnn->pnn == ctdb->pnn) {
4130                                 struct delete_ip_callback_state *state;
4131                                 struct ctdb_public_ip *ip;
4132                                 TDB_DATA data;
4133                                 int ret;
4134
4135                                 vnn->delete_pending = true;
4136
4137                                 state = talloc(ctdb,
4138                                                struct delete_ip_callback_state);
4139                                 CTDB_NO_MEMORY(ctdb, state);
4140                                 state->c = c;
4141
4142                                 ip = talloc(state, struct ctdb_public_ip);
4143                                 if (ip == NULL) {
4144                                         DEBUG(DEBUG_ERR,
4145                                               (__location__ " Out of memory\n"));
4146                                         talloc_free(state);
4147                                         return -1;
4148                                 }
4149                                 ip->pnn = -1;
4150                                 ip->addr = pub->addr;
4151
4152                                 data.dsize = sizeof(struct ctdb_public_ip);
4153                                 data.dptr = (unsigned char *)ip;
4154
4155                                 ret = ctdb_daemon_send_control(ctdb,
4156                                                                ctdb_get_pnn(ctdb),
4157                                                                0,
4158                                                                CTDB_CONTROL_RELEASE_IP,
4159                                                                0, 0,
4160                                                                data,
4161                                                                delete_ip_callback,
4162                                                                state);
4163                                 if (ret == -1) {
4164                                         DEBUG(DEBUG_ERR,
4165                                               (__location__ "Unable to send "
4166                                                "CTDB_CONTROL_RELEASE_IP\n"));
4167                                         talloc_free(state);
4168                                         return -1;
4169                                 }
4170
4171                                 state->c = talloc_steal(state, c);
4172                                 *async_reply = true;
4173                         } else {
4174                                 /* This IP is not hosted on the
4175                                  * current node so just delete it
4176                                  * now. */
4177                                 do_delete_ip(ctdb, vnn);
4178                         }
4179
4180                         return 0;
4181                 }
4182         }
4183
4184         DEBUG(DEBUG_ERR,("Delete IP of unknown public IP address %s\n",
4185                          ctdb_addr_to_str(&pub->addr)));
4186         return -1;
4187 }
4188
4189
4190 struct ipreallocated_callback_state {
4191         struct ctdb_req_control_old *c;
4192 };
4193
4194 static void ctdb_ipreallocated_callback(struct ctdb_context *ctdb,
4195                                         int status, void *p)
4196 {
4197         struct ipreallocated_callback_state *state =
4198                 talloc_get_type(p, struct ipreallocated_callback_state);
4199
4200         if (status != 0) {
4201                 DEBUG(DEBUG_ERR,
4202                       (" \"ipreallocated\" event script failed (status %d)\n",
4203                        status));
4204                 if (status == -ETIME) {
4205                         ctdb_ban_self(ctdb);
4206                 }
4207         }
4208
4209         ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
4210         talloc_free(state);
4211 }
4212
4213 /* A control to run the ipreallocated event */
4214 int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
4215                                    struct ctdb_req_control_old *c,
4216                                    bool *async_reply)
4217 {
4218         int ret;
4219         struct ipreallocated_callback_state *state;
4220
4221         state = talloc(ctdb, struct ipreallocated_callback_state);
4222         CTDB_NO_MEMORY(ctdb, state);
4223
4224         DEBUG(DEBUG_INFO,(__location__ " Running \"ipreallocated\" event\n"));
4225
4226         ret = ctdb_event_script_callback(ctdb, state,
4227                                          ctdb_ipreallocated_callback, state,
4228                                          CTDB_EVENT_IPREALLOCATED,
4229                                          "%s", "");
4230
4231         if (ret != 0) {
4232                 DEBUG(DEBUG_ERR,("Failed to run \"ipreallocated\" event \n"));
4233                 talloc_free(state);
4234                 return -1;
4235         }
4236
4237         /* tell the control that we will be reply asynchronously */
4238         state->c    = talloc_steal(state, c);
4239         *async_reply = true;
4240
4241         return 0;
4242 }
4243
4244
4245 /* This function is called from the recovery daemon to verify that a remote
4246    node has the expected ip allocation.
4247    This is verified against ctdb->ip_tree
4248 */
4249 static int verify_remote_ip_allocation(struct ctdb_context *ctdb,
4250                                        struct ctdb_public_ip_list_old *ips,
4251                                        uint32_t pnn)
4252 {
4253         struct public_ip_list *tmp_ip;
4254         int i;
4255
4256         if (ctdb->ip_tree == NULL) {
4257                 /* don't know the expected allocation yet, assume remote node
4258                    is correct. */
4259                 return 0;
4260         }
4261
4262         if (ips == NULL) {
4263                 return 0;
4264         }
4265
4266         for (i=0; i<ips->num; i++) {
4267                 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
4268                 if (tmp_ip == NULL) {
4269                         DEBUG(DEBUG_ERR,("Node %u has new or unknown public IP %s\n", pnn, ctdb_addr_to_str(&ips->ips[i].addr)));
4270                         return -1;
4271                 }
4272
4273                 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
4274                         continue;
4275                 }
4276
4277                 if (tmp_ip->pnn != ips->ips[i].pnn) {
4278                         DEBUG(DEBUG_ERR,
4279                               ("Inconsistent IP allocation - node %u thinks %s is held by node %u while it is assigned to node %u\n",
4280                                pnn,
4281                                ctdb_addr_to_str(&ips->ips[i].addr),
4282                                ips->ips[i].pnn, tmp_ip->pnn));
4283                         return -1;
4284                 }
4285         }
4286
4287         return 0;
4288 }
4289
4290 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
4291 {
4292         struct public_ip_list *tmp_ip;
4293
4294         /* IP tree is never built if DisableIPFailover is set */
4295         if (ctdb->tunable.disable_ip_failover != 0) {
4296                 return 0;
4297         }
4298
4299         if (ctdb->ip_tree == NULL) {
4300                 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
4301                 return -1;
4302         }
4303
4304         tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
4305         if (tmp_ip == NULL) {
4306                 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
4307                 return -1;
4308         }
4309
4310         DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
4311         tmp_ip->pnn = ip->pnn;
4312
4313         return 0;
4314 }
4315
4316 void clear_ip_assignment_tree(struct ctdb_context *ctdb)
4317 {
4318         TALLOC_FREE(ctdb->ip_tree);
4319 }
4320
4321 struct ctdb_reloadips_handle {
4322         struct ctdb_context *ctdb;
4323         struct ctdb_req_control_old *c;
4324         int status;
4325         int fd[2];
4326         pid_t child;
4327         struct tevent_fd *fde;
4328 };
4329
4330 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
4331 {
4332         if (h == h->ctdb->reload_ips) {
4333                 h->ctdb->reload_ips = NULL;
4334         }
4335         if (h->c != NULL) {
4336                 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
4337                 h->c = NULL;
4338         }
4339         ctdb_kill(h->ctdb, h->child, SIGKILL);
4340         return 0;
4341 }
4342
4343 static void ctdb_reloadips_timeout_event(struct tevent_context *ev,
4344                                          struct tevent_timer *te,
4345                                          struct timeval t, void *private_data)
4346 {
4347         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4348
4349         talloc_free(h);
4350 }
4351
4352 static void ctdb_reloadips_child_handler(struct tevent_context *ev,
4353                                          struct tevent_fd *fde,
4354                                          uint16_t flags, void *private_data)
4355 {
4356         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4357
4358         char res;
4359         int ret;
4360
4361         ret = sys_read(h->fd[0], &res, 1);
4362         if (ret < 1 || res != 0) {
4363                 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
4364                 res = 1;
4365         }
4366         h->status = res;
4367
4368         talloc_free(h);
4369 }
4370
4371 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
4372 {
4373         TALLOC_CTX *mem_ctx = talloc_new(NULL);
4374         struct ctdb_public_ip_list_old *ips;
4375         struct ctdb_vnn *vnn;
4376         struct client_async_data *async_data;
4377         struct timeval timeout;
4378         TDB_DATA data;
4379         struct ctdb_client_control_state *state;
4380         bool first_add;
4381         int i, ret;
4382
4383         CTDB_NO_MEMORY(ctdb, mem_ctx);
4384
4385         /* Read IPs from local node */
4386         ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(),
4387                                        CTDB_CURRENT_NODE, mem_ctx, &ips);
4388         if (ret != 0) {
4389                 DEBUG(DEBUG_ERR,
4390                       ("Unable to fetch public IPs from local node\n"));
4391                 talloc_free(mem_ctx);
4392                 return -1;
4393         }
4394
4395         /* Read IPs file - this is safe since this is a child process */
4396         ctdb->vnn = NULL;
4397         if (ctdb_set_public_addresses(ctdb, false) != 0) {
4398                 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
4399                 talloc_free(mem_ctx);
4400                 return -1;
4401         }
4402
4403         async_data = talloc_zero(mem_ctx, struct client_async_data);
4404         CTDB_NO_MEMORY(ctdb, async_data);
4405
4406         /* Compare IPs between node and file for IPs to be deleted */
4407         for (i = 0; i < ips->num; i++) {
4408                 /* */
4409                 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4410                         if (ctdb_same_ip(&vnn->public_address,
4411                                          &ips->ips[i].addr)) {
4412                                 /* IP is still in file */
4413                                 break;
4414                         }
4415                 }
4416
4417                 if (vnn == NULL) {
4418                         /* Delete IP ips->ips[i] */
4419                         struct ctdb_addr_info_old *pub;
4420
4421                         DEBUG(DEBUG_NOTICE,
4422                               ("IP %s no longer configured, deleting it\n",
4423                                ctdb_addr_to_str(&ips->ips[i].addr)));
4424
4425                         pub = talloc_zero(mem_ctx, struct ctdb_addr_info_old);
4426                         CTDB_NO_MEMORY(ctdb, pub);
4427
4428                         pub->addr  = ips->ips[i].addr;
4429                         pub->mask  = 0;
4430                         pub->len   = 0;
4431
4432                         timeout = TAKEOVER_TIMEOUT();
4433
4434                         data.dsize = offsetof(struct ctdb_addr_info_old,
4435                                               iface) + pub->len;
4436                         data.dptr = (uint8_t *)pub;
4437
4438                         state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4439                                                   CTDB_CONTROL_DEL_PUBLIC_IP,
4440                                                   0, data, async_data,
4441                                                   &timeout, NULL);
4442                         if (state == NULL) {
4443                                 DEBUG(DEBUG_ERR,
4444                                       (__location__
4445                                        " failed sending CTDB_CONTROL_DEL_PUBLIC_IP\n"));
4446                                 goto failed;
4447                         }
4448
4449                         ctdb_client_async_add(async_data, state);
4450                 }
4451         }
4452
4453         /* Compare IPs between node and file for IPs to be added */
4454         first_add = true;
4455         for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4456                 for (i = 0; i < ips->num; i++) {
4457                         if (ctdb_same_ip(&vnn->public_address,
4458                                          &ips->ips[i].addr)) {
4459                                 /* IP already on node */
4460                                 break;
4461                         }
4462                 }
4463                 if (i == ips->num) {
4464                         /* Add IP ips->ips[i] */
4465                         struct ctdb_addr_info_old *pub;
4466                         const char *ifaces = NULL;
4467                         uint32_t len;
4468                         int iface = 0;
4469
4470                         DEBUG(DEBUG_NOTICE,
4471                               ("New IP %s configured, adding it\n",
4472                                ctdb_addr_to_str(&vnn->public_address)));
4473                         if (first_add) {
4474                                 uint32_t pnn = ctdb_get_pnn(ctdb);
4475
4476                                 data.dsize = sizeof(pnn);
4477                                 data.dptr  = (uint8_t *)&pnn;
4478
4479                                 ret = ctdb_client_send_message(
4480                                         ctdb,
4481                                         CTDB_BROADCAST_CONNECTED,
4482                                         CTDB_SRVID_REBALANCE_NODE,
4483                                         data);
4484                                 if (ret != 0) {
4485                                         DEBUG(DEBUG_WARNING,
4486                                               ("Failed to send message to force node reallocation - IPs may be unbalanced\n"));
4487                                 }
4488
4489                                 first_add = false;
4490                         }
4491
4492                         ifaces = vnn->ifaces[0];
4493                         iface = 1;
4494                         while (vnn->ifaces[iface] != NULL) {
4495                                 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces,
4496                                                          vnn->ifaces[iface]);
4497                                 iface++;
4498                         }
4499
4500                         len   = strlen(ifaces) + 1;
4501                         pub = talloc_zero_size(mem_ctx,
4502                                                offsetof(struct ctdb_addr_info_old, iface) + len);
4503                         CTDB_NO_MEMORY(ctdb, pub);
4504
4505                         pub->addr  = vnn->public_address;
4506                         pub->mask  = vnn->public_netmask_bits;
4507                         pub->len   = len;
4508                         memcpy(&pub->iface[0], ifaces, pub->len);
4509
4510                         timeout = TAKEOVER_TIMEOUT();
4511
4512                         data.dsize = offsetof(struct ctdb_addr_info_old,
4513                                               iface) + pub->len;
4514                         data.dptr = (uint8_t *)pub;
4515
4516                         state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4517                                                   CTDB_CONTROL_ADD_PUBLIC_IP,
4518                                                   0, data, async_data,
4519                                                   &timeout, NULL);
4520                         if (state == NULL) {
4521                                 DEBUG(DEBUG_ERR,
4522                                       (__location__
4523                                        " failed sending CTDB_CONTROL_ADD_PUBLIC_IP\n"));
4524                                 goto failed;
4525                         }
4526
4527                         ctdb_client_async_add(async_data, state);
4528                 }
4529         }
4530
4531         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
4532                 DEBUG(DEBUG_ERR,(__location__ " Add/delete IPs failed\n"));
4533                 goto failed;
4534         }
4535
4536         talloc_free(mem_ctx);
4537         return 0;
4538
4539 failed:
4540         talloc_free(mem_ctx);
4541         return -1;
4542 }
4543
4544 /* This control is sent to force the node to re-read the public addresses file
4545    and drop any addresses we should nnot longer host, and add new addresses
4546    that we are now able to host
4547 */
4548 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control_old *c, bool *async_reply)
4549 {
4550         struct ctdb_reloadips_handle *h;
4551         pid_t parent = getpid();
4552
4553         if (ctdb->reload_ips != NULL) {
4554                 talloc_free(ctdb->reload_ips);
4555                 ctdb->reload_ips = NULL;
4556         }
4557
4558         h = talloc(ctdb, struct ctdb_reloadips_handle);
4559         CTDB_NO_MEMORY(ctdb, h);
4560         h->ctdb     = ctdb;
4561         h->c        = NULL;
4562         h->status   = -1;
4563         
4564         if (pipe(h->fd) == -1) {
4565                 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
4566                 talloc_free(h);
4567                 return -1;
4568         }
4569
4570         h->child = ctdb_fork(ctdb);
4571         if (h->child == (pid_t)-1) {
4572                 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
4573                 close(h->fd[0]);
4574                 close(h->fd[1]);
4575                 talloc_free(h);
4576                 return -1;
4577         }
4578
4579         /* child process */
4580         if (h->child == 0) {
4581                 signed char res = 0;
4582
4583                 close(h->fd[0]);
4584                 debug_extra = talloc_asprintf(NULL, "reloadips:");
4585
4586                 prctl_set_comment("ctdb_reloadips");
4587                 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
4588                         DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
4589                         res = -1;
4590                 } else {
4591                         res = ctdb_reloadips_child(ctdb);
4592                         if (res != 0) {
4593                                 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
4594                         }
4595                 }
4596
4597                 sys_write(h->fd[1], &res, 1);
4598                 /* make sure we die when our parent dies */
4599                 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
4600                         sleep(5);
4601                 }
4602                 _exit(0);
4603         }
4604
4605         h->c             = talloc_steal(h, c);
4606
4607         close(h->fd[1]);
4608         set_close_on_exec(h->fd[0]);
4609
4610         talloc_set_destructor(h, ctdb_reloadips_destructor);
4611
4612
4613         h->fde = tevent_add_fd(ctdb->ev, h, h->fd[0], TEVENT_FD_READ,
4614                                ctdb_reloadips_child_handler, (void *)h);
4615         tevent_fd_set_auto_close(h->fde);
4616
4617         tevent_add_timer(ctdb->ev, h, timeval_current_ofs(120, 0),
4618                          ctdb_reloadips_timeout_event, h);
4619
4620         /* we reply later */
4621         *async_reply = true;
4622         return 0;
4623 }