ctdb-ipalloc: Add error handling to IP allocation
[obnox/samba/samba-obnox.git] / ctdb / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6    Copyright (C) Martin Schwenke  2011
7
8    This program is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3 of the License, or
11    (at your option) any later version.
12    
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17    
18    You should have received a copy of the GNU General Public License
19    along with this program; if not, see <http://www.gnu.org/licenses/>.
20 */
21 #include "replace.h"
22 #include "system/network.h"
23 #include "system/filesys.h"
24 #include "system/time.h"
25 #include "system/wait.h"
26
27 #include <talloc.h>
28 #include <tevent.h>
29
30 #include "lib/util/dlinklist.h"
31 #include "lib/util/debug.h"
32 #include "lib/util/samba_util.h"
33 #include "lib/util/util_process.h"
34
35 #include "ctdb_private.h"
36 #include "ctdb_client.h"
37
38 #include "common/rb_tree.h"
39 #include "common/reqid.h"
40 #include "common/system.h"
41 #include "common/common.h"
42 #include "common/logging.h"
43
44
45 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
46
47 #define CTDB_ARP_INTERVAL 1
48 #define CTDB_ARP_REPEAT   3
49
50 /* Flags used in IP allocation algorithms. */
51 struct ctdb_ipflags {
52         bool noiptakeover;
53         bool noiphost;
54 };
55
56 enum ipalloc_algorithm {
57         IPALLOC_DETERMINISTIC,
58         IPALLOC_NONDETERMINISTIC,
59         IPALLOC_LCP2,
60 };
61
62 struct ipalloc_state {
63         uint32_t num;
64
65         /* Arrays with data for each node */
66         struct ctdb_public_ip_list_old **known_public_ips;
67         struct ctdb_public_ip_list_old **available_public_ips;
68
69         enum ipalloc_algorithm algorithm;
70         uint32_t no_ip_failback;
71 };
72
73 struct ctdb_interface {
74         struct ctdb_interface *prev, *next;
75         const char *name;
76         bool link_up;
77         uint32_t references;
78 };
79
80 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
81 {
82         if (vnn->iface) {
83                 return vnn->iface->name;
84         }
85
86         return "__none__";
87 }
88
89 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
90 {
91         struct ctdb_interface *i;
92
93         /* Verify that we don't have an entry for this ip yet */
94         for (i=ctdb->ifaces;i;i=i->next) {
95                 if (strcmp(i->name, iface) == 0) {
96                         return 0;
97                 }
98         }
99
100         /* create a new structure for this interface */
101         i = talloc_zero(ctdb, struct ctdb_interface);
102         CTDB_NO_MEMORY_FATAL(ctdb, i);
103         i->name = talloc_strdup(i, iface);
104         CTDB_NO_MEMORY(ctdb, i->name);
105
106         i->link_up = true;
107
108         DLIST_ADD(ctdb->ifaces, i);
109
110         return 0;
111 }
112
113 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
114                                         const char *name)
115 {
116         int n;
117
118         for (n = 0; vnn->ifaces[n] != NULL; n++) {
119                 if (strcmp(name, vnn->ifaces[n]) == 0) {
120                         return true;
121                 }
122         }
123
124         return false;
125 }
126
127 /* If any interfaces now have no possible IPs then delete them.  This
128  * implementation is naive (i.e. simple) rather than clever
129  * (i.e. complex).  Given that this is run on delip and that operation
130  * is rare, this doesn't need to be efficient - it needs to be
131  * foolproof.  One alternative is reference counting, where the logic
132  * is distributed and can, therefore, be broken in multiple places.
133  * Another alternative is to build a red-black tree of interfaces that
134  * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
135  * once) and then walking ctdb->ifaces once and deleting those not in
136  * the tree.  Let's go to one of those if the naive implementation
137  * causes problems...  :-)
138  */
139 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
140                                         struct ctdb_vnn *vnn)
141 {
142         struct ctdb_interface *i, *next;
143
144         /* For each interface, check if there's an IP using it. */
145         for (i = ctdb->ifaces; i != NULL; i = next) {
146                 struct ctdb_vnn *tv;
147                 bool found;
148                 next = i->next;
149
150                 /* Only consider interfaces named in the given VNN. */
151                 if (!vnn_has_interface_with_name(vnn, i->name)) {
152                         continue;
153                 }
154
155                 /* Is the "single IP" on this interface? */
156                 if ((ctdb->single_ip_vnn != NULL) &&
157                     (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
158                     (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
159                         /* Found, next interface please... */
160                         continue;
161                 }
162                 /* Search for a vnn with this interface. */
163                 found = false;
164                 for (tv=ctdb->vnn; tv; tv=tv->next) {
165                         if (vnn_has_interface_with_name(tv, i->name)) {
166                                 found = true;
167                                 break;
168                         }
169                 }
170
171                 if (!found) {
172                         /* None of the VNNs are using this interface. */
173                         DLIST_REMOVE(ctdb->ifaces, i);
174                         talloc_free(i);
175                 }
176         }
177 }
178
179
180 static struct ctdb_interface *ctdb_find_iface(struct ctdb_context *ctdb,
181                                               const char *iface)
182 {
183         struct ctdb_interface *i;
184
185         for (i=ctdb->ifaces;i;i=i->next) {
186                 if (strcmp(i->name, iface) == 0) {
187                         return i;
188                 }
189         }
190
191         return NULL;
192 }
193
194 static struct ctdb_interface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
195                                                   struct ctdb_vnn *vnn)
196 {
197         int i;
198         struct ctdb_interface *cur = NULL;
199         struct ctdb_interface *best = NULL;
200
201         for (i=0; vnn->ifaces[i]; i++) {
202
203                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
204                 if (cur == NULL) {
205                         continue;
206                 }
207
208                 if (!cur->link_up) {
209                         continue;
210                 }
211
212                 if (best == NULL) {
213                         best = cur;
214                         continue;
215                 }
216
217                 if (cur->references < best->references) {
218                         best = cur;
219                         continue;
220                 }
221         }
222
223         return best;
224 }
225
226 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
227                                      struct ctdb_vnn *vnn)
228 {
229         struct ctdb_interface *best = NULL;
230
231         if (vnn->iface) {
232                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
233                                    "still assigned to iface '%s'\n",
234                                    ctdb_addr_to_str(&vnn->public_address),
235                                    ctdb_vnn_iface_string(vnn)));
236                 return 0;
237         }
238
239         best = ctdb_vnn_best_iface(ctdb, vnn);
240         if (best == NULL) {
241                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
242                                   "cannot assign to iface any iface\n",
243                                   ctdb_addr_to_str(&vnn->public_address)));
244                 return -1;
245         }
246
247         vnn->iface = best;
248         best->references++;
249         vnn->pnn = ctdb->pnn;
250
251         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
252                            "now assigned to iface '%s' refs[%d]\n",
253                            ctdb_addr_to_str(&vnn->public_address),
254                            ctdb_vnn_iface_string(vnn),
255                            best->references));
256         return 0;
257 }
258
259 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
260                                     struct ctdb_vnn *vnn)
261 {
262         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
263                            "now unassigned (old iface '%s' refs[%d])\n",
264                            ctdb_addr_to_str(&vnn->public_address),
265                            ctdb_vnn_iface_string(vnn),
266                            vnn->iface?vnn->iface->references:0));
267         if (vnn->iface) {
268                 vnn->iface->references--;
269         }
270         vnn->iface = NULL;
271         if (vnn->pnn == ctdb->pnn) {
272                 vnn->pnn = -1;
273         }
274 }
275
276 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
277                                struct ctdb_vnn *vnn)
278 {
279         int i;
280
281         /* Nodes that are not RUNNING can not host IPs */
282         if (ctdb->runstate != CTDB_RUNSTATE_RUNNING) {
283                 return false;
284         }
285
286         if (vnn->delete_pending) {
287                 return false;
288         }
289
290         if (vnn->iface && vnn->iface->link_up) {
291                 return true;
292         }
293
294         for (i=0; vnn->ifaces[i]; i++) {
295                 struct ctdb_interface *cur;
296
297                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
298                 if (cur == NULL) {
299                         continue;
300                 }
301
302                 if (cur->link_up) {
303                         return true;
304                 }
305         }
306
307         return false;
308 }
309
310 struct ctdb_takeover_arp {
311         struct ctdb_context *ctdb;
312         uint32_t count;
313         ctdb_sock_addr addr;
314         struct ctdb_tcp_array *tcparray;
315         struct ctdb_vnn *vnn;
316 };
317
318
319 /*
320   lists of tcp endpoints
321  */
322 struct ctdb_tcp_list {
323         struct ctdb_tcp_list *prev, *next;
324         struct ctdb_connection connection;
325 };
326
327 /*
328   list of clients to kill on IP release
329  */
330 struct ctdb_client_ip {
331         struct ctdb_client_ip *prev, *next;
332         struct ctdb_context *ctdb;
333         ctdb_sock_addr addr;
334         uint32_t client_id;
335 };
336
337
338 /*
339   send a gratuitous arp
340  */
341 static void ctdb_control_send_arp(struct tevent_context *ev,
342                                   struct tevent_timer *te,
343                                   struct timeval t, void *private_data)
344 {
345         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
346                                                         struct ctdb_takeover_arp);
347         int i, ret;
348         struct ctdb_tcp_array *tcparray;
349         const char *iface = ctdb_vnn_iface_string(arp->vnn);
350
351         ret = ctdb_sys_send_arp(&arp->addr, iface);
352         if (ret != 0) {
353                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
354                                   iface, strerror(errno)));
355         }
356
357         tcparray = arp->tcparray;
358         if (tcparray) {
359                 for (i=0;i<tcparray->num;i++) {
360                         struct ctdb_connection *tcon;
361
362                         tcon = &tcparray->connections[i];
363                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
364                                 (unsigned)ntohs(tcon->dst.ip.sin_port),
365                                 ctdb_addr_to_str(&tcon->src),
366                                 (unsigned)ntohs(tcon->src.ip.sin_port)));
367                         ret = ctdb_sys_send_tcp(
368                                 &tcon->src,
369                                 &tcon->dst,
370                                 0, 0, 0);
371                         if (ret != 0) {
372                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
373                                         ctdb_addr_to_str(&tcon->src)));
374                         }
375                 }
376         }
377
378         arp->count++;
379
380         if (arp->count == CTDB_ARP_REPEAT) {
381                 talloc_free(arp);
382                 return;
383         }
384
385         tevent_add_timer(arp->ctdb->ev, arp->vnn->takeover_ctx,
386                          timeval_current_ofs(CTDB_ARP_INTERVAL, 100000),
387                          ctdb_control_send_arp, arp);
388 }
389
390 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
391                                        struct ctdb_vnn *vnn)
392 {
393         struct ctdb_takeover_arp *arp;
394         struct ctdb_tcp_array *tcparray;
395
396         if (!vnn->takeover_ctx) {
397                 vnn->takeover_ctx = talloc_new(vnn);
398                 if (!vnn->takeover_ctx) {
399                         return -1;
400                 }
401         }
402
403         arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
404         if (!arp) {
405                 return -1;
406         }
407
408         arp->ctdb = ctdb;
409         arp->addr = vnn->public_address;
410         arp->vnn  = vnn;
411
412         tcparray = vnn->tcp_array;
413         if (tcparray) {
414                 /* add all of the known tcp connections for this IP to the
415                    list of tcp connections to send tickle acks for */
416                 arp->tcparray = talloc_steal(arp, tcparray);
417
418                 vnn->tcp_array = NULL;
419                 vnn->tcp_update_needed = true;
420         }
421
422         tevent_add_timer(arp->ctdb->ev, vnn->takeover_ctx,
423                          timeval_zero(), ctdb_control_send_arp, arp);
424
425         return 0;
426 }
427
428 struct takeover_callback_state {
429         struct ctdb_req_control_old *c;
430         ctdb_sock_addr *addr;
431         struct ctdb_vnn *vnn;
432 };
433
434 struct ctdb_do_takeip_state {
435         struct ctdb_req_control_old *c;
436         struct ctdb_vnn *vnn;
437 };
438
439 /*
440   called when takeip event finishes
441  */
442 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
443                                     void *private_data)
444 {
445         struct ctdb_do_takeip_state *state =
446                 talloc_get_type(private_data, struct ctdb_do_takeip_state);
447         int32_t ret;
448         TDB_DATA data;
449
450         if (status != 0) {
451                 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
452         
453                 if (status == -ETIME) {
454                         ctdb_ban_self(ctdb);
455                 }
456                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
457                                  ctdb_addr_to_str(&state->vnn->public_address),
458                                  ctdb_vnn_iface_string(state->vnn)));
459                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
460
461                 node->flags |= NODE_FLAGS_UNHEALTHY;
462                 talloc_free(state);
463                 return;
464         }
465
466         if (ctdb->do_checkpublicip) {
467
468         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
469         if (ret != 0) {
470                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
471                 talloc_free(state);
472                 return;
473         }
474
475         }
476
477         data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
478         data.dsize = strlen((char *)data.dptr) + 1;
479         DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
480
481         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
482
483
484         /* the control succeeded */
485         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
486         talloc_free(state);
487         return;
488 }
489
490 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
491 {
492         state->vnn->update_in_flight = false;
493         return 0;
494 }
495
496 /*
497   take over an ip address
498  */
499 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
500                               struct ctdb_req_control_old *c,
501                               struct ctdb_vnn *vnn)
502 {
503         int ret;
504         struct ctdb_do_takeip_state *state;
505
506         if (vnn->update_in_flight) {
507                 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
508                                     "update for this IP already in flight\n",
509                                     ctdb_addr_to_str(&vnn->public_address),
510                                     vnn->public_netmask_bits));
511                 return -1;
512         }
513
514         ret = ctdb_vnn_assign_iface(ctdb, vnn);
515         if (ret != 0) {
516                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
517                                  "assign a usable interface\n",
518                                  ctdb_addr_to_str(&vnn->public_address),
519                                  vnn->public_netmask_bits));
520                 return -1;
521         }
522
523         state = talloc(vnn, struct ctdb_do_takeip_state);
524         CTDB_NO_MEMORY(ctdb, state);
525
526         state->c = talloc_steal(ctdb, c);
527         state->vnn   = vnn;
528
529         vnn->update_in_flight = true;
530         talloc_set_destructor(state, ctdb_takeip_destructor);
531
532         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
533                             ctdb_addr_to_str(&vnn->public_address),
534                             vnn->public_netmask_bits,
535                             ctdb_vnn_iface_string(vnn)));
536
537         ret = ctdb_event_script_callback(ctdb,
538                                          state,
539                                          ctdb_do_takeip_callback,
540                                          state,
541                                          CTDB_EVENT_TAKE_IP,
542                                          "%s %s %u",
543                                          ctdb_vnn_iface_string(vnn),
544                                          ctdb_addr_to_str(&vnn->public_address),
545                                          vnn->public_netmask_bits);
546
547         if (ret != 0) {
548                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
549                         ctdb_addr_to_str(&vnn->public_address),
550                         ctdb_vnn_iface_string(vnn)));
551                 talloc_free(state);
552                 return -1;
553         }
554
555         return 0;
556 }
557
558 struct ctdb_do_updateip_state {
559         struct ctdb_req_control_old *c;
560         struct ctdb_interface *old;
561         struct ctdb_vnn *vnn;
562 };
563
564 /*
565   called when updateip event finishes
566  */
567 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
568                                       void *private_data)
569 {
570         struct ctdb_do_updateip_state *state =
571                 talloc_get_type(private_data, struct ctdb_do_updateip_state);
572         int32_t ret;
573
574         if (status != 0) {
575                 if (status == -ETIME) {
576                         ctdb_ban_self(ctdb);
577                 }
578                 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
579                         ctdb_addr_to_str(&state->vnn->public_address),
580                         state->old->name,
581                         ctdb_vnn_iface_string(state->vnn)));
582
583                 /*
584                  * All we can do is reset the old interface
585                  * and let the next run fix it
586                  */
587                 ctdb_vnn_unassign_iface(ctdb, state->vnn);
588                 state->vnn->iface = state->old;
589                 state->vnn->iface->references++;
590
591                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
592                 talloc_free(state);
593                 return;
594         }
595
596         if (ctdb->do_checkpublicip) {
597
598         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
599         if (ret != 0) {
600                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
601                 talloc_free(state);
602                 return;
603         }
604
605         }
606
607         /* the control succeeded */
608         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
609         talloc_free(state);
610         return;
611 }
612
613 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
614 {
615         state->vnn->update_in_flight = false;
616         return 0;
617 }
618
619 /*
620   update (move) an ip address
621  */
622 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
623                                 struct ctdb_req_control_old *c,
624                                 struct ctdb_vnn *vnn)
625 {
626         int ret;
627         struct ctdb_do_updateip_state *state;
628         struct ctdb_interface *old = vnn->iface;
629         const char *new_name;
630
631         if (vnn->update_in_flight) {
632                 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
633                                     "update for this IP already in flight\n",
634                                     ctdb_addr_to_str(&vnn->public_address),
635                                     vnn->public_netmask_bits));
636                 return -1;
637         }
638
639         ctdb_vnn_unassign_iface(ctdb, vnn);
640         ret = ctdb_vnn_assign_iface(ctdb, vnn);
641         if (ret != 0) {
642                 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
643                                  "assin a usable interface (old iface '%s')\n",
644                                  ctdb_addr_to_str(&vnn->public_address),
645                                  vnn->public_netmask_bits,
646                                  old->name));
647                 return -1;
648         }
649
650         new_name = ctdb_vnn_iface_string(vnn);
651         if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
652                 /* A benign update from one interface onto itself.
653                  * no need to run the eventscripts in this case, just return
654                  * success.
655                  */
656                 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
657                 return 0;
658         }
659
660         state = talloc(vnn, struct ctdb_do_updateip_state);
661         CTDB_NO_MEMORY(ctdb, state);
662
663         state->c = talloc_steal(ctdb, c);
664         state->old = old;
665         state->vnn = vnn;
666
667         vnn->update_in_flight = true;
668         talloc_set_destructor(state, ctdb_updateip_destructor);
669
670         DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
671                             "interface %s to %s\n",
672                             ctdb_addr_to_str(&vnn->public_address),
673                             vnn->public_netmask_bits,
674                             old->name,
675                             new_name));
676
677         ret = ctdb_event_script_callback(ctdb,
678                                          state,
679                                          ctdb_do_updateip_callback,
680                                          state,
681                                          CTDB_EVENT_UPDATE_IP,
682                                          "%s %s %s %u",
683                                          state->old->name,
684                                          new_name,
685                                          ctdb_addr_to_str(&vnn->public_address),
686                                          vnn->public_netmask_bits);
687         if (ret != 0) {
688                 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
689                                  ctdb_addr_to_str(&vnn->public_address),
690                                  old->name, new_name));
691                 talloc_free(state);
692                 return -1;
693         }
694
695         return 0;
696 }
697
698 /*
699   Find the vnn of the node that has a public ip address
700   returns -1 if the address is not known as a public address
701  */
702 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
703 {
704         struct ctdb_vnn *vnn;
705
706         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
707                 if (ctdb_same_ip(&vnn->public_address, addr)) {
708                         return vnn;
709                 }
710         }
711
712         return NULL;
713 }
714
715 /*
716   take over an ip address
717  */
718 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
719                                  struct ctdb_req_control_old *c,
720                                  TDB_DATA indata,
721                                  bool *async_reply)
722 {
723         int ret;
724         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
725         struct ctdb_vnn *vnn;
726         bool have_ip = false;
727         bool do_updateip = false;
728         bool do_takeip = false;
729         struct ctdb_interface *best_iface = NULL;
730
731         if (pip->pnn != ctdb->pnn) {
732                 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
733                                  "with pnn %d, but we're node %d\n",
734                                  ctdb_addr_to_str(&pip->addr),
735                                  pip->pnn, ctdb->pnn));
736                 return -1;
737         }
738
739         /* update out vnn list */
740         vnn = find_public_ip_vnn(ctdb, &pip->addr);
741         if (vnn == NULL) {
742                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
743                         ctdb_addr_to_str(&pip->addr)));
744                 return 0;
745         }
746
747         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
748                 have_ip = ctdb_sys_have_ip(&pip->addr);
749         }
750         best_iface = ctdb_vnn_best_iface(ctdb, vnn);
751         if (best_iface == NULL) {
752                 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
753                                  "a usable interface (old %s, have_ip %d)\n",
754                                  ctdb_addr_to_str(&vnn->public_address),
755                                  vnn->public_netmask_bits,
756                                  ctdb_vnn_iface_string(vnn),
757                                  have_ip));
758                 return -1;
759         }
760
761         if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
762                 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
763                 have_ip = false;
764         }
765
766
767         if (vnn->iface == NULL && have_ip) {
768                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
769                                   "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
770                                  ctdb_addr_to_str(&vnn->public_address)));
771                 return 0;
772         }
773
774         if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
775                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
776                                   "and we have it on iface[%s], but it was assigned to node %d"
777                                   "and we are node %d, banning ourself\n",
778                                  ctdb_addr_to_str(&vnn->public_address),
779                                  ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
780                 ctdb_ban_self(ctdb);
781                 return -1;
782         }
783
784         if (vnn->pnn == -1 && have_ip) {
785                 vnn->pnn = ctdb->pnn;
786                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
787                                   "and we already have it on iface[%s], update local daemon\n",
788                                  ctdb_addr_to_str(&vnn->public_address),
789                                   ctdb_vnn_iface_string(vnn)));
790                 return 0;
791         }
792
793         if (vnn->iface) {
794                 if (vnn->iface != best_iface) {
795                         if (!vnn->iface->link_up) {
796                                 do_updateip = true;
797                         } else if (vnn->iface->references > (best_iface->references + 1)) {
798                                 /* only move when the rebalance gains something */
799                                         do_updateip = true;
800                         }
801                 }
802         }
803
804         if (!have_ip) {
805                 if (do_updateip) {
806                         ctdb_vnn_unassign_iface(ctdb, vnn);
807                         do_updateip = false;
808                 }
809                 do_takeip = true;
810         }
811
812         if (do_takeip) {
813                 ret = ctdb_do_takeip(ctdb, c, vnn);
814                 if (ret != 0) {
815                         return -1;
816                 }
817         } else if (do_updateip) {
818                 ret = ctdb_do_updateip(ctdb, c, vnn);
819                 if (ret != 0) {
820                         return -1;
821                 }
822         } else {
823                 /*
824                  * The interface is up and the kernel known the ip
825                  * => do nothing
826                  */
827                 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
828                         ctdb_addr_to_str(&pip->addr),
829                         vnn->public_netmask_bits,
830                         ctdb_vnn_iface_string(vnn)));
831                 return 0;
832         }
833
834         /* tell ctdb_control.c that we will be replying asynchronously */
835         *async_reply = true;
836
837         return 0;
838 }
839
840 /*
841   kill any clients that are registered with a IP that is being released
842  */
843 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
844 {
845         struct ctdb_client_ip *ip;
846
847         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
848                 ctdb_addr_to_str(addr)));
849
850         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
851                 ctdb_sock_addr tmp_addr;
852
853                 tmp_addr = ip->addr;
854                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
855                         ip->client_id,
856                         ctdb_addr_to_str(&ip->addr)));
857
858                 if (ctdb_same_ip(&tmp_addr, addr)) {
859                         struct ctdb_client *client = reqid_find(ctdb->idr,
860                                                                 ip->client_id,
861                                                                 struct ctdb_client);
862                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
863                                 ip->client_id,
864                                 ctdb_addr_to_str(&ip->addr),
865                                 client->pid));
866
867                         if (client->pid != 0) {
868                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
869                                         (unsigned)client->pid,
870                                         ctdb_addr_to_str(addr),
871                                         ip->client_id));
872                                 kill(client->pid, SIGKILL);
873                         }
874                 }
875         }
876 }
877
878 static void do_delete_ip(struct ctdb_context *ctdb, struct ctdb_vnn *vnn)
879 {
880         DLIST_REMOVE(ctdb->vnn, vnn);
881         ctdb_vnn_unassign_iface(ctdb, vnn);
882         ctdb_remove_orphaned_ifaces(ctdb, vnn);
883         talloc_free(vnn);
884 }
885
886 /*
887   called when releaseip event finishes
888  */
889 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
890                                 void *private_data)
891 {
892         struct takeover_callback_state *state = 
893                 talloc_get_type(private_data, struct takeover_callback_state);
894         TDB_DATA data;
895
896         if (status == -ETIME) {
897                 ctdb_ban_self(ctdb);
898         }
899
900         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
901                 if  (ctdb_sys_have_ip(state->addr)) {
902                         DEBUG(DEBUG_ERR,
903                               ("IP %s still hosted during release IP callback, failing\n",
904                                ctdb_addr_to_str(state->addr)));
905                         ctdb_request_control_reply(ctdb, state->c,
906                                                    NULL, -1, NULL);
907                         talloc_free(state);
908                         return;
909                 }
910         }
911
912         /* send a message to all clients of this node telling them
913            that the cluster has been reconfigured and they should
914            release any sockets on this IP */
915         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
916         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
917         data.dsize = strlen((char *)data.dptr)+1;
918
919         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
920
921         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
922
923         /* kill clients that have registered with this IP */
924         release_kill_clients(ctdb, state->addr);
925
926         ctdb_vnn_unassign_iface(ctdb, state->vnn);
927
928         /* Process the IP if it has been marked for deletion */
929         if (state->vnn->delete_pending) {
930                 do_delete_ip(ctdb, state->vnn);
931                 state->vnn = NULL;
932         }
933
934         /* the control succeeded */
935         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
936         talloc_free(state);
937 }
938
939 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
940 {
941         if (state->vnn != NULL) {
942                 state->vnn->update_in_flight = false;
943         }
944         return 0;
945 }
946
947 /*
948   release an ip address
949  */
950 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
951                                 struct ctdb_req_control_old *c,
952                                 TDB_DATA indata, 
953                                 bool *async_reply)
954 {
955         int ret;
956         struct takeover_callback_state *state;
957         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
958         struct ctdb_vnn *vnn;
959         char *iface;
960
961         /* update our vnn list */
962         vnn = find_public_ip_vnn(ctdb, &pip->addr);
963         if (vnn == NULL) {
964                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
965                         ctdb_addr_to_str(&pip->addr)));
966                 return 0;
967         }
968         vnn->pnn = pip->pnn;
969
970         /* stop any previous arps */
971         talloc_free(vnn->takeover_ctx);
972         vnn->takeover_ctx = NULL;
973
974         /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
975          * lazy multicast to drop an IP from any node that isn't the
976          * intended new node.  The following causes makes ctdbd ignore
977          * a release for any address it doesn't host.
978          */
979         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
980                 if (!ctdb_sys_have_ip(&pip->addr)) {
981                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
982                                 ctdb_addr_to_str(&pip->addr),
983                                 vnn->public_netmask_bits,
984                                 ctdb_vnn_iface_string(vnn)));
985                         ctdb_vnn_unassign_iface(ctdb, vnn);
986                         return 0;
987                 }
988         } else {
989                 if (vnn->iface == NULL) {
990                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
991                                            ctdb_addr_to_str(&pip->addr),
992                                            vnn->public_netmask_bits));
993                         return 0;
994                 }
995         }
996
997         /* There is a potential race between take_ip and us because we
998          * update the VNN via a callback that run when the
999          * eventscripts have been run.  Avoid the race by allowing one
1000          * update to be in flight at a time.
1001          */
1002         if (vnn->update_in_flight) {
1003                 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
1004                                     "update for this IP already in flight\n",
1005                                     ctdb_addr_to_str(&vnn->public_address),
1006                                     vnn->public_netmask_bits));
1007                 return -1;
1008         }
1009
1010         iface = strdup(ctdb_vnn_iface_string(vnn));
1011
1012         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
1013                 ctdb_addr_to_str(&pip->addr),
1014                 vnn->public_netmask_bits,
1015                 iface,
1016                 pip->pnn));
1017
1018         state = talloc(ctdb, struct takeover_callback_state);
1019         if (state == NULL) {
1020                 ctdb_set_error(ctdb, "Out of memory at %s:%d",
1021                                __FILE__, __LINE__);
1022                 free(iface);
1023                 return -1;
1024         }
1025
1026         state->c = talloc_steal(state, c);
1027         state->addr = talloc(state, ctdb_sock_addr);       
1028         if (state->addr == NULL) {
1029                 ctdb_set_error(ctdb, "Out of memory at %s:%d",
1030                                __FILE__, __LINE__);
1031                 free(iface);
1032                 talloc_free(state);
1033                 return -1;
1034         }
1035         *state->addr = pip->addr;
1036         state->vnn   = vnn;
1037
1038         vnn->update_in_flight = true;
1039         talloc_set_destructor(state, ctdb_releaseip_destructor);
1040
1041         ret = ctdb_event_script_callback(ctdb, 
1042                                          state, release_ip_callback, state,
1043                                          CTDB_EVENT_RELEASE_IP,
1044                                          "%s %s %u",
1045                                          iface,
1046                                          ctdb_addr_to_str(&pip->addr),
1047                                          vnn->public_netmask_bits);
1048         free(iface);
1049         if (ret != 0) {
1050                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1051                         ctdb_addr_to_str(&pip->addr),
1052                         ctdb_vnn_iface_string(vnn)));
1053                 talloc_free(state);
1054                 return -1;
1055         }
1056
1057         /* tell the control that we will be reply asynchronously */
1058         *async_reply = true;
1059         return 0;
1060 }
1061
1062 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1063                                    ctdb_sock_addr *addr,
1064                                    unsigned mask, const char *ifaces,
1065                                    bool check_address)
1066 {
1067         struct ctdb_vnn      *vnn;
1068         uint32_t num = 0;
1069         char *tmp;
1070         const char *iface;
1071         int i;
1072         int ret;
1073
1074         tmp = strdup(ifaces);
1075         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1076                 if (!ctdb_sys_check_iface_exists(iface)) {
1077                         DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1078                         free(tmp);
1079                         return -1;
1080                 }
1081         }
1082         free(tmp);
1083
1084         /* Verify that we don't have an entry for this ip yet */
1085         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1086                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1087                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
1088                                 ctdb_addr_to_str(addr)));
1089                         return -1;
1090                 }               
1091         }
1092
1093         /* create a new vnn structure for this ip address */
1094         vnn = talloc_zero(ctdb, struct ctdb_vnn);
1095         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1096         vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1097         tmp = talloc_strdup(vnn, ifaces);
1098         CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1099         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1100                 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1101                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1102                 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1103                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1104                 num++;
1105         }
1106         talloc_free(tmp);
1107         vnn->ifaces[num] = NULL;
1108         vnn->public_address      = *addr;
1109         vnn->public_netmask_bits = mask;
1110         vnn->pnn                 = -1;
1111         if (check_address) {
1112                 if (ctdb_sys_have_ip(addr)) {
1113                         DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1114                         vnn->pnn = ctdb->pnn;
1115                 }
1116         }
1117
1118         for (i=0; vnn->ifaces[i]; i++) {
1119                 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1120                 if (ret != 0) {
1121                         DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1122                                            "for public_address[%s]\n",
1123                                            vnn->ifaces[i], ctdb_addr_to_str(addr)));
1124                         talloc_free(vnn);
1125                         return -1;
1126                 }
1127         }
1128
1129         DLIST_ADD(ctdb->vnn, vnn);
1130
1131         return 0;
1132 }
1133
1134 /*
1135   setup the public address lists from a file
1136 */
1137 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1138 {
1139         char **lines;
1140         int nlines;
1141         int i;
1142
1143         lines = file_lines_load(ctdb->public_addresses_file, &nlines, 0, ctdb);
1144         if (lines == NULL) {
1145                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1146                 return -1;
1147         }
1148         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1149                 nlines--;
1150         }
1151
1152         for (i=0;i<nlines;i++) {
1153                 unsigned mask;
1154                 ctdb_sock_addr addr;
1155                 const char *addrstr;
1156                 const char *ifaces;
1157                 char *tok, *line;
1158
1159                 line = lines[i];
1160                 while ((*line == ' ') || (*line == '\t')) {
1161                         line++;
1162                 }
1163                 if (*line == '#') {
1164                         continue;
1165                 }
1166                 if (strcmp(line, "") == 0) {
1167                         continue;
1168                 }
1169                 tok = strtok(line, " \t");
1170                 addrstr = tok;
1171                 tok = strtok(NULL, " \t");
1172                 if (tok == NULL) {
1173                         if (NULL == ctdb->default_public_interface) {
1174                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1175                                          i+1));
1176                                 talloc_free(lines);
1177                                 return -1;
1178                         }
1179                         ifaces = ctdb->default_public_interface;
1180                 } else {
1181                         ifaces = tok;
1182                 }
1183
1184                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1185                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1186                         talloc_free(lines);
1187                         return -1;
1188                 }
1189                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1190                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1191                         talloc_free(lines);
1192                         return -1;
1193                 }
1194         }
1195
1196
1197         talloc_free(lines);
1198         return 0;
1199 }
1200
1201 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1202                               const char *iface,
1203                               const char *ip)
1204 {
1205         struct ctdb_vnn *svnn;
1206         struct ctdb_interface *cur = NULL;
1207         bool ok;
1208         int ret;
1209
1210         svnn = talloc_zero(ctdb, struct ctdb_vnn);
1211         CTDB_NO_MEMORY(ctdb, svnn);
1212
1213         svnn->ifaces = talloc_array(svnn, const char *, 2);
1214         CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1215         svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1216         CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1217         svnn->ifaces[1] = NULL;
1218
1219         ok = parse_ip(ip, iface, 0, &svnn->public_address);
1220         if (!ok) {
1221                 talloc_free(svnn);
1222                 return -1;
1223         }
1224
1225         ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1226         if (ret != 0) {
1227                 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1228                                    "for single_ip[%s]\n",
1229                                    svnn->ifaces[0],
1230                                    ctdb_addr_to_str(&svnn->public_address)));
1231                 talloc_free(svnn);
1232                 return -1;
1233         }
1234
1235         /* assume the single public ip interface is initially "good" */
1236         cur = ctdb_find_iface(ctdb, iface);
1237         if (cur == NULL) {
1238                 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1239                 return -1;
1240         }
1241         cur->link_up = true;
1242
1243         ret = ctdb_vnn_assign_iface(ctdb, svnn);
1244         if (ret != 0) {
1245                 talloc_free(svnn);
1246                 return -1;
1247         }
1248
1249         ctdb->single_ip_vnn = svnn;
1250         return 0;
1251 }
1252
1253 struct public_ip_list {
1254         struct public_ip_list *next;
1255         uint32_t pnn;
1256         ctdb_sock_addr addr;
1257 };
1258
1259 /* Given a physical node, return the number of
1260    public addresses that is currently assigned to this node.
1261 */
1262 static int node_ip_coverage(int32_t pnn, struct public_ip_list *ips)
1263 {
1264         int num=0;
1265
1266         for (;ips;ips=ips->next) {
1267                 if (ips->pnn == pnn) {
1268                         num++;
1269                 }
1270         }
1271         return num;
1272 }
1273
1274
1275 /* Can the given node host the given IP: is the public IP known to the
1276  * node and is NOIPHOST unset?
1277 */
1278 static bool can_node_host_ip(struct ipalloc_state *ipalloc_state,
1279                              int32_t pnn,
1280                              struct ctdb_ipflags ipflags,
1281                              struct public_ip_list *ip)
1282 {
1283         struct ctdb_public_ip_list_old *public_ips;
1284         int i;
1285
1286         if (ipflags.noiphost) {
1287                 return false;
1288         }
1289
1290         public_ips = ipalloc_state->available_public_ips[pnn];
1291
1292         if (public_ips == NULL) {
1293                 return false;
1294         }
1295
1296         for (i=0; i<public_ips->num; i++) {
1297                 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1298                         /* yes, this node can serve this public ip */
1299                         return true;
1300                 }
1301         }
1302
1303         return false;
1304 }
1305
1306 static bool can_node_takeover_ip(struct ipalloc_state *ipalloc_state,
1307                                  int32_t pnn,
1308                                  struct ctdb_ipflags ipflags,
1309                                  struct public_ip_list *ip)
1310 {
1311         if (ipflags.noiptakeover) {
1312                 return false;
1313         }
1314
1315         return can_node_host_ip(ipalloc_state, pnn, ipflags, ip);
1316 }
1317
1318 /* search the node lists list for a node to takeover this ip.
1319    pick the node that currently are serving the least number of ips
1320    so that the ips get spread out evenly.
1321 */
1322 static int find_takeover_node(struct ipalloc_state *ipalloc_state,
1323                               struct ctdb_ipflags *ipflags,
1324                               struct public_ip_list *ip,
1325                               struct public_ip_list *all_ips)
1326 {
1327         int pnn, min=0, num;
1328         int i, numnodes;
1329
1330         numnodes = talloc_array_length(ipflags);
1331         pnn    = -1;
1332         for (i=0; i<numnodes; i++) {
1333                 /* verify that this node can serve this ip */
1334                 if (!can_node_takeover_ip(ipalloc_state, i, ipflags[i], ip)) {
1335                         /* no it couldnt   so skip to the next node */
1336                         continue;
1337                 }
1338
1339                 num = node_ip_coverage(i, all_ips);
1340                 /* was this the first node we checked ? */
1341                 if (pnn == -1) {
1342                         pnn = i;
1343                         min  = num;
1344                 } else {
1345                         if (num < min) {
1346                                 pnn = i;
1347                                 min  = num;
1348                         }
1349                 }
1350         }
1351         if (pnn == -1) {
1352                 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1353                         ctdb_addr_to_str(&ip->addr)));
1354
1355                 return -1;
1356         }
1357
1358         ip->pnn = pnn;
1359         return 0;
1360 }
1361
1362 #define IP_KEYLEN       4
1363 static uint32_t *ip_key(ctdb_sock_addr *ip)
1364 {
1365         static uint32_t key[IP_KEYLEN];
1366
1367         bzero(key, sizeof(key));
1368
1369         switch (ip->sa.sa_family) {
1370         case AF_INET:
1371                 key[3]  = htonl(ip->ip.sin_addr.s_addr);
1372                 break;
1373         case AF_INET6: {
1374                 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1375                 key[0]  = htonl(s6_a32[0]);
1376                 key[1]  = htonl(s6_a32[1]);
1377                 key[2]  = htonl(s6_a32[2]);
1378                 key[3]  = htonl(s6_a32[3]);
1379                 break;
1380         }
1381         default:
1382                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1383                 return key;
1384         }
1385
1386         return key;
1387 }
1388
1389 static void *add_ip_callback(void *parm, void *data)
1390 {
1391         struct public_ip_list *this_ip = parm;
1392         struct public_ip_list *prev_ip = data;
1393
1394         if (prev_ip == NULL) {
1395                 return parm;
1396         }
1397         if (this_ip->pnn == -1) {
1398                 this_ip->pnn = prev_ip->pnn;
1399         }
1400
1401         return parm;
1402 }
1403
1404 static int getips_count_callback(void *param, void *data)
1405 {
1406         struct public_ip_list **ip_list = (struct public_ip_list **)param;
1407         struct public_ip_list *new_ip = (struct public_ip_list *)data;
1408
1409         new_ip->next = *ip_list;
1410         *ip_list     = new_ip;
1411         return 0;
1412 }
1413
1414 static int verify_remote_ip_allocation(struct ctdb_context *ctdb,
1415                                        struct ctdb_public_ip_list_old *ips,
1416                                        uint32_t pnn);
1417
1418 static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
1419                                          struct ipalloc_state *ipalloc_state,
1420                                          struct ctdb_node_map_old *nodemap)
1421 {
1422         int j;
1423         int ret;
1424
1425         if (ipalloc_state->num != nodemap->num) {
1426                 DEBUG(DEBUG_ERR,
1427                       (__location__
1428                        " ipalloc_state->num (%d) != nodemap->num (%d) invalid param\n",
1429                        ipalloc_state->num, nodemap->num));
1430                 return -1;
1431         }
1432
1433         for (j=0; j<nodemap->num; j++) {
1434                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1435                         continue;
1436                 }
1437
1438                 /* Retrieve the list of known public IPs from the node */
1439                 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1440                                         TAKEOVER_TIMEOUT(),
1441                                         j,
1442                                         ctdb->nodes,
1443                                         0,
1444                                         &ipalloc_state->known_public_ips[j]);
1445                 if (ret != 0) {
1446                         DEBUG(DEBUG_ERR,
1447                               ("Failed to read known public IPs from node: %u\n",
1448                                j));
1449                         return -1;
1450                 }
1451
1452                 if (ctdb->do_checkpublicip) {
1453                         verify_remote_ip_allocation(ctdb,
1454                                                     ipalloc_state->known_public_ips[j],
1455                                                     j);
1456                 }
1457
1458                 /* Retrieve the list of available public IPs from the node */
1459                 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1460                                         TAKEOVER_TIMEOUT(),
1461                                         j,
1462                                         ctdb->nodes,
1463                                         CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE,
1464                                         &ipalloc_state->available_public_ips[j]);
1465                 if (ret != 0) {
1466                         DEBUG(DEBUG_ERR,
1467                               ("Failed to read available public IPs from node: %u\n",
1468                                j));
1469                         return -1;
1470                 }
1471         }
1472
1473         return 0;
1474 }
1475
1476 static struct public_ip_list *
1477 create_merged_ip_list(struct ctdb_context *ctdb, struct ipalloc_state *ipalloc_state)
1478 {
1479         int i, j;
1480         struct public_ip_list *ip_list;
1481         struct ctdb_public_ip_list_old *public_ips;
1482
1483         TALLOC_FREE(ctdb->ip_tree);
1484         ctdb->ip_tree = trbt_create(ctdb, 0);
1485
1486         for (i=0; i < ctdb->num_nodes; i++) {
1487                 public_ips = ipalloc_state->known_public_ips[i];
1488
1489                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1490                         continue;
1491                 }
1492
1493                 /* there were no public ips for this node */
1494                 if (public_ips == NULL) {
1495                         continue;
1496                 }
1497
1498                 for (j=0; j < public_ips->num; j++) {
1499                         struct public_ip_list *tmp_ip;
1500
1501                         tmp_ip = talloc_zero(ctdb->ip_tree, struct public_ip_list);
1502                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1503                         /* Do not use information about IP addresses hosted
1504                          * on other nodes, it may not be accurate */
1505                         if (public_ips->ips[j].pnn == ctdb->nodes[i]->pnn) {
1506                                 tmp_ip->pnn = public_ips->ips[j].pnn;
1507                         } else {
1508                                 tmp_ip->pnn = -1;
1509                         }
1510                         tmp_ip->addr = public_ips->ips[j].addr;
1511                         tmp_ip->next = NULL;
1512
1513                         trbt_insertarray32_callback(ctdb->ip_tree,
1514                                 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1515                                 add_ip_callback,
1516                                 tmp_ip);
1517                 }
1518         }
1519
1520         ip_list = NULL;
1521         trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1522
1523         return ip_list;
1524 }
1525
1526 /* 
1527  * This is the length of the longtest common prefix between the IPs.
1528  * It is calculated by XOR-ing the 2 IPs together and counting the
1529  * number of leading zeroes.  The implementation means that all
1530  * addresses end up being 128 bits long.
1531  *
1532  * FIXME? Should we consider IPv4 and IPv6 separately given that the
1533  * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1534  * lots of nodes and IP addresses?
1535  */
1536 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1537 {
1538         uint32_t ip1_k[IP_KEYLEN];
1539         uint32_t *t;
1540         int i;
1541         uint32_t x;
1542
1543         uint32_t distance = 0;
1544
1545         memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1546         t = ip_key(ip2);
1547         for (i=0; i<IP_KEYLEN; i++) {
1548                 x = ip1_k[i] ^ t[i];
1549                 if (x == 0) {
1550                         distance += 32;
1551                 } else {
1552                         /* Count number of leading zeroes. 
1553                          * FIXME? This could be optimised...
1554                          */
1555                         while ((x & (1 << 31)) == 0) {
1556                                 x <<= 1;
1557                                 distance += 1;
1558                         }
1559                 }
1560         }
1561
1562         return distance;
1563 }
1564
1565 /* Calculate the IP distance for the given IP relative to IPs on the
1566    given node.  The ips argument is generally the all_ips variable
1567    used in the main part of the algorithm.
1568  */
1569 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1570                                   struct public_ip_list *ips,
1571                                   int pnn)
1572 {
1573         struct public_ip_list *t;
1574         uint32_t d;
1575
1576         uint32_t sum = 0;
1577
1578         for (t=ips; t != NULL; t=t->next) {
1579                 if (t->pnn != pnn) {
1580                         continue;
1581                 }
1582
1583                 /* Optimisation: We never calculate the distance
1584                  * between an address and itself.  This allows us to
1585                  * calculate the effect of removing an address from a
1586                  * node by simply calculating the distance between
1587                  * that address and all of the exitsing addresses.
1588                  * Moreover, we assume that we're only ever dealing
1589                  * with addresses from all_ips so we can identify an
1590                  * address via a pointer rather than doing a more
1591                  * expensive address comparison. */
1592                 if (&(t->addr) == ip) {
1593                         continue;
1594                 }
1595
1596                 d = ip_distance(ip, &(t->addr));
1597                 sum += d * d;  /* Cheaper than pulling in math.h :-) */
1598         }
1599
1600         return sum;
1601 }
1602
1603 /* Return the LCP2 imbalance metric for addresses currently assigned
1604    to the given node.
1605  */
1606 static uint32_t lcp2_imbalance(struct public_ip_list * all_ips, int pnn)
1607 {
1608         struct public_ip_list *t;
1609
1610         uint32_t imbalance = 0;
1611
1612         for (t=all_ips; t!=NULL; t=t->next) {
1613                 if (t->pnn != pnn) {
1614                         continue;
1615                 }
1616                 /* Pass the rest of the IPs rather than the whole
1617                    all_ips input list.
1618                 */
1619                 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1620         }
1621
1622         return imbalance;
1623 }
1624
1625 /* Allocate any unassigned IPs just by looping through the IPs and
1626  * finding the best node for each.
1627  */
1628 static void basic_allocate_unassigned(struct ipalloc_state *ipalloc_state,
1629                                       struct ctdb_ipflags *ipflags,
1630                                       struct public_ip_list *all_ips)
1631 {
1632         struct public_ip_list *tmp_ip;
1633
1634         /* loop over all ip's and find a physical node to cover for
1635            each unassigned ip.
1636         */
1637         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1638                 if (tmp_ip->pnn == -1) {
1639                         if (find_takeover_node(ipalloc_state, ipflags,
1640                                                tmp_ip, all_ips)) {
1641                                 DEBUG(DEBUG_WARNING,
1642                                       ("Failed to find node to cover ip %s\n",
1643                                        ctdb_addr_to_str(&tmp_ip->addr)));
1644                         }
1645                 }
1646         }
1647 }
1648
1649 /* Basic non-deterministic rebalancing algorithm.
1650  */
1651 static void basic_failback(struct ipalloc_state *ipalloc_state,
1652                            struct ctdb_ipflags *ipflags,
1653                            struct public_ip_list *all_ips,
1654                            int num_ips)
1655 {
1656         int i, numnodes;
1657         int maxnode, maxnum, minnode, minnum, num, retries;
1658         struct public_ip_list *tmp_ip;
1659
1660         numnodes = talloc_array_length(ipflags);
1661         retries = 0;
1662
1663 try_again:
1664         maxnum=0;
1665         minnum=0;
1666
1667         /* for each ip address, loop over all nodes that can serve
1668            this ip and make sure that the difference between the node
1669            serving the most and the node serving the least ip's are
1670            not greater than 1.
1671         */
1672         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1673                 if (tmp_ip->pnn == -1) {
1674                         continue;
1675                 }
1676
1677                 /* Get the highest and lowest number of ips's served by any 
1678                    valid node which can serve this ip.
1679                 */
1680                 maxnode = -1;
1681                 minnode = -1;
1682                 for (i=0; i<numnodes; i++) {
1683                         /* only check nodes that can actually serve this ip */
1684                         if (!can_node_takeover_ip(ipalloc_state, i,
1685                                                   ipflags[i], tmp_ip)) {
1686                                 /* no it couldnt   so skip to the next node */
1687                                 continue;
1688                         }
1689
1690                         num = node_ip_coverage(i, all_ips);
1691                         if (maxnode == -1) {
1692                                 maxnode = i;
1693                                 maxnum  = num;
1694                         } else {
1695                                 if (num > maxnum) {
1696                                         maxnode = i;
1697                                         maxnum  = num;
1698                                 }
1699                         }
1700                         if (minnode == -1) {
1701                                 minnode = i;
1702                                 minnum  = num;
1703                         } else {
1704                                 if (num < minnum) {
1705                                         minnode = i;
1706                                         minnum  = num;
1707                                 }
1708                         }
1709                 }
1710                 if (maxnode == -1) {
1711                         DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1712                                 ctdb_addr_to_str(&tmp_ip->addr)));
1713
1714                         continue;
1715                 }
1716
1717                 /* if the spread between the smallest and largest coverage by
1718                    a node is >=2 we steal one of the ips from the node with
1719                    most coverage to even things out a bit.
1720                    try to do this a limited number of times since we dont
1721                    want to spend too much time balancing the ip coverage.
1722                 */
1723                 if ( (maxnum > minnum+1)
1724                      && (retries < (num_ips + 5)) ){
1725                         struct public_ip_list *tmp;
1726
1727                         /* Reassign one of maxnode's VNNs */
1728                         for (tmp=all_ips;tmp;tmp=tmp->next) {
1729                                 if (tmp->pnn == maxnode) {
1730                                         (void)find_takeover_node(ipalloc_state,
1731                                                                  ipflags,
1732                                                                  tmp,
1733                                                                  all_ips);
1734                                         retries++;
1735                                         goto try_again;;
1736                                 }
1737                         }
1738                 }
1739         }
1740 }
1741
1742 static bool lcp2_init(TALLOC_CTX *tmp_ctx,
1743                       struct ctdb_ipflags *ipflags,
1744                       struct public_ip_list *all_ips,
1745                       uint32_t *force_rebalance_nodes,
1746                       uint32_t **lcp2_imbalances,
1747                       bool **rebalance_candidates)
1748 {
1749         int i, numnodes;
1750         struct public_ip_list *tmp_ip;
1751
1752         numnodes = talloc_array_length(ipflags);
1753
1754         *rebalance_candidates = talloc_array(tmp_ctx, bool, numnodes);
1755         if (*rebalance_candidates == NULL) {
1756                 DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
1757                 return false;
1758         }
1759         *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, numnodes);
1760         if (*lcp2_imbalances == NULL) {
1761                 DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
1762                 return false;
1763         }
1764
1765         for (i=0; i<numnodes; i++) {
1766                 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1767                 /* First step: assume all nodes are candidates */
1768                 (*rebalance_candidates)[i] = true;
1769         }
1770
1771         /* 2nd step: if a node has IPs assigned then it must have been
1772          * healthy before, so we remove it from consideration.  This
1773          * is overkill but is all we have because we don't maintain
1774          * state between takeover runs.  An alternative would be to
1775          * keep state and invalidate it every time the recovery master
1776          * changes.
1777          */
1778         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1779                 if (tmp_ip->pnn != -1) {
1780                         (*rebalance_candidates)[tmp_ip->pnn] = false;
1781                 }
1782         }
1783
1784         /* 3rd step: if a node is forced to re-balance then
1785            we allow failback onto the node */
1786         if (force_rebalance_nodes == NULL) {
1787                 return true;
1788         }
1789         for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
1790                 uint32_t pnn = force_rebalance_nodes[i];
1791                 if (pnn >= numnodes) {
1792                         DEBUG(DEBUG_ERR,
1793                               (__location__ "unknown node %u\n", pnn));
1794                         continue;
1795                 }
1796
1797                 DEBUG(DEBUG_NOTICE,
1798                       ("Forcing rebalancing of IPs to node %u\n", pnn));
1799                 (*rebalance_candidates)[pnn] = true;
1800         }
1801
1802         return true;
1803 }
1804
1805 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1806  * the IP/node combination that will cost the least.
1807  */
1808 static void lcp2_allocate_unassigned(struct ipalloc_state *ipalloc_state,
1809                                      struct ctdb_ipflags *ipflags,
1810                                      struct public_ip_list *all_ips,
1811                                      uint32_t *lcp2_imbalances)
1812 {
1813         struct public_ip_list *tmp_ip;
1814         int dstnode, numnodes;
1815
1816         int minnode;
1817         uint32_t mindsum, dstdsum, dstimbl, minimbl;
1818         struct public_ip_list *minip;
1819
1820         bool should_loop = true;
1821         bool have_unassigned = true;
1822
1823         numnodes = talloc_array_length(ipflags);
1824
1825         while (have_unassigned && should_loop) {
1826                 should_loop = false;
1827
1828                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1829                 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1830
1831                 minnode = -1;
1832                 mindsum = 0;
1833                 minip = NULL;
1834
1835                 /* loop over each unassigned ip. */
1836                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1837                         if (tmp_ip->pnn != -1) {
1838                                 continue;
1839                         }
1840
1841                         for (dstnode=0; dstnode<numnodes; dstnode++) {
1842                                 /* only check nodes that can actually takeover this ip */
1843                                 if (!can_node_takeover_ip(ipalloc_state,
1844                                                           dstnode,
1845                                                           ipflags[dstnode],
1846                                                           tmp_ip)) {
1847                                         /* no it couldnt   so skip to the next node */
1848                                         continue;
1849                                 }
1850
1851                                 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1852                                 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1853                                 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1854                                                    ctdb_addr_to_str(&(tmp_ip->addr)),
1855                                                    dstnode,
1856                                                    dstimbl - lcp2_imbalances[dstnode]));
1857
1858
1859                                 if ((minnode == -1) || (dstdsum < mindsum)) {
1860                                         minnode = dstnode;
1861                                         minimbl = dstimbl;
1862                                         mindsum = dstdsum;
1863                                         minip = tmp_ip;
1864                                         should_loop = true;
1865                                 }
1866                         }
1867                 }
1868
1869                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1870
1871                 /* If we found one then assign it to the given node. */
1872                 if (minnode != -1) {
1873                         minip->pnn = minnode;
1874                         lcp2_imbalances[minnode] = minimbl;
1875                         DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1876                                           ctdb_addr_to_str(&(minip->addr)),
1877                                           minnode,
1878                                           mindsum));
1879                 }
1880
1881                 /* There might be a better way but at least this is clear. */
1882                 have_unassigned = false;
1883                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1884                         if (tmp_ip->pnn == -1) {
1885                                 have_unassigned = true;
1886                         }
1887                 }
1888         }
1889
1890         /* We know if we have an unassigned addresses so we might as
1891          * well optimise.
1892          */
1893         if (have_unassigned) {
1894                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1895                         if (tmp_ip->pnn == -1) {
1896                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1897                                                      ctdb_addr_to_str(&tmp_ip->addr)));
1898                         }
1899                 }
1900         }
1901 }
1902
1903 /* LCP2 algorithm for rebalancing the cluster.  Given a candidate node
1904  * to move IPs from, determines the best IP/destination node
1905  * combination to move from the source node.
1906  */
1907 static bool lcp2_failback_candidate(struct ipalloc_state *ipalloc_state,
1908                                     struct ctdb_ipflags *ipflags,
1909                                     struct public_ip_list *all_ips,
1910                                     int srcnode,
1911                                     uint32_t *lcp2_imbalances,
1912                                     bool *rebalance_candidates)
1913 {
1914         int dstnode, mindstnode, numnodes;
1915         uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1916         uint32_t minsrcimbl, mindstimbl;
1917         struct public_ip_list *minip;
1918         struct public_ip_list *tmp_ip;
1919
1920         /* Find an IP and destination node that best reduces imbalance. */
1921         srcimbl = 0;
1922         minip = NULL;
1923         minsrcimbl = 0;
1924         mindstnode = -1;
1925         mindstimbl = 0;
1926
1927         numnodes = talloc_array_length(ipflags);
1928
1929         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1930         DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n",
1931                            srcnode, lcp2_imbalances[srcnode]));
1932
1933         for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1934                 /* Only consider addresses on srcnode. */
1935                 if (tmp_ip->pnn != srcnode) {
1936                         continue;
1937                 }
1938
1939                 /* What is this IP address costing the source node? */
1940                 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1941                 srcimbl = lcp2_imbalances[srcnode] - srcdsum;
1942
1943                 /* Consider this IP address would cost each potential
1944                  * destination node.  Destination nodes are limited to
1945                  * those that are newly healthy, since we don't want
1946                  * to do gratuitous failover of IPs just to make minor
1947                  * balance improvements.
1948                  */
1949                 for (dstnode=0; dstnode<numnodes; dstnode++) {
1950                         if (!rebalance_candidates[dstnode]) {
1951                                 continue;
1952                         }
1953
1954                         /* only check nodes that can actually takeover this ip */
1955                         if (!can_node_takeover_ip(ipalloc_state, dstnode,
1956                                                   ipflags[dstnode], tmp_ip)) {
1957                                 /* no it couldnt   so skip to the next node */
1958                                 continue;
1959                         }
1960
1961                         dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1962                         dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1963                         DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1964                                            srcnode, -srcdsum,
1965                                            ctdb_addr_to_str(&(tmp_ip->addr)),
1966                                            dstnode, dstdsum));
1967
1968                         if ((dstimbl < lcp2_imbalances[srcnode]) &&
1969                             (dstdsum < srcdsum) &&                      \
1970                             ((mindstnode == -1) ||                              \
1971                              ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1972
1973                                 minip = tmp_ip;
1974                                 minsrcimbl = srcimbl;
1975                                 mindstnode = dstnode;
1976                                 mindstimbl = dstimbl;
1977                         }
1978                 }
1979         }
1980         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1981
1982         if (mindstnode != -1) {
1983                 /* We found a move that makes things better... */
1984                 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1985                                   srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1986                                   ctdb_addr_to_str(&(minip->addr)),
1987                                   mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1988
1989
1990                 lcp2_imbalances[srcnode] = minsrcimbl;
1991                 lcp2_imbalances[mindstnode] = mindstimbl;
1992                 minip->pnn = mindstnode;
1993
1994                 return true;
1995         }
1996
1997         return false;
1998         
1999 }
2000
2001 struct lcp2_imbalance_pnn {
2002         uint32_t imbalance;
2003         int pnn;
2004 };
2005
2006 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
2007 {
2008         const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
2009         const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
2010
2011         if (lipa->imbalance > lipb->imbalance) {
2012                 return -1;
2013         } else if (lipa->imbalance == lipb->imbalance) {
2014                 return 0;
2015         } else {
2016                 return 1;
2017         }
2018 }
2019
2020 /* LCP2 algorithm for rebalancing the cluster.  This finds the source
2021  * node with the highest LCP2 imbalance, and then determines the best
2022  * IP/destination node combination to move from the source node.
2023  */
2024 static void lcp2_failback(struct ipalloc_state *ipalloc_state,
2025                           struct ctdb_ipflags *ipflags,
2026                           struct public_ip_list *all_ips,
2027                           uint32_t *lcp2_imbalances,
2028                           bool *rebalance_candidates)
2029 {
2030         int i, numnodes;
2031         struct lcp2_imbalance_pnn * lips;
2032         bool again;
2033
2034         numnodes = talloc_array_length(ipflags);
2035
2036 try_again:
2037         /* Put the imbalances and nodes into an array, sort them and
2038          * iterate through candidates.  Usually the 1st one will be
2039          * used, so this doesn't cost much...
2040          */
2041         DEBUG(DEBUG_DEBUG,("+++++++++++++++++++++++++++++++++++++++++\n"));
2042         DEBUG(DEBUG_DEBUG,("Selecting most imbalanced node from:\n"));
2043         lips = talloc_array(ipalloc_state, struct lcp2_imbalance_pnn, numnodes);
2044         for (i=0; i<numnodes; i++) {
2045                 lips[i].imbalance = lcp2_imbalances[i];
2046                 lips[i].pnn = i;
2047                 DEBUG(DEBUG_DEBUG,(" %d [%d]\n", i, lcp2_imbalances[i]));
2048         }
2049         qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
2050               lcp2_cmp_imbalance_pnn);
2051
2052         again = false;
2053         for (i=0; i<numnodes; i++) {
2054                 /* This means that all nodes had 0 or 1 addresses, so
2055                  * can't be imbalanced.
2056                  */
2057                 if (lips[i].imbalance == 0) {
2058                         break;
2059                 }
2060
2061                 if (lcp2_failback_candidate(ipalloc_state,
2062                                             ipflags,
2063                                             all_ips,
2064                                             lips[i].pnn,
2065                                             lcp2_imbalances,
2066                                             rebalance_candidates)) {
2067                         again = true;
2068                         break;
2069                 }
2070         }
2071
2072         talloc_free(lips);
2073         if (again) {
2074                 goto try_again;
2075         }
2076 }
2077
2078 static void unassign_unsuitable_ips(struct ipalloc_state *ipalloc_state,
2079                                     struct ctdb_ipflags *ipflags,
2080                                     struct public_ip_list *all_ips)
2081 {
2082         struct public_ip_list *tmp_ip;
2083
2084         /* verify that the assigned nodes can serve that public ip
2085            and set it to -1 if not
2086         */
2087         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2088                 if (tmp_ip->pnn == -1) {
2089                         continue;
2090                 }
2091                 if (!can_node_host_ip(ipalloc_state, tmp_ip->pnn,
2092                                       ipflags[tmp_ip->pnn], tmp_ip) != 0) {
2093                         /* this node can not serve this ip. */
2094                         DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
2095                                            ctdb_addr_to_str(&(tmp_ip->addr)),
2096                                            tmp_ip->pnn));
2097                         tmp_ip->pnn = -1;
2098                 }
2099         }
2100 }
2101
2102 static bool ip_alloc_deterministic_ips(struct ipalloc_state *ipalloc_state,
2103                                        struct ctdb_ipflags *ipflags,
2104                                        struct public_ip_list *all_ips)
2105 {
2106         struct public_ip_list *tmp_ip;
2107         int i, numnodes;
2108
2109         numnodes = talloc_array_length(ipflags);
2110
2111         DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
2112        /* Allocate IPs to nodes in a modulo fashion so that IPs will
2113         *  always be allocated the same way for a specific set of
2114         *  available/unavailable nodes.
2115         */
2116
2117         for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
2118                 tmp_ip->pnn = i % numnodes;
2119         }
2120
2121         /* IP failback doesn't make sense with deterministic
2122          * IPs, since the modulo step above implicitly fails
2123          * back IPs to their "home" node.
2124          */
2125         if (1 == ipalloc_state->no_ip_failback) {
2126                 DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
2127         }
2128
2129         unassign_unsuitable_ips(ipalloc_state, ipflags, all_ips);
2130
2131         basic_allocate_unassigned(ipalloc_state, ipflags, all_ips);
2132
2133         /* No failback here! */
2134
2135         return true;
2136 }
2137
2138 static bool ip_alloc_nondeterministic_ips(struct ipalloc_state *ipalloc_state,
2139                                           struct ctdb_ipflags *ipflags,
2140                                           struct public_ip_list *all_ips)
2141 {
2142         /* This should be pushed down into basic_failback. */
2143         struct public_ip_list *tmp_ip;
2144         int num_ips = 0;
2145         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2146                 num_ips++;
2147         }
2148
2149         unassign_unsuitable_ips(ipalloc_state, ipflags, all_ips);
2150
2151         basic_allocate_unassigned(ipalloc_state, ipflags, all_ips);
2152
2153         /* If we don't want IPs to fail back then don't rebalance IPs. */
2154         if (1 == ipalloc_state->no_ip_failback) {
2155                 return true;
2156         }
2157
2158         /* Now, try to make sure the ip adresses are evenly distributed
2159            across the nodes.
2160         */
2161         basic_failback(ipalloc_state, ipflags, all_ips, num_ips);
2162
2163         return true;
2164 }
2165
2166 static bool ip_alloc_lcp2(struct ipalloc_state *ipalloc_state,
2167                           struct ctdb_ipflags *ipflags,
2168                           struct public_ip_list *all_ips,
2169                           uint32_t *force_rebalance_nodes)
2170 {
2171         uint32_t *lcp2_imbalances;
2172         bool *rebalance_candidates;
2173         int numnodes, num_rebalance_candidates, i;
2174         bool ret = true;
2175
2176         TALLOC_CTX *tmp_ctx = talloc_new(ipalloc_state);
2177
2178         unassign_unsuitable_ips(ipalloc_state, ipflags, all_ips);
2179
2180         if (!lcp2_init(tmp_ctx, ipflags, all_ips,force_rebalance_nodes,
2181                        &lcp2_imbalances, &rebalance_candidates)) {
2182                 ret = false;
2183                 goto finished;
2184         }
2185
2186         lcp2_allocate_unassigned(ipalloc_state, ipflags, all_ips, lcp2_imbalances);
2187
2188         /* If we don't want IPs to fail back then don't rebalance IPs. */
2189         if (1 == ipalloc_state->no_ip_failback) {
2190                 goto finished;
2191         }
2192
2193         /* It is only worth continuing if we have suitable target
2194          * nodes to transfer IPs to.  This check is much cheaper than
2195          * continuing on...
2196          */
2197         numnodes = talloc_array_length(ipflags);
2198         num_rebalance_candidates = 0;
2199         for (i=0; i<numnodes; i++) {
2200                 if (rebalance_candidates[i]) {
2201                         num_rebalance_candidates++;
2202                 }
2203         }
2204         if (num_rebalance_candidates == 0) {
2205                 goto finished;
2206         }
2207
2208         /* Now, try to make sure the ip adresses are evenly distributed
2209            across the nodes.
2210         */
2211         lcp2_failback(ipalloc_state, ipflags, all_ips,
2212                       lcp2_imbalances, rebalance_candidates);
2213
2214 finished:
2215         talloc_free(tmp_ctx);
2216
2217         return ret;
2218 }
2219
2220 static bool all_nodes_are_disabled(struct ctdb_node_map_old *nodemap)
2221 {
2222         int i;
2223
2224         for (i=0;i<nodemap->num;i++) {
2225                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
2226                         /* Found one completely healthy node */
2227                         return false;
2228                 }
2229         }
2230
2231         return true;
2232 }
2233
2234 /* The calculation part of the IP allocation algorithm. */
2235 static bool ctdb_takeover_run_core(struct ipalloc_state *ipalloc_state,
2236                                    struct ctdb_ipflags *ipflags,
2237                                    struct public_ip_list *all_ips,
2238                                    uint32_t *force_rebalance_nodes)
2239 {
2240         bool ret;
2241
2242         switch (ipalloc_state->algorithm) {
2243         case IPALLOC_LCP2:
2244                 ret = ip_alloc_lcp2(ipalloc_state, ipflags, all_ips,
2245                                     force_rebalance_nodes);
2246                 break;
2247         case IPALLOC_DETERMINISTIC:
2248                 ret = ip_alloc_deterministic_ips(ipalloc_state, ipflags, all_ips);
2249                 break;
2250         case IPALLOC_NONDETERMINISTIC:
2251                 ret = ip_alloc_nondeterministic_ips(ipalloc_state, ipflags, all_ips);
2252                break;
2253         }
2254
2255         /* at this point ->pnn is the node which will own each IP
2256            or -1 if there is no node that can cover this ip
2257         */
2258
2259         return ret;
2260 }
2261
2262 struct get_tunable_callback_data {
2263         const char *tunable;
2264         uint32_t *out;
2265         bool fatal;
2266 };
2267
2268 static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
2269                                  int32_t res, TDB_DATA outdata,
2270                                  void *callback)
2271 {
2272         struct get_tunable_callback_data *cd =
2273                 (struct get_tunable_callback_data *)callback;
2274         int size;
2275
2276         if (res != 0) {
2277                 /* Already handled in fail callback */
2278                 return;
2279         }
2280
2281         if (outdata.dsize != sizeof(uint32_t)) {
2282                 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading \"%s\" tunable from node %d. Expected %d bytes but received %d bytes\n",
2283                                  cd->tunable, pnn, (int)sizeof(uint32_t),
2284                                  (int)outdata.dsize));
2285                 cd->fatal = true;
2286                 return;
2287         }
2288
2289         size = talloc_array_length(cd->out);
2290         if (pnn >= size) {
2291                 DEBUG(DEBUG_ERR,("Got %s reply from node %d but nodemap only has %d entries\n",
2292                                  cd->tunable, pnn, size));
2293                 return;
2294         }
2295
2296                 
2297         cd->out[pnn] = *(uint32_t *)outdata.dptr;
2298 }
2299
2300 static void get_tunable_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2301                                        int32_t res, TDB_DATA outdata,
2302                                        void *callback)
2303 {
2304         struct get_tunable_callback_data *cd =
2305                 (struct get_tunable_callback_data *)callback;
2306
2307         switch (res) {
2308         case -ETIME:
2309                 DEBUG(DEBUG_ERR,
2310                       ("Timed out getting tunable \"%s\" from node %d\n",
2311                        cd->tunable, pnn));
2312                 cd->fatal = true;
2313                 break;
2314         case -EINVAL:
2315         case -1:
2316                 DEBUG(DEBUG_WARNING,
2317                       ("Tunable \"%s\" not implemented on node %d\n",
2318                        cd->tunable, pnn));
2319                 break;
2320         default:
2321                 DEBUG(DEBUG_ERR,
2322                       ("Unexpected error getting tunable \"%s\" from node %d\n",
2323                        cd->tunable, pnn));
2324                 cd->fatal = true;
2325         }
2326 }
2327
2328 static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
2329                                         TALLOC_CTX *tmp_ctx,
2330                                         struct ctdb_node_map_old *nodemap,
2331                                         const char *tunable,
2332                                         uint32_t default_value)
2333 {
2334         TDB_DATA data;
2335         struct ctdb_control_get_tunable *t;
2336         uint32_t *nodes;
2337         uint32_t *tvals;
2338         struct get_tunable_callback_data callback_data;
2339         int i;
2340
2341         tvals = talloc_array(tmp_ctx, uint32_t, nodemap->num);
2342         CTDB_NO_MEMORY_NULL(ctdb, tvals);
2343         for (i=0; i<nodemap->num; i++) {
2344                 tvals[i] = default_value;
2345         }
2346                 
2347         callback_data.out = tvals;
2348         callback_data.tunable = tunable;
2349         callback_data.fatal = false;
2350
2351         data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(tunable) + 1;
2352         data.dptr  = talloc_size(tmp_ctx, data.dsize);
2353         t = (struct ctdb_control_get_tunable *)data.dptr;
2354         t->length = strlen(tunable)+1;
2355         memcpy(t->name, tunable, t->length);
2356         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2357         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
2358                                       nodes, 0, TAKEOVER_TIMEOUT(),
2359                                       false, data,
2360                                       get_tunable_callback,
2361                                       get_tunable_fail_callback,
2362                                       &callback_data) != 0) {
2363                 if (callback_data.fatal) {
2364                         talloc_free(tvals);
2365                         tvals = NULL;
2366                 }
2367         }
2368         talloc_free(nodes);
2369         talloc_free(data.dptr);
2370
2371         return tvals;
2372 }
2373
2374 /* Set internal flags for IP allocation:
2375  *   Clear ip flags
2376  *   Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
2377  *   Set NOIPHOST ip flag for each INACTIVE node
2378  *   if all nodes are disabled:
2379  *     Set NOIPHOST ip flags from per-node NoIPHostOnAllDisabled tunable
2380  *   else
2381  *     Set NOIPHOST ip flags for disabled nodes
2382  */
2383 static struct ctdb_ipflags *
2384 set_ipflags_internal(TALLOC_CTX *tmp_ctx,
2385                      struct ctdb_node_map_old *nodemap,
2386                      uint32_t *tval_noiptakeover,
2387                      uint32_t *tval_noiphostonalldisabled)
2388 {
2389         int i;
2390         struct ctdb_ipflags *ipflags;
2391
2392         /* Clear IP flags - implicit due to talloc_zero */
2393         ipflags = talloc_zero_array(tmp_ctx, struct ctdb_ipflags, nodemap->num);
2394         if (ipflags == NULL) {
2395                 DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
2396                 return NULL;
2397         }
2398
2399         for (i=0;i<nodemap->num;i++) {
2400                 /* Can not take IPs on node with NoIPTakeover set */
2401                 if (tval_noiptakeover[i] != 0) {
2402                         ipflags[i].noiptakeover = true;
2403                 }
2404
2405                 /* Can not host IPs on INACTIVE node */
2406                 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2407                         ipflags[i].noiphost = true;
2408                 }
2409         }
2410
2411         if (all_nodes_are_disabled(nodemap)) {
2412                 /* If all nodes are disabled, can not host IPs on node
2413                  * with NoIPHostOnAllDisabled set
2414                  */
2415                 for (i=0;i<nodemap->num;i++) {
2416                         if (tval_noiphostonalldisabled[i] != 0) {
2417                                 ipflags[i].noiphost = true;
2418                         }
2419                 }
2420         } else {
2421                 /* If some nodes are not disabled, then can not host
2422                  * IPs on DISABLED node
2423                  */
2424                 for (i=0;i<nodemap->num;i++) {
2425                         if (nodemap->nodes[i].flags & NODE_FLAGS_DISABLED) {
2426                                 ipflags[i].noiphost = true;
2427                         }
2428                 }
2429         }
2430
2431         return ipflags;
2432 }
2433
2434 static struct ctdb_ipflags *set_ipflags(struct ctdb_context *ctdb,
2435                                         TALLOC_CTX *tmp_ctx,
2436                                         struct ctdb_node_map_old *nodemap)
2437 {
2438         uint32_t *tval_noiptakeover;
2439         uint32_t *tval_noiphostonalldisabled;
2440         struct ctdb_ipflags *ipflags;
2441
2442
2443         tval_noiptakeover = get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2444                                                    "NoIPTakeover", 0);
2445         if (tval_noiptakeover == NULL) {
2446                 return NULL;
2447         }
2448
2449         tval_noiphostonalldisabled =
2450                 get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2451                                        "NoIPHostOnAllDisabled", 0);
2452         if (tval_noiphostonalldisabled == NULL) {
2453                 /* Caller frees tmp_ctx */
2454                 return NULL;
2455         }
2456
2457         ipflags = set_ipflags_internal(tmp_ctx, nodemap,
2458                                        tval_noiptakeover,
2459                                        tval_noiphostonalldisabled);
2460
2461         talloc_free(tval_noiptakeover);
2462         talloc_free(tval_noiphostonalldisabled);
2463
2464         return ipflags;
2465 }
2466
2467 static struct ipalloc_state * ipalloc_state_init(struct ctdb_context *ctdb,
2468                                                  TALLOC_CTX *mem_ctx)
2469 {
2470         struct ipalloc_state *ipalloc_state =
2471                 talloc_zero(mem_ctx, struct ipalloc_state);
2472         if (ipalloc_state == NULL) {
2473                 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
2474                 return NULL;
2475         }
2476
2477         ipalloc_state->num = ctdb->num_nodes;
2478         ipalloc_state->known_public_ips =
2479                 talloc_zero_array(ipalloc_state,
2480                                   struct ctdb_public_ip_list_old *,
2481                                   ipalloc_state->num);
2482         if (ipalloc_state->known_public_ips == NULL) {
2483                 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
2484                 talloc_free(ipalloc_state);
2485                 return NULL;
2486         }
2487         ipalloc_state->available_public_ips =
2488                 talloc_zero_array(ipalloc_state,
2489                                   struct ctdb_public_ip_list_old *,
2490                                   ipalloc_state->num);
2491         if (ipalloc_state->available_public_ips == NULL) {
2492                 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
2493                 talloc_free(ipalloc_state);
2494                 return NULL;
2495         }
2496
2497         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2498                 ipalloc_state->algorithm = IPALLOC_LCP2;
2499         } else if (1 == ctdb->tunable.deterministic_public_ips) {
2500                 ipalloc_state->algorithm = IPALLOC_DETERMINISTIC;
2501         } else {
2502                 ipalloc_state->algorithm = IPALLOC_NONDETERMINISTIC;
2503         }
2504
2505         ipalloc_state->no_ip_failback = ctdb->tunable.no_ip_failback;
2506
2507         return ipalloc_state;
2508 }
2509
2510 struct iprealloc_callback_data {
2511         bool *retry_nodes;
2512         int retry_count;
2513         client_async_callback fail_callback;
2514         void *fail_callback_data;
2515         struct ctdb_node_map_old *nodemap;
2516 };
2517
2518 static void iprealloc_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2519                                         int32_t res, TDB_DATA outdata,
2520                                         void *callback)
2521 {
2522         int numnodes;
2523         struct iprealloc_callback_data *cd =
2524                 (struct iprealloc_callback_data *)callback;
2525
2526         numnodes = talloc_array_length(cd->retry_nodes);
2527         if (pnn > numnodes) {
2528                 DEBUG(DEBUG_ERR,
2529                       ("ipreallocated failure from node %d, "
2530                        "but only %d nodes in nodemap\n",
2531                        pnn, numnodes));
2532                 return;
2533         }
2534
2535         /* Can't run the "ipreallocated" event on a INACTIVE node */
2536         if (cd->nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE) {
2537                 DEBUG(DEBUG_WARNING,
2538                       ("ipreallocated failed on inactive node %d, ignoring\n",
2539                        pnn));
2540                 return;
2541         }
2542
2543         switch (res) {
2544         case -ETIME:
2545                 /* If the control timed out then that's a real error,
2546                  * so call the real fail callback
2547                  */
2548                 if (cd->fail_callback) {
2549                         cd->fail_callback(ctdb, pnn, res, outdata,
2550                                           cd->fail_callback_data);
2551                 } else {
2552                         DEBUG(DEBUG_WARNING,
2553                               ("iprealloc timed out but no callback registered\n"));
2554                 }
2555                 break;
2556         default:
2557                 /* If not a timeout then either the ipreallocated
2558                  * eventscript (or some setup) failed.  This might
2559                  * have failed because the IPREALLOCATED control isn't
2560                  * implemented - right now there is no way of knowing
2561                  * because the error codes are all folded down to -1.
2562                  * Consider retrying using EVENTSCRIPT control...
2563                  */
2564                 DEBUG(DEBUG_WARNING,
2565                       ("ipreallocated failure from node %d, flagging retry\n",
2566                        pnn));
2567                 cd->retry_nodes[pnn] = true;
2568                 cd->retry_count++;
2569         }
2570 }
2571
2572 struct takeover_callback_data {
2573         bool *node_failed;
2574         client_async_callback fail_callback;
2575         void *fail_callback_data;
2576         struct ctdb_node_map_old *nodemap;
2577 };
2578
2579 static void takeover_run_fail_callback(struct ctdb_context *ctdb,
2580                                        uint32_t node_pnn, int32_t res,
2581                                        TDB_DATA outdata, void *callback_data)
2582 {
2583         struct takeover_callback_data *cd =
2584                 talloc_get_type_abort(callback_data,
2585                                       struct takeover_callback_data);
2586         int i;
2587
2588         for (i = 0; i < cd->nodemap->num; i++) {
2589                 if (node_pnn == cd->nodemap->nodes[i].pnn) {
2590                         break;
2591                 }
2592         }
2593
2594         if (i == cd->nodemap->num) {
2595                 DEBUG(DEBUG_ERR, (__location__ " invalid PNN %u\n", node_pnn));
2596                 return;
2597         }
2598
2599         if (!cd->node_failed[i]) {
2600                 cd->node_failed[i] = true;
2601                 cd->fail_callback(ctdb, node_pnn, res, outdata,
2602                                   cd->fail_callback_data);
2603         }
2604 }
2605
2606 /*
2607   make any IP alias changes for public addresses that are necessary 
2608  */
2609 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
2610                       uint32_t *force_rebalance_nodes,
2611                       client_async_callback fail_callback, void *callback_data)
2612 {
2613         int i, j, ret;
2614         struct ctdb_public_ip ip;
2615         uint32_t *nodes;
2616         struct public_ip_list *all_ips, *tmp_ip;
2617         TDB_DATA data;
2618         struct timeval timeout;
2619         struct client_async_data *async_data;
2620         struct ctdb_client_control_state *state;
2621         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2622         struct ctdb_ipflags *ipflags;
2623         struct ipalloc_state *ipalloc_state;
2624         struct takeover_callback_data *takeover_data;
2625         struct iprealloc_callback_data iprealloc_data;
2626         bool *retry_data;
2627         bool can_host_ips;
2628
2629         /*
2630          * ip failover is completely disabled, just send out the 
2631          * ipreallocated event.
2632          */
2633         if (ctdb->tunable.disable_ip_failover != 0) {
2634                 goto ipreallocated;
2635         }
2636
2637         ipalloc_state = ipalloc_state_init(ctdb, tmp_ctx);
2638         if (ipalloc_state == NULL) {
2639                 talloc_free(tmp_ctx);
2640                 return -1;
2641         }
2642
2643         ipflags = set_ipflags(ctdb, tmp_ctx, nodemap);
2644         if (ipflags == NULL) {
2645                 DEBUG(DEBUG_ERR,("Failed to set IP flags - aborting takeover run\n"));
2646                 talloc_free(tmp_ctx);
2647                 return -1;
2648         }
2649
2650         /* Fetch known/available public IPs from each active node */
2651         ret = ctdb_reload_remote_public_ips(ctdb, ipalloc_state, nodemap);
2652         if (ret != 0) {
2653                 talloc_free(tmp_ctx);
2654                 return -1;
2655         }
2656
2657         /* Short-circuit IP allocation if no node has available IPs */
2658         can_host_ips = false;
2659         for (i=0; i < ipalloc_state->num; i++) {
2660                 if (ipalloc_state->available_public_ips[i] != NULL) {
2661                         can_host_ips = true;
2662                 }
2663         }
2664         if (!can_host_ips) {
2665                 DEBUG(DEBUG_WARNING,("No nodes available to host public IPs yet\n"));
2666                 return 0;
2667         }
2668
2669         /* since nodes only know about those public addresses that
2670            can be served by that particular node, no single node has
2671            a full list of all public addresses that exist in the cluster.
2672            Walk over all node structures and create a merged list of
2673            all public addresses that exist in the cluster.
2674
2675            keep the tree of ips around as ctdb->ip_tree
2676         */
2677         all_ips = create_merged_ip_list(ctdb, ipalloc_state);
2678
2679         /* Do the IP reassignment calculations */
2680         ctdb_takeover_run_core(ipalloc_state, ipflags,
2681                                all_ips, force_rebalance_nodes);
2682
2683         /* Now tell all nodes to release any public IPs should not
2684          * host.  This will be a NOOP on nodes that don't currently
2685          * hold the given IP.
2686          */
2687         takeover_data = talloc_zero(tmp_ctx, struct takeover_callback_data);
2688         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data);
2689
2690         takeover_data->node_failed = talloc_zero_array(tmp_ctx,
2691                                                        bool, nodemap->num);
2692         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data->node_failed);
2693         takeover_data->fail_callback = fail_callback;
2694         takeover_data->fail_callback_data = callback_data;
2695         takeover_data->nodemap = nodemap;
2696
2697         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2698         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2699
2700         async_data->fail_callback = takeover_run_fail_callback;
2701         async_data->callback_data = takeover_data;
2702
2703         ZERO_STRUCT(ip); /* Avoid valgrind warnings for union */
2704
2705         /* Send a RELEASE_IP to all nodes that should not be hosting
2706          * each IP.  For each IP, all but one of these will be
2707          * redundant.  However, the redundant ones are used to tell
2708          * nodes which node should be hosting the IP so that commands
2709          * like "ctdb ip" can display a particular nodes idea of who
2710          * is hosting what. */
2711         for (i=0;i<nodemap->num;i++) {
2712                 /* don't talk to unconnected nodes, but do talk to banned nodes */
2713                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2714                         continue;
2715                 }
2716
2717                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2718                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2719                                 /* This node should be serving this
2720                                    vnn so don't tell it to release the ip
2721                                 */
2722                                 continue;
2723                         }
2724                         ip.pnn  = tmp_ip->pnn;
2725                         ip.addr = tmp_ip->addr;
2726
2727                         timeout = TAKEOVER_TIMEOUT();
2728                         data.dsize = sizeof(ip);
2729                         data.dptr  = (uint8_t *)&ip;
2730                         state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2731                                                   0, CTDB_CONTROL_RELEASE_IP, 0,
2732                                                   data, async_data,
2733                                                   &timeout, NULL);
2734                         if (state == NULL) {
2735                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2736                                 talloc_free(tmp_ctx);
2737                                 return -1;
2738                         }
2739
2740                         ctdb_client_async_add(async_data, state);
2741                 }
2742         }
2743         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2744                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2745                 talloc_free(tmp_ctx);
2746                 return -1;
2747         }
2748         talloc_free(async_data);
2749
2750
2751         /* For each IP, send a TAKOVER_IP to the node that should be
2752          * hosting it.  Many of these will often be redundant (since
2753          * the allocation won't have changed) but they can be useful
2754          * to recover from inconsistencies. */
2755         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2756         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2757
2758         async_data->fail_callback = fail_callback;
2759         async_data->callback_data = callback_data;
2760
2761         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2762                 if (tmp_ip->pnn == -1) {
2763                         /* this IP won't be taken over */
2764                         continue;
2765                 }
2766
2767                 ip.pnn  = tmp_ip->pnn;
2768                 ip.addr = tmp_ip->addr;
2769
2770                 timeout = TAKEOVER_TIMEOUT();
2771                 data.dsize = sizeof(ip);
2772                 data.dptr  = (uint8_t *)&ip;
2773                 state = ctdb_control_send(ctdb, tmp_ip->pnn,
2774                                           0, CTDB_CONTROL_TAKEOVER_IP, 0,
2775                                           data, async_data, &timeout, NULL);
2776                 if (state == NULL) {
2777                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2778                         talloc_free(tmp_ctx);
2779                         return -1;
2780                 }
2781
2782                 ctdb_client_async_add(async_data, state);
2783         }
2784         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2785                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2786                 talloc_free(tmp_ctx);
2787                 return -1;
2788         }
2789
2790 ipreallocated:
2791         /*
2792          * Tell all nodes to run eventscripts to process the
2793          * "ipreallocated" event.  This can do a lot of things,
2794          * including restarting services to reconfigure them if public
2795          * IPs have moved.  Once upon a time this event only used to
2796          * update natgw.
2797          */
2798         retry_data = talloc_zero_array(tmp_ctx, bool, nodemap->num);
2799         CTDB_NO_MEMORY_FATAL(ctdb, retry_data);
2800         iprealloc_data.retry_nodes = retry_data;
2801         iprealloc_data.retry_count = 0;
2802         iprealloc_data.fail_callback = fail_callback;
2803         iprealloc_data.fail_callback_data = callback_data;
2804         iprealloc_data.nodemap = nodemap;
2805
2806         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2807         ret = ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
2808                                         nodes, 0, TAKEOVER_TIMEOUT(),
2809                                         false, tdb_null,
2810                                         NULL, iprealloc_fail_callback,
2811                                         &iprealloc_data);
2812         if (ret != 0) {
2813                 /* If the control failed then we should retry to any
2814                  * nodes flagged by iprealloc_fail_callback using the
2815                  * EVENTSCRIPT control.  This is a best-effort at
2816                  * backward compatiblity when running a mixed cluster
2817                  * where some nodes have not yet been upgraded to
2818                  * support the IPREALLOCATED control.
2819                  */
2820                 DEBUG(DEBUG_WARNING,
2821                       ("Retry ipreallocated to some nodes using eventscript control\n"));
2822
2823                 nodes = talloc_array(tmp_ctx, uint32_t,
2824                                      iprealloc_data.retry_count);
2825                 CTDB_NO_MEMORY_FATAL(ctdb, nodes);
2826
2827                 j = 0;
2828                 for (i=0; i<nodemap->num; i++) {
2829                         if (iprealloc_data.retry_nodes[i]) {
2830                                 nodes[j] = i;
2831                                 j++;
2832                         }
2833                 }
2834
2835                 data.dptr  = discard_const("ipreallocated");
2836                 data.dsize = strlen((char *)data.dptr) + 1; 
2837                 ret = ctdb_client_async_control(ctdb,
2838                                                 CTDB_CONTROL_RUN_EVENTSCRIPTS,
2839                                                 nodes, 0, TAKEOVER_TIMEOUT(),
2840                                                 false, data,
2841                                                 NULL, fail_callback,
2842                                                 callback_data);
2843                 if (ret != 0) {
2844                         DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
2845                 }
2846         }
2847
2848         talloc_free(tmp_ctx);
2849         return ret;
2850 }
2851
2852
2853 /*
2854   destroy a ctdb_client_ip structure
2855  */
2856 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2857 {
2858         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2859                 ctdb_addr_to_str(&ip->addr),
2860                 ntohs(ip->addr.ip.sin_port),
2861                 ip->client_id));
2862
2863         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2864         return 0;
2865 }
2866
2867 /*
2868   called by a client to inform us of a TCP connection that it is managing
2869   that should tickled with an ACK when IP takeover is done
2870  */
2871 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2872                                 TDB_DATA indata)
2873 {
2874         struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
2875         struct ctdb_connection *tcp_sock = NULL;
2876         struct ctdb_tcp_list *tcp;
2877         struct ctdb_connection t;
2878         int ret;
2879         TDB_DATA data;
2880         struct ctdb_client_ip *ip;
2881         struct ctdb_vnn *vnn;
2882         ctdb_sock_addr addr;
2883
2884         /* If we don't have public IPs, tickles are useless */
2885         if (ctdb->vnn == NULL) {
2886                 return 0;
2887         }
2888
2889         tcp_sock = (struct ctdb_connection *)indata.dptr;
2890
2891         addr = tcp_sock->src;
2892         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
2893         addr = tcp_sock->dst;
2894         ctdb_canonicalize_ip(&addr, &tcp_sock->dst);
2895
2896         ZERO_STRUCT(addr);
2897         memcpy(&addr, &tcp_sock->dst, sizeof(addr));
2898         vnn = find_public_ip_vnn(ctdb, &addr);
2899         if (vnn == NULL) {
2900                 switch (addr.sa.sa_family) {
2901                 case AF_INET:
2902                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2903                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
2904                                         ctdb_addr_to_str(&addr)));
2905                         }
2906                         break;
2907                 case AF_INET6:
2908                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
2909                                 ctdb_addr_to_str(&addr)));
2910                         break;
2911                 default:
2912                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2913                 }
2914
2915                 return 0;
2916         }
2917
2918         if (vnn->pnn != ctdb->pnn) {
2919                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2920                         ctdb_addr_to_str(&addr),
2921                         client_id, client->pid));
2922                 /* failing this call will tell smbd to die */
2923                 return -1;
2924         }
2925
2926         ip = talloc(client, struct ctdb_client_ip);
2927         CTDB_NO_MEMORY(ctdb, ip);
2928
2929         ip->ctdb      = ctdb;
2930         ip->addr      = addr;
2931         ip->client_id = client_id;
2932         talloc_set_destructor(ip, ctdb_client_ip_destructor);
2933         DLIST_ADD(ctdb->client_ip_list, ip);
2934
2935         tcp = talloc(client, struct ctdb_tcp_list);
2936         CTDB_NO_MEMORY(ctdb, tcp);
2937
2938         tcp->connection.src = tcp_sock->src;
2939         tcp->connection.dst = tcp_sock->dst;
2940
2941         DLIST_ADD(client->tcp_list, tcp);
2942
2943         t.src = tcp_sock->src;
2944         t.dst = tcp_sock->dst;
2945
2946         data.dptr = (uint8_t *)&t;
2947         data.dsize = sizeof(t);
2948
2949         switch (addr.sa.sa_family) {
2950         case AF_INET:
2951                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2952                         (unsigned)ntohs(tcp_sock->dst.ip.sin_port),
2953                         ctdb_addr_to_str(&tcp_sock->src),
2954                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2955                 break;
2956         case AF_INET6:
2957                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2958                         (unsigned)ntohs(tcp_sock->dst.ip6.sin6_port),
2959                         ctdb_addr_to_str(&tcp_sock->src),
2960                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2961                 break;
2962         default:
2963                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2964         }
2965
2966
2967         /* tell all nodes about this tcp connection */
2968         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
2969                                        CTDB_CONTROL_TCP_ADD,
2970                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2971         if (ret != 0) {
2972                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
2973                 return -1;
2974         }
2975
2976         return 0;
2977 }
2978
2979 /*
2980   find a tcp address on a list
2981  */
2982 static struct ctdb_connection *ctdb_tcp_find(struct ctdb_tcp_array *array,
2983                                            struct ctdb_connection *tcp)
2984 {
2985         int i;
2986
2987         if (array == NULL) {
2988                 return NULL;
2989         }
2990
2991         for (i=0;i<array->num;i++) {
2992                 if (ctdb_same_sockaddr(&array->connections[i].src, &tcp->src) &&
2993                     ctdb_same_sockaddr(&array->connections[i].dst, &tcp->dst)) {
2994                         return &array->connections[i];
2995                 }
2996         }
2997         return NULL;
2998 }
2999
3000
3001
3002 /*
3003   called by a daemon to inform us of a TCP connection that one of its
3004   clients managing that should tickled with an ACK when IP takeover is
3005   done
3006  */
3007 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
3008 {
3009         struct ctdb_connection *p = (struct ctdb_connection *)indata.dptr;
3010         struct ctdb_tcp_array *tcparray;
3011         struct ctdb_connection tcp;
3012         struct ctdb_vnn *vnn;
3013
3014         /* If we don't have public IPs, tickles are useless */
3015         if (ctdb->vnn == NULL) {
3016                 return 0;
3017         }
3018
3019         vnn = find_public_ip_vnn(ctdb, &p->dst);
3020         if (vnn == NULL) {
3021                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
3022                         ctdb_addr_to_str(&p->dst)));
3023
3024                 return -1;
3025         }
3026
3027
3028         tcparray = vnn->tcp_array;
3029
3030         /* If this is the first tickle */
3031         if (tcparray == NULL) {
3032                 tcparray = talloc(vnn, struct ctdb_tcp_array);
3033                 CTDB_NO_MEMORY(ctdb, tcparray);
3034                 vnn->tcp_array = tcparray;
3035
3036                 tcparray->num = 0;
3037                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_connection));
3038                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
3039
3040                 tcparray->connections[tcparray->num].src = p->src;
3041                 tcparray->connections[tcparray->num].dst = p->dst;
3042                 tcparray->num++;
3043
3044                 if (tcp_update_needed) {
3045                         vnn->tcp_update_needed = true;
3046                 }
3047                 return 0;
3048         }
3049
3050
3051         /* Do we already have this tickle ?*/
3052         tcp.src = p->src;
3053         tcp.dst = p->dst;
3054         if (ctdb_tcp_find(tcparray, &tcp) != NULL) {
3055                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
3056                         ctdb_addr_to_str(&tcp.dst),
3057                         ntohs(tcp.dst.ip.sin_port),
3058                         vnn->pnn));
3059                 return 0;
3060         }
3061
3062         /* A new tickle, we must add it to the array */
3063         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
3064                                         struct ctdb_connection,
3065                                         tcparray->num+1);
3066         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3067
3068         tcparray->connections[tcparray->num].src = p->src;
3069         tcparray->connections[tcparray->num].dst = p->dst;
3070         tcparray->num++;
3071
3072         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
3073                 ctdb_addr_to_str(&tcp.dst),
3074                 ntohs(tcp.dst.ip.sin_port),
3075                 vnn->pnn));
3076
3077         if (tcp_update_needed) {
3078                 vnn->tcp_update_needed = true;
3079         }
3080
3081         return 0;
3082 }
3083
3084
3085 /*
3086   called by a daemon to inform us of a TCP connection that one of its
3087   clients managing that should tickled with an ACK when IP takeover is
3088   done
3089  */
3090 static void ctdb_remove_connection(struct ctdb_context *ctdb, struct ctdb_connection *conn)
3091 {
3092         struct ctdb_connection *tcpp;
3093         struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst);
3094
3095         if (vnn == NULL) {
3096                 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
3097                         ctdb_addr_to_str(&conn->dst)));
3098                 return;
3099         }
3100
3101         /* if the array is empty we cant remove it
3102            and we don't need to do anything
3103          */
3104         if (vnn->tcp_array == NULL) {
3105                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
3106                         ctdb_addr_to_str(&conn->dst),
3107                         ntohs(conn->dst.ip.sin_port)));
3108                 return;
3109         }
3110
3111
3112         /* See if we know this connection
3113            if we don't know this connection  then we dont need to do anything
3114          */
3115         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
3116         if (tcpp == NULL) {
3117                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
3118                         ctdb_addr_to_str(&conn->dst),
3119                         ntohs(conn->dst.ip.sin_port)));
3120                 return;
3121         }
3122
3123
3124         /* We need to remove this entry from the array.
3125            Instead of allocating a new array and copying data to it
3126            we cheat and just copy the last entry in the existing array
3127            to the entry that is to be removed and just shring the 
3128            ->num field
3129          */
3130         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
3131         vnn->tcp_array->num--;
3132
3133         /* If we deleted the last entry we also need to remove the entire array
3134          */
3135         if (vnn->tcp_array->num == 0) {
3136                 talloc_free(vnn->tcp_array);
3137                 vnn->tcp_array = NULL;
3138         }               
3139
3140         vnn->tcp_update_needed = true;
3141
3142         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
3143                 ctdb_addr_to_str(&conn->src),
3144                 ntohs(conn->src.ip.sin_port)));
3145 }
3146
3147
3148 /*
3149   called by a daemon to inform us of a TCP connection that one of its
3150   clients used are no longer needed in the tickle database
3151  */
3152 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
3153 {
3154         struct ctdb_connection *conn = (struct ctdb_connection *)indata.dptr;
3155
3156         /* If we don't have public IPs, tickles are useless */
3157         if (ctdb->vnn == NULL) {
3158                 return 0;
3159         }
3160
3161         ctdb_remove_connection(ctdb, conn);
3162
3163         return 0;
3164 }
3165
3166
3167 /*
3168   Called when another daemon starts - causes all tickles for all
3169   public addresses we are serving to be sent to the new node on the
3170   next check.  This actually causes the next scheduled call to
3171   tdb_update_tcp_tickles() to update all nodes.  This is simple and
3172   doesn't require careful error handling.
3173  */
3174 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t pnn)
3175 {
3176         struct ctdb_vnn *vnn;
3177
3178         DEBUG(DEBUG_INFO, ("Received startup control from node %lu\n",
3179                            (unsigned long) pnn));
3180
3181         for (vnn = ctdb->vnn; vnn != NULL; vnn = vnn->next) {
3182                 vnn->tcp_update_needed = true;
3183         }
3184
3185         return 0;
3186 }
3187
3188
3189 /*
3190   called when a client structure goes away - hook to remove
3191   elements from the tcp_list in all daemons
3192  */
3193 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
3194 {
3195         while (client->tcp_list) {
3196                 struct ctdb_tcp_list *tcp = client->tcp_list;
3197                 DLIST_REMOVE(client->tcp_list, tcp);
3198                 ctdb_remove_connection(client->ctdb, &tcp->connection);
3199         }
3200 }
3201
3202
3203 void ctdb_release_all_ips(struct ctdb_context *ctdb)
3204 {
3205         struct ctdb_vnn *vnn;
3206         int count = 0;
3207
3208         if (ctdb->tunable.disable_ip_failover == 1) {
3209                 return;
3210         }
3211
3212         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3213                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
3214                         ctdb_vnn_unassign_iface(ctdb, vnn);
3215                         continue;
3216                 }
3217                 if (!vnn->iface) {
3218                         continue;
3219                 }
3220
3221                 /* Don't allow multiple releases at once.  Some code,
3222                  * particularly ctdb_tickle_sentenced_connections() is
3223                  * not re-entrant */
3224                 if (vnn->update_in_flight) {
3225                         DEBUG(DEBUG_WARNING,
3226                               (__location__
3227                                " Not releasing IP %s/%u on interface %s, an update is already in progess\n",
3228                                     ctdb_addr_to_str(&vnn->public_address),
3229                                     vnn->public_netmask_bits,
3230                                     ctdb_vnn_iface_string(vnn)));
3231                         continue;
3232                 }
3233                 vnn->update_in_flight = true;
3234
3235                 DEBUG(DEBUG_INFO,("Release of IP %s/%u on interface %s node:-1\n",
3236                                     ctdb_addr_to_str(&vnn->public_address),
3237                                     vnn->public_netmask_bits,
3238                                     ctdb_vnn_iface_string(vnn)));
3239
3240                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
3241                                   ctdb_vnn_iface_string(vnn),
3242                                   ctdb_addr_to_str(&vnn->public_address),
3243                                   vnn->public_netmask_bits);
3244                 release_kill_clients(ctdb, &vnn->public_address);
3245                 ctdb_vnn_unassign_iface(ctdb, vnn);
3246                 vnn->update_in_flight = false;
3247                 count++;
3248         }
3249
3250         DEBUG(DEBUG_NOTICE,(__location__ " Released %d public IPs\n", count));
3251 }
3252
3253
3254 /*
3255   get list of public IPs
3256  */
3257 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
3258                                     struct ctdb_req_control_old *c, TDB_DATA *outdata)
3259 {
3260         int i, num, len;
3261         struct ctdb_public_ip_list_old *ips;
3262         struct ctdb_vnn *vnn;
3263         bool only_available = false;
3264
3265         if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
3266                 only_available = true;
3267         }
3268
3269         /* count how many public ip structures we have */
3270         num = 0;
3271         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3272                 num++;
3273         }
3274
3275         len = offsetof(struct ctdb_public_ip_list_old, ips) +
3276                 num*sizeof(struct ctdb_public_ip);
3277         ips = talloc_zero_size(outdata, len);
3278         CTDB_NO_MEMORY(ctdb, ips);
3279
3280         i = 0;
3281         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3282                 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
3283                         continue;
3284                 }
3285                 ips->ips[i].pnn  = vnn->pnn;
3286                 ips->ips[i].addr = vnn->public_address;
3287                 i++;
3288         }
3289         ips->num = i;
3290         len = offsetof(struct ctdb_public_ip_list_old, ips) +
3291                 i*sizeof(struct ctdb_public_ip);
3292
3293         outdata->dsize = len;
3294         outdata->dptr  = (uint8_t *)ips;
3295
3296         return 0;
3297 }
3298
3299
3300 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
3301                                         struct ctdb_req_control_old *c,
3302                                         TDB_DATA indata,
3303                                         TDB_DATA *outdata)
3304 {
3305         int i, num, len;
3306         ctdb_sock_addr *addr;
3307         struct ctdb_public_ip_info_old *info;
3308         struct ctdb_vnn *vnn;
3309
3310         addr = (ctdb_sock_addr *)indata.dptr;
3311
3312         vnn = find_public_ip_vnn(ctdb, addr);
3313         if (vnn == NULL) {
3314                 /* if it is not a public ip   it could be our 'single ip' */
3315                 if (ctdb->single_ip_vnn) {
3316                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
3317                                 vnn = ctdb->single_ip_vnn;
3318                         }
3319                 }
3320         }
3321         if (vnn == NULL) {
3322                 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
3323                                  "'%s'not a public address\n",
3324                                  ctdb_addr_to_str(addr)));
3325                 return -1;
3326         }
3327
3328         /* count how many public ip structures we have */
3329         num = 0;
3330         for (;vnn->ifaces[num];) {
3331                 num++;
3332         }
3333
3334         len = offsetof(struct ctdb_public_ip_info_old, ifaces) +
3335                 num*sizeof(struct ctdb_iface);
3336         info = talloc_zero_size(outdata, len);
3337         CTDB_NO_MEMORY(ctdb, info);
3338
3339         info->ip.addr = vnn->public_address;
3340         info->ip.pnn = vnn->pnn;
3341         info->active_idx = 0xFFFFFFFF;
3342
3343         for (i=0; vnn->ifaces[i]; i++) {
3344                 struct ctdb_interface *cur;
3345
3346                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
3347                 if (cur == NULL) {
3348                         DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
3349                                            vnn->ifaces[i]));
3350                         return -1;
3351                 }
3352                 if (vnn->iface == cur) {
3353                         info->active_idx = i;
3354                 }
3355                 strncpy(info->ifaces[i].name, cur->name, sizeof(info->ifaces[i].name)-1);
3356                 info->ifaces[i].link_state = cur->link_up;
3357                 info->ifaces[i].references = cur->references;
3358         }
3359         info->num = i;
3360         len = offsetof(struct ctdb_public_ip_info_old, ifaces) +
3361                 i*sizeof(struct ctdb_iface);
3362
3363         outdata->dsize = len;
3364         outdata->dptr  = (uint8_t *)info;
3365
3366         return 0;
3367 }
3368
3369 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
3370                                 struct ctdb_req_control_old *c,
3371                                 TDB_DATA *outdata)
3372 {
3373         int i, num, len;
3374         struct ctdb_iface_list_old *ifaces;
3375         struct ctdb_interface *cur;
3376
3377         /* count how many public ip structures we have */
3378         num = 0;
3379         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3380                 num++;
3381         }
3382
3383         len = offsetof(struct ctdb_iface_list_old, ifaces) +
3384                 num*sizeof(struct ctdb_iface);
3385         ifaces = talloc_zero_size(outdata, len);
3386         CTDB_NO_MEMORY(ctdb, ifaces);
3387
3388         i = 0;
3389         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3390                 strcpy(ifaces->ifaces[i].name, cur->name);
3391                 ifaces->ifaces[i].link_state = cur->link_up;
3392                 ifaces->ifaces[i].references = cur->references;
3393                 i++;
3394         }
3395         ifaces->num = i;
3396         len = offsetof(struct ctdb_iface_list_old, ifaces) +
3397                 i*sizeof(struct ctdb_iface);
3398
3399         outdata->dsize = len;
3400         outdata->dptr  = (uint8_t *)ifaces;
3401
3402         return 0;
3403 }
3404
3405 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
3406                                     struct ctdb_req_control_old *c,
3407                                     TDB_DATA indata)
3408 {
3409         struct ctdb_iface *info;
3410         struct ctdb_interface *iface;
3411         bool link_up = false;
3412
3413         info = (struct ctdb_iface *)indata.dptr;
3414
3415         if (info->name[CTDB_IFACE_SIZE] != '\0') {
3416                 int len = strnlen(info->name, CTDB_IFACE_SIZE);
3417                 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
3418                                   len, len, info->name));
3419                 return -1;
3420         }
3421
3422         switch (info->link_state) {
3423         case 0:
3424                 link_up = false;
3425                 break;
3426         case 1:
3427                 link_up = true;
3428                 break;
3429         default:
3430                 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
3431                                   (unsigned int)info->link_state));
3432                 return -1;
3433         }
3434
3435         if (info->references != 0) {
3436                 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
3437                                   (unsigned int)info->references));
3438                 return -1;
3439         }
3440
3441         iface = ctdb_find_iface(ctdb, info->name);
3442         if (iface == NULL) {
3443                 return -1;
3444         }
3445
3446         if (link_up == iface->link_up) {
3447                 return 0;
3448         }
3449
3450         DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
3451               ("iface[%s] has changed it's link status %s => %s\n",
3452                iface->name,
3453                iface->link_up?"up":"down",
3454                link_up?"up":"down"));
3455
3456         iface->link_up = link_up;
3457         return 0;
3458 }
3459
3460
3461 /* 
3462    structure containing the listening socket and the list of tcp connections
3463    that the ctdb daemon is to kill
3464 */
3465 struct ctdb_kill_tcp {
3466         struct ctdb_vnn *vnn;
3467         struct ctdb_context *ctdb;
3468         int capture_fd;
3469         struct tevent_fd *fde;
3470         trbt_tree_t *connections;
3471         void *private_data;
3472 };
3473
3474 /*
3475   a tcp connection that is to be killed
3476  */
3477 struct ctdb_killtcp_con {
3478         ctdb_sock_addr src_addr;
3479         ctdb_sock_addr dst_addr;
3480         int count;
3481         struct ctdb_kill_tcp *killtcp;
3482 };
3483
3484 /* this function is used to create a key to represent this socketpair
3485    in the killtcp tree.
3486    this key is used to insert and lookup matching socketpairs that are
3487    to be tickled and RST
3488 */
3489 #define KILLTCP_KEYLEN  10
3490 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
3491 {
3492         static uint32_t key[KILLTCP_KEYLEN];
3493
3494         bzero(key, sizeof(key));
3495
3496         if (src->sa.sa_family != dst->sa.sa_family) {
3497                 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
3498                 return key;
3499         }
3500         
3501         switch (src->sa.sa_family) {
3502         case AF_INET:
3503                 key[0]  = dst->ip.sin_addr.s_addr;
3504                 key[1]  = src->ip.sin_addr.s_addr;
3505                 key[2]  = dst->ip.sin_port;
3506                 key[3]  = src->ip.sin_port;
3507                 break;
3508         case AF_INET6: {
3509                 uint32_t *dst6_addr32 =
3510                         (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
3511                 uint32_t *src6_addr32 =
3512                         (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
3513                 key[0]  = dst6_addr32[3];
3514                 key[1]  = src6_addr32[3];
3515                 key[2]  = dst6_addr32[2];
3516                 key[3]  = src6_addr32[2];
3517                 key[4]  = dst6_addr32[1];
3518                 key[5]  = src6_addr32[1];
3519                 key[6]  = dst6_addr32[0];
3520                 key[7]  = src6_addr32[0];
3521                 key[8]  = dst->ip6.sin6_port;
3522                 key[9]  = src->ip6.sin6_port;
3523                 break;
3524         }
3525         default:
3526                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
3527                 return key;
3528         }
3529
3530         return key;
3531 }
3532
3533 /*
3534   called when we get a read event on the raw socket
3535  */
3536 static void capture_tcp_handler(struct tevent_context *ev,
3537                                 struct tevent_fd *fde,
3538                                 uint16_t flags, void *private_data)
3539 {
3540         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3541         struct ctdb_killtcp_con *con;
3542         ctdb_sock_addr src, dst;
3543         uint32_t ack_seq, seq;
3544
3545         if (!(flags & TEVENT_FD_READ)) {
3546                 return;
3547         }
3548
3549         if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
3550                                 killtcp->private_data,
3551                                 &src, &dst,
3552                                 &ack_seq, &seq) != 0) {
3553                 /* probably a non-tcp ACK packet */
3554                 return;
3555         }
3556
3557         /* check if we have this guy in our list of connections
3558            to kill
3559         */
3560         con = trbt_lookuparray32(killtcp->connections, 
3561                         KILLTCP_KEYLEN, killtcp_key(&src, &dst));
3562         if (con == NULL) {
3563                 /* no this was some other packet we can just ignore */
3564                 return;
3565         }
3566
3567         /* This one has been tickled !
3568            now reset him and remove him from the list.
3569          */
3570         DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
3571                 ntohs(con->dst_addr.ip.sin_port),
3572                 ctdb_addr_to_str(&con->src_addr),
3573                 ntohs(con->src_addr.ip.sin_port)));
3574
3575         ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
3576         talloc_free(con);
3577 }
3578
3579
3580 /* when traversing the list of all tcp connections to send tickle acks to
3581    (so that we can capture the ack coming back and kill the connection
3582     by a RST)
3583    this callback is called for each connection we are currently trying to kill
3584 */
3585 static int tickle_connection_traverse(void *param, void *data)
3586 {
3587         struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
3588
3589         /* have tried too many times, just give up */
3590         if (con->count >= 5) {
3591                 /* can't delete in traverse: reparent to delete_cons */
3592                 talloc_steal(param, con);
3593                 return 0;
3594         }
3595
3596         /* othervise, try tickling it again */
3597         con->count++;
3598         ctdb_sys_send_tcp(
3599                 (ctdb_sock_addr *)&con->dst_addr,
3600                 (ctdb_sock_addr *)&con->src_addr,
3601                 0, 0, 0);
3602         return 0;
3603 }
3604
3605
3606 /* 
3607    called every second until all sentenced connections have been reset
3608  */
3609 static void ctdb_tickle_sentenced_connections(struct tevent_context *ev,
3610                                               struct tevent_timer *te,
3611                                               struct timeval t, void *private_data)
3612 {
3613         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3614         void *delete_cons = talloc_new(NULL);
3615
3616         /* loop over all connections sending tickle ACKs */
3617         trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
3618
3619         /* now we've finished traverse, it's safe to do deletion. */
3620         talloc_free(delete_cons);
3621
3622         /* If there are no more connections to kill we can remove the
3623            entire killtcp structure
3624          */
3625         if ( (killtcp->connections == NULL) || 
3626              (killtcp->connections->root == NULL) ) {
3627                 talloc_free(killtcp);
3628                 return;
3629         }
3630
3631         /* try tickling them again in a seconds time
3632          */
3633         tevent_add_timer(killtcp->ctdb->ev, killtcp,
3634                          timeval_current_ofs(1, 0),
3635                          ctdb_tickle_sentenced_connections, killtcp);
3636 }
3637
3638 /*
3639   destroy the killtcp structure
3640  */
3641 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
3642 {
3643         struct ctdb_vnn *tmpvnn;
3644
3645         /* verify that this vnn is still active */
3646         for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
3647                 if (tmpvnn == killtcp->vnn) {
3648                         break;
3649                 }
3650         }
3651
3652         if (tmpvnn == NULL) {
3653                 return 0;
3654         }
3655
3656         if (killtcp->vnn->killtcp != killtcp) {
3657                 return 0;
3658         }
3659
3660         killtcp->vnn->killtcp = NULL;
3661
3662         return 0;
3663 }
3664
3665
3666 /* nothing fancy here, just unconditionally replace any existing
3667    connection structure with the new one.
3668
3669    don't even free the old one if it did exist, that one is talloc_stolen
3670    by the same node in the tree anyway and will be deleted when the new data 
3671    is deleted
3672 */
3673 static void *add_killtcp_callback(void *parm, void *data)
3674 {
3675         return parm;
3676 }
3677
3678 /*
3679   add a tcp socket to the list of connections we want to RST
3680  */
3681 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb, 
3682                                        ctdb_sock_addr *s,
3683                                        ctdb_sock_addr *d)
3684 {
3685         ctdb_sock_addr src, dst;
3686         struct ctdb_kill_tcp *killtcp;
3687         struct ctdb_killtcp_con *con;
3688         struct ctdb_vnn *vnn;
3689
3690         ctdb_canonicalize_ip(s, &src);
3691         ctdb_canonicalize_ip(d, &dst);
3692
3693         vnn = find_public_ip_vnn(ctdb, &dst);
3694         if (vnn == NULL) {
3695                 vnn = find_public_ip_vnn(ctdb, &src);
3696         }
3697         if (vnn == NULL) {
3698                 /* if it is not a public ip   it could be our 'single ip' */
3699                 if (ctdb->single_ip_vnn) {
3700                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
3701                                 vnn = ctdb->single_ip_vnn;
3702                         }
3703                 }
3704         }
3705         if (vnn == NULL) {
3706                 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n")); 
3707                 return -1;
3708         }
3709
3710         killtcp = vnn->killtcp;
3711         
3712         /* If this is the first connection to kill we must allocate
3713            a new structure
3714          */
3715         if (killtcp == NULL) {
3716                 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
3717                 CTDB_NO_MEMORY(ctdb, killtcp);
3718
3719                 killtcp->vnn         = vnn;
3720                 killtcp->ctdb        = ctdb;
3721                 killtcp->capture_fd  = -1;
3722                 killtcp->connections = trbt_create(killtcp, 0);
3723
3724                 vnn->killtcp         = killtcp;
3725                 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
3726         }
3727
3728
3729
3730         /* create a structure that describes this connection we want to
3731            RST and store it in killtcp->connections
3732         */
3733         con = talloc(killtcp, struct ctdb_killtcp_con);
3734         CTDB_NO_MEMORY(ctdb, con);
3735         con->src_addr = src;
3736         con->dst_addr = dst;
3737         con->count    = 0;
3738         con->killtcp  = killtcp;
3739
3740
3741         trbt_insertarray32_callback(killtcp->connections,
3742                         KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
3743                         add_killtcp_callback, con);
3744
3745         /* 
3746            If we don't have a socket to listen on yet we must create it
3747          */
3748         if (killtcp->capture_fd == -1) {
3749                 const char *iface = ctdb_vnn_iface_string(vnn);
3750                 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
3751                 if (killtcp->capture_fd == -1) {
3752                         DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
3753                                           "socket on iface '%s' for killtcp (%s)\n",
3754                                           iface, strerror(errno)));
3755                         goto failed;
3756                 }
3757         }
3758
3759
3760         if (killtcp->fde == NULL) {
3761                 killtcp->fde = tevent_add_fd(ctdb->ev, killtcp,
3762                                              killtcp->capture_fd,
3763                                              TEVENT_FD_READ,
3764                                              capture_tcp_handler, killtcp);
3765                 tevent_fd_set_auto_close(killtcp->fde);
3766
3767                 /* We also need to set up some events to tickle all these connections
3768                    until they are all reset
3769                 */
3770                 tevent_add_timer(ctdb->ev, killtcp, timeval_current_ofs(1, 0),
3771                                  ctdb_tickle_sentenced_connections, killtcp);
3772         }
3773
3774         /* tickle him once now */
3775         ctdb_sys_send_tcp(
3776                 &con->dst_addr,
3777                 &con->src_addr,
3778                 0, 0, 0);
3779
3780         return 0;
3781
3782 failed:
3783         talloc_free(vnn->killtcp);
3784         vnn->killtcp = NULL;
3785         return -1;
3786 }
3787
3788 /*
3789   kill a TCP connection.
3790  */
3791 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
3792 {
3793         struct ctdb_connection *killtcp = (struct ctdb_connection *)indata.dptr;
3794
3795         return ctdb_killtcp_add_connection(ctdb, &killtcp->src, &killtcp->dst);
3796 }
3797
3798 /*
3799   called by a daemon to inform us of the entire list of TCP tickles for
3800   a particular public address.
3801   this control should only be sent by the node that is currently serving
3802   that public address.
3803  */
3804 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
3805 {
3806         struct ctdb_tickle_list_old *list = (struct ctdb_tickle_list_old *)indata.dptr;
3807         struct ctdb_tcp_array *tcparray;
3808         struct ctdb_vnn *vnn;
3809
3810         /* We must at least have tickles.num or else we cant verify the size
3811            of the received data blob
3812          */
3813         if (indata.dsize < offsetof(struct ctdb_tickle_list_old, connections)) {
3814                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_tickle_list. Not enough data for the tickle.num field\n"));
3815                 return -1;
3816         }
3817
3818         /* verify that the size of data matches what we expect */
3819         if (indata.dsize < offsetof(struct ctdb_tickle_list_old, connections)
3820                          + sizeof(struct ctdb_connection) * list->num) {
3821                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_tickle_list\n"));
3822                 return -1;
3823         }
3824
3825         DEBUG(DEBUG_INFO, ("Received tickle update for public address %s\n",
3826                            ctdb_addr_to_str(&list->addr)));
3827
3828         vnn = find_public_ip_vnn(ctdb, &list->addr);
3829         if (vnn == NULL) {
3830                 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n",
3831                         ctdb_addr_to_str(&list->addr)));
3832
3833                 return 1;
3834         }
3835
3836         /* remove any old ticklelist we might have */
3837         talloc_free(vnn->tcp_array);
3838         vnn->tcp_array = NULL;
3839
3840         tcparray = talloc(vnn, struct ctdb_tcp_array);
3841         CTDB_NO_MEMORY(ctdb, tcparray);
3842
3843         tcparray->num = list->num;
3844
3845         tcparray->connections = talloc_array(tcparray, struct ctdb_connection, tcparray->num);
3846         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3847
3848         memcpy(tcparray->connections, &list->connections[0],
3849                sizeof(struct ctdb_connection)*tcparray->num);
3850
3851         /* We now have a new fresh tickle list array for this vnn */
3852         vnn->tcp_array = tcparray;
3853
3854         return 0;
3855 }
3856
3857 /*
3858   called to return the full list of tickles for the puclic address associated 
3859   with the provided vnn
3860  */
3861 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3862 {
3863         ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3864         struct ctdb_tickle_list_old *list;
3865         struct ctdb_tcp_array *tcparray;
3866         int num;
3867         struct ctdb_vnn *vnn;
3868
3869         vnn = find_public_ip_vnn(ctdb, addr);
3870         if (vnn == NULL) {
3871                 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n", 
3872                         ctdb_addr_to_str(addr)));
3873
3874                 return 1;
3875         }
3876
3877         tcparray = vnn->tcp_array;
3878         if (tcparray) {
3879                 num = tcparray->num;
3880         } else {
3881                 num = 0;
3882         }
3883
3884         outdata->dsize = offsetof(struct ctdb_tickle_list_old, connections)
3885                         + sizeof(struct ctdb_connection) * num;
3886
3887         outdata->dptr  = talloc_size(outdata, outdata->dsize);
3888         CTDB_NO_MEMORY(ctdb, outdata->dptr);
3889         list = (struct ctdb_tickle_list_old *)outdata->dptr;
3890
3891         list->addr = *addr;
3892         list->num = num;
3893         if (num) {
3894                 memcpy(&list->connections[0], tcparray->connections,
3895                         sizeof(struct ctdb_connection) * num);
3896         }
3897
3898         return 0;
3899 }
3900
3901
3902 /*
3903   set the list of all tcp tickles for a public address
3904  */
3905 static int ctdb_send_set_tcp_tickles_for_ip(struct ctdb_context *ctdb,
3906                                             ctdb_sock_addr *addr,
3907                                             struct ctdb_tcp_array *tcparray)
3908 {
3909         int ret, num;
3910         TDB_DATA data;
3911         struct ctdb_tickle_list_old *list;
3912
3913         if (tcparray) {
3914                 num = tcparray->num;
3915         } else {
3916                 num = 0;
3917         }
3918
3919         data.dsize = offsetof(struct ctdb_tickle_list_old, connections) +
3920                         sizeof(struct ctdb_connection) * num;
3921         data.dptr = talloc_size(ctdb, data.dsize);
3922         CTDB_NO_MEMORY(ctdb, data.dptr);
3923
3924         list = (struct ctdb_tickle_list_old *)data.dptr;
3925         list->addr = *addr;
3926         list->num = num;
3927         if (tcparray) {
3928                 memcpy(&list->connections[0], tcparray->connections, sizeof(struct ctdb_connection) * num);
3929         }
3930
3931         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_ALL, 0,
3932                                        CTDB_CONTROL_SET_TCP_TICKLE_LIST,
3933                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3934         if (ret != 0) {
3935                 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
3936                 return -1;
3937         }
3938
3939         talloc_free(data.dptr);
3940
3941         return ret;
3942 }
3943
3944
3945 /*
3946   perform tickle updates if required
3947  */
3948 static void ctdb_update_tcp_tickles(struct tevent_context *ev,
3949                                     struct tevent_timer *te,
3950                                     struct timeval t, void *private_data)
3951 {
3952         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3953         int ret;
3954         struct ctdb_vnn *vnn;
3955
3956         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3957                 /* we only send out updates for public addresses that 
3958                    we have taken over
3959                  */
3960                 if (ctdb->pnn != vnn->pnn) {
3961                         continue;
3962                 }
3963                 /* We only send out the updates if we need to */
3964                 if (!vnn->tcp_update_needed) {
3965                         continue;
3966                 }
3967                 ret = ctdb_send_set_tcp_tickles_for_ip(ctdb,
3968                                                        &vnn->public_address,
3969                                                        vnn->tcp_array);
3970                 if (ret != 0) {
3971                         DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
3972                                 ctdb_addr_to_str(&vnn->public_address)));
3973                 } else {
3974                         DEBUG(DEBUG_INFO,
3975                               ("Sent tickle update for public address %s\n",
3976                                ctdb_addr_to_str(&vnn->public_address)));
3977                         vnn->tcp_update_needed = false;
3978                 }
3979         }
3980
3981         tevent_add_timer(ctdb->ev, ctdb->tickle_update_context,
3982                          timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
3983                          ctdb_update_tcp_tickles, ctdb);
3984 }
3985
3986 /*
3987   start periodic update of tcp tickles
3988  */
3989 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
3990 {
3991         ctdb->tickle_update_context = talloc_new(ctdb);
3992
3993         tevent_add_timer(ctdb->ev, ctdb->tickle_update_context,
3994                          timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
3995                          ctdb_update_tcp_tickles, ctdb);
3996 }
3997
3998
3999
4000
4001 struct control_gratious_arp {
4002         struct ctdb_context *ctdb;
4003         ctdb_sock_addr addr;
4004         const char *iface;
4005         int count;
4006 };
4007
4008 /*
4009   send a control_gratuitous arp
4010  */
4011 static void send_gratious_arp(struct tevent_context *ev,
4012                               struct tevent_timer *te,
4013                               struct timeval t, void *private_data)
4014 {
4015         int ret;
4016         struct control_gratious_arp *arp = talloc_get_type(private_data, 
4017                                                         struct control_gratious_arp);
4018
4019         ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
4020         if (ret != 0) {
4021                 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
4022                                  arp->iface, strerror(errno)));
4023         }
4024
4025
4026         arp->count++;
4027         if (arp->count == CTDB_ARP_REPEAT) {
4028                 talloc_free(arp);
4029                 return;
4030         }
4031
4032         tevent_add_timer(arp->ctdb->ev, arp,
4033                          timeval_current_ofs(CTDB_ARP_INTERVAL, 0),
4034                          send_gratious_arp, arp);
4035 }
4036
4037
4038 /*
4039   send a gratious arp 
4040  */
4041 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
4042 {
4043         struct ctdb_addr_info_old *gratious_arp = (struct ctdb_addr_info_old *)indata.dptr;
4044         struct control_gratious_arp *arp;
4045
4046         /* verify the size of indata */
4047         if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
4048                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
4049                                  (unsigned)indata.dsize, 
4050                                  (unsigned)offsetof(struct ctdb_addr_info_old, iface)));
4051                 return -1;
4052         }
4053         if (indata.dsize != 
4054                 ( offsetof(struct ctdb_addr_info_old, iface)
4055                 + gratious_arp->len ) ){
4056
4057                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4058                         "but should be %u bytes\n", 
4059                          (unsigned)indata.dsize, 
4060                          (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+gratious_arp->len)));
4061                 return -1;
4062         }
4063
4064
4065         arp = talloc(ctdb, struct control_gratious_arp);
4066         CTDB_NO_MEMORY(ctdb, arp);
4067
4068         arp->ctdb  = ctdb;
4069         arp->addr   = gratious_arp->addr;
4070         arp->iface = talloc_strdup(arp, gratious_arp->iface);
4071         CTDB_NO_MEMORY(ctdb, arp->iface);
4072         arp->count = 0;
4073
4074         tevent_add_timer(arp->ctdb->ev, arp,
4075                          timeval_zero(), send_gratious_arp, arp);
4076
4077         return 0;
4078 }
4079
4080 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
4081 {
4082         struct ctdb_addr_info_old *pub = (struct ctdb_addr_info_old *)indata.dptr;
4083         int ret;
4084
4085         /* verify the size of indata */
4086         if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
4087                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_addr_info structure\n"));
4088                 return -1;
4089         }
4090         if (indata.dsize != 
4091                 ( offsetof(struct ctdb_addr_info_old, iface)
4092                 + pub->len ) ){
4093
4094                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4095                         "but should be %u bytes\n", 
4096                          (unsigned)indata.dsize, 
4097                          (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+pub->len)));
4098                 return -1;
4099         }
4100
4101         DEBUG(DEBUG_NOTICE,("Add IP %s\n", ctdb_addr_to_str(&pub->addr)));
4102
4103         ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
4104
4105         if (ret != 0) {
4106                 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
4107                 return -1;
4108         }
4109
4110         return 0;
4111 }
4112
4113 struct delete_ip_callback_state {
4114         struct ctdb_req_control_old *c;
4115 };
4116
4117 /*
4118   called when releaseip event finishes for del_public_address
4119  */
4120 static void delete_ip_callback(struct ctdb_context *ctdb,
4121                                int32_t status, TDB_DATA data,
4122                                const char *errormsg,
4123                                void *private_data)
4124 {
4125         struct delete_ip_callback_state *state =
4126                 talloc_get_type(private_data, struct delete_ip_callback_state);
4127
4128         /* If release failed then fail. */
4129         ctdb_request_control_reply(ctdb, state->c, NULL, status, errormsg);
4130         talloc_free(private_data);
4131 }
4132
4133 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb,
4134                                         struct ctdb_req_control_old *c,
4135                                         TDB_DATA indata, bool *async_reply)
4136 {
4137         struct ctdb_addr_info_old *pub = (struct ctdb_addr_info_old *)indata.dptr;
4138         struct ctdb_vnn *vnn;
4139
4140         /* verify the size of indata */
4141         if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
4142                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_addr_info structure\n"));
4143                 return -1;
4144         }
4145         if (indata.dsize != 
4146                 ( offsetof(struct ctdb_addr_info_old, iface)
4147                 + pub->len ) ){
4148
4149                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4150                         "but should be %u bytes\n", 
4151                          (unsigned)indata.dsize, 
4152                          (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+pub->len)));
4153                 return -1;
4154         }
4155
4156         DEBUG(DEBUG_NOTICE,("Delete IP %s\n", ctdb_addr_to_str(&pub->addr)));
4157
4158         /* walk over all public addresses until we find a match */
4159         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
4160                 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
4161                         if (vnn->pnn == ctdb->pnn) {
4162                                 struct delete_ip_callback_state *state;
4163                                 struct ctdb_public_ip *ip;
4164                                 TDB_DATA data;
4165                                 int ret;
4166
4167                                 vnn->delete_pending = true;
4168
4169                                 state = talloc(ctdb,
4170                                                struct delete_ip_callback_state);
4171                                 CTDB_NO_MEMORY(ctdb, state);
4172                                 state->c = c;
4173
4174                                 ip = talloc(state, struct ctdb_public_ip);
4175                                 if (ip == NULL) {
4176                                         DEBUG(DEBUG_ERR,
4177                                               (__location__ " Out of memory\n"));
4178                                         talloc_free(state);
4179                                         return -1;
4180                                 }
4181                                 ip->pnn = -1;
4182                                 ip->addr = pub->addr;
4183
4184                                 data.dsize = sizeof(struct ctdb_public_ip);
4185                                 data.dptr = (unsigned char *)ip;
4186
4187                                 ret = ctdb_daemon_send_control(ctdb,
4188                                                                ctdb_get_pnn(ctdb),
4189                                                                0,
4190                                                                CTDB_CONTROL_RELEASE_IP,
4191                                                                0, 0,
4192                                                                data,
4193                                                                delete_ip_callback,
4194                                                                state);
4195                                 if (ret == -1) {
4196                                         DEBUG(DEBUG_ERR,
4197                                               (__location__ "Unable to send "
4198                                                "CTDB_CONTROL_RELEASE_IP\n"));
4199                                         talloc_free(state);
4200                                         return -1;
4201                                 }
4202
4203                                 state->c = talloc_steal(state, c);
4204                                 *async_reply = true;
4205                         } else {
4206                                 /* This IP is not hosted on the
4207                                  * current node so just delete it
4208                                  * now. */
4209                                 do_delete_ip(ctdb, vnn);
4210                         }
4211
4212                         return 0;
4213                 }
4214         }
4215
4216         DEBUG(DEBUG_ERR,("Delete IP of unknown public IP address %s\n",
4217                          ctdb_addr_to_str(&pub->addr)));
4218         return -1;
4219 }
4220
4221
4222 struct ipreallocated_callback_state {
4223         struct ctdb_req_control_old *c;
4224 };
4225
4226 static void ctdb_ipreallocated_callback(struct ctdb_context *ctdb,
4227                                         int status, void *p)
4228 {
4229         struct ipreallocated_callback_state *state =
4230                 talloc_get_type(p, struct ipreallocated_callback_state);
4231
4232         if (status != 0) {
4233                 DEBUG(DEBUG_ERR,
4234                       (" \"ipreallocated\" event script failed (status %d)\n",
4235                        status));
4236                 if (status == -ETIME) {
4237                         ctdb_ban_self(ctdb);
4238                 }
4239         }
4240
4241         ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
4242         talloc_free(state);
4243 }
4244
4245 /* A control to run the ipreallocated event */
4246 int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
4247                                    struct ctdb_req_control_old *c,
4248                                    bool *async_reply)
4249 {
4250         int ret;
4251         struct ipreallocated_callback_state *state;
4252
4253         state = talloc(ctdb, struct ipreallocated_callback_state);
4254         CTDB_NO_MEMORY(ctdb, state);
4255
4256         DEBUG(DEBUG_INFO,(__location__ " Running \"ipreallocated\" event\n"));
4257
4258         ret = ctdb_event_script_callback(ctdb, state,
4259                                          ctdb_ipreallocated_callback, state,
4260                                          CTDB_EVENT_IPREALLOCATED,
4261                                          "%s", "");
4262
4263         if (ret != 0) {
4264                 DEBUG(DEBUG_ERR,("Failed to run \"ipreallocated\" event \n"));
4265                 talloc_free(state);
4266                 return -1;
4267         }
4268
4269         /* tell the control that we will be reply asynchronously */
4270         state->c    = talloc_steal(state, c);
4271         *async_reply = true;
4272
4273         return 0;
4274 }
4275
4276
4277 /* This function is called from the recovery daemon to verify that a remote
4278    node has the expected ip allocation.
4279    This is verified against ctdb->ip_tree
4280 */
4281 static int verify_remote_ip_allocation(struct ctdb_context *ctdb,
4282                                        struct ctdb_public_ip_list_old *ips,
4283                                        uint32_t pnn)
4284 {
4285         struct public_ip_list *tmp_ip;
4286         int i;
4287
4288         if (ctdb->ip_tree == NULL) {
4289                 /* don't know the expected allocation yet, assume remote node
4290                    is correct. */
4291                 return 0;
4292         }
4293
4294         if (ips == NULL) {
4295                 return 0;
4296         }
4297
4298         for (i=0; i<ips->num; i++) {
4299                 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
4300                 if (tmp_ip == NULL) {
4301                         DEBUG(DEBUG_ERR,("Node %u has new or unknown public IP %s\n", pnn, ctdb_addr_to_str(&ips->ips[i].addr)));
4302                         return -1;
4303                 }
4304
4305                 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
4306                         continue;
4307                 }
4308
4309                 if (tmp_ip->pnn != ips->ips[i].pnn) {
4310                         DEBUG(DEBUG_ERR,
4311                               ("Inconsistent IP allocation - node %u thinks %s is held by node %u while it is assigned to node %u\n",
4312                                pnn,
4313                                ctdb_addr_to_str(&ips->ips[i].addr),
4314                                ips->ips[i].pnn, tmp_ip->pnn));
4315                         return -1;
4316                 }
4317         }
4318
4319         return 0;
4320 }
4321
4322 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
4323 {
4324         struct public_ip_list *tmp_ip;
4325
4326         /* IP tree is never built if DisableIPFailover is set */
4327         if (ctdb->tunable.disable_ip_failover != 0) {
4328                 return 0;
4329         }
4330
4331         if (ctdb->ip_tree == NULL) {
4332                 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
4333                 return -1;
4334         }
4335
4336         tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
4337         if (tmp_ip == NULL) {
4338                 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
4339                 return -1;
4340         }
4341
4342         DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
4343         tmp_ip->pnn = ip->pnn;
4344
4345         return 0;
4346 }
4347
4348 void clear_ip_assignment_tree(struct ctdb_context *ctdb)
4349 {
4350         TALLOC_FREE(ctdb->ip_tree);
4351 }
4352
4353 struct ctdb_reloadips_handle {
4354         struct ctdb_context *ctdb;
4355         struct ctdb_req_control_old *c;
4356         int status;
4357         int fd[2];
4358         pid_t child;
4359         struct tevent_fd *fde;
4360 };
4361
4362 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
4363 {
4364         if (h == h->ctdb->reload_ips) {
4365                 h->ctdb->reload_ips = NULL;
4366         }
4367         if (h->c != NULL) {
4368                 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
4369                 h->c = NULL;
4370         }
4371         ctdb_kill(h->ctdb, h->child, SIGKILL);
4372         return 0;
4373 }
4374
4375 static void ctdb_reloadips_timeout_event(struct tevent_context *ev,
4376                                          struct tevent_timer *te,
4377                                          struct timeval t, void *private_data)
4378 {
4379         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4380
4381         talloc_free(h);
4382 }
4383
4384 static void ctdb_reloadips_child_handler(struct tevent_context *ev,
4385                                          struct tevent_fd *fde,
4386                                          uint16_t flags, void *private_data)
4387 {
4388         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4389
4390         char res;
4391         int ret;
4392
4393         ret = sys_read(h->fd[0], &res, 1);
4394         if (ret < 1 || res != 0) {
4395                 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
4396                 res = 1;
4397         }
4398         h->status = res;
4399
4400         talloc_free(h);
4401 }
4402
4403 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
4404 {
4405         TALLOC_CTX *mem_ctx = talloc_new(NULL);
4406         struct ctdb_public_ip_list_old *ips;
4407         struct ctdb_vnn *vnn;
4408         struct client_async_data *async_data;
4409         struct timeval timeout;
4410         TDB_DATA data;
4411         struct ctdb_client_control_state *state;
4412         bool first_add;
4413         int i, ret;
4414
4415         CTDB_NO_MEMORY(ctdb, mem_ctx);
4416
4417         /* Read IPs from local node */
4418         ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(),
4419                                        CTDB_CURRENT_NODE, mem_ctx, &ips);
4420         if (ret != 0) {
4421                 DEBUG(DEBUG_ERR,
4422                       ("Unable to fetch public IPs from local node\n"));
4423                 talloc_free(mem_ctx);
4424                 return -1;
4425         }
4426
4427         /* Read IPs file - this is safe since this is a child process */
4428         ctdb->vnn = NULL;
4429         if (ctdb_set_public_addresses(ctdb, false) != 0) {
4430                 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
4431                 talloc_free(mem_ctx);
4432                 return -1;
4433         }
4434
4435         async_data = talloc_zero(mem_ctx, struct client_async_data);
4436         CTDB_NO_MEMORY(ctdb, async_data);
4437
4438         /* Compare IPs between node and file for IPs to be deleted */
4439         for (i = 0; i < ips->num; i++) {
4440                 /* */
4441                 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4442                         if (ctdb_same_ip(&vnn->public_address,
4443                                          &ips->ips[i].addr)) {
4444                                 /* IP is still in file */
4445                                 break;
4446                         }
4447                 }
4448
4449                 if (vnn == NULL) {
4450                         /* Delete IP ips->ips[i] */
4451                         struct ctdb_addr_info_old *pub;
4452
4453                         DEBUG(DEBUG_NOTICE,
4454                               ("IP %s no longer configured, deleting it\n",
4455                                ctdb_addr_to_str(&ips->ips[i].addr)));
4456
4457                         pub = talloc_zero(mem_ctx, struct ctdb_addr_info_old);
4458                         CTDB_NO_MEMORY(ctdb, pub);
4459
4460                         pub->addr  = ips->ips[i].addr;
4461                         pub->mask  = 0;
4462                         pub->len   = 0;
4463
4464                         timeout = TAKEOVER_TIMEOUT();
4465
4466                         data.dsize = offsetof(struct ctdb_addr_info_old,
4467                                               iface) + pub->len;
4468                         data.dptr = (uint8_t *)pub;
4469
4470                         state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4471                                                   CTDB_CONTROL_DEL_PUBLIC_IP,
4472                                                   0, data, async_data,
4473                                                   &timeout, NULL);
4474                         if (state == NULL) {
4475                                 DEBUG(DEBUG_ERR,
4476                                       (__location__
4477                                        " failed sending CTDB_CONTROL_DEL_PUBLIC_IP\n"));
4478                                 goto failed;
4479                         }
4480
4481                         ctdb_client_async_add(async_data, state);
4482                 }
4483         }
4484
4485         /* Compare IPs between node and file for IPs to be added */
4486         first_add = true;
4487         for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4488                 for (i = 0; i < ips->num; i++) {
4489                         if (ctdb_same_ip(&vnn->public_address,
4490                                          &ips->ips[i].addr)) {
4491                                 /* IP already on node */
4492                                 break;
4493                         }
4494                 }
4495                 if (i == ips->num) {
4496                         /* Add IP ips->ips[i] */
4497                         struct ctdb_addr_info_old *pub;
4498                         const char *ifaces = NULL;
4499                         uint32_t len;
4500                         int iface = 0;
4501
4502                         DEBUG(DEBUG_NOTICE,
4503                               ("New IP %s configured, adding it\n",
4504                                ctdb_addr_to_str(&vnn->public_address)));
4505                         if (first_add) {
4506                                 uint32_t pnn = ctdb_get_pnn(ctdb);
4507
4508                                 data.dsize = sizeof(pnn);
4509                                 data.dptr  = (uint8_t *)&pnn;
4510
4511                                 ret = ctdb_client_send_message(
4512                                         ctdb,
4513                                         CTDB_BROADCAST_CONNECTED,
4514                                         CTDB_SRVID_REBALANCE_NODE,
4515                                         data);
4516                                 if (ret != 0) {
4517                                         DEBUG(DEBUG_WARNING,
4518                                               ("Failed to send message to force node reallocation - IPs may be unbalanced\n"));
4519                                 }
4520
4521                                 first_add = false;
4522                         }
4523
4524                         ifaces = vnn->ifaces[0];
4525                         iface = 1;
4526                         while (vnn->ifaces[iface] != NULL) {
4527                                 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces,
4528                                                          vnn->ifaces[iface]);
4529                                 iface++;
4530                         }
4531
4532                         len   = strlen(ifaces) + 1;
4533                         pub = talloc_zero_size(mem_ctx,
4534                                                offsetof(struct ctdb_addr_info_old, iface) + len);
4535                         CTDB_NO_MEMORY(ctdb, pub);
4536
4537                         pub->addr  = vnn->public_address;
4538                         pub->mask  = vnn->public_netmask_bits;
4539                         pub->len   = len;
4540                         memcpy(&pub->iface[0], ifaces, pub->len);
4541
4542                         timeout = TAKEOVER_TIMEOUT();
4543
4544                         data.dsize = offsetof(struct ctdb_addr_info_old,
4545                                               iface) + pub->len;
4546                         data.dptr = (uint8_t *)pub;
4547
4548                         state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4549                                                   CTDB_CONTROL_ADD_PUBLIC_IP,
4550                                                   0, data, async_data,
4551                                                   &timeout, NULL);
4552                         if (state == NULL) {
4553                                 DEBUG(DEBUG_ERR,
4554                                       (__location__
4555                                        " failed sending CTDB_CONTROL_ADD_PUBLIC_IP\n"));
4556                                 goto failed;
4557                         }
4558
4559                         ctdb_client_async_add(async_data, state);
4560                 }
4561         }
4562
4563         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
4564                 DEBUG(DEBUG_ERR,(__location__ " Add/delete IPs failed\n"));
4565                 goto failed;
4566         }
4567
4568         talloc_free(mem_ctx);
4569         return 0;
4570
4571 failed:
4572         talloc_free(mem_ctx);
4573         return -1;
4574 }
4575
4576 /* This control is sent to force the node to re-read the public addresses file
4577    and drop any addresses we should nnot longer host, and add new addresses
4578    that we are now able to host
4579 */
4580 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control_old *c, bool *async_reply)
4581 {
4582         struct ctdb_reloadips_handle *h;
4583         pid_t parent = getpid();
4584
4585         if (ctdb->reload_ips != NULL) {
4586                 talloc_free(ctdb->reload_ips);
4587                 ctdb->reload_ips = NULL;
4588         }
4589
4590         h = talloc(ctdb, struct ctdb_reloadips_handle);
4591         CTDB_NO_MEMORY(ctdb, h);
4592         h->ctdb     = ctdb;
4593         h->c        = NULL;
4594         h->status   = -1;
4595         
4596         if (pipe(h->fd) == -1) {
4597                 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
4598                 talloc_free(h);
4599                 return -1;
4600         }
4601
4602         h->child = ctdb_fork(ctdb);
4603         if (h->child == (pid_t)-1) {
4604                 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
4605                 close(h->fd[0]);
4606                 close(h->fd[1]);
4607                 talloc_free(h);
4608                 return -1;
4609         }
4610
4611         /* child process */
4612         if (h->child == 0) {
4613                 signed char res = 0;
4614
4615                 close(h->fd[0]);
4616                 debug_extra = talloc_asprintf(NULL, "reloadips:");
4617
4618                 prctl_set_comment("ctdb_reloadips");
4619                 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
4620                         DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
4621                         res = -1;
4622                 } else {
4623                         res = ctdb_reloadips_child(ctdb);
4624                         if (res != 0) {
4625                                 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
4626                         }
4627                 }
4628
4629                 sys_write(h->fd[1], &res, 1);
4630                 /* make sure we die when our parent dies */
4631                 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
4632                         sleep(5);
4633                 }
4634                 _exit(0);
4635         }
4636
4637         h->c             = talloc_steal(h, c);
4638
4639         close(h->fd[1]);
4640         set_close_on_exec(h->fd[0]);
4641
4642         talloc_set_destructor(h, ctdb_reloadips_destructor);
4643
4644
4645         h->fde = tevent_add_fd(ctdb->ev, h, h->fd[0], TEVENT_FD_READ,
4646                                ctdb_reloadips_child_handler, (void *)h);
4647         tevent_fd_set_auto_close(h->fde);
4648
4649         tevent_add_timer(ctdb->ev, h, timeval_current_ofs(120, 0),
4650                          ctdb_reloadips_timeout_event, h);
4651
4652         /* we reply later */
4653         *async_reply = true;
4654         return 0;
4655 }