ctdb-ipalloc: Use number of nodes from IP allocation state
[obnox/samba/samba-obnox.git] / ctdb / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6    Copyright (C) Martin Schwenke  2011
7
8    This program is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3 of the License, or
11    (at your option) any later version.
12    
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17    
18    You should have received a copy of the GNU General Public License
19    along with this program; if not, see <http://www.gnu.org/licenses/>.
20 */
21 #include "replace.h"
22 #include "system/network.h"
23 #include "system/filesys.h"
24 #include "system/time.h"
25 #include "system/wait.h"
26
27 #include <talloc.h>
28 #include <tevent.h>
29
30 #include "lib/util/dlinklist.h"
31 #include "lib/util/debug.h"
32 #include "lib/util/samba_util.h"
33 #include "lib/util/util_process.h"
34
35 #include "ctdb_private.h"
36 #include "ctdb_client.h"
37
38 #include "common/rb_tree.h"
39 #include "common/reqid.h"
40 #include "common/system.h"
41 #include "common/common.h"
42 #include "common/logging.h"
43
44
45 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
46
47 #define CTDB_ARP_INTERVAL 1
48 #define CTDB_ARP_REPEAT   3
49
50 /* Flags used in IP allocation algorithms. */
51 struct ctdb_ipflags {
52         bool noiptakeover;
53         bool noiphost;
54 };
55
56 enum ipalloc_algorithm {
57         IPALLOC_DETERMINISTIC,
58         IPALLOC_NONDETERMINISTIC,
59         IPALLOC_LCP2,
60 };
61
62 struct ipalloc_state {
63         uint32_t num;
64
65         /* Arrays with data for each node */
66         struct ctdb_public_ip_list_old **known_public_ips;
67         struct ctdb_public_ip_list_old **available_public_ips;
68
69         enum ipalloc_algorithm algorithm;
70         uint32_t no_ip_failback;
71 };
72
73 struct ctdb_interface {
74         struct ctdb_interface *prev, *next;
75         const char *name;
76         bool link_up;
77         uint32_t references;
78 };
79
80 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
81 {
82         if (vnn->iface) {
83                 return vnn->iface->name;
84         }
85
86         return "__none__";
87 }
88
89 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
90 {
91         struct ctdb_interface *i;
92
93         /* Verify that we don't have an entry for this ip yet */
94         for (i=ctdb->ifaces;i;i=i->next) {
95                 if (strcmp(i->name, iface) == 0) {
96                         return 0;
97                 }
98         }
99
100         /* create a new structure for this interface */
101         i = talloc_zero(ctdb, struct ctdb_interface);
102         CTDB_NO_MEMORY_FATAL(ctdb, i);
103         i->name = talloc_strdup(i, iface);
104         CTDB_NO_MEMORY(ctdb, i->name);
105
106         i->link_up = true;
107
108         DLIST_ADD(ctdb->ifaces, i);
109
110         return 0;
111 }
112
113 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
114                                         const char *name)
115 {
116         int n;
117
118         for (n = 0; vnn->ifaces[n] != NULL; n++) {
119                 if (strcmp(name, vnn->ifaces[n]) == 0) {
120                         return true;
121                 }
122         }
123
124         return false;
125 }
126
127 /* If any interfaces now have no possible IPs then delete them.  This
128  * implementation is naive (i.e. simple) rather than clever
129  * (i.e. complex).  Given that this is run on delip and that operation
130  * is rare, this doesn't need to be efficient - it needs to be
131  * foolproof.  One alternative is reference counting, where the logic
132  * is distributed and can, therefore, be broken in multiple places.
133  * Another alternative is to build a red-black tree of interfaces that
134  * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
135  * once) and then walking ctdb->ifaces once and deleting those not in
136  * the tree.  Let's go to one of those if the naive implementation
137  * causes problems...  :-)
138  */
139 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
140                                         struct ctdb_vnn *vnn)
141 {
142         struct ctdb_interface *i, *next;
143
144         /* For each interface, check if there's an IP using it. */
145         for (i = ctdb->ifaces; i != NULL; i = next) {
146                 struct ctdb_vnn *tv;
147                 bool found;
148                 next = i->next;
149
150                 /* Only consider interfaces named in the given VNN. */
151                 if (!vnn_has_interface_with_name(vnn, i->name)) {
152                         continue;
153                 }
154
155                 /* Is the "single IP" on this interface? */
156                 if ((ctdb->single_ip_vnn != NULL) &&
157                     (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
158                     (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
159                         /* Found, next interface please... */
160                         continue;
161                 }
162                 /* Search for a vnn with this interface. */
163                 found = false;
164                 for (tv=ctdb->vnn; tv; tv=tv->next) {
165                         if (vnn_has_interface_with_name(tv, i->name)) {
166                                 found = true;
167                                 break;
168                         }
169                 }
170
171                 if (!found) {
172                         /* None of the VNNs are using this interface. */
173                         DLIST_REMOVE(ctdb->ifaces, i);
174                         talloc_free(i);
175                 }
176         }
177 }
178
179
180 static struct ctdb_interface *ctdb_find_iface(struct ctdb_context *ctdb,
181                                               const char *iface)
182 {
183         struct ctdb_interface *i;
184
185         for (i=ctdb->ifaces;i;i=i->next) {
186                 if (strcmp(i->name, iface) == 0) {
187                         return i;
188                 }
189         }
190
191         return NULL;
192 }
193
194 static struct ctdb_interface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
195                                                   struct ctdb_vnn *vnn)
196 {
197         int i;
198         struct ctdb_interface *cur = NULL;
199         struct ctdb_interface *best = NULL;
200
201         for (i=0; vnn->ifaces[i]; i++) {
202
203                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
204                 if (cur == NULL) {
205                         continue;
206                 }
207
208                 if (!cur->link_up) {
209                         continue;
210                 }
211
212                 if (best == NULL) {
213                         best = cur;
214                         continue;
215                 }
216
217                 if (cur->references < best->references) {
218                         best = cur;
219                         continue;
220                 }
221         }
222
223         return best;
224 }
225
226 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
227                                      struct ctdb_vnn *vnn)
228 {
229         struct ctdb_interface *best = NULL;
230
231         if (vnn->iface) {
232                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
233                                    "still assigned to iface '%s'\n",
234                                    ctdb_addr_to_str(&vnn->public_address),
235                                    ctdb_vnn_iface_string(vnn)));
236                 return 0;
237         }
238
239         best = ctdb_vnn_best_iface(ctdb, vnn);
240         if (best == NULL) {
241                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
242                                   "cannot assign to iface any iface\n",
243                                   ctdb_addr_to_str(&vnn->public_address)));
244                 return -1;
245         }
246
247         vnn->iface = best;
248         best->references++;
249         vnn->pnn = ctdb->pnn;
250
251         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
252                            "now assigned to iface '%s' refs[%d]\n",
253                            ctdb_addr_to_str(&vnn->public_address),
254                            ctdb_vnn_iface_string(vnn),
255                            best->references));
256         return 0;
257 }
258
259 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
260                                     struct ctdb_vnn *vnn)
261 {
262         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
263                            "now unassigned (old iface '%s' refs[%d])\n",
264                            ctdb_addr_to_str(&vnn->public_address),
265                            ctdb_vnn_iface_string(vnn),
266                            vnn->iface?vnn->iface->references:0));
267         if (vnn->iface) {
268                 vnn->iface->references--;
269         }
270         vnn->iface = NULL;
271         if (vnn->pnn == ctdb->pnn) {
272                 vnn->pnn = -1;
273         }
274 }
275
276 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
277                                struct ctdb_vnn *vnn)
278 {
279         int i;
280
281         /* Nodes that are not RUNNING can not host IPs */
282         if (ctdb->runstate != CTDB_RUNSTATE_RUNNING) {
283                 return false;
284         }
285
286         if (vnn->delete_pending) {
287                 return false;
288         }
289
290         if (vnn->iface && vnn->iface->link_up) {
291                 return true;
292         }
293
294         for (i=0; vnn->ifaces[i]; i++) {
295                 struct ctdb_interface *cur;
296
297                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
298                 if (cur == NULL) {
299                         continue;
300                 }
301
302                 if (cur->link_up) {
303                         return true;
304                 }
305         }
306
307         return false;
308 }
309
310 struct ctdb_takeover_arp {
311         struct ctdb_context *ctdb;
312         uint32_t count;
313         ctdb_sock_addr addr;
314         struct ctdb_tcp_array *tcparray;
315         struct ctdb_vnn *vnn;
316 };
317
318
319 /*
320   lists of tcp endpoints
321  */
322 struct ctdb_tcp_list {
323         struct ctdb_tcp_list *prev, *next;
324         struct ctdb_connection connection;
325 };
326
327 /*
328   list of clients to kill on IP release
329  */
330 struct ctdb_client_ip {
331         struct ctdb_client_ip *prev, *next;
332         struct ctdb_context *ctdb;
333         ctdb_sock_addr addr;
334         uint32_t client_id;
335 };
336
337
338 /*
339   send a gratuitous arp
340  */
341 static void ctdb_control_send_arp(struct tevent_context *ev,
342                                   struct tevent_timer *te,
343                                   struct timeval t, void *private_data)
344 {
345         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
346                                                         struct ctdb_takeover_arp);
347         int i, ret;
348         struct ctdb_tcp_array *tcparray;
349         const char *iface = ctdb_vnn_iface_string(arp->vnn);
350
351         ret = ctdb_sys_send_arp(&arp->addr, iface);
352         if (ret != 0) {
353                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
354                                   iface, strerror(errno)));
355         }
356
357         tcparray = arp->tcparray;
358         if (tcparray) {
359                 for (i=0;i<tcparray->num;i++) {
360                         struct ctdb_connection *tcon;
361
362                         tcon = &tcparray->connections[i];
363                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
364                                 (unsigned)ntohs(tcon->dst.ip.sin_port),
365                                 ctdb_addr_to_str(&tcon->src),
366                                 (unsigned)ntohs(tcon->src.ip.sin_port)));
367                         ret = ctdb_sys_send_tcp(
368                                 &tcon->src,
369                                 &tcon->dst,
370                                 0, 0, 0);
371                         if (ret != 0) {
372                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
373                                         ctdb_addr_to_str(&tcon->src)));
374                         }
375                 }
376         }
377
378         arp->count++;
379
380         if (arp->count == CTDB_ARP_REPEAT) {
381                 talloc_free(arp);
382                 return;
383         }
384
385         tevent_add_timer(arp->ctdb->ev, arp->vnn->takeover_ctx,
386                          timeval_current_ofs(CTDB_ARP_INTERVAL, 100000),
387                          ctdb_control_send_arp, arp);
388 }
389
390 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
391                                        struct ctdb_vnn *vnn)
392 {
393         struct ctdb_takeover_arp *arp;
394         struct ctdb_tcp_array *tcparray;
395
396         if (!vnn->takeover_ctx) {
397                 vnn->takeover_ctx = talloc_new(vnn);
398                 if (!vnn->takeover_ctx) {
399                         return -1;
400                 }
401         }
402
403         arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
404         if (!arp) {
405                 return -1;
406         }
407
408         arp->ctdb = ctdb;
409         arp->addr = vnn->public_address;
410         arp->vnn  = vnn;
411
412         tcparray = vnn->tcp_array;
413         if (tcparray) {
414                 /* add all of the known tcp connections for this IP to the
415                    list of tcp connections to send tickle acks for */
416                 arp->tcparray = talloc_steal(arp, tcparray);
417
418                 vnn->tcp_array = NULL;
419                 vnn->tcp_update_needed = true;
420         }
421
422         tevent_add_timer(arp->ctdb->ev, vnn->takeover_ctx,
423                          timeval_zero(), ctdb_control_send_arp, arp);
424
425         return 0;
426 }
427
428 struct takeover_callback_state {
429         struct ctdb_req_control_old *c;
430         ctdb_sock_addr *addr;
431         struct ctdb_vnn *vnn;
432 };
433
434 struct ctdb_do_takeip_state {
435         struct ctdb_req_control_old *c;
436         struct ctdb_vnn *vnn;
437 };
438
439 /*
440   called when takeip event finishes
441  */
442 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
443                                     void *private_data)
444 {
445         struct ctdb_do_takeip_state *state =
446                 talloc_get_type(private_data, struct ctdb_do_takeip_state);
447         int32_t ret;
448         TDB_DATA data;
449
450         if (status != 0) {
451                 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
452         
453                 if (status == -ETIME) {
454                         ctdb_ban_self(ctdb);
455                 }
456                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
457                                  ctdb_addr_to_str(&state->vnn->public_address),
458                                  ctdb_vnn_iface_string(state->vnn)));
459                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
460
461                 node->flags |= NODE_FLAGS_UNHEALTHY;
462                 talloc_free(state);
463                 return;
464         }
465
466         if (ctdb->do_checkpublicip) {
467
468         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
469         if (ret != 0) {
470                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
471                 talloc_free(state);
472                 return;
473         }
474
475         }
476
477         data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
478         data.dsize = strlen((char *)data.dptr) + 1;
479         DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
480
481         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
482
483
484         /* the control succeeded */
485         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
486         talloc_free(state);
487         return;
488 }
489
490 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
491 {
492         state->vnn->update_in_flight = false;
493         return 0;
494 }
495
496 /*
497   take over an ip address
498  */
499 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
500                               struct ctdb_req_control_old *c,
501                               struct ctdb_vnn *vnn)
502 {
503         int ret;
504         struct ctdb_do_takeip_state *state;
505
506         if (vnn->update_in_flight) {
507                 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
508                                     "update for this IP already in flight\n",
509                                     ctdb_addr_to_str(&vnn->public_address),
510                                     vnn->public_netmask_bits));
511                 return -1;
512         }
513
514         ret = ctdb_vnn_assign_iface(ctdb, vnn);
515         if (ret != 0) {
516                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
517                                  "assign a usable interface\n",
518                                  ctdb_addr_to_str(&vnn->public_address),
519                                  vnn->public_netmask_bits));
520                 return -1;
521         }
522
523         state = talloc(vnn, struct ctdb_do_takeip_state);
524         CTDB_NO_MEMORY(ctdb, state);
525
526         state->c = talloc_steal(ctdb, c);
527         state->vnn   = vnn;
528
529         vnn->update_in_flight = true;
530         talloc_set_destructor(state, ctdb_takeip_destructor);
531
532         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
533                             ctdb_addr_to_str(&vnn->public_address),
534                             vnn->public_netmask_bits,
535                             ctdb_vnn_iface_string(vnn)));
536
537         ret = ctdb_event_script_callback(ctdb,
538                                          state,
539                                          ctdb_do_takeip_callback,
540                                          state,
541                                          CTDB_EVENT_TAKE_IP,
542                                          "%s %s %u",
543                                          ctdb_vnn_iface_string(vnn),
544                                          ctdb_addr_to_str(&vnn->public_address),
545                                          vnn->public_netmask_bits);
546
547         if (ret != 0) {
548                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
549                         ctdb_addr_to_str(&vnn->public_address),
550                         ctdb_vnn_iface_string(vnn)));
551                 talloc_free(state);
552                 return -1;
553         }
554
555         return 0;
556 }
557
558 struct ctdb_do_updateip_state {
559         struct ctdb_req_control_old *c;
560         struct ctdb_interface *old;
561         struct ctdb_vnn *vnn;
562 };
563
564 /*
565   called when updateip event finishes
566  */
567 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
568                                       void *private_data)
569 {
570         struct ctdb_do_updateip_state *state =
571                 talloc_get_type(private_data, struct ctdb_do_updateip_state);
572         int32_t ret;
573
574         if (status != 0) {
575                 if (status == -ETIME) {
576                         ctdb_ban_self(ctdb);
577                 }
578                 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
579                         ctdb_addr_to_str(&state->vnn->public_address),
580                         state->old->name,
581                         ctdb_vnn_iface_string(state->vnn)));
582
583                 /*
584                  * All we can do is reset the old interface
585                  * and let the next run fix it
586                  */
587                 ctdb_vnn_unassign_iface(ctdb, state->vnn);
588                 state->vnn->iface = state->old;
589                 state->vnn->iface->references++;
590
591                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
592                 talloc_free(state);
593                 return;
594         }
595
596         if (ctdb->do_checkpublicip) {
597
598         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
599         if (ret != 0) {
600                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
601                 talloc_free(state);
602                 return;
603         }
604
605         }
606
607         /* the control succeeded */
608         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
609         talloc_free(state);
610         return;
611 }
612
613 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
614 {
615         state->vnn->update_in_flight = false;
616         return 0;
617 }
618
619 /*
620   update (move) an ip address
621  */
622 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
623                                 struct ctdb_req_control_old *c,
624                                 struct ctdb_vnn *vnn)
625 {
626         int ret;
627         struct ctdb_do_updateip_state *state;
628         struct ctdb_interface *old = vnn->iface;
629         const char *new_name;
630
631         if (vnn->update_in_flight) {
632                 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
633                                     "update for this IP already in flight\n",
634                                     ctdb_addr_to_str(&vnn->public_address),
635                                     vnn->public_netmask_bits));
636                 return -1;
637         }
638
639         ctdb_vnn_unassign_iface(ctdb, vnn);
640         ret = ctdb_vnn_assign_iface(ctdb, vnn);
641         if (ret != 0) {
642                 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
643                                  "assin a usable interface (old iface '%s')\n",
644                                  ctdb_addr_to_str(&vnn->public_address),
645                                  vnn->public_netmask_bits,
646                                  old->name));
647                 return -1;
648         }
649
650         new_name = ctdb_vnn_iface_string(vnn);
651         if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
652                 /* A benign update from one interface onto itself.
653                  * no need to run the eventscripts in this case, just return
654                  * success.
655                  */
656                 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
657                 return 0;
658         }
659
660         state = talloc(vnn, struct ctdb_do_updateip_state);
661         CTDB_NO_MEMORY(ctdb, state);
662
663         state->c = talloc_steal(ctdb, c);
664         state->old = old;
665         state->vnn = vnn;
666
667         vnn->update_in_flight = true;
668         talloc_set_destructor(state, ctdb_updateip_destructor);
669
670         DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
671                             "interface %s to %s\n",
672                             ctdb_addr_to_str(&vnn->public_address),
673                             vnn->public_netmask_bits,
674                             old->name,
675                             new_name));
676
677         ret = ctdb_event_script_callback(ctdb,
678                                          state,
679                                          ctdb_do_updateip_callback,
680                                          state,
681                                          CTDB_EVENT_UPDATE_IP,
682                                          "%s %s %s %u",
683                                          state->old->name,
684                                          new_name,
685                                          ctdb_addr_to_str(&vnn->public_address),
686                                          vnn->public_netmask_bits);
687         if (ret != 0) {
688                 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
689                                  ctdb_addr_to_str(&vnn->public_address),
690                                  old->name, new_name));
691                 talloc_free(state);
692                 return -1;
693         }
694
695         return 0;
696 }
697
698 /*
699   Find the vnn of the node that has a public ip address
700   returns -1 if the address is not known as a public address
701  */
702 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
703 {
704         struct ctdb_vnn *vnn;
705
706         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
707                 if (ctdb_same_ip(&vnn->public_address, addr)) {
708                         return vnn;
709                 }
710         }
711
712         return NULL;
713 }
714
715 /*
716   take over an ip address
717  */
718 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
719                                  struct ctdb_req_control_old *c,
720                                  TDB_DATA indata,
721                                  bool *async_reply)
722 {
723         int ret;
724         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
725         struct ctdb_vnn *vnn;
726         bool have_ip = false;
727         bool do_updateip = false;
728         bool do_takeip = false;
729         struct ctdb_interface *best_iface = NULL;
730
731         if (pip->pnn != ctdb->pnn) {
732                 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
733                                  "with pnn %d, but we're node %d\n",
734                                  ctdb_addr_to_str(&pip->addr),
735                                  pip->pnn, ctdb->pnn));
736                 return -1;
737         }
738
739         /* update out vnn list */
740         vnn = find_public_ip_vnn(ctdb, &pip->addr);
741         if (vnn == NULL) {
742                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
743                         ctdb_addr_to_str(&pip->addr)));
744                 return 0;
745         }
746
747         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
748                 have_ip = ctdb_sys_have_ip(&pip->addr);
749         }
750         best_iface = ctdb_vnn_best_iface(ctdb, vnn);
751         if (best_iface == NULL) {
752                 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
753                                  "a usable interface (old %s, have_ip %d)\n",
754                                  ctdb_addr_to_str(&vnn->public_address),
755                                  vnn->public_netmask_bits,
756                                  ctdb_vnn_iface_string(vnn),
757                                  have_ip));
758                 return -1;
759         }
760
761         if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
762                 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
763                 have_ip = false;
764         }
765
766
767         if (vnn->iface == NULL && have_ip) {
768                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
769                                   "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
770                                  ctdb_addr_to_str(&vnn->public_address)));
771                 return 0;
772         }
773
774         if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
775                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
776                                   "and we have it on iface[%s], but it was assigned to node %d"
777                                   "and we are node %d, banning ourself\n",
778                                  ctdb_addr_to_str(&vnn->public_address),
779                                  ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
780                 ctdb_ban_self(ctdb);
781                 return -1;
782         }
783
784         if (vnn->pnn == -1 && have_ip) {
785                 vnn->pnn = ctdb->pnn;
786                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
787                                   "and we already have it on iface[%s], update local daemon\n",
788                                  ctdb_addr_to_str(&vnn->public_address),
789                                   ctdb_vnn_iface_string(vnn)));
790                 return 0;
791         }
792
793         if (vnn->iface) {
794                 if (vnn->iface != best_iface) {
795                         if (!vnn->iface->link_up) {
796                                 do_updateip = true;
797                         } else if (vnn->iface->references > (best_iface->references + 1)) {
798                                 /* only move when the rebalance gains something */
799                                         do_updateip = true;
800                         }
801                 }
802         }
803
804         if (!have_ip) {
805                 if (do_updateip) {
806                         ctdb_vnn_unassign_iface(ctdb, vnn);
807                         do_updateip = false;
808                 }
809                 do_takeip = true;
810         }
811
812         if (do_takeip) {
813                 ret = ctdb_do_takeip(ctdb, c, vnn);
814                 if (ret != 0) {
815                         return -1;
816                 }
817         } else if (do_updateip) {
818                 ret = ctdb_do_updateip(ctdb, c, vnn);
819                 if (ret != 0) {
820                         return -1;
821                 }
822         } else {
823                 /*
824                  * The interface is up and the kernel known the ip
825                  * => do nothing
826                  */
827                 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
828                         ctdb_addr_to_str(&pip->addr),
829                         vnn->public_netmask_bits,
830                         ctdb_vnn_iface_string(vnn)));
831                 return 0;
832         }
833
834         /* tell ctdb_control.c that we will be replying asynchronously */
835         *async_reply = true;
836
837         return 0;
838 }
839
840 /*
841   kill any clients that are registered with a IP that is being released
842  */
843 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
844 {
845         struct ctdb_client_ip *ip;
846
847         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
848                 ctdb_addr_to_str(addr)));
849
850         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
851                 ctdb_sock_addr tmp_addr;
852
853                 tmp_addr = ip->addr;
854                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
855                         ip->client_id,
856                         ctdb_addr_to_str(&ip->addr)));
857
858                 if (ctdb_same_ip(&tmp_addr, addr)) {
859                         struct ctdb_client *client = reqid_find(ctdb->idr,
860                                                                 ip->client_id,
861                                                                 struct ctdb_client);
862                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
863                                 ip->client_id,
864                                 ctdb_addr_to_str(&ip->addr),
865                                 client->pid));
866
867                         if (client->pid != 0) {
868                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
869                                         (unsigned)client->pid,
870                                         ctdb_addr_to_str(addr),
871                                         ip->client_id));
872                                 kill(client->pid, SIGKILL);
873                         }
874                 }
875         }
876 }
877
878 static void do_delete_ip(struct ctdb_context *ctdb, struct ctdb_vnn *vnn)
879 {
880         DLIST_REMOVE(ctdb->vnn, vnn);
881         ctdb_vnn_unassign_iface(ctdb, vnn);
882         ctdb_remove_orphaned_ifaces(ctdb, vnn);
883         talloc_free(vnn);
884 }
885
886 /*
887   called when releaseip event finishes
888  */
889 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
890                                 void *private_data)
891 {
892         struct takeover_callback_state *state = 
893                 talloc_get_type(private_data, struct takeover_callback_state);
894         TDB_DATA data;
895
896         if (status == -ETIME) {
897                 ctdb_ban_self(ctdb);
898         }
899
900         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
901                 if  (ctdb_sys_have_ip(state->addr)) {
902                         DEBUG(DEBUG_ERR,
903                               ("IP %s still hosted during release IP callback, failing\n",
904                                ctdb_addr_to_str(state->addr)));
905                         ctdb_request_control_reply(ctdb, state->c,
906                                                    NULL, -1, NULL);
907                         talloc_free(state);
908                         return;
909                 }
910         }
911
912         /* send a message to all clients of this node telling them
913            that the cluster has been reconfigured and they should
914            release any sockets on this IP */
915         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
916         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
917         data.dsize = strlen((char *)data.dptr)+1;
918
919         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
920
921         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
922
923         /* kill clients that have registered with this IP */
924         release_kill_clients(ctdb, state->addr);
925
926         ctdb_vnn_unassign_iface(ctdb, state->vnn);
927
928         /* Process the IP if it has been marked for deletion */
929         if (state->vnn->delete_pending) {
930                 do_delete_ip(ctdb, state->vnn);
931                 state->vnn = NULL;
932         }
933
934         /* the control succeeded */
935         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
936         talloc_free(state);
937 }
938
939 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
940 {
941         if (state->vnn != NULL) {
942                 state->vnn->update_in_flight = false;
943         }
944         return 0;
945 }
946
947 /*
948   release an ip address
949  */
950 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
951                                 struct ctdb_req_control_old *c,
952                                 TDB_DATA indata, 
953                                 bool *async_reply)
954 {
955         int ret;
956         struct takeover_callback_state *state;
957         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
958         struct ctdb_vnn *vnn;
959         char *iface;
960
961         /* update our vnn list */
962         vnn = find_public_ip_vnn(ctdb, &pip->addr);
963         if (vnn == NULL) {
964                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
965                         ctdb_addr_to_str(&pip->addr)));
966                 return 0;
967         }
968         vnn->pnn = pip->pnn;
969
970         /* stop any previous arps */
971         talloc_free(vnn->takeover_ctx);
972         vnn->takeover_ctx = NULL;
973
974         /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
975          * lazy multicast to drop an IP from any node that isn't the
976          * intended new node.  The following causes makes ctdbd ignore
977          * a release for any address it doesn't host.
978          */
979         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
980                 if (!ctdb_sys_have_ip(&pip->addr)) {
981                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
982                                 ctdb_addr_to_str(&pip->addr),
983                                 vnn->public_netmask_bits,
984                                 ctdb_vnn_iface_string(vnn)));
985                         ctdb_vnn_unassign_iface(ctdb, vnn);
986                         return 0;
987                 }
988         } else {
989                 if (vnn->iface == NULL) {
990                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
991                                            ctdb_addr_to_str(&pip->addr),
992                                            vnn->public_netmask_bits));
993                         return 0;
994                 }
995         }
996
997         /* There is a potential race between take_ip and us because we
998          * update the VNN via a callback that run when the
999          * eventscripts have been run.  Avoid the race by allowing one
1000          * update to be in flight at a time.
1001          */
1002         if (vnn->update_in_flight) {
1003                 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
1004                                     "update for this IP already in flight\n",
1005                                     ctdb_addr_to_str(&vnn->public_address),
1006                                     vnn->public_netmask_bits));
1007                 return -1;
1008         }
1009
1010         iface = strdup(ctdb_vnn_iface_string(vnn));
1011
1012         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
1013                 ctdb_addr_to_str(&pip->addr),
1014                 vnn->public_netmask_bits,
1015                 iface,
1016                 pip->pnn));
1017
1018         state = talloc(ctdb, struct takeover_callback_state);
1019         if (state == NULL) {
1020                 ctdb_set_error(ctdb, "Out of memory at %s:%d",
1021                                __FILE__, __LINE__);
1022                 free(iface);
1023                 return -1;
1024         }
1025
1026         state->c = talloc_steal(state, c);
1027         state->addr = talloc(state, ctdb_sock_addr);       
1028         if (state->addr == NULL) {
1029                 ctdb_set_error(ctdb, "Out of memory at %s:%d",
1030                                __FILE__, __LINE__);
1031                 free(iface);
1032                 talloc_free(state);
1033                 return -1;
1034         }
1035         *state->addr = pip->addr;
1036         state->vnn   = vnn;
1037
1038         vnn->update_in_flight = true;
1039         talloc_set_destructor(state, ctdb_releaseip_destructor);
1040
1041         ret = ctdb_event_script_callback(ctdb, 
1042                                          state, release_ip_callback, state,
1043                                          CTDB_EVENT_RELEASE_IP,
1044                                          "%s %s %u",
1045                                          iface,
1046                                          ctdb_addr_to_str(&pip->addr),
1047                                          vnn->public_netmask_bits);
1048         free(iface);
1049         if (ret != 0) {
1050                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1051                         ctdb_addr_to_str(&pip->addr),
1052                         ctdb_vnn_iface_string(vnn)));
1053                 talloc_free(state);
1054                 return -1;
1055         }
1056
1057         /* tell the control that we will be reply asynchronously */
1058         *async_reply = true;
1059         return 0;
1060 }
1061
1062 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1063                                    ctdb_sock_addr *addr,
1064                                    unsigned mask, const char *ifaces,
1065                                    bool check_address)
1066 {
1067         struct ctdb_vnn      *vnn;
1068         uint32_t num = 0;
1069         char *tmp;
1070         const char *iface;
1071         int i;
1072         int ret;
1073
1074         tmp = strdup(ifaces);
1075         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1076                 if (!ctdb_sys_check_iface_exists(iface)) {
1077                         DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1078                         free(tmp);
1079                         return -1;
1080                 }
1081         }
1082         free(tmp);
1083
1084         /* Verify that we don't have an entry for this ip yet */
1085         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1086                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1087                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
1088                                 ctdb_addr_to_str(addr)));
1089                         return -1;
1090                 }               
1091         }
1092
1093         /* create a new vnn structure for this ip address */
1094         vnn = talloc_zero(ctdb, struct ctdb_vnn);
1095         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1096         vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1097         tmp = talloc_strdup(vnn, ifaces);
1098         CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1099         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1100                 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1101                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1102                 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1103                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1104                 num++;
1105         }
1106         talloc_free(tmp);
1107         vnn->ifaces[num] = NULL;
1108         vnn->public_address      = *addr;
1109         vnn->public_netmask_bits = mask;
1110         vnn->pnn                 = -1;
1111         if (check_address) {
1112                 if (ctdb_sys_have_ip(addr)) {
1113                         DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1114                         vnn->pnn = ctdb->pnn;
1115                 }
1116         }
1117
1118         for (i=0; vnn->ifaces[i]; i++) {
1119                 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1120                 if (ret != 0) {
1121                         DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1122                                            "for public_address[%s]\n",
1123                                            vnn->ifaces[i], ctdb_addr_to_str(addr)));
1124                         talloc_free(vnn);
1125                         return -1;
1126                 }
1127         }
1128
1129         DLIST_ADD(ctdb->vnn, vnn);
1130
1131         return 0;
1132 }
1133
1134 /*
1135   setup the public address lists from a file
1136 */
1137 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1138 {
1139         char **lines;
1140         int nlines;
1141         int i;
1142
1143         lines = file_lines_load(ctdb->public_addresses_file, &nlines, 0, ctdb);
1144         if (lines == NULL) {
1145                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1146                 return -1;
1147         }
1148         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1149                 nlines--;
1150         }
1151
1152         for (i=0;i<nlines;i++) {
1153                 unsigned mask;
1154                 ctdb_sock_addr addr;
1155                 const char *addrstr;
1156                 const char *ifaces;
1157                 char *tok, *line;
1158
1159                 line = lines[i];
1160                 while ((*line == ' ') || (*line == '\t')) {
1161                         line++;
1162                 }
1163                 if (*line == '#') {
1164                         continue;
1165                 }
1166                 if (strcmp(line, "") == 0) {
1167                         continue;
1168                 }
1169                 tok = strtok(line, " \t");
1170                 addrstr = tok;
1171                 tok = strtok(NULL, " \t");
1172                 if (tok == NULL) {
1173                         if (NULL == ctdb->default_public_interface) {
1174                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1175                                          i+1));
1176                                 talloc_free(lines);
1177                                 return -1;
1178                         }
1179                         ifaces = ctdb->default_public_interface;
1180                 } else {
1181                         ifaces = tok;
1182                 }
1183
1184                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1185                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1186                         talloc_free(lines);
1187                         return -1;
1188                 }
1189                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1190                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1191                         talloc_free(lines);
1192                         return -1;
1193                 }
1194         }
1195
1196
1197         talloc_free(lines);
1198         return 0;
1199 }
1200
1201 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1202                               const char *iface,
1203                               const char *ip)
1204 {
1205         struct ctdb_vnn *svnn;
1206         struct ctdb_interface *cur = NULL;
1207         bool ok;
1208         int ret;
1209
1210         svnn = talloc_zero(ctdb, struct ctdb_vnn);
1211         CTDB_NO_MEMORY(ctdb, svnn);
1212
1213         svnn->ifaces = talloc_array(svnn, const char *, 2);
1214         CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1215         svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1216         CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1217         svnn->ifaces[1] = NULL;
1218
1219         ok = parse_ip(ip, iface, 0, &svnn->public_address);
1220         if (!ok) {
1221                 talloc_free(svnn);
1222                 return -1;
1223         }
1224
1225         ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1226         if (ret != 0) {
1227                 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1228                                    "for single_ip[%s]\n",
1229                                    svnn->ifaces[0],
1230                                    ctdb_addr_to_str(&svnn->public_address)));
1231                 talloc_free(svnn);
1232                 return -1;
1233         }
1234
1235         /* assume the single public ip interface is initially "good" */
1236         cur = ctdb_find_iface(ctdb, iface);
1237         if (cur == NULL) {
1238                 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1239                 return -1;
1240         }
1241         cur->link_up = true;
1242
1243         ret = ctdb_vnn_assign_iface(ctdb, svnn);
1244         if (ret != 0) {
1245                 talloc_free(svnn);
1246                 return -1;
1247         }
1248
1249         ctdb->single_ip_vnn = svnn;
1250         return 0;
1251 }
1252
1253 struct public_ip_list {
1254         struct public_ip_list *next;
1255         uint32_t pnn;
1256         ctdb_sock_addr addr;
1257 };
1258
1259 /* Given a physical node, return the number of
1260    public addresses that is currently assigned to this node.
1261 */
1262 static int node_ip_coverage(int32_t pnn, struct public_ip_list *ips)
1263 {
1264         int num=0;
1265
1266         for (;ips;ips=ips->next) {
1267                 if (ips->pnn == pnn) {
1268                         num++;
1269                 }
1270         }
1271         return num;
1272 }
1273
1274
1275 /* Can the given node host the given IP: is the public IP known to the
1276  * node and is NOIPHOST unset?
1277 */
1278 static bool can_node_host_ip(struct ipalloc_state *ipalloc_state,
1279                              int32_t pnn,
1280                              struct ctdb_ipflags ipflags,
1281                              struct public_ip_list *ip)
1282 {
1283         struct ctdb_public_ip_list_old *public_ips;
1284         int i;
1285
1286         if (ipflags.noiphost) {
1287                 return false;
1288         }
1289
1290         public_ips = ipalloc_state->available_public_ips[pnn];
1291
1292         if (public_ips == NULL) {
1293                 return false;
1294         }
1295
1296         for (i=0; i<public_ips->num; i++) {
1297                 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1298                         /* yes, this node can serve this public ip */
1299                         return true;
1300                 }
1301         }
1302
1303         return false;
1304 }
1305
1306 static bool can_node_takeover_ip(struct ipalloc_state *ipalloc_state,
1307                                  int32_t pnn,
1308                                  struct ctdb_ipflags ipflags,
1309                                  struct public_ip_list *ip)
1310 {
1311         if (ipflags.noiptakeover) {
1312                 return false;
1313         }
1314
1315         return can_node_host_ip(ipalloc_state, pnn, ipflags, ip);
1316 }
1317
1318 /* search the node lists list for a node to takeover this ip.
1319    pick the node that currently are serving the least number of ips
1320    so that the ips get spread out evenly.
1321 */
1322 static int find_takeover_node(struct ipalloc_state *ipalloc_state,
1323                               struct ctdb_ipflags *ipflags,
1324                               struct public_ip_list *ip,
1325                               struct public_ip_list *all_ips)
1326 {
1327         int pnn, min=0, num;
1328         int i, numnodes;
1329
1330         numnodes = ipalloc_state->num;
1331         pnn    = -1;
1332         for (i=0; i<numnodes; i++) {
1333                 /* verify that this node can serve this ip */
1334                 if (!can_node_takeover_ip(ipalloc_state, i, ipflags[i], ip)) {
1335                         /* no it couldnt   so skip to the next node */
1336                         continue;
1337                 }
1338
1339                 num = node_ip_coverage(i, all_ips);
1340                 /* was this the first node we checked ? */
1341                 if (pnn == -1) {
1342                         pnn = i;
1343                         min  = num;
1344                 } else {
1345                         if (num < min) {
1346                                 pnn = i;
1347                                 min  = num;
1348                         }
1349                 }
1350         }
1351         if (pnn == -1) {
1352                 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1353                         ctdb_addr_to_str(&ip->addr)));
1354
1355                 return -1;
1356         }
1357
1358         ip->pnn = pnn;
1359         return 0;
1360 }
1361
1362 #define IP_KEYLEN       4
1363 static uint32_t *ip_key(ctdb_sock_addr *ip)
1364 {
1365         static uint32_t key[IP_KEYLEN];
1366
1367         bzero(key, sizeof(key));
1368
1369         switch (ip->sa.sa_family) {
1370         case AF_INET:
1371                 key[3]  = htonl(ip->ip.sin_addr.s_addr);
1372                 break;
1373         case AF_INET6: {
1374                 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1375                 key[0]  = htonl(s6_a32[0]);
1376                 key[1]  = htonl(s6_a32[1]);
1377                 key[2]  = htonl(s6_a32[2]);
1378                 key[3]  = htonl(s6_a32[3]);
1379                 break;
1380         }
1381         default:
1382                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1383                 return key;
1384         }
1385
1386         return key;
1387 }
1388
1389 static void *add_ip_callback(void *parm, void *data)
1390 {
1391         struct public_ip_list *this_ip = parm;
1392         struct public_ip_list *prev_ip = data;
1393
1394         if (prev_ip == NULL) {
1395                 return parm;
1396         }
1397         if (this_ip->pnn == -1) {
1398                 this_ip->pnn = prev_ip->pnn;
1399         }
1400
1401         return parm;
1402 }
1403
1404 static int getips_count_callback(void *param, void *data)
1405 {
1406         struct public_ip_list **ip_list = (struct public_ip_list **)param;
1407         struct public_ip_list *new_ip = (struct public_ip_list *)data;
1408
1409         new_ip->next = *ip_list;
1410         *ip_list     = new_ip;
1411         return 0;
1412 }
1413
1414 static int verify_remote_ip_allocation(struct ctdb_context *ctdb,
1415                                        struct ctdb_public_ip_list_old *ips,
1416                                        uint32_t pnn);
1417
1418 static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
1419                                          struct ipalloc_state *ipalloc_state,
1420                                          struct ctdb_node_map_old *nodemap)
1421 {
1422         int j;
1423         int ret;
1424
1425         if (ipalloc_state->num != nodemap->num) {
1426                 DEBUG(DEBUG_ERR,
1427                       (__location__
1428                        " ipalloc_state->num (%d) != nodemap->num (%d) invalid param\n",
1429                        ipalloc_state->num, nodemap->num));
1430                 return -1;
1431         }
1432
1433         for (j=0; j<nodemap->num; j++) {
1434                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1435                         continue;
1436                 }
1437
1438                 /* Retrieve the list of known public IPs from the node */
1439                 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1440                                         TAKEOVER_TIMEOUT(),
1441                                         j,
1442                                         ctdb->nodes,
1443                                         0,
1444                                         &ipalloc_state->known_public_ips[j]);
1445                 if (ret != 0) {
1446                         DEBUG(DEBUG_ERR,
1447                               ("Failed to read known public IPs from node: %u\n",
1448                                j));
1449                         return -1;
1450                 }
1451
1452                 if (ctdb->do_checkpublicip) {
1453                         verify_remote_ip_allocation(ctdb,
1454                                                     ipalloc_state->known_public_ips[j],
1455                                                     j);
1456                 }
1457
1458                 /* Retrieve the list of available public IPs from the node */
1459                 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1460                                         TAKEOVER_TIMEOUT(),
1461                                         j,
1462                                         ctdb->nodes,
1463                                         CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE,
1464                                         &ipalloc_state->available_public_ips[j]);
1465                 if (ret != 0) {
1466                         DEBUG(DEBUG_ERR,
1467                               ("Failed to read available public IPs from node: %u\n",
1468                                j));
1469                         return -1;
1470                 }
1471         }
1472
1473         return 0;
1474 }
1475
1476 static struct public_ip_list *
1477 create_merged_ip_list(struct ctdb_context *ctdb, struct ipalloc_state *ipalloc_state)
1478 {
1479         int i, j;
1480         struct public_ip_list *ip_list;
1481         struct ctdb_public_ip_list_old *public_ips;
1482
1483         TALLOC_FREE(ctdb->ip_tree);
1484         ctdb->ip_tree = trbt_create(ctdb, 0);
1485
1486         for (i=0; i < ctdb->num_nodes; i++) {
1487                 public_ips = ipalloc_state->known_public_ips[i];
1488
1489                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1490                         continue;
1491                 }
1492
1493                 /* there were no public ips for this node */
1494                 if (public_ips == NULL) {
1495                         continue;
1496                 }
1497
1498                 for (j=0; j < public_ips->num; j++) {
1499                         struct public_ip_list *tmp_ip;
1500
1501                         tmp_ip = talloc_zero(ctdb->ip_tree, struct public_ip_list);
1502                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1503                         /* Do not use information about IP addresses hosted
1504                          * on other nodes, it may not be accurate */
1505                         if (public_ips->ips[j].pnn == ctdb->nodes[i]->pnn) {
1506                                 tmp_ip->pnn = public_ips->ips[j].pnn;
1507                         } else {
1508                                 tmp_ip->pnn = -1;
1509                         }
1510                         tmp_ip->addr = public_ips->ips[j].addr;
1511                         tmp_ip->next = NULL;
1512
1513                         trbt_insertarray32_callback(ctdb->ip_tree,
1514                                 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1515                                 add_ip_callback,
1516                                 tmp_ip);
1517                 }
1518         }
1519
1520         ip_list = NULL;
1521         trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1522
1523         return ip_list;
1524 }
1525
1526 /* 
1527  * This is the length of the longtest common prefix between the IPs.
1528  * It is calculated by XOR-ing the 2 IPs together and counting the
1529  * number of leading zeroes.  The implementation means that all
1530  * addresses end up being 128 bits long.
1531  *
1532  * FIXME? Should we consider IPv4 and IPv6 separately given that the
1533  * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1534  * lots of nodes and IP addresses?
1535  */
1536 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1537 {
1538         uint32_t ip1_k[IP_KEYLEN];
1539         uint32_t *t;
1540         int i;
1541         uint32_t x;
1542
1543         uint32_t distance = 0;
1544
1545         memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1546         t = ip_key(ip2);
1547         for (i=0; i<IP_KEYLEN; i++) {
1548                 x = ip1_k[i] ^ t[i];
1549                 if (x == 0) {
1550                         distance += 32;
1551                 } else {
1552                         /* Count number of leading zeroes. 
1553                          * FIXME? This could be optimised...
1554                          */
1555                         while ((x & (1 << 31)) == 0) {
1556                                 x <<= 1;
1557                                 distance += 1;
1558                         }
1559                 }
1560         }
1561
1562         return distance;
1563 }
1564
1565 /* Calculate the IP distance for the given IP relative to IPs on the
1566    given node.  The ips argument is generally the all_ips variable
1567    used in the main part of the algorithm.
1568  */
1569 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1570                                   struct public_ip_list *ips,
1571                                   int pnn)
1572 {
1573         struct public_ip_list *t;
1574         uint32_t d;
1575
1576         uint32_t sum = 0;
1577
1578         for (t=ips; t != NULL; t=t->next) {
1579                 if (t->pnn != pnn) {
1580                         continue;
1581                 }
1582
1583                 /* Optimisation: We never calculate the distance
1584                  * between an address and itself.  This allows us to
1585                  * calculate the effect of removing an address from a
1586                  * node by simply calculating the distance between
1587                  * that address and all of the exitsing addresses.
1588                  * Moreover, we assume that we're only ever dealing
1589                  * with addresses from all_ips so we can identify an
1590                  * address via a pointer rather than doing a more
1591                  * expensive address comparison. */
1592                 if (&(t->addr) == ip) {
1593                         continue;
1594                 }
1595
1596                 d = ip_distance(ip, &(t->addr));
1597                 sum += d * d;  /* Cheaper than pulling in math.h :-) */
1598         }
1599
1600         return sum;
1601 }
1602
1603 /* Return the LCP2 imbalance metric for addresses currently assigned
1604    to the given node.
1605  */
1606 static uint32_t lcp2_imbalance(struct public_ip_list * all_ips, int pnn)
1607 {
1608         struct public_ip_list *t;
1609
1610         uint32_t imbalance = 0;
1611
1612         for (t=all_ips; t!=NULL; t=t->next) {
1613                 if (t->pnn != pnn) {
1614                         continue;
1615                 }
1616                 /* Pass the rest of the IPs rather than the whole
1617                    all_ips input list.
1618                 */
1619                 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1620         }
1621
1622         return imbalance;
1623 }
1624
1625 /* Allocate any unassigned IPs just by looping through the IPs and
1626  * finding the best node for each.
1627  */
1628 static void basic_allocate_unassigned(struct ipalloc_state *ipalloc_state,
1629                                       struct ctdb_ipflags *ipflags,
1630                                       struct public_ip_list *all_ips)
1631 {
1632         struct public_ip_list *tmp_ip;
1633
1634         /* loop over all ip's and find a physical node to cover for
1635            each unassigned ip.
1636         */
1637         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1638                 if (tmp_ip->pnn == -1) {
1639                         if (find_takeover_node(ipalloc_state, ipflags,
1640                                                tmp_ip, all_ips)) {
1641                                 DEBUG(DEBUG_WARNING,
1642                                       ("Failed to find node to cover ip %s\n",
1643                                        ctdb_addr_to_str(&tmp_ip->addr)));
1644                         }
1645                 }
1646         }
1647 }
1648
1649 /* Basic non-deterministic rebalancing algorithm.
1650  */
1651 static void basic_failback(struct ipalloc_state *ipalloc_state,
1652                            struct ctdb_ipflags *ipflags,
1653                            struct public_ip_list *all_ips,
1654                            int num_ips)
1655 {
1656         int i, numnodes;
1657         int maxnode, maxnum, minnode, minnum, num, retries;
1658         struct public_ip_list *tmp_ip;
1659
1660         numnodes = ipalloc_state->num;
1661         retries = 0;
1662
1663 try_again:
1664         maxnum=0;
1665         minnum=0;
1666
1667         /* for each ip address, loop over all nodes that can serve
1668            this ip and make sure that the difference between the node
1669            serving the most and the node serving the least ip's are
1670            not greater than 1.
1671         */
1672         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1673                 if (tmp_ip->pnn == -1) {
1674                         continue;
1675                 }
1676
1677                 /* Get the highest and lowest number of ips's served by any 
1678                    valid node which can serve this ip.
1679                 */
1680                 maxnode = -1;
1681                 minnode = -1;
1682                 for (i=0; i<numnodes; i++) {
1683                         /* only check nodes that can actually serve this ip */
1684                         if (!can_node_takeover_ip(ipalloc_state, i,
1685                                                   ipflags[i], tmp_ip)) {
1686                                 /* no it couldnt   so skip to the next node */
1687                                 continue;
1688                         }
1689
1690                         num = node_ip_coverage(i, all_ips);
1691                         if (maxnode == -1) {
1692                                 maxnode = i;
1693                                 maxnum  = num;
1694                         } else {
1695                                 if (num > maxnum) {
1696                                         maxnode = i;
1697                                         maxnum  = num;
1698                                 }
1699                         }
1700                         if (minnode == -1) {
1701                                 minnode = i;
1702                                 minnum  = num;
1703                         } else {
1704                                 if (num < minnum) {
1705                                         minnode = i;
1706                                         minnum  = num;
1707                                 }
1708                         }
1709                 }
1710                 if (maxnode == -1) {
1711                         DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1712                                 ctdb_addr_to_str(&tmp_ip->addr)));
1713
1714                         continue;
1715                 }
1716
1717                 /* if the spread between the smallest and largest coverage by
1718                    a node is >=2 we steal one of the ips from the node with
1719                    most coverage to even things out a bit.
1720                    try to do this a limited number of times since we dont
1721                    want to spend too much time balancing the ip coverage.
1722                 */
1723                 if ( (maxnum > minnum+1)
1724                      && (retries < (num_ips + 5)) ){
1725                         struct public_ip_list *tmp;
1726
1727                         /* Reassign one of maxnode's VNNs */
1728                         for (tmp=all_ips;tmp;tmp=tmp->next) {
1729                                 if (tmp->pnn == maxnode) {
1730                                         (void)find_takeover_node(ipalloc_state,
1731                                                                  ipflags,
1732                                                                  tmp,
1733                                                                  all_ips);
1734                                         retries++;
1735                                         goto try_again;;
1736                                 }
1737                         }
1738                 }
1739         }
1740 }
1741
1742 static bool lcp2_init(struct ipalloc_state *ipalloc_state,
1743                       struct ctdb_ipflags *ipflags,
1744                       struct public_ip_list *all_ips,
1745                       uint32_t *force_rebalance_nodes,
1746                       uint32_t **lcp2_imbalances,
1747                       bool **rebalance_candidates)
1748 {
1749         int i, numnodes;
1750         struct public_ip_list *tmp_ip;
1751
1752         numnodes = ipalloc_state->num;
1753
1754         *rebalance_candidates = talloc_array(ipalloc_state, bool, numnodes);
1755         if (*rebalance_candidates == NULL) {
1756                 DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
1757                 return false;
1758         }
1759         *lcp2_imbalances = talloc_array(ipalloc_state, uint32_t, numnodes);
1760         if (*lcp2_imbalances == NULL) {
1761                 DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
1762                 return false;
1763         }
1764
1765         for (i=0; i<numnodes; i++) {
1766                 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1767                 /* First step: assume all nodes are candidates */
1768                 (*rebalance_candidates)[i] = true;
1769         }
1770
1771         /* 2nd step: if a node has IPs assigned then it must have been
1772          * healthy before, so we remove it from consideration.  This
1773          * is overkill but is all we have because we don't maintain
1774          * state between takeover runs.  An alternative would be to
1775          * keep state and invalidate it every time the recovery master
1776          * changes.
1777          */
1778         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1779                 if (tmp_ip->pnn != -1) {
1780                         (*rebalance_candidates)[tmp_ip->pnn] = false;
1781                 }
1782         }
1783
1784         /* 3rd step: if a node is forced to re-balance then
1785            we allow failback onto the node */
1786         if (force_rebalance_nodes == NULL) {
1787                 return true;
1788         }
1789         for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
1790                 uint32_t pnn = force_rebalance_nodes[i];
1791                 if (pnn >= numnodes) {
1792                         DEBUG(DEBUG_ERR,
1793                               (__location__ "unknown node %u\n", pnn));
1794                         continue;
1795                 }
1796
1797                 DEBUG(DEBUG_NOTICE,
1798                       ("Forcing rebalancing of IPs to node %u\n", pnn));
1799                 (*rebalance_candidates)[pnn] = true;
1800         }
1801
1802         return true;
1803 }
1804
1805 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1806  * the IP/node combination that will cost the least.
1807  */
1808 static void lcp2_allocate_unassigned(struct ipalloc_state *ipalloc_state,
1809                                      struct ctdb_ipflags *ipflags,
1810                                      struct public_ip_list *all_ips,
1811                                      uint32_t *lcp2_imbalances)
1812 {
1813         struct public_ip_list *tmp_ip;
1814         int dstnode, numnodes;
1815
1816         int minnode;
1817         uint32_t mindsum, dstdsum, dstimbl, minimbl;
1818         struct public_ip_list *minip;
1819
1820         bool should_loop = true;
1821         bool have_unassigned = true;
1822
1823         numnodes = ipalloc_state->num;
1824
1825         while (have_unassigned && should_loop) {
1826                 should_loop = false;
1827
1828                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1829                 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1830
1831                 minnode = -1;
1832                 mindsum = 0;
1833                 minip = NULL;
1834
1835                 /* loop over each unassigned ip. */
1836                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1837                         if (tmp_ip->pnn != -1) {
1838                                 continue;
1839                         }
1840
1841                         for (dstnode=0; dstnode<numnodes; dstnode++) {
1842                                 /* only check nodes that can actually takeover this ip */
1843                                 if (!can_node_takeover_ip(ipalloc_state,
1844                                                           dstnode,
1845                                                           ipflags[dstnode],
1846                                                           tmp_ip)) {
1847                                         /* no it couldnt   so skip to the next node */
1848                                         continue;
1849                                 }
1850
1851                                 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1852                                 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1853                                 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1854                                                    ctdb_addr_to_str(&(tmp_ip->addr)),
1855                                                    dstnode,
1856                                                    dstimbl - lcp2_imbalances[dstnode]));
1857
1858
1859                                 if ((minnode == -1) || (dstdsum < mindsum)) {
1860                                         minnode = dstnode;
1861                                         minimbl = dstimbl;
1862                                         mindsum = dstdsum;
1863                                         minip = tmp_ip;
1864                                         should_loop = true;
1865                                 }
1866                         }
1867                 }
1868
1869                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1870
1871                 /* If we found one then assign it to the given node. */
1872                 if (minnode != -1) {
1873                         minip->pnn = minnode;
1874                         lcp2_imbalances[minnode] = minimbl;
1875                         DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1876                                           ctdb_addr_to_str(&(minip->addr)),
1877                                           minnode,
1878                                           mindsum));
1879                 }
1880
1881                 /* There might be a better way but at least this is clear. */
1882                 have_unassigned = false;
1883                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1884                         if (tmp_ip->pnn == -1) {
1885                                 have_unassigned = true;
1886                         }
1887                 }
1888         }
1889
1890         /* We know if we have an unassigned addresses so we might as
1891          * well optimise.
1892          */
1893         if (have_unassigned) {
1894                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1895                         if (tmp_ip->pnn == -1) {
1896                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1897                                                      ctdb_addr_to_str(&tmp_ip->addr)));
1898                         }
1899                 }
1900         }
1901 }
1902
1903 /* LCP2 algorithm for rebalancing the cluster.  Given a candidate node
1904  * to move IPs from, determines the best IP/destination node
1905  * combination to move from the source node.
1906  */
1907 static bool lcp2_failback_candidate(struct ipalloc_state *ipalloc_state,
1908                                     struct ctdb_ipflags *ipflags,
1909                                     struct public_ip_list *all_ips,
1910                                     int srcnode,
1911                                     uint32_t *lcp2_imbalances,
1912                                     bool *rebalance_candidates)
1913 {
1914         int dstnode, mindstnode, numnodes;
1915         uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1916         uint32_t minsrcimbl, mindstimbl;
1917         struct public_ip_list *minip;
1918         struct public_ip_list *tmp_ip;
1919
1920         /* Find an IP and destination node that best reduces imbalance. */
1921         srcimbl = 0;
1922         minip = NULL;
1923         minsrcimbl = 0;
1924         mindstnode = -1;
1925         mindstimbl = 0;
1926
1927         numnodes = ipalloc_state->num;
1928
1929         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1930         DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n",
1931                            srcnode, lcp2_imbalances[srcnode]));
1932
1933         for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1934                 /* Only consider addresses on srcnode. */
1935                 if (tmp_ip->pnn != srcnode) {
1936                         continue;
1937                 }
1938
1939                 /* What is this IP address costing the source node? */
1940                 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1941                 srcimbl = lcp2_imbalances[srcnode] - srcdsum;
1942
1943                 /* Consider this IP address would cost each potential
1944                  * destination node.  Destination nodes are limited to
1945                  * those that are newly healthy, since we don't want
1946                  * to do gratuitous failover of IPs just to make minor
1947                  * balance improvements.
1948                  */
1949                 for (dstnode=0; dstnode<numnodes; dstnode++) {
1950                         if (!rebalance_candidates[dstnode]) {
1951                                 continue;
1952                         }
1953
1954                         /* only check nodes that can actually takeover this ip */
1955                         if (!can_node_takeover_ip(ipalloc_state, dstnode,
1956                                                   ipflags[dstnode], tmp_ip)) {
1957                                 /* no it couldnt   so skip to the next node */
1958                                 continue;
1959                         }
1960
1961                         dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1962                         dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1963                         DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1964                                            srcnode, -srcdsum,
1965                                            ctdb_addr_to_str(&(tmp_ip->addr)),
1966                                            dstnode, dstdsum));
1967
1968                         if ((dstimbl < lcp2_imbalances[srcnode]) &&
1969                             (dstdsum < srcdsum) &&                      \
1970                             ((mindstnode == -1) ||                              \
1971                              ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1972
1973                                 minip = tmp_ip;
1974                                 minsrcimbl = srcimbl;
1975                                 mindstnode = dstnode;
1976                                 mindstimbl = dstimbl;
1977                         }
1978                 }
1979         }
1980         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1981
1982         if (mindstnode != -1) {
1983                 /* We found a move that makes things better... */
1984                 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1985                                   srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1986                                   ctdb_addr_to_str(&(minip->addr)),
1987                                   mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1988
1989
1990                 lcp2_imbalances[srcnode] = minsrcimbl;
1991                 lcp2_imbalances[mindstnode] = mindstimbl;
1992                 minip->pnn = mindstnode;
1993
1994                 return true;
1995         }
1996
1997         return false;
1998         
1999 }
2000
2001 struct lcp2_imbalance_pnn {
2002         uint32_t imbalance;
2003         int pnn;
2004 };
2005
2006 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
2007 {
2008         const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
2009         const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
2010
2011         if (lipa->imbalance > lipb->imbalance) {
2012                 return -1;
2013         } else if (lipa->imbalance == lipb->imbalance) {
2014                 return 0;
2015         } else {
2016                 return 1;
2017         }
2018 }
2019
2020 /* LCP2 algorithm for rebalancing the cluster.  This finds the source
2021  * node with the highest LCP2 imbalance, and then determines the best
2022  * IP/destination node combination to move from the source node.
2023  */
2024 static void lcp2_failback(struct ipalloc_state *ipalloc_state,
2025                           struct ctdb_ipflags *ipflags,
2026                           struct public_ip_list *all_ips,
2027                           uint32_t *lcp2_imbalances,
2028                           bool *rebalance_candidates)
2029 {
2030         int i, numnodes;
2031         struct lcp2_imbalance_pnn * lips;
2032         bool again;
2033
2034         numnodes = ipalloc_state->num;
2035
2036 try_again:
2037         /* Put the imbalances and nodes into an array, sort them and
2038          * iterate through candidates.  Usually the 1st one will be
2039          * used, so this doesn't cost much...
2040          */
2041         DEBUG(DEBUG_DEBUG,("+++++++++++++++++++++++++++++++++++++++++\n"));
2042         DEBUG(DEBUG_DEBUG,("Selecting most imbalanced node from:\n"));
2043         lips = talloc_array(ipalloc_state, struct lcp2_imbalance_pnn, numnodes);
2044         for (i=0; i<numnodes; i++) {
2045                 lips[i].imbalance = lcp2_imbalances[i];
2046                 lips[i].pnn = i;
2047                 DEBUG(DEBUG_DEBUG,(" %d [%d]\n", i, lcp2_imbalances[i]));
2048         }
2049         qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
2050               lcp2_cmp_imbalance_pnn);
2051
2052         again = false;
2053         for (i=0; i<numnodes; i++) {
2054                 /* This means that all nodes had 0 or 1 addresses, so
2055                  * can't be imbalanced.
2056                  */
2057                 if (lips[i].imbalance == 0) {
2058                         break;
2059                 }
2060
2061                 if (lcp2_failback_candidate(ipalloc_state,
2062                                             ipflags,
2063                                             all_ips,
2064                                             lips[i].pnn,
2065                                             lcp2_imbalances,
2066                                             rebalance_candidates)) {
2067                         again = true;
2068                         break;
2069                 }
2070         }
2071
2072         talloc_free(lips);
2073         if (again) {
2074                 goto try_again;
2075         }
2076 }
2077
2078 static void unassign_unsuitable_ips(struct ipalloc_state *ipalloc_state,
2079                                     struct ctdb_ipflags *ipflags,
2080                                     struct public_ip_list *all_ips)
2081 {
2082         struct public_ip_list *tmp_ip;
2083
2084         /* verify that the assigned nodes can serve that public ip
2085            and set it to -1 if not
2086         */
2087         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2088                 if (tmp_ip->pnn == -1) {
2089                         continue;
2090                 }
2091                 if (!can_node_host_ip(ipalloc_state, tmp_ip->pnn,
2092                                       ipflags[tmp_ip->pnn], tmp_ip) != 0) {
2093                         /* this node can not serve this ip. */
2094                         DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
2095                                            ctdb_addr_to_str(&(tmp_ip->addr)),
2096                                            tmp_ip->pnn));
2097                         tmp_ip->pnn = -1;
2098                 }
2099         }
2100 }
2101
2102 static bool ip_alloc_deterministic_ips(struct ipalloc_state *ipalloc_state,
2103                                        struct ctdb_ipflags *ipflags,
2104                                        struct public_ip_list *all_ips)
2105 {
2106         struct public_ip_list *tmp_ip;
2107         int i, numnodes;
2108
2109         numnodes = ipalloc_state->num;
2110
2111         DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
2112        /* Allocate IPs to nodes in a modulo fashion so that IPs will
2113         *  always be allocated the same way for a specific set of
2114         *  available/unavailable nodes.
2115         */
2116
2117         for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
2118                 tmp_ip->pnn = i % numnodes;
2119         }
2120
2121         /* IP failback doesn't make sense with deterministic
2122          * IPs, since the modulo step above implicitly fails
2123          * back IPs to their "home" node.
2124          */
2125         if (1 == ipalloc_state->no_ip_failback) {
2126                 DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
2127         }
2128
2129         unassign_unsuitable_ips(ipalloc_state, ipflags, all_ips);
2130
2131         basic_allocate_unassigned(ipalloc_state, ipflags, all_ips);
2132
2133         /* No failback here! */
2134
2135         return true;
2136 }
2137
2138 static bool ip_alloc_nondeterministic_ips(struct ipalloc_state *ipalloc_state,
2139                                           struct ctdb_ipflags *ipflags,
2140                                           struct public_ip_list *all_ips)
2141 {
2142         /* This should be pushed down into basic_failback. */
2143         struct public_ip_list *tmp_ip;
2144         int num_ips = 0;
2145         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2146                 num_ips++;
2147         }
2148
2149         unassign_unsuitable_ips(ipalloc_state, ipflags, all_ips);
2150
2151         basic_allocate_unassigned(ipalloc_state, ipflags, all_ips);
2152
2153         /* If we don't want IPs to fail back then don't rebalance IPs. */
2154         if (1 == ipalloc_state->no_ip_failback) {
2155                 return true;
2156         }
2157
2158         /* Now, try to make sure the ip adresses are evenly distributed
2159            across the nodes.
2160         */
2161         basic_failback(ipalloc_state, ipflags, all_ips, num_ips);
2162
2163         return true;
2164 }
2165
2166 static bool ip_alloc_lcp2(struct ipalloc_state *ipalloc_state,
2167                           struct ctdb_ipflags *ipflags,
2168                           struct public_ip_list *all_ips,
2169                           uint32_t *force_rebalance_nodes)
2170 {
2171         uint32_t *lcp2_imbalances;
2172         bool *rebalance_candidates;
2173         int numnodes, num_rebalance_candidates, i;
2174         bool ret = true;
2175
2176         unassign_unsuitable_ips(ipalloc_state, ipflags, all_ips);
2177
2178         if (!lcp2_init(ipalloc_state, ipflags, all_ips,force_rebalance_nodes,
2179                        &lcp2_imbalances, &rebalance_candidates)) {
2180                 ret = false;
2181                 goto finished;
2182         }
2183
2184         lcp2_allocate_unassigned(ipalloc_state, ipflags, all_ips, lcp2_imbalances);
2185
2186         /* If we don't want IPs to fail back then don't rebalance IPs. */
2187         if (1 == ipalloc_state->no_ip_failback) {
2188                 goto finished;
2189         }
2190
2191         /* It is only worth continuing if we have suitable target
2192          * nodes to transfer IPs to.  This check is much cheaper than
2193          * continuing on...
2194          */
2195         numnodes = ipalloc_state->num;
2196         num_rebalance_candidates = 0;
2197         for (i=0; i<numnodes; i++) {
2198                 if (rebalance_candidates[i]) {
2199                         num_rebalance_candidates++;
2200                 }
2201         }
2202         if (num_rebalance_candidates == 0) {
2203                 goto finished;
2204         }
2205
2206         /* Now, try to make sure the ip adresses are evenly distributed
2207            across the nodes.
2208         */
2209         lcp2_failback(ipalloc_state, ipflags, all_ips,
2210                       lcp2_imbalances, rebalance_candidates);
2211
2212 finished:
2213         return ret;
2214 }
2215
2216 static bool all_nodes_are_disabled(struct ctdb_node_map_old *nodemap)
2217 {
2218         int i;
2219
2220         for (i=0;i<nodemap->num;i++) {
2221                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
2222                         /* Found one completely healthy node */
2223                         return false;
2224                 }
2225         }
2226
2227         return true;
2228 }
2229
2230 /* The calculation part of the IP allocation algorithm. */
2231 static bool ctdb_takeover_run_core(struct ipalloc_state *ipalloc_state,
2232                                    struct ctdb_ipflags *ipflags,
2233                                    struct public_ip_list *all_ips,
2234                                    uint32_t *force_rebalance_nodes)
2235 {
2236         bool ret;
2237
2238         switch (ipalloc_state->algorithm) {
2239         case IPALLOC_LCP2:
2240                 ret = ip_alloc_lcp2(ipalloc_state, ipflags, all_ips,
2241                                     force_rebalance_nodes);
2242                 break;
2243         case IPALLOC_DETERMINISTIC:
2244                 ret = ip_alloc_deterministic_ips(ipalloc_state, ipflags, all_ips);
2245                 break;
2246         case IPALLOC_NONDETERMINISTIC:
2247                 ret = ip_alloc_nondeterministic_ips(ipalloc_state, ipflags, all_ips);
2248                break;
2249         }
2250
2251         /* at this point ->pnn is the node which will own each IP
2252            or -1 if there is no node that can cover this ip
2253         */
2254
2255         return ret;
2256 }
2257
2258 struct get_tunable_callback_data {
2259         const char *tunable;
2260         uint32_t *out;
2261         bool fatal;
2262 };
2263
2264 static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
2265                                  int32_t res, TDB_DATA outdata,
2266                                  void *callback)
2267 {
2268         struct get_tunable_callback_data *cd =
2269                 (struct get_tunable_callback_data *)callback;
2270         int size;
2271
2272         if (res != 0) {
2273                 /* Already handled in fail callback */
2274                 return;
2275         }
2276
2277         if (outdata.dsize != sizeof(uint32_t)) {
2278                 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading \"%s\" tunable from node %d. Expected %d bytes but received %d bytes\n",
2279                                  cd->tunable, pnn, (int)sizeof(uint32_t),
2280                                  (int)outdata.dsize));
2281                 cd->fatal = true;
2282                 return;
2283         }
2284
2285         size = talloc_array_length(cd->out);
2286         if (pnn >= size) {
2287                 DEBUG(DEBUG_ERR,("Got %s reply from node %d but nodemap only has %d entries\n",
2288                                  cd->tunable, pnn, size));
2289                 return;
2290         }
2291
2292                 
2293         cd->out[pnn] = *(uint32_t *)outdata.dptr;
2294 }
2295
2296 static void get_tunable_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2297                                        int32_t res, TDB_DATA outdata,
2298                                        void *callback)
2299 {
2300         struct get_tunable_callback_data *cd =
2301                 (struct get_tunable_callback_data *)callback;
2302
2303         switch (res) {
2304         case -ETIME:
2305                 DEBUG(DEBUG_ERR,
2306                       ("Timed out getting tunable \"%s\" from node %d\n",
2307                        cd->tunable, pnn));
2308                 cd->fatal = true;
2309                 break;
2310         case -EINVAL:
2311         case -1:
2312                 DEBUG(DEBUG_WARNING,
2313                       ("Tunable \"%s\" not implemented on node %d\n",
2314                        cd->tunable, pnn));
2315                 break;
2316         default:
2317                 DEBUG(DEBUG_ERR,
2318                       ("Unexpected error getting tunable \"%s\" from node %d\n",
2319                        cd->tunable, pnn));
2320                 cd->fatal = true;
2321         }
2322 }
2323
2324 static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
2325                                         TALLOC_CTX *tmp_ctx,
2326                                         struct ctdb_node_map_old *nodemap,
2327                                         const char *tunable,
2328                                         uint32_t default_value)
2329 {
2330         TDB_DATA data;
2331         struct ctdb_control_get_tunable *t;
2332         uint32_t *nodes;
2333         uint32_t *tvals;
2334         struct get_tunable_callback_data callback_data;
2335         int i;
2336
2337         tvals = talloc_array(tmp_ctx, uint32_t, nodemap->num);
2338         CTDB_NO_MEMORY_NULL(ctdb, tvals);
2339         for (i=0; i<nodemap->num; i++) {
2340                 tvals[i] = default_value;
2341         }
2342                 
2343         callback_data.out = tvals;
2344         callback_data.tunable = tunable;
2345         callback_data.fatal = false;
2346
2347         data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(tunable) + 1;
2348         data.dptr  = talloc_size(tmp_ctx, data.dsize);
2349         t = (struct ctdb_control_get_tunable *)data.dptr;
2350         t->length = strlen(tunable)+1;
2351         memcpy(t->name, tunable, t->length);
2352         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2353         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
2354                                       nodes, 0, TAKEOVER_TIMEOUT(),
2355                                       false, data,
2356                                       get_tunable_callback,
2357                                       get_tunable_fail_callback,
2358                                       &callback_data) != 0) {
2359                 if (callback_data.fatal) {
2360                         talloc_free(tvals);
2361                         tvals = NULL;
2362                 }
2363         }
2364         talloc_free(nodes);
2365         talloc_free(data.dptr);
2366
2367         return tvals;
2368 }
2369
2370 /* Set internal flags for IP allocation:
2371  *   Clear ip flags
2372  *   Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
2373  *   Set NOIPHOST ip flag for each INACTIVE node
2374  *   if all nodes are disabled:
2375  *     Set NOIPHOST ip flags from per-node NoIPHostOnAllDisabled tunable
2376  *   else
2377  *     Set NOIPHOST ip flags for disabled nodes
2378  */
2379 static struct ctdb_ipflags *
2380 set_ipflags_internal(struct ipalloc_state *ipalloc_state,
2381                      struct ctdb_node_map_old *nodemap,
2382                      uint32_t *tval_noiptakeover,
2383                      uint32_t *tval_noiphostonalldisabled)
2384 {
2385         int i;
2386         struct ctdb_ipflags *ipflags;
2387
2388         /* Clear IP flags - implicit due to talloc_zero */
2389         ipflags = talloc_zero_array(ipalloc_state, struct ctdb_ipflags, nodemap->num);
2390         if (ipflags == NULL) {
2391                 DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
2392                 return NULL;
2393         }
2394
2395         for (i=0;i<nodemap->num;i++) {
2396                 /* Can not take IPs on node with NoIPTakeover set */
2397                 if (tval_noiptakeover[i] != 0) {
2398                         ipflags[i].noiptakeover = true;
2399                 }
2400
2401                 /* Can not host IPs on INACTIVE node */
2402                 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2403                         ipflags[i].noiphost = true;
2404                 }
2405         }
2406
2407         if (all_nodes_are_disabled(nodemap)) {
2408                 /* If all nodes are disabled, can not host IPs on node
2409                  * with NoIPHostOnAllDisabled set
2410                  */
2411                 for (i=0;i<nodemap->num;i++) {
2412                         if (tval_noiphostonalldisabled[i] != 0) {
2413                                 ipflags[i].noiphost = true;
2414                         }
2415                 }
2416         } else {
2417                 /* If some nodes are not disabled, then can not host
2418                  * IPs on DISABLED node
2419                  */
2420                 for (i=0;i<nodemap->num;i++) {
2421                         if (nodemap->nodes[i].flags & NODE_FLAGS_DISABLED) {
2422                                 ipflags[i].noiphost = true;
2423                         }
2424                 }
2425         }
2426
2427         return ipflags;
2428 }
2429
2430 static struct ctdb_ipflags *set_ipflags(struct ctdb_context *ctdb,
2431                                         struct ipalloc_state *ipalloc_state,
2432                                         struct ctdb_node_map_old *nodemap)
2433 {
2434         uint32_t *tval_noiptakeover;
2435         uint32_t *tval_noiphostonalldisabled;
2436         struct ctdb_ipflags *ipflags;
2437
2438
2439         tval_noiptakeover = get_tunable_from_nodes(ctdb, ipalloc_state, nodemap,
2440                                                    "NoIPTakeover", 0);
2441         if (tval_noiptakeover == NULL) {
2442                 return NULL;
2443         }
2444
2445         tval_noiphostonalldisabled =
2446                 get_tunable_from_nodes(ctdb, ipalloc_state, nodemap,
2447                                        "NoIPHostOnAllDisabled", 0);
2448         if (tval_noiphostonalldisabled == NULL) {
2449                 /* Caller frees tmp_ctx */
2450                 return NULL;
2451         }
2452
2453         ipflags = set_ipflags_internal(ipalloc_state, nodemap,
2454                                        tval_noiptakeover,
2455                                        tval_noiphostonalldisabled);
2456
2457         talloc_free(tval_noiptakeover);
2458         talloc_free(tval_noiphostonalldisabled);
2459
2460         return ipflags;
2461 }
2462
2463 static struct ipalloc_state * ipalloc_state_init(struct ctdb_context *ctdb,
2464                                                  TALLOC_CTX *mem_ctx)
2465 {
2466         struct ipalloc_state *ipalloc_state =
2467                 talloc_zero(mem_ctx, struct ipalloc_state);
2468         if (ipalloc_state == NULL) {
2469                 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
2470                 return NULL;
2471         }
2472
2473         ipalloc_state->num = ctdb->num_nodes;
2474         ipalloc_state->known_public_ips =
2475                 talloc_zero_array(ipalloc_state,
2476                                   struct ctdb_public_ip_list_old *,
2477                                   ipalloc_state->num);
2478         if (ipalloc_state->known_public_ips == NULL) {
2479                 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
2480                 talloc_free(ipalloc_state);
2481                 return NULL;
2482         }
2483         ipalloc_state->available_public_ips =
2484                 talloc_zero_array(ipalloc_state,
2485                                   struct ctdb_public_ip_list_old *,
2486                                   ipalloc_state->num);
2487         if (ipalloc_state->available_public_ips == NULL) {
2488                 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
2489                 talloc_free(ipalloc_state);
2490                 return NULL;
2491         }
2492
2493         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2494                 ipalloc_state->algorithm = IPALLOC_LCP2;
2495         } else if (1 == ctdb->tunable.deterministic_public_ips) {
2496                 ipalloc_state->algorithm = IPALLOC_DETERMINISTIC;
2497         } else {
2498                 ipalloc_state->algorithm = IPALLOC_NONDETERMINISTIC;
2499         }
2500
2501         ipalloc_state->no_ip_failback = ctdb->tunable.no_ip_failback;
2502
2503         return ipalloc_state;
2504 }
2505
2506 struct iprealloc_callback_data {
2507         bool *retry_nodes;
2508         int retry_count;
2509         client_async_callback fail_callback;
2510         void *fail_callback_data;
2511         struct ctdb_node_map_old *nodemap;
2512 };
2513
2514 static void iprealloc_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2515                                         int32_t res, TDB_DATA outdata,
2516                                         void *callback)
2517 {
2518         int numnodes;
2519         struct iprealloc_callback_data *cd =
2520                 (struct iprealloc_callback_data *)callback;
2521
2522         numnodes = talloc_array_length(cd->retry_nodes);
2523         if (pnn > numnodes) {
2524                 DEBUG(DEBUG_ERR,
2525                       ("ipreallocated failure from node %d, "
2526                        "but only %d nodes in nodemap\n",
2527                        pnn, numnodes));
2528                 return;
2529         }
2530
2531         /* Can't run the "ipreallocated" event on a INACTIVE node */
2532         if (cd->nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE) {
2533                 DEBUG(DEBUG_WARNING,
2534                       ("ipreallocated failed on inactive node %d, ignoring\n",
2535                        pnn));
2536                 return;
2537         }
2538
2539         switch (res) {
2540         case -ETIME:
2541                 /* If the control timed out then that's a real error,
2542                  * so call the real fail callback
2543                  */
2544                 if (cd->fail_callback) {
2545                         cd->fail_callback(ctdb, pnn, res, outdata,
2546                                           cd->fail_callback_data);
2547                 } else {
2548                         DEBUG(DEBUG_WARNING,
2549                               ("iprealloc timed out but no callback registered\n"));
2550                 }
2551                 break;
2552         default:
2553                 /* If not a timeout then either the ipreallocated
2554                  * eventscript (or some setup) failed.  This might
2555                  * have failed because the IPREALLOCATED control isn't
2556                  * implemented - right now there is no way of knowing
2557                  * because the error codes are all folded down to -1.
2558                  * Consider retrying using EVENTSCRIPT control...
2559                  */
2560                 DEBUG(DEBUG_WARNING,
2561                       ("ipreallocated failure from node %d, flagging retry\n",
2562                        pnn));
2563                 cd->retry_nodes[pnn] = true;
2564                 cd->retry_count++;
2565         }
2566 }
2567
2568 struct takeover_callback_data {
2569         bool *node_failed;
2570         client_async_callback fail_callback;
2571         void *fail_callback_data;
2572         struct ctdb_node_map_old *nodemap;
2573 };
2574
2575 static void takeover_run_fail_callback(struct ctdb_context *ctdb,
2576                                        uint32_t node_pnn, int32_t res,
2577                                        TDB_DATA outdata, void *callback_data)
2578 {
2579         struct takeover_callback_data *cd =
2580                 talloc_get_type_abort(callback_data,
2581                                       struct takeover_callback_data);
2582         int i;
2583
2584         for (i = 0; i < cd->nodemap->num; i++) {
2585                 if (node_pnn == cd->nodemap->nodes[i].pnn) {
2586                         break;
2587                 }
2588         }
2589
2590         if (i == cd->nodemap->num) {
2591                 DEBUG(DEBUG_ERR, (__location__ " invalid PNN %u\n", node_pnn));
2592                 return;
2593         }
2594
2595         if (!cd->node_failed[i]) {
2596                 cd->node_failed[i] = true;
2597                 cd->fail_callback(ctdb, node_pnn, res, outdata,
2598                                   cd->fail_callback_data);
2599         }
2600 }
2601
2602 /*
2603   make any IP alias changes for public addresses that are necessary 
2604  */
2605 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
2606                       uint32_t *force_rebalance_nodes,
2607                       client_async_callback fail_callback, void *callback_data)
2608 {
2609         int i, j, ret;
2610         struct ctdb_public_ip ip;
2611         uint32_t *nodes;
2612         struct public_ip_list *all_ips, *tmp_ip;
2613         TDB_DATA data;
2614         struct timeval timeout;
2615         struct client_async_data *async_data;
2616         struct ctdb_client_control_state *state;
2617         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2618         struct ctdb_ipflags *ipflags;
2619         struct ipalloc_state *ipalloc_state;
2620         struct takeover_callback_data *takeover_data;
2621         struct iprealloc_callback_data iprealloc_data;
2622         bool *retry_data;
2623         bool can_host_ips;
2624
2625         /*
2626          * ip failover is completely disabled, just send out the 
2627          * ipreallocated event.
2628          */
2629         if (ctdb->tunable.disable_ip_failover != 0) {
2630                 goto ipreallocated;
2631         }
2632
2633         ipalloc_state = ipalloc_state_init(ctdb, tmp_ctx);
2634         if (ipalloc_state == NULL) {
2635                 talloc_free(tmp_ctx);
2636                 return -1;
2637         }
2638
2639         ipflags = set_ipflags(ctdb, ipalloc_state, nodemap);
2640         if (ipflags == NULL) {
2641                 DEBUG(DEBUG_ERR,("Failed to set IP flags - aborting takeover run\n"));
2642                 talloc_free(tmp_ctx);
2643                 return -1;
2644         }
2645
2646         /* Fetch known/available public IPs from each active node */
2647         ret = ctdb_reload_remote_public_ips(ctdb, ipalloc_state, nodemap);
2648         if (ret != 0) {
2649                 talloc_free(tmp_ctx);
2650                 return -1;
2651         }
2652
2653         /* Short-circuit IP allocation if no node has available IPs */
2654         can_host_ips = false;
2655         for (i=0; i < ipalloc_state->num; i++) {
2656                 if (ipalloc_state->available_public_ips[i] != NULL) {
2657                         can_host_ips = true;
2658                 }
2659         }
2660         if (!can_host_ips) {
2661                 DEBUG(DEBUG_WARNING,("No nodes available to host public IPs yet\n"));
2662                 return 0;
2663         }
2664
2665         /* since nodes only know about those public addresses that
2666            can be served by that particular node, no single node has
2667            a full list of all public addresses that exist in the cluster.
2668            Walk over all node structures and create a merged list of
2669            all public addresses that exist in the cluster.
2670
2671            keep the tree of ips around as ctdb->ip_tree
2672         */
2673         all_ips = create_merged_ip_list(ctdb, ipalloc_state);
2674
2675         /* Do the IP reassignment calculations */
2676         ctdb_takeover_run_core(ipalloc_state, ipflags,
2677                                all_ips, force_rebalance_nodes);
2678
2679         /* Now tell all nodes to release any public IPs should not
2680          * host.  This will be a NOOP on nodes that don't currently
2681          * hold the given IP.
2682          */
2683         takeover_data = talloc_zero(tmp_ctx, struct takeover_callback_data);
2684         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data);
2685
2686         takeover_data->node_failed = talloc_zero_array(tmp_ctx,
2687                                                        bool, nodemap->num);
2688         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data->node_failed);
2689         takeover_data->fail_callback = fail_callback;
2690         takeover_data->fail_callback_data = callback_data;
2691         takeover_data->nodemap = nodemap;
2692
2693         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2694         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2695
2696         async_data->fail_callback = takeover_run_fail_callback;
2697         async_data->callback_data = takeover_data;
2698
2699         ZERO_STRUCT(ip); /* Avoid valgrind warnings for union */
2700
2701         /* Send a RELEASE_IP to all nodes that should not be hosting
2702          * each IP.  For each IP, all but one of these will be
2703          * redundant.  However, the redundant ones are used to tell
2704          * nodes which node should be hosting the IP so that commands
2705          * like "ctdb ip" can display a particular nodes idea of who
2706          * is hosting what. */
2707         for (i=0;i<nodemap->num;i++) {
2708                 /* don't talk to unconnected nodes, but do talk to banned nodes */
2709                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2710                         continue;
2711                 }
2712
2713                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2714                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2715                                 /* This node should be serving this
2716                                    vnn so don't tell it to release the ip
2717                                 */
2718                                 continue;
2719                         }
2720                         ip.pnn  = tmp_ip->pnn;
2721                         ip.addr = tmp_ip->addr;
2722
2723                         timeout = TAKEOVER_TIMEOUT();
2724                         data.dsize = sizeof(ip);
2725                         data.dptr  = (uint8_t *)&ip;
2726                         state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2727                                                   0, CTDB_CONTROL_RELEASE_IP, 0,
2728                                                   data, async_data,
2729                                                   &timeout, NULL);
2730                         if (state == NULL) {
2731                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2732                                 talloc_free(tmp_ctx);
2733                                 return -1;
2734                         }
2735
2736                         ctdb_client_async_add(async_data, state);
2737                 }
2738         }
2739         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2740                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2741                 talloc_free(tmp_ctx);
2742                 return -1;
2743         }
2744         talloc_free(async_data);
2745
2746
2747         /* For each IP, send a TAKOVER_IP to the node that should be
2748          * hosting it.  Many of these will often be redundant (since
2749          * the allocation won't have changed) but they can be useful
2750          * to recover from inconsistencies. */
2751         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2752         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2753
2754         async_data->fail_callback = fail_callback;
2755         async_data->callback_data = callback_data;
2756
2757         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2758                 if (tmp_ip->pnn == -1) {
2759                         /* this IP won't be taken over */
2760                         continue;
2761                 }
2762
2763                 ip.pnn  = tmp_ip->pnn;
2764                 ip.addr = tmp_ip->addr;
2765
2766                 timeout = TAKEOVER_TIMEOUT();
2767                 data.dsize = sizeof(ip);
2768                 data.dptr  = (uint8_t *)&ip;
2769                 state = ctdb_control_send(ctdb, tmp_ip->pnn,
2770                                           0, CTDB_CONTROL_TAKEOVER_IP, 0,
2771                                           data, async_data, &timeout, NULL);
2772                 if (state == NULL) {
2773                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2774                         talloc_free(tmp_ctx);
2775                         return -1;
2776                 }
2777
2778                 ctdb_client_async_add(async_data, state);
2779         }
2780         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2781                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2782                 talloc_free(tmp_ctx);
2783                 return -1;
2784         }
2785
2786 ipreallocated:
2787         /*
2788          * Tell all nodes to run eventscripts to process the
2789          * "ipreallocated" event.  This can do a lot of things,
2790          * including restarting services to reconfigure them if public
2791          * IPs have moved.  Once upon a time this event only used to
2792          * update natgw.
2793          */
2794         retry_data = talloc_zero_array(tmp_ctx, bool, nodemap->num);
2795         CTDB_NO_MEMORY_FATAL(ctdb, retry_data);
2796         iprealloc_data.retry_nodes = retry_data;
2797         iprealloc_data.retry_count = 0;
2798         iprealloc_data.fail_callback = fail_callback;
2799         iprealloc_data.fail_callback_data = callback_data;
2800         iprealloc_data.nodemap = nodemap;
2801
2802         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2803         ret = ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
2804                                         nodes, 0, TAKEOVER_TIMEOUT(),
2805                                         false, tdb_null,
2806                                         NULL, iprealloc_fail_callback,
2807                                         &iprealloc_data);
2808         if (ret != 0) {
2809                 /* If the control failed then we should retry to any
2810                  * nodes flagged by iprealloc_fail_callback using the
2811                  * EVENTSCRIPT control.  This is a best-effort at
2812                  * backward compatiblity when running a mixed cluster
2813                  * where some nodes have not yet been upgraded to
2814                  * support the IPREALLOCATED control.
2815                  */
2816                 DEBUG(DEBUG_WARNING,
2817                       ("Retry ipreallocated to some nodes using eventscript control\n"));
2818
2819                 nodes = talloc_array(tmp_ctx, uint32_t,
2820                                      iprealloc_data.retry_count);
2821                 CTDB_NO_MEMORY_FATAL(ctdb, nodes);
2822
2823                 j = 0;
2824                 for (i=0; i<nodemap->num; i++) {
2825                         if (iprealloc_data.retry_nodes[i]) {
2826                                 nodes[j] = i;
2827                                 j++;
2828                         }
2829                 }
2830
2831                 data.dptr  = discard_const("ipreallocated");
2832                 data.dsize = strlen((char *)data.dptr) + 1; 
2833                 ret = ctdb_client_async_control(ctdb,
2834                                                 CTDB_CONTROL_RUN_EVENTSCRIPTS,
2835                                                 nodes, 0, TAKEOVER_TIMEOUT(),
2836                                                 false, data,
2837                                                 NULL, fail_callback,
2838                                                 callback_data);
2839                 if (ret != 0) {
2840                         DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
2841                 }
2842         }
2843
2844         talloc_free(tmp_ctx);
2845         return ret;
2846 }
2847
2848
2849 /*
2850   destroy a ctdb_client_ip structure
2851  */
2852 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2853 {
2854         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2855                 ctdb_addr_to_str(&ip->addr),
2856                 ntohs(ip->addr.ip.sin_port),
2857                 ip->client_id));
2858
2859         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2860         return 0;
2861 }
2862
2863 /*
2864   called by a client to inform us of a TCP connection that it is managing
2865   that should tickled with an ACK when IP takeover is done
2866  */
2867 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2868                                 TDB_DATA indata)
2869 {
2870         struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
2871         struct ctdb_connection *tcp_sock = NULL;
2872         struct ctdb_tcp_list *tcp;
2873         struct ctdb_connection t;
2874         int ret;
2875         TDB_DATA data;
2876         struct ctdb_client_ip *ip;
2877         struct ctdb_vnn *vnn;
2878         ctdb_sock_addr addr;
2879
2880         /* If we don't have public IPs, tickles are useless */
2881         if (ctdb->vnn == NULL) {
2882                 return 0;
2883         }
2884
2885         tcp_sock = (struct ctdb_connection *)indata.dptr;
2886
2887         addr = tcp_sock->src;
2888         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
2889         addr = tcp_sock->dst;
2890         ctdb_canonicalize_ip(&addr, &tcp_sock->dst);
2891
2892         ZERO_STRUCT(addr);
2893         memcpy(&addr, &tcp_sock->dst, sizeof(addr));
2894         vnn = find_public_ip_vnn(ctdb, &addr);
2895         if (vnn == NULL) {
2896                 switch (addr.sa.sa_family) {
2897                 case AF_INET:
2898                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2899                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
2900                                         ctdb_addr_to_str(&addr)));
2901                         }
2902                         break;
2903                 case AF_INET6:
2904                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
2905                                 ctdb_addr_to_str(&addr)));
2906                         break;
2907                 default:
2908                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2909                 }
2910
2911                 return 0;
2912         }
2913
2914         if (vnn->pnn != ctdb->pnn) {
2915                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2916                         ctdb_addr_to_str(&addr),
2917                         client_id, client->pid));
2918                 /* failing this call will tell smbd to die */
2919                 return -1;
2920         }
2921
2922         ip = talloc(client, struct ctdb_client_ip);
2923         CTDB_NO_MEMORY(ctdb, ip);
2924
2925         ip->ctdb      = ctdb;
2926         ip->addr      = addr;
2927         ip->client_id = client_id;
2928         talloc_set_destructor(ip, ctdb_client_ip_destructor);
2929         DLIST_ADD(ctdb->client_ip_list, ip);
2930
2931         tcp = talloc(client, struct ctdb_tcp_list);
2932         CTDB_NO_MEMORY(ctdb, tcp);
2933
2934         tcp->connection.src = tcp_sock->src;
2935         tcp->connection.dst = tcp_sock->dst;
2936
2937         DLIST_ADD(client->tcp_list, tcp);
2938
2939         t.src = tcp_sock->src;
2940         t.dst = tcp_sock->dst;
2941
2942         data.dptr = (uint8_t *)&t;
2943         data.dsize = sizeof(t);
2944
2945         switch (addr.sa.sa_family) {
2946         case AF_INET:
2947                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2948                         (unsigned)ntohs(tcp_sock->dst.ip.sin_port),
2949                         ctdb_addr_to_str(&tcp_sock->src),
2950                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2951                 break;
2952         case AF_INET6:
2953                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2954                         (unsigned)ntohs(tcp_sock->dst.ip6.sin6_port),
2955                         ctdb_addr_to_str(&tcp_sock->src),
2956                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2957                 break;
2958         default:
2959                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2960         }
2961
2962
2963         /* tell all nodes about this tcp connection */
2964         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
2965                                        CTDB_CONTROL_TCP_ADD,
2966                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2967         if (ret != 0) {
2968                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
2969                 return -1;
2970         }
2971
2972         return 0;
2973 }
2974
2975 /*
2976   find a tcp address on a list
2977  */
2978 static struct ctdb_connection *ctdb_tcp_find(struct ctdb_tcp_array *array,
2979                                            struct ctdb_connection *tcp)
2980 {
2981         int i;
2982
2983         if (array == NULL) {
2984                 return NULL;
2985         }
2986
2987         for (i=0;i<array->num;i++) {
2988                 if (ctdb_same_sockaddr(&array->connections[i].src, &tcp->src) &&
2989                     ctdb_same_sockaddr(&array->connections[i].dst, &tcp->dst)) {
2990                         return &array->connections[i];
2991                 }
2992         }
2993         return NULL;
2994 }
2995
2996
2997
2998 /*
2999   called by a daemon to inform us of a TCP connection that one of its
3000   clients managing that should tickled with an ACK when IP takeover is
3001   done
3002  */
3003 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
3004 {
3005         struct ctdb_connection *p = (struct ctdb_connection *)indata.dptr;
3006         struct ctdb_tcp_array *tcparray;
3007         struct ctdb_connection tcp;
3008         struct ctdb_vnn *vnn;
3009
3010         /* If we don't have public IPs, tickles are useless */
3011         if (ctdb->vnn == NULL) {
3012                 return 0;
3013         }
3014
3015         vnn = find_public_ip_vnn(ctdb, &p->dst);
3016         if (vnn == NULL) {
3017                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
3018                         ctdb_addr_to_str(&p->dst)));
3019
3020                 return -1;
3021         }
3022
3023
3024         tcparray = vnn->tcp_array;
3025
3026         /* If this is the first tickle */
3027         if (tcparray == NULL) {
3028                 tcparray = talloc(vnn, struct ctdb_tcp_array);
3029                 CTDB_NO_MEMORY(ctdb, tcparray);
3030                 vnn->tcp_array = tcparray;
3031
3032                 tcparray->num = 0;
3033                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_connection));
3034                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
3035
3036                 tcparray->connections[tcparray->num].src = p->src;
3037                 tcparray->connections[tcparray->num].dst = p->dst;
3038                 tcparray->num++;
3039
3040                 if (tcp_update_needed) {
3041                         vnn->tcp_update_needed = true;
3042                 }
3043                 return 0;
3044         }
3045
3046
3047         /* Do we already have this tickle ?*/
3048         tcp.src = p->src;
3049         tcp.dst = p->dst;
3050         if (ctdb_tcp_find(tcparray, &tcp) != NULL) {
3051                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
3052                         ctdb_addr_to_str(&tcp.dst),
3053                         ntohs(tcp.dst.ip.sin_port),
3054                         vnn->pnn));
3055                 return 0;
3056         }
3057
3058         /* A new tickle, we must add it to the array */
3059         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
3060                                         struct ctdb_connection,
3061                                         tcparray->num+1);
3062         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3063
3064         tcparray->connections[tcparray->num].src = p->src;
3065         tcparray->connections[tcparray->num].dst = p->dst;
3066         tcparray->num++;
3067
3068         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
3069                 ctdb_addr_to_str(&tcp.dst),
3070                 ntohs(tcp.dst.ip.sin_port),
3071                 vnn->pnn));
3072
3073         if (tcp_update_needed) {
3074                 vnn->tcp_update_needed = true;
3075         }
3076
3077         return 0;
3078 }
3079
3080
3081 /*
3082   called by a daemon to inform us of a TCP connection that one of its
3083   clients managing that should tickled with an ACK when IP takeover is
3084   done
3085  */
3086 static void ctdb_remove_connection(struct ctdb_context *ctdb, struct ctdb_connection *conn)
3087 {
3088         struct ctdb_connection *tcpp;
3089         struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst);
3090
3091         if (vnn == NULL) {
3092                 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
3093                         ctdb_addr_to_str(&conn->dst)));
3094                 return;
3095         }
3096
3097         /* if the array is empty we cant remove it
3098            and we don't need to do anything
3099          */
3100         if (vnn->tcp_array == NULL) {
3101                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
3102                         ctdb_addr_to_str(&conn->dst),
3103                         ntohs(conn->dst.ip.sin_port)));
3104                 return;
3105         }
3106
3107
3108         /* See if we know this connection
3109            if we don't know this connection  then we dont need to do anything
3110          */
3111         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
3112         if (tcpp == NULL) {
3113                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
3114                         ctdb_addr_to_str(&conn->dst),
3115                         ntohs(conn->dst.ip.sin_port)));
3116                 return;
3117         }
3118
3119
3120         /* We need to remove this entry from the array.
3121            Instead of allocating a new array and copying data to it
3122            we cheat and just copy the last entry in the existing array
3123            to the entry that is to be removed and just shring the 
3124            ->num field
3125          */
3126         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
3127         vnn->tcp_array->num--;
3128
3129         /* If we deleted the last entry we also need to remove the entire array
3130          */
3131         if (vnn->tcp_array->num == 0) {
3132                 talloc_free(vnn->tcp_array);
3133                 vnn->tcp_array = NULL;
3134         }               
3135
3136         vnn->tcp_update_needed = true;
3137
3138         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
3139                 ctdb_addr_to_str(&conn->src),
3140                 ntohs(conn->src.ip.sin_port)));
3141 }
3142
3143
3144 /*
3145   called by a daemon to inform us of a TCP connection that one of its
3146   clients used are no longer needed in the tickle database
3147  */
3148 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
3149 {
3150         struct ctdb_connection *conn = (struct ctdb_connection *)indata.dptr;
3151
3152         /* If we don't have public IPs, tickles are useless */
3153         if (ctdb->vnn == NULL) {
3154                 return 0;
3155         }
3156
3157         ctdb_remove_connection(ctdb, conn);
3158
3159         return 0;
3160 }
3161
3162
3163 /*
3164   Called when another daemon starts - causes all tickles for all
3165   public addresses we are serving to be sent to the new node on the
3166   next check.  This actually causes the next scheduled call to
3167   tdb_update_tcp_tickles() to update all nodes.  This is simple and
3168   doesn't require careful error handling.
3169  */
3170 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t pnn)
3171 {
3172         struct ctdb_vnn *vnn;
3173
3174         DEBUG(DEBUG_INFO, ("Received startup control from node %lu\n",
3175                            (unsigned long) pnn));
3176
3177         for (vnn = ctdb->vnn; vnn != NULL; vnn = vnn->next) {
3178                 vnn->tcp_update_needed = true;
3179         }
3180
3181         return 0;
3182 }
3183
3184
3185 /*
3186   called when a client structure goes away - hook to remove
3187   elements from the tcp_list in all daemons
3188  */
3189 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
3190 {
3191         while (client->tcp_list) {
3192                 struct ctdb_tcp_list *tcp = client->tcp_list;
3193                 DLIST_REMOVE(client->tcp_list, tcp);
3194                 ctdb_remove_connection(client->ctdb, &tcp->connection);
3195         }
3196 }
3197
3198
3199 void ctdb_release_all_ips(struct ctdb_context *ctdb)
3200 {
3201         struct ctdb_vnn *vnn;
3202         int count = 0;
3203
3204         if (ctdb->tunable.disable_ip_failover == 1) {
3205                 return;
3206         }
3207
3208         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3209                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
3210                         ctdb_vnn_unassign_iface(ctdb, vnn);
3211                         continue;
3212                 }
3213                 if (!vnn->iface) {
3214                         continue;
3215                 }
3216
3217                 /* Don't allow multiple releases at once.  Some code,
3218                  * particularly ctdb_tickle_sentenced_connections() is
3219                  * not re-entrant */
3220                 if (vnn->update_in_flight) {
3221                         DEBUG(DEBUG_WARNING,
3222                               (__location__
3223                                " Not releasing IP %s/%u on interface %s, an update is already in progess\n",
3224                                     ctdb_addr_to_str(&vnn->public_address),
3225                                     vnn->public_netmask_bits,
3226                                     ctdb_vnn_iface_string(vnn)));
3227                         continue;
3228                 }
3229                 vnn->update_in_flight = true;
3230
3231                 DEBUG(DEBUG_INFO,("Release of IP %s/%u on interface %s node:-1\n",
3232                                     ctdb_addr_to_str(&vnn->public_address),
3233                                     vnn->public_netmask_bits,
3234                                     ctdb_vnn_iface_string(vnn)));
3235
3236                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
3237                                   ctdb_vnn_iface_string(vnn),
3238                                   ctdb_addr_to_str(&vnn->public_address),
3239                                   vnn->public_netmask_bits);
3240                 release_kill_clients(ctdb, &vnn->public_address);
3241                 ctdb_vnn_unassign_iface(ctdb, vnn);
3242                 vnn->update_in_flight = false;
3243                 count++;
3244         }
3245
3246         DEBUG(DEBUG_NOTICE,(__location__ " Released %d public IPs\n", count));
3247 }
3248
3249
3250 /*
3251   get list of public IPs
3252  */
3253 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
3254                                     struct ctdb_req_control_old *c, TDB_DATA *outdata)
3255 {
3256         int i, num, len;
3257         struct ctdb_public_ip_list_old *ips;
3258         struct ctdb_vnn *vnn;
3259         bool only_available = false;
3260
3261         if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
3262                 only_available = true;
3263         }
3264
3265         /* count how many public ip structures we have */
3266         num = 0;
3267         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3268                 num++;
3269         }
3270
3271         len = offsetof(struct ctdb_public_ip_list_old, ips) +
3272                 num*sizeof(struct ctdb_public_ip);
3273         ips = talloc_zero_size(outdata, len);
3274         CTDB_NO_MEMORY(ctdb, ips);
3275
3276         i = 0;
3277         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3278                 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
3279                         continue;
3280                 }
3281                 ips->ips[i].pnn  = vnn->pnn;
3282                 ips->ips[i].addr = vnn->public_address;
3283                 i++;
3284         }
3285         ips->num = i;
3286         len = offsetof(struct ctdb_public_ip_list_old, ips) +
3287                 i*sizeof(struct ctdb_public_ip);
3288
3289         outdata->dsize = len;
3290         outdata->dptr  = (uint8_t *)ips;
3291
3292         return 0;
3293 }
3294
3295
3296 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
3297                                         struct ctdb_req_control_old *c,
3298                                         TDB_DATA indata,
3299                                         TDB_DATA *outdata)
3300 {
3301         int i, num, len;
3302         ctdb_sock_addr *addr;
3303         struct ctdb_public_ip_info_old *info;
3304         struct ctdb_vnn *vnn;
3305
3306         addr = (ctdb_sock_addr *)indata.dptr;
3307
3308         vnn = find_public_ip_vnn(ctdb, addr);
3309         if (vnn == NULL) {
3310                 /* if it is not a public ip   it could be our 'single ip' */
3311                 if (ctdb->single_ip_vnn) {
3312                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
3313                                 vnn = ctdb->single_ip_vnn;
3314                         }
3315                 }
3316         }
3317         if (vnn == NULL) {
3318                 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
3319                                  "'%s'not a public address\n",
3320                                  ctdb_addr_to_str(addr)));
3321                 return -1;
3322         }
3323
3324         /* count how many public ip structures we have */
3325         num = 0;
3326         for (;vnn->ifaces[num];) {
3327                 num++;
3328         }
3329
3330         len = offsetof(struct ctdb_public_ip_info_old, ifaces) +
3331                 num*sizeof(struct ctdb_iface);
3332         info = talloc_zero_size(outdata, len);
3333         CTDB_NO_MEMORY(ctdb, info);
3334
3335         info->ip.addr = vnn->public_address;
3336         info->ip.pnn = vnn->pnn;
3337         info->active_idx = 0xFFFFFFFF;
3338
3339         for (i=0; vnn->ifaces[i]; i++) {
3340                 struct ctdb_interface *cur;
3341
3342                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
3343                 if (cur == NULL) {
3344                         DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
3345                                            vnn->ifaces[i]));
3346                         return -1;
3347                 }
3348                 if (vnn->iface == cur) {
3349                         info->active_idx = i;
3350                 }
3351                 strncpy(info->ifaces[i].name, cur->name, sizeof(info->ifaces[i].name)-1);
3352                 info->ifaces[i].link_state = cur->link_up;
3353                 info->ifaces[i].references = cur->references;
3354         }
3355         info->num = i;
3356         len = offsetof(struct ctdb_public_ip_info_old, ifaces) +
3357                 i*sizeof(struct ctdb_iface);
3358
3359         outdata->dsize = len;
3360         outdata->dptr  = (uint8_t *)info;
3361
3362         return 0;
3363 }
3364
3365 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
3366                                 struct ctdb_req_control_old *c,
3367                                 TDB_DATA *outdata)
3368 {
3369         int i, num, len;
3370         struct ctdb_iface_list_old *ifaces;
3371         struct ctdb_interface *cur;
3372
3373         /* count how many public ip structures we have */
3374         num = 0;
3375         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3376                 num++;
3377         }
3378
3379         len = offsetof(struct ctdb_iface_list_old, ifaces) +
3380                 num*sizeof(struct ctdb_iface);
3381         ifaces = talloc_zero_size(outdata, len);
3382         CTDB_NO_MEMORY(ctdb, ifaces);
3383
3384         i = 0;
3385         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3386                 strcpy(ifaces->ifaces[i].name, cur->name);
3387                 ifaces->ifaces[i].link_state = cur->link_up;
3388                 ifaces->ifaces[i].references = cur->references;
3389                 i++;
3390         }
3391         ifaces->num = i;
3392         len = offsetof(struct ctdb_iface_list_old, ifaces) +
3393                 i*sizeof(struct ctdb_iface);
3394
3395         outdata->dsize = len;
3396         outdata->dptr  = (uint8_t *)ifaces;
3397
3398         return 0;
3399 }
3400
3401 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
3402                                     struct ctdb_req_control_old *c,
3403                                     TDB_DATA indata)
3404 {
3405         struct ctdb_iface *info;
3406         struct ctdb_interface *iface;
3407         bool link_up = false;
3408
3409         info = (struct ctdb_iface *)indata.dptr;
3410
3411         if (info->name[CTDB_IFACE_SIZE] != '\0') {
3412                 int len = strnlen(info->name, CTDB_IFACE_SIZE);
3413                 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
3414                                   len, len, info->name));
3415                 return -1;
3416         }
3417
3418         switch (info->link_state) {
3419         case 0:
3420                 link_up = false;
3421                 break;
3422         case 1:
3423                 link_up = true;
3424                 break;
3425         default:
3426                 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
3427                                   (unsigned int)info->link_state));
3428                 return -1;
3429         }
3430
3431         if (info->references != 0) {
3432                 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
3433                                   (unsigned int)info->references));
3434                 return -1;
3435         }
3436
3437         iface = ctdb_find_iface(ctdb, info->name);
3438         if (iface == NULL) {
3439                 return -1;
3440         }
3441
3442         if (link_up == iface->link_up) {
3443                 return 0;
3444         }
3445
3446         DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
3447               ("iface[%s] has changed it's link status %s => %s\n",
3448                iface->name,
3449                iface->link_up?"up":"down",
3450                link_up?"up":"down"));
3451
3452         iface->link_up = link_up;
3453         return 0;
3454 }
3455
3456
3457 /* 
3458    structure containing the listening socket and the list of tcp connections
3459    that the ctdb daemon is to kill
3460 */
3461 struct ctdb_kill_tcp {
3462         struct ctdb_vnn *vnn;
3463         struct ctdb_context *ctdb;
3464         int capture_fd;
3465         struct tevent_fd *fde;
3466         trbt_tree_t *connections;
3467         void *private_data;
3468 };
3469
3470 /*
3471   a tcp connection that is to be killed
3472  */
3473 struct ctdb_killtcp_con {
3474         ctdb_sock_addr src_addr;
3475         ctdb_sock_addr dst_addr;
3476         int count;
3477         struct ctdb_kill_tcp *killtcp;
3478 };
3479
3480 /* this function is used to create a key to represent this socketpair
3481    in the killtcp tree.
3482    this key is used to insert and lookup matching socketpairs that are
3483    to be tickled and RST
3484 */
3485 #define KILLTCP_KEYLEN  10
3486 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
3487 {
3488         static uint32_t key[KILLTCP_KEYLEN];
3489
3490         bzero(key, sizeof(key));
3491
3492         if (src->sa.sa_family != dst->sa.sa_family) {
3493                 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
3494                 return key;
3495         }
3496         
3497         switch (src->sa.sa_family) {
3498         case AF_INET:
3499                 key[0]  = dst->ip.sin_addr.s_addr;
3500                 key[1]  = src->ip.sin_addr.s_addr;
3501                 key[2]  = dst->ip.sin_port;
3502                 key[3]  = src->ip.sin_port;
3503                 break;
3504         case AF_INET6: {
3505                 uint32_t *dst6_addr32 =
3506                         (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
3507                 uint32_t *src6_addr32 =
3508                         (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
3509                 key[0]  = dst6_addr32[3];
3510                 key[1]  = src6_addr32[3];
3511                 key[2]  = dst6_addr32[2];
3512                 key[3]  = src6_addr32[2];
3513                 key[4]  = dst6_addr32[1];
3514                 key[5]  = src6_addr32[1];
3515                 key[6]  = dst6_addr32[0];
3516                 key[7]  = src6_addr32[0];
3517                 key[8]  = dst->ip6.sin6_port;
3518                 key[9]  = src->ip6.sin6_port;
3519                 break;
3520         }
3521         default:
3522                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
3523                 return key;
3524         }
3525
3526         return key;
3527 }
3528
3529 /*
3530   called when we get a read event on the raw socket
3531  */
3532 static void capture_tcp_handler(struct tevent_context *ev,
3533                                 struct tevent_fd *fde,
3534                                 uint16_t flags, void *private_data)
3535 {
3536         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3537         struct ctdb_killtcp_con *con;
3538         ctdb_sock_addr src, dst;
3539         uint32_t ack_seq, seq;
3540
3541         if (!(flags & TEVENT_FD_READ)) {
3542                 return;
3543         }
3544
3545         if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
3546                                 killtcp->private_data,
3547                                 &src, &dst,
3548                                 &ack_seq, &seq) != 0) {
3549                 /* probably a non-tcp ACK packet */
3550                 return;
3551         }
3552
3553         /* check if we have this guy in our list of connections
3554            to kill
3555         */
3556         con = trbt_lookuparray32(killtcp->connections, 
3557                         KILLTCP_KEYLEN, killtcp_key(&src, &dst));
3558         if (con == NULL) {
3559                 /* no this was some other packet we can just ignore */
3560                 return;
3561         }
3562
3563         /* This one has been tickled !
3564            now reset him and remove him from the list.
3565          */
3566         DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
3567                 ntohs(con->dst_addr.ip.sin_port),
3568                 ctdb_addr_to_str(&con->src_addr),
3569                 ntohs(con->src_addr.ip.sin_port)));
3570
3571         ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
3572         talloc_free(con);
3573 }
3574
3575
3576 /* when traversing the list of all tcp connections to send tickle acks to
3577    (so that we can capture the ack coming back and kill the connection
3578     by a RST)
3579    this callback is called for each connection we are currently trying to kill
3580 */
3581 static int tickle_connection_traverse(void *param, void *data)
3582 {
3583         struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
3584
3585         /* have tried too many times, just give up */
3586         if (con->count >= 5) {
3587                 /* can't delete in traverse: reparent to delete_cons */
3588                 talloc_steal(param, con);
3589                 return 0;
3590         }
3591
3592         /* othervise, try tickling it again */
3593         con->count++;
3594         ctdb_sys_send_tcp(
3595                 (ctdb_sock_addr *)&con->dst_addr,
3596                 (ctdb_sock_addr *)&con->src_addr,
3597                 0, 0, 0);
3598         return 0;
3599 }
3600
3601
3602 /* 
3603    called every second until all sentenced connections have been reset
3604  */
3605 static void ctdb_tickle_sentenced_connections(struct tevent_context *ev,
3606                                               struct tevent_timer *te,
3607                                               struct timeval t, void *private_data)
3608 {
3609         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3610         void *delete_cons = talloc_new(NULL);
3611
3612         /* loop over all connections sending tickle ACKs */
3613         trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
3614
3615         /* now we've finished traverse, it's safe to do deletion. */
3616         talloc_free(delete_cons);
3617
3618         /* If there are no more connections to kill we can remove the
3619            entire killtcp structure
3620          */
3621         if ( (killtcp->connections == NULL) || 
3622              (killtcp->connections->root == NULL) ) {
3623                 talloc_free(killtcp);
3624                 return;
3625         }
3626
3627         /* try tickling them again in a seconds time
3628          */
3629         tevent_add_timer(killtcp->ctdb->ev, killtcp,
3630                          timeval_current_ofs(1, 0),
3631                          ctdb_tickle_sentenced_connections, killtcp);
3632 }
3633
3634 /*
3635   destroy the killtcp structure
3636  */
3637 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
3638 {
3639         struct ctdb_vnn *tmpvnn;
3640
3641         /* verify that this vnn is still active */
3642         for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
3643                 if (tmpvnn == killtcp->vnn) {
3644                         break;
3645                 }
3646         }
3647
3648         if (tmpvnn == NULL) {
3649                 return 0;
3650         }
3651
3652         if (killtcp->vnn->killtcp != killtcp) {
3653                 return 0;
3654         }
3655
3656         killtcp->vnn->killtcp = NULL;
3657
3658         return 0;
3659 }
3660
3661
3662 /* nothing fancy here, just unconditionally replace any existing
3663    connection structure with the new one.
3664
3665    don't even free the old one if it did exist, that one is talloc_stolen
3666    by the same node in the tree anyway and will be deleted when the new data 
3667    is deleted
3668 */
3669 static void *add_killtcp_callback(void *parm, void *data)
3670 {
3671         return parm;
3672 }
3673
3674 /*
3675   add a tcp socket to the list of connections we want to RST
3676  */
3677 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb, 
3678                                        ctdb_sock_addr *s,
3679                                        ctdb_sock_addr *d)
3680 {
3681         ctdb_sock_addr src, dst;
3682         struct ctdb_kill_tcp *killtcp;
3683         struct ctdb_killtcp_con *con;
3684         struct ctdb_vnn *vnn;
3685
3686         ctdb_canonicalize_ip(s, &src);
3687         ctdb_canonicalize_ip(d, &dst);
3688
3689         vnn = find_public_ip_vnn(ctdb, &dst);
3690         if (vnn == NULL) {
3691                 vnn = find_public_ip_vnn(ctdb, &src);
3692         }
3693         if (vnn == NULL) {
3694                 /* if it is not a public ip   it could be our 'single ip' */
3695                 if (ctdb->single_ip_vnn) {
3696                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
3697                                 vnn = ctdb->single_ip_vnn;
3698                         }
3699                 }
3700         }
3701         if (vnn == NULL) {
3702                 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n")); 
3703                 return -1;
3704         }
3705
3706         killtcp = vnn->killtcp;
3707         
3708         /* If this is the first connection to kill we must allocate
3709            a new structure
3710          */
3711         if (killtcp == NULL) {
3712                 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
3713                 CTDB_NO_MEMORY(ctdb, killtcp);
3714
3715                 killtcp->vnn         = vnn;
3716                 killtcp->ctdb        = ctdb;
3717                 killtcp->capture_fd  = -1;
3718                 killtcp->connections = trbt_create(killtcp, 0);
3719
3720                 vnn->killtcp         = killtcp;
3721                 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
3722         }
3723
3724
3725
3726         /* create a structure that describes this connection we want to
3727            RST and store it in killtcp->connections
3728         */
3729         con = talloc(killtcp, struct ctdb_killtcp_con);
3730         CTDB_NO_MEMORY(ctdb, con);
3731         con->src_addr = src;
3732         con->dst_addr = dst;
3733         con->count    = 0;
3734         con->killtcp  = killtcp;
3735
3736
3737         trbt_insertarray32_callback(killtcp->connections,
3738                         KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
3739                         add_killtcp_callback, con);
3740
3741         /* 
3742            If we don't have a socket to listen on yet we must create it
3743          */
3744         if (killtcp->capture_fd == -1) {
3745                 const char *iface = ctdb_vnn_iface_string(vnn);
3746                 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
3747                 if (killtcp->capture_fd == -1) {
3748                         DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
3749                                           "socket on iface '%s' for killtcp (%s)\n",
3750                                           iface, strerror(errno)));
3751                         goto failed;
3752                 }
3753         }
3754
3755
3756         if (killtcp->fde == NULL) {
3757                 killtcp->fde = tevent_add_fd(ctdb->ev, killtcp,
3758                                              killtcp->capture_fd,
3759                                              TEVENT_FD_READ,
3760                                              capture_tcp_handler, killtcp);
3761                 tevent_fd_set_auto_close(killtcp->fde);
3762
3763                 /* We also need to set up some events to tickle all these connections
3764                    until they are all reset
3765                 */
3766                 tevent_add_timer(ctdb->ev, killtcp, timeval_current_ofs(1, 0),
3767                                  ctdb_tickle_sentenced_connections, killtcp);
3768         }
3769
3770         /* tickle him once now */
3771         ctdb_sys_send_tcp(
3772                 &con->dst_addr,
3773                 &con->src_addr,
3774                 0, 0, 0);
3775
3776         return 0;
3777
3778 failed:
3779         talloc_free(vnn->killtcp);
3780         vnn->killtcp = NULL;
3781         return -1;
3782 }
3783
3784 /*
3785   kill a TCP connection.
3786  */
3787 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
3788 {
3789         struct ctdb_connection *killtcp = (struct ctdb_connection *)indata.dptr;
3790
3791         return ctdb_killtcp_add_connection(ctdb, &killtcp->src, &killtcp->dst);
3792 }
3793
3794 /*
3795   called by a daemon to inform us of the entire list of TCP tickles for
3796   a particular public address.
3797   this control should only be sent by the node that is currently serving
3798   that public address.
3799  */
3800 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
3801 {
3802         struct ctdb_tickle_list_old *list = (struct ctdb_tickle_list_old *)indata.dptr;
3803         struct ctdb_tcp_array *tcparray;
3804         struct ctdb_vnn *vnn;
3805
3806         /* We must at least have tickles.num or else we cant verify the size
3807            of the received data blob
3808          */
3809         if (indata.dsize < offsetof(struct ctdb_tickle_list_old, connections)) {
3810                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_tickle_list. Not enough data for the tickle.num field\n"));
3811                 return -1;
3812         }
3813
3814         /* verify that the size of data matches what we expect */
3815         if (indata.dsize < offsetof(struct ctdb_tickle_list_old, connections)
3816                          + sizeof(struct ctdb_connection) * list->num) {
3817                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_tickle_list\n"));
3818                 return -1;
3819         }
3820
3821         DEBUG(DEBUG_INFO, ("Received tickle update for public address %s\n",
3822                            ctdb_addr_to_str(&list->addr)));
3823
3824         vnn = find_public_ip_vnn(ctdb, &list->addr);
3825         if (vnn == NULL) {
3826                 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n",
3827                         ctdb_addr_to_str(&list->addr)));
3828
3829                 return 1;
3830         }
3831
3832         /* remove any old ticklelist we might have */
3833         talloc_free(vnn->tcp_array);
3834         vnn->tcp_array = NULL;
3835
3836         tcparray = talloc(vnn, struct ctdb_tcp_array);
3837         CTDB_NO_MEMORY(ctdb, tcparray);
3838
3839         tcparray->num = list->num;
3840
3841         tcparray->connections = talloc_array(tcparray, struct ctdb_connection, tcparray->num);
3842         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3843
3844         memcpy(tcparray->connections, &list->connections[0],
3845                sizeof(struct ctdb_connection)*tcparray->num);
3846
3847         /* We now have a new fresh tickle list array for this vnn */
3848         vnn->tcp_array = tcparray;
3849
3850         return 0;
3851 }
3852
3853 /*
3854   called to return the full list of tickles for the puclic address associated 
3855   with the provided vnn
3856  */
3857 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3858 {
3859         ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3860         struct ctdb_tickle_list_old *list;
3861         struct ctdb_tcp_array *tcparray;
3862         int num;
3863         struct ctdb_vnn *vnn;
3864
3865         vnn = find_public_ip_vnn(ctdb, addr);
3866         if (vnn == NULL) {
3867                 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n", 
3868                         ctdb_addr_to_str(addr)));
3869
3870                 return 1;
3871         }
3872
3873         tcparray = vnn->tcp_array;
3874         if (tcparray) {
3875                 num = tcparray->num;
3876         } else {
3877                 num = 0;
3878         }
3879
3880         outdata->dsize = offsetof(struct ctdb_tickle_list_old, connections)
3881                         + sizeof(struct ctdb_connection) * num;
3882
3883         outdata->dptr  = talloc_size(outdata, outdata->dsize);
3884         CTDB_NO_MEMORY(ctdb, outdata->dptr);
3885         list = (struct ctdb_tickle_list_old *)outdata->dptr;
3886
3887         list->addr = *addr;
3888         list->num = num;
3889         if (num) {
3890                 memcpy(&list->connections[0], tcparray->connections,
3891                         sizeof(struct ctdb_connection) * num);
3892         }
3893
3894         return 0;
3895 }
3896
3897
3898 /*
3899   set the list of all tcp tickles for a public address
3900  */
3901 static int ctdb_send_set_tcp_tickles_for_ip(struct ctdb_context *ctdb,
3902                                             ctdb_sock_addr *addr,
3903                                             struct ctdb_tcp_array *tcparray)
3904 {
3905         int ret, num;
3906         TDB_DATA data;
3907         struct ctdb_tickle_list_old *list;
3908
3909         if (tcparray) {
3910                 num = tcparray->num;
3911         } else {
3912                 num = 0;
3913         }
3914
3915         data.dsize = offsetof(struct ctdb_tickle_list_old, connections) +
3916                         sizeof(struct ctdb_connection) * num;
3917         data.dptr = talloc_size(ctdb, data.dsize);
3918         CTDB_NO_MEMORY(ctdb, data.dptr);
3919
3920         list = (struct ctdb_tickle_list_old *)data.dptr;
3921         list->addr = *addr;
3922         list->num = num;
3923         if (tcparray) {
3924                 memcpy(&list->connections[0], tcparray->connections, sizeof(struct ctdb_connection) * num);
3925         }
3926
3927         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_ALL, 0,
3928                                        CTDB_CONTROL_SET_TCP_TICKLE_LIST,
3929                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3930         if (ret != 0) {
3931                 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
3932                 return -1;
3933         }
3934
3935         talloc_free(data.dptr);
3936
3937         return ret;
3938 }
3939
3940
3941 /*
3942   perform tickle updates if required
3943  */
3944 static void ctdb_update_tcp_tickles(struct tevent_context *ev,
3945                                     struct tevent_timer *te,
3946                                     struct timeval t, void *private_data)
3947 {
3948         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3949         int ret;
3950         struct ctdb_vnn *vnn;
3951
3952         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3953                 /* we only send out updates for public addresses that 
3954                    we have taken over
3955                  */
3956                 if (ctdb->pnn != vnn->pnn) {
3957                         continue;
3958                 }
3959                 /* We only send out the updates if we need to */
3960                 if (!vnn->tcp_update_needed) {
3961                         continue;
3962                 }
3963                 ret = ctdb_send_set_tcp_tickles_for_ip(ctdb,
3964                                                        &vnn->public_address,
3965                                                        vnn->tcp_array);
3966                 if (ret != 0) {
3967                         DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
3968                                 ctdb_addr_to_str(&vnn->public_address)));
3969                 } else {
3970                         DEBUG(DEBUG_INFO,
3971                               ("Sent tickle update for public address %s\n",
3972                                ctdb_addr_to_str(&vnn->public_address)));
3973                         vnn->tcp_update_needed = false;
3974                 }
3975         }
3976
3977         tevent_add_timer(ctdb->ev, ctdb->tickle_update_context,
3978                          timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
3979                          ctdb_update_tcp_tickles, ctdb);
3980 }
3981
3982 /*
3983   start periodic update of tcp tickles
3984  */
3985 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
3986 {
3987         ctdb->tickle_update_context = talloc_new(ctdb);
3988
3989         tevent_add_timer(ctdb->ev, ctdb->tickle_update_context,
3990                          timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
3991                          ctdb_update_tcp_tickles, ctdb);
3992 }
3993
3994
3995
3996
3997 struct control_gratious_arp {
3998         struct ctdb_context *ctdb;
3999         ctdb_sock_addr addr;
4000         const char *iface;
4001         int count;
4002 };
4003
4004 /*
4005   send a control_gratuitous arp
4006  */
4007 static void send_gratious_arp(struct tevent_context *ev,
4008                               struct tevent_timer *te,
4009                               struct timeval t, void *private_data)
4010 {
4011         int ret;
4012         struct control_gratious_arp *arp = talloc_get_type(private_data, 
4013                                                         struct control_gratious_arp);
4014
4015         ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
4016         if (ret != 0) {
4017                 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
4018                                  arp->iface, strerror(errno)));
4019         }
4020
4021
4022         arp->count++;
4023         if (arp->count == CTDB_ARP_REPEAT) {
4024                 talloc_free(arp);
4025                 return;
4026         }
4027
4028         tevent_add_timer(arp->ctdb->ev, arp,
4029                          timeval_current_ofs(CTDB_ARP_INTERVAL, 0),
4030                          send_gratious_arp, arp);
4031 }
4032
4033
4034 /*
4035   send a gratious arp 
4036  */
4037 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
4038 {
4039         struct ctdb_addr_info_old *gratious_arp = (struct ctdb_addr_info_old *)indata.dptr;
4040         struct control_gratious_arp *arp;
4041
4042         /* verify the size of indata */
4043         if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
4044                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
4045                                  (unsigned)indata.dsize, 
4046                                  (unsigned)offsetof(struct ctdb_addr_info_old, iface)));
4047                 return -1;
4048         }
4049         if (indata.dsize != 
4050                 ( offsetof(struct ctdb_addr_info_old, iface)
4051                 + gratious_arp->len ) ){
4052
4053                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4054                         "but should be %u bytes\n", 
4055                          (unsigned)indata.dsize, 
4056                          (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+gratious_arp->len)));
4057                 return -1;
4058         }
4059
4060
4061         arp = talloc(ctdb, struct control_gratious_arp);
4062         CTDB_NO_MEMORY(ctdb, arp);
4063
4064         arp->ctdb  = ctdb;
4065         arp->addr   = gratious_arp->addr;
4066         arp->iface = talloc_strdup(arp, gratious_arp->iface);
4067         CTDB_NO_MEMORY(ctdb, arp->iface);
4068         arp->count = 0;
4069
4070         tevent_add_timer(arp->ctdb->ev, arp,
4071                          timeval_zero(), send_gratious_arp, arp);
4072
4073         return 0;
4074 }
4075
4076 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
4077 {
4078         struct ctdb_addr_info_old *pub = (struct ctdb_addr_info_old *)indata.dptr;
4079         int ret;
4080
4081         /* verify the size of indata */
4082         if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
4083                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_addr_info structure\n"));
4084                 return -1;
4085         }
4086         if (indata.dsize != 
4087                 ( offsetof(struct ctdb_addr_info_old, iface)
4088                 + pub->len ) ){
4089
4090                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4091                         "but should be %u bytes\n", 
4092                          (unsigned)indata.dsize, 
4093                          (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+pub->len)));
4094                 return -1;
4095         }
4096
4097         DEBUG(DEBUG_NOTICE,("Add IP %s\n", ctdb_addr_to_str(&pub->addr)));
4098
4099         ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
4100
4101         if (ret != 0) {
4102                 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
4103                 return -1;
4104         }
4105
4106         return 0;
4107 }
4108
4109 struct delete_ip_callback_state {
4110         struct ctdb_req_control_old *c;
4111 };
4112
4113 /*
4114   called when releaseip event finishes for del_public_address
4115  */
4116 static void delete_ip_callback(struct ctdb_context *ctdb,
4117                                int32_t status, TDB_DATA data,
4118                                const char *errormsg,
4119                                void *private_data)
4120 {
4121         struct delete_ip_callback_state *state =
4122                 talloc_get_type(private_data, struct delete_ip_callback_state);
4123
4124         /* If release failed then fail. */
4125         ctdb_request_control_reply(ctdb, state->c, NULL, status, errormsg);
4126         talloc_free(private_data);
4127 }
4128
4129 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb,
4130                                         struct ctdb_req_control_old *c,
4131                                         TDB_DATA indata, bool *async_reply)
4132 {
4133         struct ctdb_addr_info_old *pub = (struct ctdb_addr_info_old *)indata.dptr;
4134         struct ctdb_vnn *vnn;
4135
4136         /* verify the size of indata */
4137         if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
4138                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_addr_info structure\n"));
4139                 return -1;
4140         }
4141         if (indata.dsize != 
4142                 ( offsetof(struct ctdb_addr_info_old, iface)
4143                 + pub->len ) ){
4144
4145                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4146                         "but should be %u bytes\n", 
4147                          (unsigned)indata.dsize, 
4148                          (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+pub->len)));
4149                 return -1;
4150         }
4151
4152         DEBUG(DEBUG_NOTICE,("Delete IP %s\n", ctdb_addr_to_str(&pub->addr)));
4153
4154         /* walk over all public addresses until we find a match */
4155         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
4156                 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
4157                         if (vnn->pnn == ctdb->pnn) {
4158                                 struct delete_ip_callback_state *state;
4159                                 struct ctdb_public_ip *ip;
4160                                 TDB_DATA data;
4161                                 int ret;
4162
4163                                 vnn->delete_pending = true;
4164
4165                                 state = talloc(ctdb,
4166                                                struct delete_ip_callback_state);
4167                                 CTDB_NO_MEMORY(ctdb, state);
4168                                 state->c = c;
4169
4170                                 ip = talloc(state, struct ctdb_public_ip);
4171                                 if (ip == NULL) {
4172                                         DEBUG(DEBUG_ERR,
4173                                               (__location__ " Out of memory\n"));
4174                                         talloc_free(state);
4175                                         return -1;
4176                                 }
4177                                 ip->pnn = -1;
4178                                 ip->addr = pub->addr;
4179
4180                                 data.dsize = sizeof(struct ctdb_public_ip);
4181                                 data.dptr = (unsigned char *)ip;
4182
4183                                 ret = ctdb_daemon_send_control(ctdb,
4184                                                                ctdb_get_pnn(ctdb),
4185                                                                0,
4186                                                                CTDB_CONTROL_RELEASE_IP,
4187                                                                0, 0,
4188                                                                data,
4189                                                                delete_ip_callback,
4190                                                                state);
4191                                 if (ret == -1) {
4192                                         DEBUG(DEBUG_ERR,
4193                                               (__location__ "Unable to send "
4194                                                "CTDB_CONTROL_RELEASE_IP\n"));
4195                                         talloc_free(state);
4196                                         return -1;
4197                                 }
4198
4199                                 state->c = talloc_steal(state, c);
4200                                 *async_reply = true;
4201                         } else {
4202                                 /* This IP is not hosted on the
4203                                  * current node so just delete it
4204                                  * now. */
4205                                 do_delete_ip(ctdb, vnn);
4206                         }
4207
4208                         return 0;
4209                 }
4210         }
4211
4212         DEBUG(DEBUG_ERR,("Delete IP of unknown public IP address %s\n",
4213                          ctdb_addr_to_str(&pub->addr)));
4214         return -1;
4215 }
4216
4217
4218 struct ipreallocated_callback_state {
4219         struct ctdb_req_control_old *c;
4220 };
4221
4222 static void ctdb_ipreallocated_callback(struct ctdb_context *ctdb,
4223                                         int status, void *p)
4224 {
4225         struct ipreallocated_callback_state *state =
4226                 talloc_get_type(p, struct ipreallocated_callback_state);
4227
4228         if (status != 0) {
4229                 DEBUG(DEBUG_ERR,
4230                       (" \"ipreallocated\" event script failed (status %d)\n",
4231                        status));
4232                 if (status == -ETIME) {
4233                         ctdb_ban_self(ctdb);
4234                 }
4235         }
4236
4237         ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
4238         talloc_free(state);
4239 }
4240
4241 /* A control to run the ipreallocated event */
4242 int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
4243                                    struct ctdb_req_control_old *c,
4244                                    bool *async_reply)
4245 {
4246         int ret;
4247         struct ipreallocated_callback_state *state;
4248
4249         state = talloc(ctdb, struct ipreallocated_callback_state);
4250         CTDB_NO_MEMORY(ctdb, state);
4251
4252         DEBUG(DEBUG_INFO,(__location__ " Running \"ipreallocated\" event\n"));
4253
4254         ret = ctdb_event_script_callback(ctdb, state,
4255                                          ctdb_ipreallocated_callback, state,
4256                                          CTDB_EVENT_IPREALLOCATED,
4257                                          "%s", "");
4258
4259         if (ret != 0) {
4260                 DEBUG(DEBUG_ERR,("Failed to run \"ipreallocated\" event \n"));
4261                 talloc_free(state);
4262                 return -1;
4263         }
4264
4265         /* tell the control that we will be reply asynchronously */
4266         state->c    = talloc_steal(state, c);
4267         *async_reply = true;
4268
4269         return 0;
4270 }
4271
4272
4273 /* This function is called from the recovery daemon to verify that a remote
4274    node has the expected ip allocation.
4275    This is verified against ctdb->ip_tree
4276 */
4277 static int verify_remote_ip_allocation(struct ctdb_context *ctdb,
4278                                        struct ctdb_public_ip_list_old *ips,
4279                                        uint32_t pnn)
4280 {
4281         struct public_ip_list *tmp_ip;
4282         int i;
4283
4284         if (ctdb->ip_tree == NULL) {
4285                 /* don't know the expected allocation yet, assume remote node
4286                    is correct. */
4287                 return 0;
4288         }
4289
4290         if (ips == NULL) {
4291                 return 0;
4292         }
4293
4294         for (i=0; i<ips->num; i++) {
4295                 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
4296                 if (tmp_ip == NULL) {
4297                         DEBUG(DEBUG_ERR,("Node %u has new or unknown public IP %s\n", pnn, ctdb_addr_to_str(&ips->ips[i].addr)));
4298                         return -1;
4299                 }
4300
4301                 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
4302                         continue;
4303                 }
4304
4305                 if (tmp_ip->pnn != ips->ips[i].pnn) {
4306                         DEBUG(DEBUG_ERR,
4307                               ("Inconsistent IP allocation - node %u thinks %s is held by node %u while it is assigned to node %u\n",
4308                                pnn,
4309                                ctdb_addr_to_str(&ips->ips[i].addr),
4310                                ips->ips[i].pnn, tmp_ip->pnn));
4311                         return -1;
4312                 }
4313         }
4314
4315         return 0;
4316 }
4317
4318 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
4319 {
4320         struct public_ip_list *tmp_ip;
4321
4322         /* IP tree is never built if DisableIPFailover is set */
4323         if (ctdb->tunable.disable_ip_failover != 0) {
4324                 return 0;
4325         }
4326
4327         if (ctdb->ip_tree == NULL) {
4328                 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
4329                 return -1;
4330         }
4331
4332         tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
4333         if (tmp_ip == NULL) {
4334                 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
4335                 return -1;
4336         }
4337
4338         DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
4339         tmp_ip->pnn = ip->pnn;
4340
4341         return 0;
4342 }
4343
4344 void clear_ip_assignment_tree(struct ctdb_context *ctdb)
4345 {
4346         TALLOC_FREE(ctdb->ip_tree);
4347 }
4348
4349 struct ctdb_reloadips_handle {
4350         struct ctdb_context *ctdb;
4351         struct ctdb_req_control_old *c;
4352         int status;
4353         int fd[2];
4354         pid_t child;
4355         struct tevent_fd *fde;
4356 };
4357
4358 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
4359 {
4360         if (h == h->ctdb->reload_ips) {
4361                 h->ctdb->reload_ips = NULL;
4362         }
4363         if (h->c != NULL) {
4364                 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
4365                 h->c = NULL;
4366         }
4367         ctdb_kill(h->ctdb, h->child, SIGKILL);
4368         return 0;
4369 }
4370
4371 static void ctdb_reloadips_timeout_event(struct tevent_context *ev,
4372                                          struct tevent_timer *te,
4373                                          struct timeval t, void *private_data)
4374 {
4375         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4376
4377         talloc_free(h);
4378 }
4379
4380 static void ctdb_reloadips_child_handler(struct tevent_context *ev,
4381                                          struct tevent_fd *fde,
4382                                          uint16_t flags, void *private_data)
4383 {
4384         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4385
4386         char res;
4387         int ret;
4388
4389         ret = sys_read(h->fd[0], &res, 1);
4390         if (ret < 1 || res != 0) {
4391                 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
4392                 res = 1;
4393         }
4394         h->status = res;
4395
4396         talloc_free(h);
4397 }
4398
4399 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
4400 {
4401         TALLOC_CTX *mem_ctx = talloc_new(NULL);
4402         struct ctdb_public_ip_list_old *ips;
4403         struct ctdb_vnn *vnn;
4404         struct client_async_data *async_data;
4405         struct timeval timeout;
4406         TDB_DATA data;
4407         struct ctdb_client_control_state *state;
4408         bool first_add;
4409         int i, ret;
4410
4411         CTDB_NO_MEMORY(ctdb, mem_ctx);
4412
4413         /* Read IPs from local node */
4414         ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(),
4415                                        CTDB_CURRENT_NODE, mem_ctx, &ips);
4416         if (ret != 0) {
4417                 DEBUG(DEBUG_ERR,
4418                       ("Unable to fetch public IPs from local node\n"));
4419                 talloc_free(mem_ctx);
4420                 return -1;
4421         }
4422
4423         /* Read IPs file - this is safe since this is a child process */
4424         ctdb->vnn = NULL;
4425         if (ctdb_set_public_addresses(ctdb, false) != 0) {
4426                 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
4427                 talloc_free(mem_ctx);
4428                 return -1;
4429         }
4430
4431         async_data = talloc_zero(mem_ctx, struct client_async_data);
4432         CTDB_NO_MEMORY(ctdb, async_data);
4433
4434         /* Compare IPs between node and file for IPs to be deleted */
4435         for (i = 0; i < ips->num; i++) {
4436                 /* */
4437                 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4438                         if (ctdb_same_ip(&vnn->public_address,
4439                                          &ips->ips[i].addr)) {
4440                                 /* IP is still in file */
4441                                 break;
4442                         }
4443                 }
4444
4445                 if (vnn == NULL) {
4446                         /* Delete IP ips->ips[i] */
4447                         struct ctdb_addr_info_old *pub;
4448
4449                         DEBUG(DEBUG_NOTICE,
4450                               ("IP %s no longer configured, deleting it\n",
4451                                ctdb_addr_to_str(&ips->ips[i].addr)));
4452
4453                         pub = talloc_zero(mem_ctx, struct ctdb_addr_info_old);
4454                         CTDB_NO_MEMORY(ctdb, pub);
4455
4456                         pub->addr  = ips->ips[i].addr;
4457                         pub->mask  = 0;
4458                         pub->len   = 0;
4459
4460                         timeout = TAKEOVER_TIMEOUT();
4461
4462                         data.dsize = offsetof(struct ctdb_addr_info_old,
4463                                               iface) + pub->len;
4464                         data.dptr = (uint8_t *)pub;
4465
4466                         state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4467                                                   CTDB_CONTROL_DEL_PUBLIC_IP,
4468                                                   0, data, async_data,
4469                                                   &timeout, NULL);
4470                         if (state == NULL) {
4471                                 DEBUG(DEBUG_ERR,
4472                                       (__location__
4473                                        " failed sending CTDB_CONTROL_DEL_PUBLIC_IP\n"));
4474                                 goto failed;
4475                         }
4476
4477                         ctdb_client_async_add(async_data, state);
4478                 }
4479         }
4480
4481         /* Compare IPs between node and file for IPs to be added */
4482         first_add = true;
4483         for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4484                 for (i = 0; i < ips->num; i++) {
4485                         if (ctdb_same_ip(&vnn->public_address,
4486                                          &ips->ips[i].addr)) {
4487                                 /* IP already on node */
4488                                 break;
4489                         }
4490                 }
4491                 if (i == ips->num) {
4492                         /* Add IP ips->ips[i] */
4493                         struct ctdb_addr_info_old *pub;
4494                         const char *ifaces = NULL;
4495                         uint32_t len;
4496                         int iface = 0;
4497
4498                         DEBUG(DEBUG_NOTICE,
4499                               ("New IP %s configured, adding it\n",
4500                                ctdb_addr_to_str(&vnn->public_address)));
4501                         if (first_add) {
4502                                 uint32_t pnn = ctdb_get_pnn(ctdb);
4503
4504                                 data.dsize = sizeof(pnn);
4505                                 data.dptr  = (uint8_t *)&pnn;
4506
4507                                 ret = ctdb_client_send_message(
4508                                         ctdb,
4509                                         CTDB_BROADCAST_CONNECTED,
4510                                         CTDB_SRVID_REBALANCE_NODE,
4511                                         data);
4512                                 if (ret != 0) {
4513                                         DEBUG(DEBUG_WARNING,
4514                                               ("Failed to send message to force node reallocation - IPs may be unbalanced\n"));
4515                                 }
4516
4517                                 first_add = false;
4518                         }
4519
4520                         ifaces = vnn->ifaces[0];
4521                         iface = 1;
4522                         while (vnn->ifaces[iface] != NULL) {
4523                                 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces,
4524                                                          vnn->ifaces[iface]);
4525                                 iface++;
4526                         }
4527
4528                         len   = strlen(ifaces) + 1;
4529                         pub = talloc_zero_size(mem_ctx,
4530                                                offsetof(struct ctdb_addr_info_old, iface) + len);
4531                         CTDB_NO_MEMORY(ctdb, pub);
4532
4533                         pub->addr  = vnn->public_address;
4534                         pub->mask  = vnn->public_netmask_bits;
4535                         pub->len   = len;
4536                         memcpy(&pub->iface[0], ifaces, pub->len);
4537
4538                         timeout = TAKEOVER_TIMEOUT();
4539
4540                         data.dsize = offsetof(struct ctdb_addr_info_old,
4541                                               iface) + pub->len;
4542                         data.dptr = (uint8_t *)pub;
4543
4544                         state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4545                                                   CTDB_CONTROL_ADD_PUBLIC_IP,
4546                                                   0, data, async_data,
4547                                                   &timeout, NULL);
4548                         if (state == NULL) {
4549                                 DEBUG(DEBUG_ERR,
4550                                       (__location__
4551                                        " failed sending CTDB_CONTROL_ADD_PUBLIC_IP\n"));
4552                                 goto failed;
4553                         }
4554
4555                         ctdb_client_async_add(async_data, state);
4556                 }
4557         }
4558
4559         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
4560                 DEBUG(DEBUG_ERR,(__location__ " Add/delete IPs failed\n"));
4561                 goto failed;
4562         }
4563
4564         talloc_free(mem_ctx);
4565         return 0;
4566
4567 failed:
4568         talloc_free(mem_ctx);
4569         return -1;
4570 }
4571
4572 /* This control is sent to force the node to re-read the public addresses file
4573    and drop any addresses we should nnot longer host, and add new addresses
4574    that we are now able to host
4575 */
4576 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control_old *c, bool *async_reply)
4577 {
4578         struct ctdb_reloadips_handle *h;
4579         pid_t parent = getpid();
4580
4581         if (ctdb->reload_ips != NULL) {
4582                 talloc_free(ctdb->reload_ips);
4583                 ctdb->reload_ips = NULL;
4584         }
4585
4586         h = talloc(ctdb, struct ctdb_reloadips_handle);
4587         CTDB_NO_MEMORY(ctdb, h);
4588         h->ctdb     = ctdb;
4589         h->c        = NULL;
4590         h->status   = -1;
4591         
4592         if (pipe(h->fd) == -1) {
4593                 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
4594                 talloc_free(h);
4595                 return -1;
4596         }
4597
4598         h->child = ctdb_fork(ctdb);
4599         if (h->child == (pid_t)-1) {
4600                 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
4601                 close(h->fd[0]);
4602                 close(h->fd[1]);
4603                 talloc_free(h);
4604                 return -1;
4605         }
4606
4607         /* child process */
4608         if (h->child == 0) {
4609                 signed char res = 0;
4610
4611                 close(h->fd[0]);
4612                 debug_extra = talloc_asprintf(NULL, "reloadips:");
4613
4614                 prctl_set_comment("ctdb_reloadips");
4615                 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
4616                         DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
4617                         res = -1;
4618                 } else {
4619                         res = ctdb_reloadips_child(ctdb);
4620                         if (res != 0) {
4621                                 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
4622                         }
4623                 }
4624
4625                 sys_write(h->fd[1], &res, 1);
4626                 /* make sure we die when our parent dies */
4627                 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
4628                         sleep(5);
4629                 }
4630                 _exit(0);
4631         }
4632
4633         h->c             = talloc_steal(h, c);
4634
4635         close(h->fd[1]);
4636         set_close_on_exec(h->fd[0]);
4637
4638         talloc_set_destructor(h, ctdb_reloadips_destructor);
4639
4640
4641         h->fde = tevent_add_fd(ctdb->ev, h, h->fd[0], TEVENT_FD_READ,
4642                                ctdb_reloadips_child_handler, (void *)h);
4643         tevent_fd_set_auto_close(h->fde);
4644
4645         tevent_add_timer(ctdb->ev, h, timeval_current_ofs(120, 0),
4646                          ctdb_reloadips_timeout_event, h);
4647
4648         /* we reply later */
4649         *async_reply = true;
4650         return 0;
4651 }