ctdb-ipalloc: Add no_ip_failback to ipalloc_state
[obnox/samba/samba-obnox.git] / ctdb / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6    Copyright (C) Martin Schwenke  2011
7
8    This program is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3 of the License, or
11    (at your option) any later version.
12    
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17    
18    You should have received a copy of the GNU General Public License
19    along with this program; if not, see <http://www.gnu.org/licenses/>.
20 */
21 #include "replace.h"
22 #include "system/network.h"
23 #include "system/filesys.h"
24 #include "system/time.h"
25 #include "system/wait.h"
26
27 #include <talloc.h>
28 #include <tevent.h>
29
30 #include "lib/util/dlinklist.h"
31 #include "lib/util/debug.h"
32 #include "lib/util/samba_util.h"
33 #include "lib/util/util_process.h"
34
35 #include "ctdb_private.h"
36 #include "ctdb_client.h"
37
38 #include "common/rb_tree.h"
39 #include "common/reqid.h"
40 #include "common/system.h"
41 #include "common/common.h"
42 #include "common/logging.h"
43
44
45 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
46
47 #define CTDB_ARP_INTERVAL 1
48 #define CTDB_ARP_REPEAT   3
49
50 /* Flags used in IP allocation algorithms. */
51 struct ctdb_ipflags {
52         bool noiptakeover;
53         bool noiphost;
54 };
55
56 enum ipalloc_algorithm {
57         IPALLOC_DETERMINISTIC,
58         IPALLOC_NONDETERMINISTIC,
59         IPALLOC_LCP2,
60 };
61
62 struct ipalloc_state {
63         uint32_t num;
64
65         /* Arrays with data for each node */
66         struct ctdb_public_ip_list_old **known_public_ips;
67         struct ctdb_public_ip_list_old **available_public_ips;
68
69         enum ipalloc_algorithm algorithm;
70         uint32_t no_ip_failback;
71 };
72
73 struct ctdb_interface {
74         struct ctdb_interface *prev, *next;
75         const char *name;
76         bool link_up;
77         uint32_t references;
78 };
79
80 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
81 {
82         if (vnn->iface) {
83                 return vnn->iface->name;
84         }
85
86         return "__none__";
87 }
88
89 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
90 {
91         struct ctdb_interface *i;
92
93         /* Verify that we don't have an entry for this ip yet */
94         for (i=ctdb->ifaces;i;i=i->next) {
95                 if (strcmp(i->name, iface) == 0) {
96                         return 0;
97                 }
98         }
99
100         /* create a new structure for this interface */
101         i = talloc_zero(ctdb, struct ctdb_interface);
102         CTDB_NO_MEMORY_FATAL(ctdb, i);
103         i->name = talloc_strdup(i, iface);
104         CTDB_NO_MEMORY(ctdb, i->name);
105
106         i->link_up = true;
107
108         DLIST_ADD(ctdb->ifaces, i);
109
110         return 0;
111 }
112
113 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
114                                         const char *name)
115 {
116         int n;
117
118         for (n = 0; vnn->ifaces[n] != NULL; n++) {
119                 if (strcmp(name, vnn->ifaces[n]) == 0) {
120                         return true;
121                 }
122         }
123
124         return false;
125 }
126
127 /* If any interfaces now have no possible IPs then delete them.  This
128  * implementation is naive (i.e. simple) rather than clever
129  * (i.e. complex).  Given that this is run on delip and that operation
130  * is rare, this doesn't need to be efficient - it needs to be
131  * foolproof.  One alternative is reference counting, where the logic
132  * is distributed and can, therefore, be broken in multiple places.
133  * Another alternative is to build a red-black tree of interfaces that
134  * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
135  * once) and then walking ctdb->ifaces once and deleting those not in
136  * the tree.  Let's go to one of those if the naive implementation
137  * causes problems...  :-)
138  */
139 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
140                                         struct ctdb_vnn *vnn)
141 {
142         struct ctdb_interface *i, *next;
143
144         /* For each interface, check if there's an IP using it. */
145         for (i = ctdb->ifaces; i != NULL; i = next) {
146                 struct ctdb_vnn *tv;
147                 bool found;
148                 next = i->next;
149
150                 /* Only consider interfaces named in the given VNN. */
151                 if (!vnn_has_interface_with_name(vnn, i->name)) {
152                         continue;
153                 }
154
155                 /* Is the "single IP" on this interface? */
156                 if ((ctdb->single_ip_vnn != NULL) &&
157                     (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
158                     (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
159                         /* Found, next interface please... */
160                         continue;
161                 }
162                 /* Search for a vnn with this interface. */
163                 found = false;
164                 for (tv=ctdb->vnn; tv; tv=tv->next) {
165                         if (vnn_has_interface_with_name(tv, i->name)) {
166                                 found = true;
167                                 break;
168                         }
169                 }
170
171                 if (!found) {
172                         /* None of the VNNs are using this interface. */
173                         DLIST_REMOVE(ctdb->ifaces, i);
174                         talloc_free(i);
175                 }
176         }
177 }
178
179
180 static struct ctdb_interface *ctdb_find_iface(struct ctdb_context *ctdb,
181                                               const char *iface)
182 {
183         struct ctdb_interface *i;
184
185         for (i=ctdb->ifaces;i;i=i->next) {
186                 if (strcmp(i->name, iface) == 0) {
187                         return i;
188                 }
189         }
190
191         return NULL;
192 }
193
194 static struct ctdb_interface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
195                                                   struct ctdb_vnn *vnn)
196 {
197         int i;
198         struct ctdb_interface *cur = NULL;
199         struct ctdb_interface *best = NULL;
200
201         for (i=0; vnn->ifaces[i]; i++) {
202
203                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
204                 if (cur == NULL) {
205                         continue;
206                 }
207
208                 if (!cur->link_up) {
209                         continue;
210                 }
211
212                 if (best == NULL) {
213                         best = cur;
214                         continue;
215                 }
216
217                 if (cur->references < best->references) {
218                         best = cur;
219                         continue;
220                 }
221         }
222
223         return best;
224 }
225
226 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
227                                      struct ctdb_vnn *vnn)
228 {
229         struct ctdb_interface *best = NULL;
230
231         if (vnn->iface) {
232                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
233                                    "still assigned to iface '%s'\n",
234                                    ctdb_addr_to_str(&vnn->public_address),
235                                    ctdb_vnn_iface_string(vnn)));
236                 return 0;
237         }
238
239         best = ctdb_vnn_best_iface(ctdb, vnn);
240         if (best == NULL) {
241                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
242                                   "cannot assign to iface any iface\n",
243                                   ctdb_addr_to_str(&vnn->public_address)));
244                 return -1;
245         }
246
247         vnn->iface = best;
248         best->references++;
249         vnn->pnn = ctdb->pnn;
250
251         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
252                            "now assigned to iface '%s' refs[%d]\n",
253                            ctdb_addr_to_str(&vnn->public_address),
254                            ctdb_vnn_iface_string(vnn),
255                            best->references));
256         return 0;
257 }
258
259 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
260                                     struct ctdb_vnn *vnn)
261 {
262         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
263                            "now unassigned (old iface '%s' refs[%d])\n",
264                            ctdb_addr_to_str(&vnn->public_address),
265                            ctdb_vnn_iface_string(vnn),
266                            vnn->iface?vnn->iface->references:0));
267         if (vnn->iface) {
268                 vnn->iface->references--;
269         }
270         vnn->iface = NULL;
271         if (vnn->pnn == ctdb->pnn) {
272                 vnn->pnn = -1;
273         }
274 }
275
276 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
277                                struct ctdb_vnn *vnn)
278 {
279         int i;
280
281         /* Nodes that are not RUNNING can not host IPs */
282         if (ctdb->runstate != CTDB_RUNSTATE_RUNNING) {
283                 return false;
284         }
285
286         if (vnn->delete_pending) {
287                 return false;
288         }
289
290         if (vnn->iface && vnn->iface->link_up) {
291                 return true;
292         }
293
294         for (i=0; vnn->ifaces[i]; i++) {
295                 struct ctdb_interface *cur;
296
297                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
298                 if (cur == NULL) {
299                         continue;
300                 }
301
302                 if (cur->link_up) {
303                         return true;
304                 }
305         }
306
307         return false;
308 }
309
310 struct ctdb_takeover_arp {
311         struct ctdb_context *ctdb;
312         uint32_t count;
313         ctdb_sock_addr addr;
314         struct ctdb_tcp_array *tcparray;
315         struct ctdb_vnn *vnn;
316 };
317
318
319 /*
320   lists of tcp endpoints
321  */
322 struct ctdb_tcp_list {
323         struct ctdb_tcp_list *prev, *next;
324         struct ctdb_connection connection;
325 };
326
327 /*
328   list of clients to kill on IP release
329  */
330 struct ctdb_client_ip {
331         struct ctdb_client_ip *prev, *next;
332         struct ctdb_context *ctdb;
333         ctdb_sock_addr addr;
334         uint32_t client_id;
335 };
336
337
338 /*
339   send a gratuitous arp
340  */
341 static void ctdb_control_send_arp(struct tevent_context *ev,
342                                   struct tevent_timer *te,
343                                   struct timeval t, void *private_data)
344 {
345         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
346                                                         struct ctdb_takeover_arp);
347         int i, ret;
348         struct ctdb_tcp_array *tcparray;
349         const char *iface = ctdb_vnn_iface_string(arp->vnn);
350
351         ret = ctdb_sys_send_arp(&arp->addr, iface);
352         if (ret != 0) {
353                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
354                                   iface, strerror(errno)));
355         }
356
357         tcparray = arp->tcparray;
358         if (tcparray) {
359                 for (i=0;i<tcparray->num;i++) {
360                         struct ctdb_connection *tcon;
361
362                         tcon = &tcparray->connections[i];
363                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
364                                 (unsigned)ntohs(tcon->dst.ip.sin_port),
365                                 ctdb_addr_to_str(&tcon->src),
366                                 (unsigned)ntohs(tcon->src.ip.sin_port)));
367                         ret = ctdb_sys_send_tcp(
368                                 &tcon->src,
369                                 &tcon->dst,
370                                 0, 0, 0);
371                         if (ret != 0) {
372                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
373                                         ctdb_addr_to_str(&tcon->src)));
374                         }
375                 }
376         }
377
378         arp->count++;
379
380         if (arp->count == CTDB_ARP_REPEAT) {
381                 talloc_free(arp);
382                 return;
383         }
384
385         tevent_add_timer(arp->ctdb->ev, arp->vnn->takeover_ctx,
386                          timeval_current_ofs(CTDB_ARP_INTERVAL, 100000),
387                          ctdb_control_send_arp, arp);
388 }
389
390 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
391                                        struct ctdb_vnn *vnn)
392 {
393         struct ctdb_takeover_arp *arp;
394         struct ctdb_tcp_array *tcparray;
395
396         if (!vnn->takeover_ctx) {
397                 vnn->takeover_ctx = talloc_new(vnn);
398                 if (!vnn->takeover_ctx) {
399                         return -1;
400                 }
401         }
402
403         arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
404         if (!arp) {
405                 return -1;
406         }
407
408         arp->ctdb = ctdb;
409         arp->addr = vnn->public_address;
410         arp->vnn  = vnn;
411
412         tcparray = vnn->tcp_array;
413         if (tcparray) {
414                 /* add all of the known tcp connections for this IP to the
415                    list of tcp connections to send tickle acks for */
416                 arp->tcparray = talloc_steal(arp, tcparray);
417
418                 vnn->tcp_array = NULL;
419                 vnn->tcp_update_needed = true;
420         }
421
422         tevent_add_timer(arp->ctdb->ev, vnn->takeover_ctx,
423                          timeval_zero(), ctdb_control_send_arp, arp);
424
425         return 0;
426 }
427
428 struct takeover_callback_state {
429         struct ctdb_req_control_old *c;
430         ctdb_sock_addr *addr;
431         struct ctdb_vnn *vnn;
432 };
433
434 struct ctdb_do_takeip_state {
435         struct ctdb_req_control_old *c;
436         struct ctdb_vnn *vnn;
437 };
438
439 /*
440   called when takeip event finishes
441  */
442 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
443                                     void *private_data)
444 {
445         struct ctdb_do_takeip_state *state =
446                 talloc_get_type(private_data, struct ctdb_do_takeip_state);
447         int32_t ret;
448         TDB_DATA data;
449
450         if (status != 0) {
451                 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
452         
453                 if (status == -ETIME) {
454                         ctdb_ban_self(ctdb);
455                 }
456                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
457                                  ctdb_addr_to_str(&state->vnn->public_address),
458                                  ctdb_vnn_iface_string(state->vnn)));
459                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
460
461                 node->flags |= NODE_FLAGS_UNHEALTHY;
462                 talloc_free(state);
463                 return;
464         }
465
466         if (ctdb->do_checkpublicip) {
467
468         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
469         if (ret != 0) {
470                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
471                 talloc_free(state);
472                 return;
473         }
474
475         }
476
477         data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
478         data.dsize = strlen((char *)data.dptr) + 1;
479         DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
480
481         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
482
483
484         /* the control succeeded */
485         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
486         talloc_free(state);
487         return;
488 }
489
490 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
491 {
492         state->vnn->update_in_flight = false;
493         return 0;
494 }
495
496 /*
497   take over an ip address
498  */
499 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
500                               struct ctdb_req_control_old *c,
501                               struct ctdb_vnn *vnn)
502 {
503         int ret;
504         struct ctdb_do_takeip_state *state;
505
506         if (vnn->update_in_flight) {
507                 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
508                                     "update for this IP already in flight\n",
509                                     ctdb_addr_to_str(&vnn->public_address),
510                                     vnn->public_netmask_bits));
511                 return -1;
512         }
513
514         ret = ctdb_vnn_assign_iface(ctdb, vnn);
515         if (ret != 0) {
516                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
517                                  "assign a usable interface\n",
518                                  ctdb_addr_to_str(&vnn->public_address),
519                                  vnn->public_netmask_bits));
520                 return -1;
521         }
522
523         state = talloc(vnn, struct ctdb_do_takeip_state);
524         CTDB_NO_MEMORY(ctdb, state);
525
526         state->c = talloc_steal(ctdb, c);
527         state->vnn   = vnn;
528
529         vnn->update_in_flight = true;
530         talloc_set_destructor(state, ctdb_takeip_destructor);
531
532         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
533                             ctdb_addr_to_str(&vnn->public_address),
534                             vnn->public_netmask_bits,
535                             ctdb_vnn_iface_string(vnn)));
536
537         ret = ctdb_event_script_callback(ctdb,
538                                          state,
539                                          ctdb_do_takeip_callback,
540                                          state,
541                                          CTDB_EVENT_TAKE_IP,
542                                          "%s %s %u",
543                                          ctdb_vnn_iface_string(vnn),
544                                          ctdb_addr_to_str(&vnn->public_address),
545                                          vnn->public_netmask_bits);
546
547         if (ret != 0) {
548                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
549                         ctdb_addr_to_str(&vnn->public_address),
550                         ctdb_vnn_iface_string(vnn)));
551                 talloc_free(state);
552                 return -1;
553         }
554
555         return 0;
556 }
557
558 struct ctdb_do_updateip_state {
559         struct ctdb_req_control_old *c;
560         struct ctdb_interface *old;
561         struct ctdb_vnn *vnn;
562 };
563
564 /*
565   called when updateip event finishes
566  */
567 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
568                                       void *private_data)
569 {
570         struct ctdb_do_updateip_state *state =
571                 talloc_get_type(private_data, struct ctdb_do_updateip_state);
572         int32_t ret;
573
574         if (status != 0) {
575                 if (status == -ETIME) {
576                         ctdb_ban_self(ctdb);
577                 }
578                 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
579                         ctdb_addr_to_str(&state->vnn->public_address),
580                         state->old->name,
581                         ctdb_vnn_iface_string(state->vnn)));
582
583                 /*
584                  * All we can do is reset the old interface
585                  * and let the next run fix it
586                  */
587                 ctdb_vnn_unassign_iface(ctdb, state->vnn);
588                 state->vnn->iface = state->old;
589                 state->vnn->iface->references++;
590
591                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
592                 talloc_free(state);
593                 return;
594         }
595
596         if (ctdb->do_checkpublicip) {
597
598         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
599         if (ret != 0) {
600                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
601                 talloc_free(state);
602                 return;
603         }
604
605         }
606
607         /* the control succeeded */
608         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
609         talloc_free(state);
610         return;
611 }
612
613 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
614 {
615         state->vnn->update_in_flight = false;
616         return 0;
617 }
618
619 /*
620   update (move) an ip address
621  */
622 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
623                                 struct ctdb_req_control_old *c,
624                                 struct ctdb_vnn *vnn)
625 {
626         int ret;
627         struct ctdb_do_updateip_state *state;
628         struct ctdb_interface *old = vnn->iface;
629         const char *new_name;
630
631         if (vnn->update_in_flight) {
632                 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
633                                     "update for this IP already in flight\n",
634                                     ctdb_addr_to_str(&vnn->public_address),
635                                     vnn->public_netmask_bits));
636                 return -1;
637         }
638
639         ctdb_vnn_unassign_iface(ctdb, vnn);
640         ret = ctdb_vnn_assign_iface(ctdb, vnn);
641         if (ret != 0) {
642                 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
643                                  "assin a usable interface (old iface '%s')\n",
644                                  ctdb_addr_to_str(&vnn->public_address),
645                                  vnn->public_netmask_bits,
646                                  old->name));
647                 return -1;
648         }
649
650         new_name = ctdb_vnn_iface_string(vnn);
651         if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
652                 /* A benign update from one interface onto itself.
653                  * no need to run the eventscripts in this case, just return
654                  * success.
655                  */
656                 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
657                 return 0;
658         }
659
660         state = talloc(vnn, struct ctdb_do_updateip_state);
661         CTDB_NO_MEMORY(ctdb, state);
662
663         state->c = talloc_steal(ctdb, c);
664         state->old = old;
665         state->vnn = vnn;
666
667         vnn->update_in_flight = true;
668         talloc_set_destructor(state, ctdb_updateip_destructor);
669
670         DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
671                             "interface %s to %s\n",
672                             ctdb_addr_to_str(&vnn->public_address),
673                             vnn->public_netmask_bits,
674                             old->name,
675                             new_name));
676
677         ret = ctdb_event_script_callback(ctdb,
678                                          state,
679                                          ctdb_do_updateip_callback,
680                                          state,
681                                          CTDB_EVENT_UPDATE_IP,
682                                          "%s %s %s %u",
683                                          state->old->name,
684                                          new_name,
685                                          ctdb_addr_to_str(&vnn->public_address),
686                                          vnn->public_netmask_bits);
687         if (ret != 0) {
688                 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
689                                  ctdb_addr_to_str(&vnn->public_address),
690                                  old->name, new_name));
691                 talloc_free(state);
692                 return -1;
693         }
694
695         return 0;
696 }
697
698 /*
699   Find the vnn of the node that has a public ip address
700   returns -1 if the address is not known as a public address
701  */
702 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
703 {
704         struct ctdb_vnn *vnn;
705
706         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
707                 if (ctdb_same_ip(&vnn->public_address, addr)) {
708                         return vnn;
709                 }
710         }
711
712         return NULL;
713 }
714
715 /*
716   take over an ip address
717  */
718 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
719                                  struct ctdb_req_control_old *c,
720                                  TDB_DATA indata,
721                                  bool *async_reply)
722 {
723         int ret;
724         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
725         struct ctdb_vnn *vnn;
726         bool have_ip = false;
727         bool do_updateip = false;
728         bool do_takeip = false;
729         struct ctdb_interface *best_iface = NULL;
730
731         if (pip->pnn != ctdb->pnn) {
732                 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
733                                  "with pnn %d, but we're node %d\n",
734                                  ctdb_addr_to_str(&pip->addr),
735                                  pip->pnn, ctdb->pnn));
736                 return -1;
737         }
738
739         /* update out vnn list */
740         vnn = find_public_ip_vnn(ctdb, &pip->addr);
741         if (vnn == NULL) {
742                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
743                         ctdb_addr_to_str(&pip->addr)));
744                 return 0;
745         }
746
747         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
748                 have_ip = ctdb_sys_have_ip(&pip->addr);
749         }
750         best_iface = ctdb_vnn_best_iface(ctdb, vnn);
751         if (best_iface == NULL) {
752                 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
753                                  "a usable interface (old %s, have_ip %d)\n",
754                                  ctdb_addr_to_str(&vnn->public_address),
755                                  vnn->public_netmask_bits,
756                                  ctdb_vnn_iface_string(vnn),
757                                  have_ip));
758                 return -1;
759         }
760
761         if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
762                 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
763                 have_ip = false;
764         }
765
766
767         if (vnn->iface == NULL && have_ip) {
768                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
769                                   "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
770                                  ctdb_addr_to_str(&vnn->public_address)));
771                 return 0;
772         }
773
774         if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
775                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
776                                   "and we have it on iface[%s], but it was assigned to node %d"
777                                   "and we are node %d, banning ourself\n",
778                                  ctdb_addr_to_str(&vnn->public_address),
779                                  ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
780                 ctdb_ban_self(ctdb);
781                 return -1;
782         }
783
784         if (vnn->pnn == -1 && have_ip) {
785                 vnn->pnn = ctdb->pnn;
786                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
787                                   "and we already have it on iface[%s], update local daemon\n",
788                                  ctdb_addr_to_str(&vnn->public_address),
789                                   ctdb_vnn_iface_string(vnn)));
790                 return 0;
791         }
792
793         if (vnn->iface) {
794                 if (vnn->iface != best_iface) {
795                         if (!vnn->iface->link_up) {
796                                 do_updateip = true;
797                         } else if (vnn->iface->references > (best_iface->references + 1)) {
798                                 /* only move when the rebalance gains something */
799                                         do_updateip = true;
800                         }
801                 }
802         }
803
804         if (!have_ip) {
805                 if (do_updateip) {
806                         ctdb_vnn_unassign_iface(ctdb, vnn);
807                         do_updateip = false;
808                 }
809                 do_takeip = true;
810         }
811
812         if (do_takeip) {
813                 ret = ctdb_do_takeip(ctdb, c, vnn);
814                 if (ret != 0) {
815                         return -1;
816                 }
817         } else if (do_updateip) {
818                 ret = ctdb_do_updateip(ctdb, c, vnn);
819                 if (ret != 0) {
820                         return -1;
821                 }
822         } else {
823                 /*
824                  * The interface is up and the kernel known the ip
825                  * => do nothing
826                  */
827                 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
828                         ctdb_addr_to_str(&pip->addr),
829                         vnn->public_netmask_bits,
830                         ctdb_vnn_iface_string(vnn)));
831                 return 0;
832         }
833
834         /* tell ctdb_control.c that we will be replying asynchronously */
835         *async_reply = true;
836
837         return 0;
838 }
839
840 /*
841   kill any clients that are registered with a IP that is being released
842  */
843 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
844 {
845         struct ctdb_client_ip *ip;
846
847         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
848                 ctdb_addr_to_str(addr)));
849
850         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
851                 ctdb_sock_addr tmp_addr;
852
853                 tmp_addr = ip->addr;
854                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
855                         ip->client_id,
856                         ctdb_addr_to_str(&ip->addr)));
857
858                 if (ctdb_same_ip(&tmp_addr, addr)) {
859                         struct ctdb_client *client = reqid_find(ctdb->idr,
860                                                                 ip->client_id,
861                                                                 struct ctdb_client);
862                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
863                                 ip->client_id,
864                                 ctdb_addr_to_str(&ip->addr),
865                                 client->pid));
866
867                         if (client->pid != 0) {
868                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
869                                         (unsigned)client->pid,
870                                         ctdb_addr_to_str(addr),
871                                         ip->client_id));
872                                 kill(client->pid, SIGKILL);
873                         }
874                 }
875         }
876 }
877
878 static void do_delete_ip(struct ctdb_context *ctdb, struct ctdb_vnn *vnn)
879 {
880         DLIST_REMOVE(ctdb->vnn, vnn);
881         ctdb_vnn_unassign_iface(ctdb, vnn);
882         ctdb_remove_orphaned_ifaces(ctdb, vnn);
883         talloc_free(vnn);
884 }
885
886 /*
887   called when releaseip event finishes
888  */
889 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
890                                 void *private_data)
891 {
892         struct takeover_callback_state *state = 
893                 talloc_get_type(private_data, struct takeover_callback_state);
894         TDB_DATA data;
895
896         if (status == -ETIME) {
897                 ctdb_ban_self(ctdb);
898         }
899
900         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
901                 if  (ctdb_sys_have_ip(state->addr)) {
902                         DEBUG(DEBUG_ERR,
903                               ("IP %s still hosted during release IP callback, failing\n",
904                                ctdb_addr_to_str(state->addr)));
905                         ctdb_request_control_reply(ctdb, state->c,
906                                                    NULL, -1, NULL);
907                         talloc_free(state);
908                         return;
909                 }
910         }
911
912         /* send a message to all clients of this node telling them
913            that the cluster has been reconfigured and they should
914            release any sockets on this IP */
915         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
916         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
917         data.dsize = strlen((char *)data.dptr)+1;
918
919         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
920
921         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
922
923         /* kill clients that have registered with this IP */
924         release_kill_clients(ctdb, state->addr);
925
926         ctdb_vnn_unassign_iface(ctdb, state->vnn);
927
928         /* Process the IP if it has been marked for deletion */
929         if (state->vnn->delete_pending) {
930                 do_delete_ip(ctdb, state->vnn);
931                 state->vnn = NULL;
932         }
933
934         /* the control succeeded */
935         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
936         talloc_free(state);
937 }
938
939 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
940 {
941         if (state->vnn != NULL) {
942                 state->vnn->update_in_flight = false;
943         }
944         return 0;
945 }
946
947 /*
948   release an ip address
949  */
950 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
951                                 struct ctdb_req_control_old *c,
952                                 TDB_DATA indata, 
953                                 bool *async_reply)
954 {
955         int ret;
956         struct takeover_callback_state *state;
957         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
958         struct ctdb_vnn *vnn;
959         char *iface;
960
961         /* update our vnn list */
962         vnn = find_public_ip_vnn(ctdb, &pip->addr);
963         if (vnn == NULL) {
964                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
965                         ctdb_addr_to_str(&pip->addr)));
966                 return 0;
967         }
968         vnn->pnn = pip->pnn;
969
970         /* stop any previous arps */
971         talloc_free(vnn->takeover_ctx);
972         vnn->takeover_ctx = NULL;
973
974         /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
975          * lazy multicast to drop an IP from any node that isn't the
976          * intended new node.  The following causes makes ctdbd ignore
977          * a release for any address it doesn't host.
978          */
979         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
980                 if (!ctdb_sys_have_ip(&pip->addr)) {
981                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
982                                 ctdb_addr_to_str(&pip->addr),
983                                 vnn->public_netmask_bits,
984                                 ctdb_vnn_iface_string(vnn)));
985                         ctdb_vnn_unassign_iface(ctdb, vnn);
986                         return 0;
987                 }
988         } else {
989                 if (vnn->iface == NULL) {
990                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
991                                            ctdb_addr_to_str(&pip->addr),
992                                            vnn->public_netmask_bits));
993                         return 0;
994                 }
995         }
996
997         /* There is a potential race between take_ip and us because we
998          * update the VNN via a callback that run when the
999          * eventscripts have been run.  Avoid the race by allowing one
1000          * update to be in flight at a time.
1001          */
1002         if (vnn->update_in_flight) {
1003                 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
1004                                     "update for this IP already in flight\n",
1005                                     ctdb_addr_to_str(&vnn->public_address),
1006                                     vnn->public_netmask_bits));
1007                 return -1;
1008         }
1009
1010         iface = strdup(ctdb_vnn_iface_string(vnn));
1011
1012         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
1013                 ctdb_addr_to_str(&pip->addr),
1014                 vnn->public_netmask_bits,
1015                 iface,
1016                 pip->pnn));
1017
1018         state = talloc(ctdb, struct takeover_callback_state);
1019         if (state == NULL) {
1020                 ctdb_set_error(ctdb, "Out of memory at %s:%d",
1021                                __FILE__, __LINE__);
1022                 free(iface);
1023                 return -1;
1024         }
1025
1026         state->c = talloc_steal(state, c);
1027         state->addr = talloc(state, ctdb_sock_addr);       
1028         if (state->addr == NULL) {
1029                 ctdb_set_error(ctdb, "Out of memory at %s:%d",
1030                                __FILE__, __LINE__);
1031                 free(iface);
1032                 talloc_free(state);
1033                 return -1;
1034         }
1035         *state->addr = pip->addr;
1036         state->vnn   = vnn;
1037
1038         vnn->update_in_flight = true;
1039         talloc_set_destructor(state, ctdb_releaseip_destructor);
1040
1041         ret = ctdb_event_script_callback(ctdb, 
1042                                          state, release_ip_callback, state,
1043                                          CTDB_EVENT_RELEASE_IP,
1044                                          "%s %s %u",
1045                                          iface,
1046                                          ctdb_addr_to_str(&pip->addr),
1047                                          vnn->public_netmask_bits);
1048         free(iface);
1049         if (ret != 0) {
1050                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1051                         ctdb_addr_to_str(&pip->addr),
1052                         ctdb_vnn_iface_string(vnn)));
1053                 talloc_free(state);
1054                 return -1;
1055         }
1056
1057         /* tell the control that we will be reply asynchronously */
1058         *async_reply = true;
1059         return 0;
1060 }
1061
1062 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1063                                    ctdb_sock_addr *addr,
1064                                    unsigned mask, const char *ifaces,
1065                                    bool check_address)
1066 {
1067         struct ctdb_vnn      *vnn;
1068         uint32_t num = 0;
1069         char *tmp;
1070         const char *iface;
1071         int i;
1072         int ret;
1073
1074         tmp = strdup(ifaces);
1075         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1076                 if (!ctdb_sys_check_iface_exists(iface)) {
1077                         DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1078                         free(tmp);
1079                         return -1;
1080                 }
1081         }
1082         free(tmp);
1083
1084         /* Verify that we don't have an entry for this ip yet */
1085         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1086                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1087                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
1088                                 ctdb_addr_to_str(addr)));
1089                         return -1;
1090                 }               
1091         }
1092
1093         /* create a new vnn structure for this ip address */
1094         vnn = talloc_zero(ctdb, struct ctdb_vnn);
1095         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1096         vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1097         tmp = talloc_strdup(vnn, ifaces);
1098         CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1099         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1100                 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1101                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1102                 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1103                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1104                 num++;
1105         }
1106         talloc_free(tmp);
1107         vnn->ifaces[num] = NULL;
1108         vnn->public_address      = *addr;
1109         vnn->public_netmask_bits = mask;
1110         vnn->pnn                 = -1;
1111         if (check_address) {
1112                 if (ctdb_sys_have_ip(addr)) {
1113                         DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1114                         vnn->pnn = ctdb->pnn;
1115                 }
1116         }
1117
1118         for (i=0; vnn->ifaces[i]; i++) {
1119                 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1120                 if (ret != 0) {
1121                         DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1122                                            "for public_address[%s]\n",
1123                                            vnn->ifaces[i], ctdb_addr_to_str(addr)));
1124                         talloc_free(vnn);
1125                         return -1;
1126                 }
1127         }
1128
1129         DLIST_ADD(ctdb->vnn, vnn);
1130
1131         return 0;
1132 }
1133
1134 /*
1135   setup the public address lists from a file
1136 */
1137 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1138 {
1139         char **lines;
1140         int nlines;
1141         int i;
1142
1143         lines = file_lines_load(ctdb->public_addresses_file, &nlines, 0, ctdb);
1144         if (lines == NULL) {
1145                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1146                 return -1;
1147         }
1148         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1149                 nlines--;
1150         }
1151
1152         for (i=0;i<nlines;i++) {
1153                 unsigned mask;
1154                 ctdb_sock_addr addr;
1155                 const char *addrstr;
1156                 const char *ifaces;
1157                 char *tok, *line;
1158
1159                 line = lines[i];
1160                 while ((*line == ' ') || (*line == '\t')) {
1161                         line++;
1162                 }
1163                 if (*line == '#') {
1164                         continue;
1165                 }
1166                 if (strcmp(line, "") == 0) {
1167                         continue;
1168                 }
1169                 tok = strtok(line, " \t");
1170                 addrstr = tok;
1171                 tok = strtok(NULL, " \t");
1172                 if (tok == NULL) {
1173                         if (NULL == ctdb->default_public_interface) {
1174                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1175                                          i+1));
1176                                 talloc_free(lines);
1177                                 return -1;
1178                         }
1179                         ifaces = ctdb->default_public_interface;
1180                 } else {
1181                         ifaces = tok;
1182                 }
1183
1184                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1185                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1186                         talloc_free(lines);
1187                         return -1;
1188                 }
1189                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1190                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1191                         talloc_free(lines);
1192                         return -1;
1193                 }
1194         }
1195
1196
1197         talloc_free(lines);
1198         return 0;
1199 }
1200
1201 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1202                               const char *iface,
1203                               const char *ip)
1204 {
1205         struct ctdb_vnn *svnn;
1206         struct ctdb_interface *cur = NULL;
1207         bool ok;
1208         int ret;
1209
1210         svnn = talloc_zero(ctdb, struct ctdb_vnn);
1211         CTDB_NO_MEMORY(ctdb, svnn);
1212
1213         svnn->ifaces = talloc_array(svnn, const char *, 2);
1214         CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1215         svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1216         CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1217         svnn->ifaces[1] = NULL;
1218
1219         ok = parse_ip(ip, iface, 0, &svnn->public_address);
1220         if (!ok) {
1221                 talloc_free(svnn);
1222                 return -1;
1223         }
1224
1225         ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1226         if (ret != 0) {
1227                 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1228                                    "for single_ip[%s]\n",
1229                                    svnn->ifaces[0],
1230                                    ctdb_addr_to_str(&svnn->public_address)));
1231                 talloc_free(svnn);
1232                 return -1;
1233         }
1234
1235         /* assume the single public ip interface is initially "good" */
1236         cur = ctdb_find_iface(ctdb, iface);
1237         if (cur == NULL) {
1238                 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1239                 return -1;
1240         }
1241         cur->link_up = true;
1242
1243         ret = ctdb_vnn_assign_iface(ctdb, svnn);
1244         if (ret != 0) {
1245                 talloc_free(svnn);
1246                 return -1;
1247         }
1248
1249         ctdb->single_ip_vnn = svnn;
1250         return 0;
1251 }
1252
1253 struct public_ip_list {
1254         struct public_ip_list *next;
1255         uint32_t pnn;
1256         ctdb_sock_addr addr;
1257 };
1258
1259 /* Given a physical node, return the number of
1260    public addresses that is currently assigned to this node.
1261 */
1262 static int node_ip_coverage(int32_t pnn, struct public_ip_list *ips)
1263 {
1264         int num=0;
1265
1266         for (;ips;ips=ips->next) {
1267                 if (ips->pnn == pnn) {
1268                         num++;
1269                 }
1270         }
1271         return num;
1272 }
1273
1274
1275 /* Can the given node host the given IP: is the public IP known to the
1276  * node and is NOIPHOST unset?
1277 */
1278 static bool can_node_host_ip(struct ctdb_context *ctdb, int32_t pnn,
1279                              struct ctdb_ipflags ipflags,
1280                              struct public_ip_list *ip)
1281 {
1282         struct ctdb_public_ip_list_old *public_ips;
1283         int i;
1284
1285         if (ipflags.noiphost) {
1286                 return false;
1287         }
1288
1289         public_ips = ctdb->ipalloc_state->available_public_ips[pnn];
1290
1291         if (public_ips == NULL) {
1292                 return false;
1293         }
1294
1295         for (i=0; i<public_ips->num; i++) {
1296                 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1297                         /* yes, this node can serve this public ip */
1298                         return true;
1299                 }
1300         }
1301
1302         return false;
1303 }
1304
1305 static bool can_node_takeover_ip(struct ctdb_context *ctdb, int32_t pnn,
1306                                  struct ctdb_ipflags ipflags,
1307                                  struct public_ip_list *ip)
1308 {
1309         if (ipflags.noiptakeover) {
1310                 return false;
1311         }
1312
1313         return can_node_host_ip(ctdb, pnn, ipflags, ip);
1314 }
1315
1316 /* search the node lists list for a node to takeover this ip.
1317    pick the node that currently are serving the least number of ips
1318    so that the ips get spread out evenly.
1319 */
1320 static int find_takeover_node(struct ctdb_context *ctdb,
1321                               struct ctdb_ipflags *ipflags,
1322                               struct public_ip_list *ip,
1323                               struct public_ip_list *all_ips)
1324 {
1325         int pnn, min=0, num;
1326         int i, numnodes;
1327
1328         numnodes = talloc_array_length(ipflags);
1329         pnn    = -1;
1330         for (i=0; i<numnodes; i++) {
1331                 /* verify that this node can serve this ip */
1332                 if (!can_node_takeover_ip(ctdb, i, ipflags[i], ip)) {
1333                         /* no it couldnt   so skip to the next node */
1334                         continue;
1335                 }
1336
1337                 num = node_ip_coverage(i, all_ips);
1338                 /* was this the first node we checked ? */
1339                 if (pnn == -1) {
1340                         pnn = i;
1341                         min  = num;
1342                 } else {
1343                         if (num < min) {
1344                                 pnn = i;
1345                                 min  = num;
1346                         }
1347                 }
1348         }       
1349         if (pnn == -1) {
1350                 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1351                         ctdb_addr_to_str(&ip->addr)));
1352
1353                 return -1;
1354         }
1355
1356         ip->pnn = pnn;
1357         return 0;
1358 }
1359
1360 #define IP_KEYLEN       4
1361 static uint32_t *ip_key(ctdb_sock_addr *ip)
1362 {
1363         static uint32_t key[IP_KEYLEN];
1364
1365         bzero(key, sizeof(key));
1366
1367         switch (ip->sa.sa_family) {
1368         case AF_INET:
1369                 key[3]  = htonl(ip->ip.sin_addr.s_addr);
1370                 break;
1371         case AF_INET6: {
1372                 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1373                 key[0]  = htonl(s6_a32[0]);
1374                 key[1]  = htonl(s6_a32[1]);
1375                 key[2]  = htonl(s6_a32[2]);
1376                 key[3]  = htonl(s6_a32[3]);
1377                 break;
1378         }
1379         default:
1380                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1381                 return key;
1382         }
1383
1384         return key;
1385 }
1386
1387 static void *add_ip_callback(void *parm, void *data)
1388 {
1389         struct public_ip_list *this_ip = parm;
1390         struct public_ip_list *prev_ip = data;
1391
1392         if (prev_ip == NULL) {
1393                 return parm;
1394         }
1395         if (this_ip->pnn == -1) {
1396                 this_ip->pnn = prev_ip->pnn;
1397         }
1398
1399         return parm;
1400 }
1401
1402 static int getips_count_callback(void *param, void *data)
1403 {
1404         struct public_ip_list **ip_list = (struct public_ip_list **)param;
1405         struct public_ip_list *new_ip = (struct public_ip_list *)data;
1406
1407         new_ip->next = *ip_list;
1408         *ip_list     = new_ip;
1409         return 0;
1410 }
1411
1412 static int verify_remote_ip_allocation(struct ctdb_context *ctdb,
1413                                        struct ctdb_public_ip_list_old *ips,
1414                                        uint32_t pnn);
1415
1416 static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
1417                                          struct ipalloc_state *ipalloc_state,
1418                                          struct ctdb_node_map_old *nodemap)
1419 {
1420         int j;
1421         int ret;
1422
1423         if (ipalloc_state->num != nodemap->num) {
1424                 DEBUG(DEBUG_ERR,
1425                       (__location__
1426                        " ipalloc_state->num (%d) != nodemap->num (%d) invalid param\n",
1427                        ipalloc_state->num, nodemap->num));
1428                 return -1;
1429         }
1430
1431         for (j=0; j<nodemap->num; j++) {
1432                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1433                         continue;
1434                 }
1435
1436                 /* Retrieve the list of known public IPs from the node */
1437                 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1438                                         TAKEOVER_TIMEOUT(),
1439                                         j,
1440                                         ctdb->nodes,
1441                                         0,
1442                                         &ipalloc_state->known_public_ips[j]);
1443                 if (ret != 0) {
1444                         DEBUG(DEBUG_ERR,
1445                               ("Failed to read known public IPs from node: %u\n",
1446                                j));
1447                         return -1;
1448                 }
1449
1450                 if (ctdb->do_checkpublicip) {
1451                         verify_remote_ip_allocation(ctdb,
1452                                                     ipalloc_state->known_public_ips[j],
1453                                                     j);
1454                 }
1455
1456                 /* Retrieve the list of available public IPs from the node */
1457                 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1458                                         TAKEOVER_TIMEOUT(),
1459                                         j,
1460                                         ctdb->nodes,
1461                                         CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE,
1462                                         &ipalloc_state->available_public_ips[j]);
1463                 if (ret != 0) {
1464                         DEBUG(DEBUG_ERR,
1465                               ("Failed to read available public IPs from node: %u\n",
1466                                j));
1467                         return -1;
1468                 }
1469         }
1470
1471         return 0;
1472 }
1473
1474 static struct public_ip_list *
1475 create_merged_ip_list(struct ctdb_context *ctdb)
1476 {
1477         int i, j;
1478         struct public_ip_list *ip_list;
1479         struct ctdb_public_ip_list_old *public_ips;
1480
1481         if (ctdb->ip_tree != NULL) {
1482                 talloc_free(ctdb->ip_tree);
1483                 ctdb->ip_tree = NULL;
1484         }
1485         ctdb->ip_tree = trbt_create(ctdb, 0);
1486
1487         for (i=0;i<ctdb->num_nodes;i++) {
1488                 public_ips = ctdb->ipalloc_state->known_public_ips[i];
1489
1490                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1491                         continue;
1492                 }
1493
1494                 /* there were no public ips for this node */
1495                 if (public_ips == NULL) {
1496                         continue;
1497                 }               
1498
1499                 for (j=0;j<public_ips->num;j++) {
1500                         struct public_ip_list *tmp_ip;
1501
1502                         tmp_ip = talloc_zero(ctdb->ip_tree, struct public_ip_list);
1503                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1504                         /* Do not use information about IP addresses hosted
1505                          * on other nodes, it may not be accurate */
1506                         if (public_ips->ips[j].pnn == ctdb->nodes[i]->pnn) {
1507                                 tmp_ip->pnn = public_ips->ips[j].pnn;
1508                         } else {
1509                                 tmp_ip->pnn = -1;
1510                         }
1511                         tmp_ip->addr = public_ips->ips[j].addr;
1512                         tmp_ip->next = NULL;
1513
1514                         trbt_insertarray32_callback(ctdb->ip_tree,
1515                                 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1516                                 add_ip_callback,
1517                                 tmp_ip);
1518                 }
1519         }
1520
1521         ip_list = NULL;
1522         trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1523
1524         return ip_list;
1525 }
1526
1527 /* 
1528  * This is the length of the longtest common prefix between the IPs.
1529  * It is calculated by XOR-ing the 2 IPs together and counting the
1530  * number of leading zeroes.  The implementation means that all
1531  * addresses end up being 128 bits long.
1532  *
1533  * FIXME? Should we consider IPv4 and IPv6 separately given that the
1534  * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1535  * lots of nodes and IP addresses?
1536  */
1537 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1538 {
1539         uint32_t ip1_k[IP_KEYLEN];
1540         uint32_t *t;
1541         int i;
1542         uint32_t x;
1543
1544         uint32_t distance = 0;
1545
1546         memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1547         t = ip_key(ip2);
1548         for (i=0; i<IP_KEYLEN; i++) {
1549                 x = ip1_k[i] ^ t[i];
1550                 if (x == 0) {
1551                         distance += 32;
1552                 } else {
1553                         /* Count number of leading zeroes. 
1554                          * FIXME? This could be optimised...
1555                          */
1556                         while ((x & (1 << 31)) == 0) {
1557                                 x <<= 1;
1558                                 distance += 1;
1559                         }
1560                 }
1561         }
1562
1563         return distance;
1564 }
1565
1566 /* Calculate the IP distance for the given IP relative to IPs on the
1567    given node.  The ips argument is generally the all_ips variable
1568    used in the main part of the algorithm.
1569  */
1570 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1571                                   struct public_ip_list *ips,
1572                                   int pnn)
1573 {
1574         struct public_ip_list *t;
1575         uint32_t d;
1576
1577         uint32_t sum = 0;
1578
1579         for (t=ips; t != NULL; t=t->next) {
1580                 if (t->pnn != pnn) {
1581                         continue;
1582                 }
1583
1584                 /* Optimisation: We never calculate the distance
1585                  * between an address and itself.  This allows us to
1586                  * calculate the effect of removing an address from a
1587                  * node by simply calculating the distance between
1588                  * that address and all of the exitsing addresses.
1589                  * Moreover, we assume that we're only ever dealing
1590                  * with addresses from all_ips so we can identify an
1591                  * address via a pointer rather than doing a more
1592                  * expensive address comparison. */
1593                 if (&(t->addr) == ip) {
1594                         continue;
1595                 }
1596
1597                 d = ip_distance(ip, &(t->addr));
1598                 sum += d * d;  /* Cheaper than pulling in math.h :-) */
1599         }
1600
1601         return sum;
1602 }
1603
1604 /* Return the LCP2 imbalance metric for addresses currently assigned
1605    to the given node.
1606  */
1607 static uint32_t lcp2_imbalance(struct public_ip_list * all_ips, int pnn)
1608 {
1609         struct public_ip_list *t;
1610
1611         uint32_t imbalance = 0;
1612
1613         for (t=all_ips; t!=NULL; t=t->next) {
1614                 if (t->pnn != pnn) {
1615                         continue;
1616                 }
1617                 /* Pass the rest of the IPs rather than the whole
1618                    all_ips input list.
1619                 */
1620                 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1621         }
1622
1623         return imbalance;
1624 }
1625
1626 /* Allocate any unassigned IPs just by looping through the IPs and
1627  * finding the best node for each.
1628  */
1629 static void basic_allocate_unassigned(struct ctdb_context *ctdb,
1630                                       struct ctdb_ipflags *ipflags,
1631                                       struct public_ip_list *all_ips)
1632 {
1633         struct public_ip_list *tmp_ip;
1634
1635         /* loop over all ip's and find a physical node to cover for 
1636            each unassigned ip.
1637         */
1638         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1639                 if (tmp_ip->pnn == -1) {
1640                         if (find_takeover_node(ctdb, ipflags, tmp_ip, all_ips)) {
1641                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1642                                         ctdb_addr_to_str(&tmp_ip->addr)));
1643                         }
1644                 }
1645         }
1646 }
1647
1648 /* Basic non-deterministic rebalancing algorithm.
1649  */
1650 static void basic_failback(struct ctdb_context *ctdb,
1651                            struct ctdb_ipflags *ipflags,
1652                            struct public_ip_list *all_ips,
1653                            int num_ips)
1654 {
1655         int i, numnodes;
1656         int maxnode, maxnum, minnode, minnum, num, retries;
1657         struct public_ip_list *tmp_ip;
1658
1659         numnodes = talloc_array_length(ipflags);
1660         retries = 0;
1661
1662 try_again:
1663         maxnum=0;
1664         minnum=0;
1665
1666         /* for each ip address, loop over all nodes that can serve
1667            this ip and make sure that the difference between the node
1668            serving the most and the node serving the least ip's are
1669            not greater than 1.
1670         */
1671         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1672                 if (tmp_ip->pnn == -1) {
1673                         continue;
1674                 }
1675
1676                 /* Get the highest and lowest number of ips's served by any 
1677                    valid node which can serve this ip.
1678                 */
1679                 maxnode = -1;
1680                 minnode = -1;
1681                 for (i=0; i<numnodes; i++) {
1682                         /* only check nodes that can actually serve this ip */
1683                         if (!can_node_takeover_ip(ctdb, i, ipflags[i], tmp_ip)) {
1684                                 /* no it couldnt   so skip to the next node */
1685                                 continue;
1686                         }
1687
1688                         num = node_ip_coverage(i, all_ips);
1689                         if (maxnode == -1) {
1690                                 maxnode = i;
1691                                 maxnum  = num;
1692                         } else {
1693                                 if (num > maxnum) {
1694                                         maxnode = i;
1695                                         maxnum  = num;
1696                                 }
1697                         }
1698                         if (minnode == -1) {
1699                                 minnode = i;
1700                                 minnum  = num;
1701                         } else {
1702                                 if (num < minnum) {
1703                                         minnode = i;
1704                                         minnum  = num;
1705                                 }
1706                         }
1707                 }
1708                 if (maxnode == -1) {
1709                         DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1710                                 ctdb_addr_to_str(&tmp_ip->addr)));
1711
1712                         continue;
1713                 }
1714
1715                 /* if the spread between the smallest and largest coverage by
1716                    a node is >=2 we steal one of the ips from the node with
1717                    most coverage to even things out a bit.
1718                    try to do this a limited number of times since we dont
1719                    want to spend too much time balancing the ip coverage.
1720                 */
1721                 if ( (maxnum > minnum+1)
1722                      && (retries < (num_ips + 5)) ){
1723                         struct public_ip_list *tmp;
1724
1725                         /* Reassign one of maxnode's VNNs */
1726                         for (tmp=all_ips;tmp;tmp=tmp->next) {
1727                                 if (tmp->pnn == maxnode) {
1728                                         (void)find_takeover_node(ctdb, ipflags, tmp, all_ips);
1729                                         retries++;
1730                                         goto try_again;;
1731                                 }
1732                         }
1733                 }
1734         }
1735 }
1736
1737 static void lcp2_init(struct ctdb_context *tmp_ctx,
1738                       struct ctdb_ipflags *ipflags,
1739                       struct public_ip_list *all_ips,
1740                       uint32_t *force_rebalance_nodes,
1741                       uint32_t **lcp2_imbalances,
1742                       bool **rebalance_candidates)
1743 {
1744         int i, numnodes;
1745         struct public_ip_list *tmp_ip;
1746
1747         numnodes = talloc_array_length(ipflags);
1748
1749         *rebalance_candidates = talloc_array(tmp_ctx, bool, numnodes);
1750         CTDB_NO_MEMORY_FATAL(tmp_ctx, *rebalance_candidates);
1751         *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, numnodes);
1752         CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
1753
1754         for (i=0; i<numnodes; i++) {
1755                 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1756                 /* First step: assume all nodes are candidates */
1757                 (*rebalance_candidates)[i] = true;
1758         }
1759
1760         /* 2nd step: if a node has IPs assigned then it must have been
1761          * healthy before, so we remove it from consideration.  This
1762          * is overkill but is all we have because we don't maintain
1763          * state between takeover runs.  An alternative would be to
1764          * keep state and invalidate it every time the recovery master
1765          * changes.
1766          */
1767         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1768                 if (tmp_ip->pnn != -1) {
1769                         (*rebalance_candidates)[tmp_ip->pnn] = false;
1770                 }
1771         }
1772
1773         /* 3rd step: if a node is forced to re-balance then
1774            we allow failback onto the node */
1775         if (force_rebalance_nodes == NULL) {
1776                 return;
1777         }
1778         for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
1779                 uint32_t pnn = force_rebalance_nodes[i];
1780                 if (pnn >= numnodes) {
1781                         DEBUG(DEBUG_ERR,
1782                               (__location__ "unknown node %u\n", pnn));
1783                         continue;
1784                 }
1785
1786                 DEBUG(DEBUG_NOTICE,
1787                       ("Forcing rebalancing of IPs to node %u\n", pnn));
1788                 (*rebalance_candidates)[pnn] = true;
1789         }
1790 }
1791
1792 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1793  * the IP/node combination that will cost the least.
1794  */
1795 static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
1796                                      struct ctdb_ipflags *ipflags,
1797                                      struct public_ip_list *all_ips,
1798                                      uint32_t *lcp2_imbalances)
1799 {
1800         struct public_ip_list *tmp_ip;
1801         int dstnode, numnodes;
1802
1803         int minnode;
1804         uint32_t mindsum, dstdsum, dstimbl, minimbl;
1805         struct public_ip_list *minip;
1806
1807         bool should_loop = true;
1808         bool have_unassigned = true;
1809
1810         numnodes = talloc_array_length(ipflags);
1811
1812         while (have_unassigned && should_loop) {
1813                 should_loop = false;
1814
1815                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1816                 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1817
1818                 minnode = -1;
1819                 mindsum = 0;
1820                 minip = NULL;
1821
1822                 /* loop over each unassigned ip. */
1823                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1824                         if (tmp_ip->pnn != -1) {
1825                                 continue;
1826                         }
1827
1828                         for (dstnode=0; dstnode<numnodes; dstnode++) {
1829                                 /* only check nodes that can actually takeover this ip */
1830                                 if (!can_node_takeover_ip(ctdb, dstnode,
1831                                                           ipflags[dstnode],
1832                                                           tmp_ip)) {
1833                                         /* no it couldnt   so skip to the next node */
1834                                         continue;
1835                                 }
1836
1837                                 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1838                                 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1839                                 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1840                                                    ctdb_addr_to_str(&(tmp_ip->addr)),
1841                                                    dstnode,
1842                                                    dstimbl - lcp2_imbalances[dstnode]));
1843
1844
1845                                 if ((minnode == -1) || (dstdsum < mindsum)) {
1846                                         minnode = dstnode;
1847                                         minimbl = dstimbl;
1848                                         mindsum = dstdsum;
1849                                         minip = tmp_ip;
1850                                         should_loop = true;
1851                                 }
1852                         }
1853                 }
1854
1855                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1856
1857                 /* If we found one then assign it to the given node. */
1858                 if (minnode != -1) {
1859                         minip->pnn = minnode;
1860                         lcp2_imbalances[minnode] = minimbl;
1861                         DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1862                                           ctdb_addr_to_str(&(minip->addr)),
1863                                           minnode,
1864                                           mindsum));
1865                 }
1866
1867                 /* There might be a better way but at least this is clear. */
1868                 have_unassigned = false;
1869                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1870                         if (tmp_ip->pnn == -1) {
1871                                 have_unassigned = true;
1872                         }
1873                 }
1874         }
1875
1876         /* We know if we have an unassigned addresses so we might as
1877          * well optimise.
1878          */
1879         if (have_unassigned) {
1880                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1881                         if (tmp_ip->pnn == -1) {
1882                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1883                                                      ctdb_addr_to_str(&tmp_ip->addr)));
1884                         }
1885                 }
1886         }
1887 }
1888
1889 /* LCP2 algorithm for rebalancing the cluster.  Given a candidate node
1890  * to move IPs from, determines the best IP/destination node
1891  * combination to move from the source node.
1892  */
1893 static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
1894                                     struct ctdb_ipflags *ipflags,
1895                                     struct public_ip_list *all_ips,
1896                                     int srcnode,
1897                                     uint32_t *lcp2_imbalances,
1898                                     bool *rebalance_candidates)
1899 {
1900         int dstnode, mindstnode, numnodes;
1901         uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1902         uint32_t minsrcimbl, mindstimbl;
1903         struct public_ip_list *minip;
1904         struct public_ip_list *tmp_ip;
1905
1906         /* Find an IP and destination node that best reduces imbalance. */
1907         srcimbl = 0;
1908         minip = NULL;
1909         minsrcimbl = 0;
1910         mindstnode = -1;
1911         mindstimbl = 0;
1912
1913         numnodes = talloc_array_length(ipflags);
1914
1915         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1916         DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n",
1917                            srcnode, lcp2_imbalances[srcnode]));
1918
1919         for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1920                 /* Only consider addresses on srcnode. */
1921                 if (tmp_ip->pnn != srcnode) {
1922                         continue;
1923                 }
1924
1925                 /* What is this IP address costing the source node? */
1926                 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1927                 srcimbl = lcp2_imbalances[srcnode] - srcdsum;
1928
1929                 /* Consider this IP address would cost each potential
1930                  * destination node.  Destination nodes are limited to
1931                  * those that are newly healthy, since we don't want
1932                  * to do gratuitous failover of IPs just to make minor
1933                  * balance improvements.
1934                  */
1935                 for (dstnode=0; dstnode<numnodes; dstnode++) {
1936                         if (!rebalance_candidates[dstnode]) {
1937                                 continue;
1938                         }
1939
1940                         /* only check nodes that can actually takeover this ip */
1941                         if (!can_node_takeover_ip(ctdb, dstnode,
1942                                                   ipflags[dstnode], tmp_ip)) {
1943                                 /* no it couldnt   so skip to the next node */
1944                                 continue;
1945                         }
1946
1947                         dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1948                         dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1949                         DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1950                                            srcnode, -srcdsum,
1951                                            ctdb_addr_to_str(&(tmp_ip->addr)),
1952                                            dstnode, dstdsum));
1953
1954                         if ((dstimbl < lcp2_imbalances[srcnode]) &&
1955                             (dstdsum < srcdsum) &&                      \
1956                             ((mindstnode == -1) ||                              \
1957                              ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1958
1959                                 minip = tmp_ip;
1960                                 minsrcimbl = srcimbl;
1961                                 mindstnode = dstnode;
1962                                 mindstimbl = dstimbl;
1963                         }
1964                 }
1965         }
1966         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1967
1968         if (mindstnode != -1) {
1969                 /* We found a move that makes things better... */
1970                 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1971                                   srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1972                                   ctdb_addr_to_str(&(minip->addr)),
1973                                   mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1974
1975
1976                 lcp2_imbalances[srcnode] = minsrcimbl;
1977                 lcp2_imbalances[mindstnode] = mindstimbl;
1978                 minip->pnn = mindstnode;
1979
1980                 return true;
1981         }
1982
1983         return false;
1984         
1985 }
1986
1987 struct lcp2_imbalance_pnn {
1988         uint32_t imbalance;
1989         int pnn;
1990 };
1991
1992 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
1993 {
1994         const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
1995         const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
1996
1997         if (lipa->imbalance > lipb->imbalance) {
1998                 return -1;
1999         } else if (lipa->imbalance == lipb->imbalance) {
2000                 return 0;
2001         } else {
2002                 return 1;
2003         }
2004 }
2005
2006 /* LCP2 algorithm for rebalancing the cluster.  This finds the source
2007  * node with the highest LCP2 imbalance, and then determines the best
2008  * IP/destination node combination to move from the source node.
2009  */
2010 static void lcp2_failback(struct ctdb_context *ctdb,
2011                           struct ctdb_ipflags *ipflags,
2012                           struct public_ip_list *all_ips,
2013                           uint32_t *lcp2_imbalances,
2014                           bool *rebalance_candidates)
2015 {
2016         int i, numnodes;
2017         struct lcp2_imbalance_pnn * lips;
2018         bool again;
2019
2020         numnodes = talloc_array_length(ipflags);
2021
2022 try_again:
2023         /* Put the imbalances and nodes into an array, sort them and
2024          * iterate through candidates.  Usually the 1st one will be
2025          * used, so this doesn't cost much...
2026          */
2027         DEBUG(DEBUG_DEBUG,("+++++++++++++++++++++++++++++++++++++++++\n"));
2028         DEBUG(DEBUG_DEBUG,("Selecting most imbalanced node from:\n"));
2029         lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, numnodes);
2030         for (i=0; i<numnodes; i++) {
2031                 lips[i].imbalance = lcp2_imbalances[i];
2032                 lips[i].pnn = i;
2033                 DEBUG(DEBUG_DEBUG,(" %d [%d]\n", i, lcp2_imbalances[i]));
2034         }
2035         qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
2036               lcp2_cmp_imbalance_pnn);
2037
2038         again = false;
2039         for (i=0; i<numnodes; i++) {
2040                 /* This means that all nodes had 0 or 1 addresses, so
2041                  * can't be imbalanced.
2042                  */
2043                 if (lips[i].imbalance == 0) {
2044                         break;
2045                 }
2046
2047                 if (lcp2_failback_candidate(ctdb,
2048                                             ipflags,
2049                                             all_ips,
2050                                             lips[i].pnn,
2051                                             lcp2_imbalances,
2052                                             rebalance_candidates)) {
2053                         again = true;
2054                         break;
2055                 }
2056         }
2057
2058         talloc_free(lips);
2059         if (again) {
2060                 goto try_again;
2061         }
2062 }
2063
2064 static void unassign_unsuitable_ips(struct ctdb_context *ctdb,
2065                                     struct ctdb_ipflags *ipflags,
2066                                     struct public_ip_list *all_ips)
2067 {
2068         struct public_ip_list *tmp_ip;
2069
2070         /* verify that the assigned nodes can serve that public ip
2071            and set it to -1 if not
2072         */
2073         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2074                 if (tmp_ip->pnn == -1) {
2075                         continue;
2076                 }
2077                 if (!can_node_host_ip(ctdb, tmp_ip->pnn,
2078                                       ipflags[tmp_ip->pnn], tmp_ip) != 0) {
2079                         /* this node can not serve this ip. */
2080                         DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
2081                                            ctdb_addr_to_str(&(tmp_ip->addr)),
2082                                            tmp_ip->pnn));
2083                         tmp_ip->pnn = -1;
2084                 }
2085         }
2086 }
2087
2088 static void ip_alloc_deterministic_ips(struct ctdb_context *ctdb,
2089                                        struct ctdb_ipflags *ipflags,
2090                                        struct public_ip_list *all_ips)
2091 {
2092         struct public_ip_list *tmp_ip;
2093         int i, numnodes;
2094
2095         numnodes = talloc_array_length(ipflags);
2096
2097         DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
2098        /* Allocate IPs to nodes in a modulo fashion so that IPs will
2099         *  always be allocated the same way for a specific set of
2100         *  available/unavailable nodes.
2101         */
2102
2103         for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
2104                 tmp_ip->pnn = i % numnodes;
2105         }
2106
2107         /* IP failback doesn't make sense with deterministic
2108          * IPs, since the modulo step above implicitly fails
2109          * back IPs to their "home" node.
2110          */
2111         if (1 == ctdb->ipalloc_state->no_ip_failback) {
2112                 DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
2113         }
2114
2115         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2116
2117         basic_allocate_unassigned(ctdb, ipflags, all_ips);
2118
2119         /* No failback here! */
2120 }
2121
2122 static void ip_alloc_nondeterministic_ips(struct ctdb_context *ctdb,
2123                                           struct ctdb_ipflags *ipflags,
2124                                           struct public_ip_list *all_ips)
2125 {
2126         /* This should be pushed down into basic_failback. */
2127         struct public_ip_list *tmp_ip;
2128         int num_ips = 0;
2129         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2130                 num_ips++;
2131         }
2132
2133         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2134
2135         basic_allocate_unassigned(ctdb, ipflags, all_ips);
2136
2137         /* If we don't want IPs to fail back then don't rebalance IPs. */
2138         if (1 == ctdb->ipalloc_state->no_ip_failback) {
2139                 return;
2140         }
2141
2142         /* Now, try to make sure the ip adresses are evenly distributed
2143            across the nodes.
2144         */
2145         basic_failback(ctdb, ipflags, all_ips, num_ips);
2146 }
2147
2148 static void ip_alloc_lcp2(struct ctdb_context *ctdb,
2149                           struct ctdb_ipflags *ipflags,
2150                           struct public_ip_list *all_ips,
2151                           uint32_t *force_rebalance_nodes)
2152 {
2153         uint32_t *lcp2_imbalances;
2154         bool *rebalance_candidates;
2155         int numnodes, num_rebalance_candidates, i;
2156
2157         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2158
2159         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2160
2161         lcp2_init(tmp_ctx, ipflags, all_ips,force_rebalance_nodes,
2162                   &lcp2_imbalances, &rebalance_candidates);
2163
2164         lcp2_allocate_unassigned(ctdb, ipflags, all_ips, lcp2_imbalances);
2165
2166         /* If we don't want IPs to fail back then don't rebalance IPs. */
2167         if (1 == ctdb->ipalloc_state->no_ip_failback) {
2168                 goto finished;
2169         }
2170
2171         /* It is only worth continuing if we have suitable target
2172          * nodes to transfer IPs to.  This check is much cheaper than
2173          * continuing on...
2174          */
2175         numnodes = talloc_array_length(ipflags);
2176         num_rebalance_candidates = 0;
2177         for (i=0; i<numnodes; i++) {
2178                 if (rebalance_candidates[i]) {
2179                         num_rebalance_candidates++;
2180                 }
2181         }
2182         if (num_rebalance_candidates == 0) {
2183                 goto finished;
2184         }
2185
2186         /* Now, try to make sure the ip adresses are evenly distributed
2187            across the nodes.
2188         */
2189         lcp2_failback(ctdb, ipflags, all_ips,
2190                       lcp2_imbalances, rebalance_candidates);
2191
2192 finished:
2193         talloc_free(tmp_ctx);
2194 }
2195
2196 static bool all_nodes_are_disabled(struct ctdb_node_map_old *nodemap)
2197 {
2198         int i;
2199
2200         for (i=0;i<nodemap->num;i++) {
2201                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
2202                         /* Found one completely healthy node */
2203                         return false;
2204                 }
2205         }
2206
2207         return true;
2208 }
2209
2210 /* The calculation part of the IP allocation algorithm. */
2211 static void ctdb_takeover_run_core(struct ctdb_context *ctdb,
2212                                    struct ctdb_ipflags *ipflags,
2213                                    struct public_ip_list **all_ips_p,
2214                                    uint32_t *force_rebalance_nodes)
2215 {
2216         /* since nodes only know about those public addresses that
2217            can be served by that particular node, no single node has
2218            a full list of all public addresses that exist in the cluster.
2219            Walk over all node structures and create a merged list of
2220            all public addresses that exist in the cluster.
2221
2222            keep the tree of ips around as ctdb->ip_tree
2223         */
2224         *all_ips_p = create_merged_ip_list(ctdb);
2225
2226         switch (ctdb->ipalloc_state->algorithm) {
2227         case IPALLOC_LCP2:
2228                 ip_alloc_lcp2(ctdb, ipflags, *all_ips_p, force_rebalance_nodes);
2229                 break;
2230         case IPALLOC_DETERMINISTIC:
2231                 ip_alloc_deterministic_ips(ctdb, ipflags, *all_ips_p);
2232                 break;
2233         case IPALLOC_NONDETERMINISTIC:
2234                 ip_alloc_nondeterministic_ips(ctdb, ipflags, *all_ips_p);
2235                break;
2236         }
2237
2238         /* at this point ->pnn is the node which will own each IP
2239            or -1 if there is no node that can cover this ip
2240         */
2241
2242         return;
2243 }
2244
2245 struct get_tunable_callback_data {
2246         const char *tunable;
2247         uint32_t *out;
2248         bool fatal;
2249 };
2250
2251 static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
2252                                  int32_t res, TDB_DATA outdata,
2253                                  void *callback)
2254 {
2255         struct get_tunable_callback_data *cd =
2256                 (struct get_tunable_callback_data *)callback;
2257         int size;
2258
2259         if (res != 0) {
2260                 /* Already handled in fail callback */
2261                 return;
2262         }
2263
2264         if (outdata.dsize != sizeof(uint32_t)) {
2265                 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading \"%s\" tunable from node %d. Expected %d bytes but received %d bytes\n",
2266                                  cd->tunable, pnn, (int)sizeof(uint32_t),
2267                                  (int)outdata.dsize));
2268                 cd->fatal = true;
2269                 return;
2270         }
2271
2272         size = talloc_array_length(cd->out);
2273         if (pnn >= size) {
2274                 DEBUG(DEBUG_ERR,("Got %s reply from node %d but nodemap only has %d entries\n",
2275                                  cd->tunable, pnn, size));
2276                 return;
2277         }
2278
2279                 
2280         cd->out[pnn] = *(uint32_t *)outdata.dptr;
2281 }
2282
2283 static void get_tunable_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2284                                        int32_t res, TDB_DATA outdata,
2285                                        void *callback)
2286 {
2287         struct get_tunable_callback_data *cd =
2288                 (struct get_tunable_callback_data *)callback;
2289
2290         switch (res) {
2291         case -ETIME:
2292                 DEBUG(DEBUG_ERR,
2293                       ("Timed out getting tunable \"%s\" from node %d\n",
2294                        cd->tunable, pnn));
2295                 cd->fatal = true;
2296                 break;
2297         case -EINVAL:
2298         case -1:
2299                 DEBUG(DEBUG_WARNING,
2300                       ("Tunable \"%s\" not implemented on node %d\n",
2301                        cd->tunable, pnn));
2302                 break;
2303         default:
2304                 DEBUG(DEBUG_ERR,
2305                       ("Unexpected error getting tunable \"%s\" from node %d\n",
2306                        cd->tunable, pnn));
2307                 cd->fatal = true;
2308         }
2309 }
2310
2311 static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
2312                                         TALLOC_CTX *tmp_ctx,
2313                                         struct ctdb_node_map_old *nodemap,
2314                                         const char *tunable,
2315                                         uint32_t default_value)
2316 {
2317         TDB_DATA data;
2318         struct ctdb_control_get_tunable *t;
2319         uint32_t *nodes;
2320         uint32_t *tvals;
2321         struct get_tunable_callback_data callback_data;
2322         int i;
2323
2324         tvals = talloc_array(tmp_ctx, uint32_t, nodemap->num);
2325         CTDB_NO_MEMORY_NULL(ctdb, tvals);
2326         for (i=0; i<nodemap->num; i++) {
2327                 tvals[i] = default_value;
2328         }
2329                 
2330         callback_data.out = tvals;
2331         callback_data.tunable = tunable;
2332         callback_data.fatal = false;
2333
2334         data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(tunable) + 1;
2335         data.dptr  = talloc_size(tmp_ctx, data.dsize);
2336         t = (struct ctdb_control_get_tunable *)data.dptr;
2337         t->length = strlen(tunable)+1;
2338         memcpy(t->name, tunable, t->length);
2339         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2340         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
2341                                       nodes, 0, TAKEOVER_TIMEOUT(),
2342                                       false, data,
2343                                       get_tunable_callback,
2344                                       get_tunable_fail_callback,
2345                                       &callback_data) != 0) {
2346                 if (callback_data.fatal) {
2347                         talloc_free(tvals);
2348                         tvals = NULL;
2349                 }
2350         }
2351         talloc_free(nodes);
2352         talloc_free(data.dptr);
2353
2354         return tvals;
2355 }
2356
2357 /* Set internal flags for IP allocation:
2358  *   Clear ip flags
2359  *   Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
2360  *   Set NOIPHOST ip flag for each INACTIVE node
2361  *   if all nodes are disabled:
2362  *     Set NOIPHOST ip flags from per-node NoIPHostOnAllDisabled tunable
2363  *   else
2364  *     Set NOIPHOST ip flags for disabled nodes
2365  */
2366 static struct ctdb_ipflags *
2367 set_ipflags_internal(struct ctdb_context *ctdb,
2368                      TALLOC_CTX *tmp_ctx,
2369                      struct ctdb_node_map_old *nodemap,
2370                      uint32_t *tval_noiptakeover,
2371                      uint32_t *tval_noiphostonalldisabled)
2372 {
2373         int i;
2374         struct ctdb_ipflags *ipflags;
2375
2376         /* Clear IP flags - implicit due to talloc_zero */
2377         ipflags = talloc_zero_array(tmp_ctx, struct ctdb_ipflags, nodemap->num);
2378         CTDB_NO_MEMORY_NULL(ctdb, ipflags);
2379
2380         for (i=0;i<nodemap->num;i++) {
2381                 /* Can not take IPs on node with NoIPTakeover set */
2382                 if (tval_noiptakeover[i] != 0) {
2383                         ipflags[i].noiptakeover = true;
2384                 }
2385
2386                 /* Can not host IPs on INACTIVE node */
2387                 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2388                         ipflags[i].noiphost = true;
2389                 }
2390         }
2391
2392         if (all_nodes_are_disabled(nodemap)) {
2393                 /* If all nodes are disabled, can not host IPs on node
2394                  * with NoIPHostOnAllDisabled set
2395                  */
2396                 for (i=0;i<nodemap->num;i++) {
2397                         if (tval_noiphostonalldisabled[i] != 0) {
2398                                 ipflags[i].noiphost = true;
2399                         }
2400                 }
2401         } else {
2402                 /* If some nodes are not disabled, then can not host
2403                  * IPs on DISABLED node
2404                  */
2405                 for (i=0;i<nodemap->num;i++) {
2406                         if (nodemap->nodes[i].flags & NODE_FLAGS_DISABLED) {
2407                                 ipflags[i].noiphost = true;
2408                         }
2409                 }
2410         }
2411
2412         return ipflags;
2413 }
2414
2415 static struct ctdb_ipflags *set_ipflags(struct ctdb_context *ctdb,
2416                                         TALLOC_CTX *tmp_ctx,
2417                                         struct ctdb_node_map_old *nodemap)
2418 {
2419         uint32_t *tval_noiptakeover;
2420         uint32_t *tval_noiphostonalldisabled;
2421         struct ctdb_ipflags *ipflags;
2422
2423
2424         tval_noiptakeover = get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2425                                                    "NoIPTakeover", 0);
2426         if (tval_noiptakeover == NULL) {
2427                 return NULL;
2428         }
2429
2430         tval_noiphostonalldisabled =
2431                 get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2432                                        "NoIPHostOnAllDisabled", 0);
2433         if (tval_noiphostonalldisabled == NULL) {
2434                 /* Caller frees tmp_ctx */
2435                 return NULL;
2436         }
2437
2438         ipflags = set_ipflags_internal(ctdb, tmp_ctx, nodemap,
2439                                        tval_noiptakeover,
2440                                        tval_noiphostonalldisabled);
2441
2442         talloc_free(tval_noiptakeover);
2443         talloc_free(tval_noiphostonalldisabled);
2444
2445         return ipflags;
2446 }
2447
2448 static struct ipalloc_state * ipalloc_state_init(struct ctdb_context *ctdb,
2449                                                  TALLOC_CTX *mem_ctx)
2450 {
2451         struct ipalloc_state *ipalloc_state =
2452                 talloc_zero(mem_ctx, struct ipalloc_state);
2453         if (ipalloc_state == NULL) {
2454                 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
2455                 return NULL;
2456         }
2457
2458         ipalloc_state->num = ctdb->num_nodes;
2459         ipalloc_state->known_public_ips =
2460                 talloc_zero_array(ipalloc_state,
2461                                   struct ctdb_public_ip_list_old *,
2462                                   ipalloc_state->num);
2463         if (ipalloc_state->known_public_ips == NULL) {
2464                 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
2465                 talloc_free(ipalloc_state);
2466                 return NULL;
2467         }
2468         ipalloc_state->available_public_ips =
2469                 talloc_zero_array(ipalloc_state,
2470                                   struct ctdb_public_ip_list_old *,
2471                                   ipalloc_state->num);
2472         if (ipalloc_state->available_public_ips == NULL) {
2473                 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
2474                 talloc_free(ipalloc_state);
2475                 return NULL;
2476         }
2477
2478         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2479                 ipalloc_state->algorithm = IPALLOC_LCP2;
2480         } else if (1 == ctdb->tunable.deterministic_public_ips) {
2481                 ipalloc_state->algorithm = IPALLOC_DETERMINISTIC;
2482         } else {
2483                 ipalloc_state->algorithm = IPALLOC_NONDETERMINISTIC;
2484         }
2485
2486         ipalloc_state->no_ip_failback = ctdb->tunable.no_ip_failback;
2487
2488         return ipalloc_state;
2489 }
2490
2491 struct iprealloc_callback_data {
2492         bool *retry_nodes;
2493         int retry_count;
2494         client_async_callback fail_callback;
2495         void *fail_callback_data;
2496         struct ctdb_node_map_old *nodemap;
2497 };
2498
2499 static void iprealloc_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2500                                         int32_t res, TDB_DATA outdata,
2501                                         void *callback)
2502 {
2503         int numnodes;
2504         struct iprealloc_callback_data *cd =
2505                 (struct iprealloc_callback_data *)callback;
2506
2507         numnodes = talloc_array_length(cd->retry_nodes);
2508         if (pnn > numnodes) {
2509                 DEBUG(DEBUG_ERR,
2510                       ("ipreallocated failure from node %d, "
2511                        "but only %d nodes in nodemap\n",
2512                        pnn, numnodes));
2513                 return;
2514         }
2515
2516         /* Can't run the "ipreallocated" event on a INACTIVE node */
2517         if (cd->nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE) {
2518                 DEBUG(DEBUG_WARNING,
2519                       ("ipreallocated failed on inactive node %d, ignoring\n",
2520                        pnn));
2521                 return;
2522         }
2523
2524         switch (res) {
2525         case -ETIME:
2526                 /* If the control timed out then that's a real error,
2527                  * so call the real fail callback
2528                  */
2529                 if (cd->fail_callback) {
2530                         cd->fail_callback(ctdb, pnn, res, outdata,
2531                                           cd->fail_callback_data);
2532                 } else {
2533                         DEBUG(DEBUG_WARNING,
2534                               ("iprealloc timed out but no callback registered\n"));
2535                 }
2536                 break;
2537         default:
2538                 /* If not a timeout then either the ipreallocated
2539                  * eventscript (or some setup) failed.  This might
2540                  * have failed because the IPREALLOCATED control isn't
2541                  * implemented - right now there is no way of knowing
2542                  * because the error codes are all folded down to -1.
2543                  * Consider retrying using EVENTSCRIPT control...
2544                  */
2545                 DEBUG(DEBUG_WARNING,
2546                       ("ipreallocated failure from node %d, flagging retry\n",
2547                        pnn));
2548                 cd->retry_nodes[pnn] = true;
2549                 cd->retry_count++;
2550         }
2551 }
2552
2553 struct takeover_callback_data {
2554         bool *node_failed;
2555         client_async_callback fail_callback;
2556         void *fail_callback_data;
2557         struct ctdb_node_map_old *nodemap;
2558 };
2559
2560 static void takeover_run_fail_callback(struct ctdb_context *ctdb,
2561                                        uint32_t node_pnn, int32_t res,
2562                                        TDB_DATA outdata, void *callback_data)
2563 {
2564         struct takeover_callback_data *cd =
2565                 talloc_get_type_abort(callback_data,
2566                                       struct takeover_callback_data);
2567         int i;
2568
2569         for (i = 0; i < cd->nodemap->num; i++) {
2570                 if (node_pnn == cd->nodemap->nodes[i].pnn) {
2571                         break;
2572                 }
2573         }
2574
2575         if (i == cd->nodemap->num) {
2576                 DEBUG(DEBUG_ERR, (__location__ " invalid PNN %u\n", node_pnn));
2577                 return;
2578         }
2579
2580         if (!cd->node_failed[i]) {
2581                 cd->node_failed[i] = true;
2582                 cd->fail_callback(ctdb, node_pnn, res, outdata,
2583                                   cd->fail_callback_data);
2584         }
2585 }
2586
2587 /*
2588   make any IP alias changes for public addresses that are necessary 
2589  */
2590 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
2591                       uint32_t *force_rebalance_nodes,
2592                       client_async_callback fail_callback, void *callback_data)
2593 {
2594         int i, j, ret;
2595         struct ctdb_public_ip ip;
2596         uint32_t *nodes;
2597         struct public_ip_list *all_ips, *tmp_ip;
2598         TDB_DATA data;
2599         struct timeval timeout;
2600         struct client_async_data *async_data;
2601         struct ctdb_client_control_state *state;
2602         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2603         struct ctdb_ipflags *ipflags;
2604         struct ipalloc_state *ipalloc_state;
2605         struct takeover_callback_data *takeover_data;
2606         struct iprealloc_callback_data iprealloc_data;
2607         bool *retry_data;
2608         bool can_host_ips;
2609
2610         /*
2611          * ip failover is completely disabled, just send out the 
2612          * ipreallocated event.
2613          */
2614         if (ctdb->tunable.disable_ip_failover != 0) {
2615                 goto ipreallocated;
2616         }
2617
2618         ipalloc_state = ipalloc_state_init(ctdb, tmp_ctx);
2619         if (ipalloc_state == NULL) {
2620                 talloc_free(tmp_ctx);
2621                 return -1;
2622         }
2623         ctdb->ipalloc_state = ipalloc_state;
2624
2625         ipflags = set_ipflags(ctdb, tmp_ctx, nodemap);
2626         if (ipflags == NULL) {
2627                 DEBUG(DEBUG_ERR,("Failed to set IP flags - aborting takeover run\n"));
2628                 talloc_free(tmp_ctx);
2629                 return -1;
2630         }
2631
2632         /* Fetch known/available public IPs from each active node */
2633         ret = ctdb_reload_remote_public_ips(ctdb, ipalloc_state, nodemap);
2634         if (ret != 0) {
2635                 talloc_free(tmp_ctx);
2636                 return -1;
2637         }
2638
2639         /* Short-circuit IP allocation if no node has available IPs */
2640         can_host_ips = false;
2641         for (i=0; i < ipalloc_state->num; i++) {
2642                 if (ipalloc_state->available_public_ips[i] != NULL) {
2643                         can_host_ips = true;
2644                 }
2645         }
2646         if (!can_host_ips) {
2647                 DEBUG(DEBUG_WARNING,("No nodes available to host public IPs yet\n"));
2648                 return 0;
2649         }
2650
2651         /* Do the IP reassignment calculations */
2652         ctdb_takeover_run_core(ctdb, ipflags, &all_ips, force_rebalance_nodes);
2653
2654         /* Now tell all nodes to release any public IPs should not
2655          * host.  This will be a NOOP on nodes that don't currently
2656          * hold the given IP.
2657          */
2658         takeover_data = talloc_zero(tmp_ctx, struct takeover_callback_data);
2659         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data);
2660
2661         takeover_data->node_failed = talloc_zero_array(tmp_ctx,
2662                                                        bool, nodemap->num);
2663         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data->node_failed);
2664         takeover_data->fail_callback = fail_callback;
2665         takeover_data->fail_callback_data = callback_data;
2666         takeover_data->nodemap = nodemap;
2667
2668         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2669         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2670
2671         async_data->fail_callback = takeover_run_fail_callback;
2672         async_data->callback_data = takeover_data;
2673
2674         ZERO_STRUCT(ip); /* Avoid valgrind warnings for union */
2675
2676         /* Send a RELEASE_IP to all nodes that should not be hosting
2677          * each IP.  For each IP, all but one of these will be
2678          * redundant.  However, the redundant ones are used to tell
2679          * nodes which node should be hosting the IP so that commands
2680          * like "ctdb ip" can display a particular nodes idea of who
2681          * is hosting what. */
2682         for (i=0;i<nodemap->num;i++) {
2683                 /* don't talk to unconnected nodes, but do talk to banned nodes */
2684                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2685                         continue;
2686                 }
2687
2688                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2689                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2690                                 /* This node should be serving this
2691                                    vnn so don't tell it to release the ip
2692                                 */
2693                                 continue;
2694                         }
2695                         ip.pnn  = tmp_ip->pnn;
2696                         ip.addr = tmp_ip->addr;
2697
2698                         timeout = TAKEOVER_TIMEOUT();
2699                         data.dsize = sizeof(ip);
2700                         data.dptr  = (uint8_t *)&ip;
2701                         state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2702                                                   0, CTDB_CONTROL_RELEASE_IP, 0,
2703                                                   data, async_data,
2704                                                   &timeout, NULL);
2705                         if (state == NULL) {
2706                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2707                                 talloc_free(tmp_ctx);
2708                                 return -1;
2709                         }
2710
2711                         ctdb_client_async_add(async_data, state);
2712                 }
2713         }
2714         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2715                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2716                 talloc_free(tmp_ctx);
2717                 return -1;
2718         }
2719         talloc_free(async_data);
2720
2721
2722         /* For each IP, send a TAKOVER_IP to the node that should be
2723          * hosting it.  Many of these will often be redundant (since
2724          * the allocation won't have changed) but they can be useful
2725          * to recover from inconsistencies. */
2726         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2727         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2728
2729         async_data->fail_callback = fail_callback;
2730         async_data->callback_data = callback_data;
2731
2732         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2733                 if (tmp_ip->pnn == -1) {
2734                         /* this IP won't be taken over */
2735                         continue;
2736                 }
2737
2738                 ip.pnn  = tmp_ip->pnn;
2739                 ip.addr = tmp_ip->addr;
2740
2741                 timeout = TAKEOVER_TIMEOUT();
2742                 data.dsize = sizeof(ip);
2743                 data.dptr  = (uint8_t *)&ip;
2744                 state = ctdb_control_send(ctdb, tmp_ip->pnn,
2745                                           0, CTDB_CONTROL_TAKEOVER_IP, 0,
2746                                           data, async_data, &timeout, NULL);
2747                 if (state == NULL) {
2748                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2749                         talloc_free(tmp_ctx);
2750                         return -1;
2751                 }
2752
2753                 ctdb_client_async_add(async_data, state);
2754         }
2755         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2756                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2757                 talloc_free(tmp_ctx);
2758                 return -1;
2759         }
2760
2761 ipreallocated:
2762         /*
2763          * Tell all nodes to run eventscripts to process the
2764          * "ipreallocated" event.  This can do a lot of things,
2765          * including restarting services to reconfigure them if public
2766          * IPs have moved.  Once upon a time this event only used to
2767          * update natgw.
2768          */
2769         retry_data = talloc_zero_array(tmp_ctx, bool, nodemap->num);
2770         CTDB_NO_MEMORY_FATAL(ctdb, retry_data);
2771         iprealloc_data.retry_nodes = retry_data;
2772         iprealloc_data.retry_count = 0;
2773         iprealloc_data.fail_callback = fail_callback;
2774         iprealloc_data.fail_callback_data = callback_data;
2775         iprealloc_data.nodemap = nodemap;
2776
2777         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2778         ret = ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
2779                                         nodes, 0, TAKEOVER_TIMEOUT(),
2780                                         false, tdb_null,
2781                                         NULL, iprealloc_fail_callback,
2782                                         &iprealloc_data);
2783         if (ret != 0) {
2784                 /* If the control failed then we should retry to any
2785                  * nodes flagged by iprealloc_fail_callback using the
2786                  * EVENTSCRIPT control.  This is a best-effort at
2787                  * backward compatiblity when running a mixed cluster
2788                  * where some nodes have not yet been upgraded to
2789                  * support the IPREALLOCATED control.
2790                  */
2791                 DEBUG(DEBUG_WARNING,
2792                       ("Retry ipreallocated to some nodes using eventscript control\n"));
2793
2794                 nodes = talloc_array(tmp_ctx, uint32_t,
2795                                      iprealloc_data.retry_count);
2796                 CTDB_NO_MEMORY_FATAL(ctdb, nodes);
2797
2798                 j = 0;
2799                 for (i=0; i<nodemap->num; i++) {
2800                         if (iprealloc_data.retry_nodes[i]) {
2801                                 nodes[j] = i;
2802                                 j++;
2803                         }
2804                 }
2805
2806                 data.dptr  = discard_const("ipreallocated");
2807                 data.dsize = strlen((char *)data.dptr) + 1; 
2808                 ret = ctdb_client_async_control(ctdb,
2809                                                 CTDB_CONTROL_RUN_EVENTSCRIPTS,
2810                                                 nodes, 0, TAKEOVER_TIMEOUT(),
2811                                                 false, data,
2812                                                 NULL, fail_callback,
2813                                                 callback_data);
2814                 if (ret != 0) {
2815                         DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
2816                 }
2817         }
2818
2819         talloc_free(tmp_ctx);
2820         return ret;
2821 }
2822
2823
2824 /*
2825   destroy a ctdb_client_ip structure
2826  */
2827 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2828 {
2829         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2830                 ctdb_addr_to_str(&ip->addr),
2831                 ntohs(ip->addr.ip.sin_port),
2832                 ip->client_id));
2833
2834         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2835         return 0;
2836 }
2837
2838 /*
2839   called by a client to inform us of a TCP connection that it is managing
2840   that should tickled with an ACK when IP takeover is done
2841  */
2842 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2843                                 TDB_DATA indata)
2844 {
2845         struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
2846         struct ctdb_connection *tcp_sock = NULL;
2847         struct ctdb_tcp_list *tcp;
2848         struct ctdb_connection t;
2849         int ret;
2850         TDB_DATA data;
2851         struct ctdb_client_ip *ip;
2852         struct ctdb_vnn *vnn;
2853         ctdb_sock_addr addr;
2854
2855         /* If we don't have public IPs, tickles are useless */
2856         if (ctdb->vnn == NULL) {
2857                 return 0;
2858         }
2859
2860         tcp_sock = (struct ctdb_connection *)indata.dptr;
2861
2862         addr = tcp_sock->src;
2863         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
2864         addr = tcp_sock->dst;
2865         ctdb_canonicalize_ip(&addr, &tcp_sock->dst);
2866
2867         ZERO_STRUCT(addr);
2868         memcpy(&addr, &tcp_sock->dst, sizeof(addr));
2869         vnn = find_public_ip_vnn(ctdb, &addr);
2870         if (vnn == NULL) {
2871                 switch (addr.sa.sa_family) {
2872                 case AF_INET:
2873                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2874                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
2875                                         ctdb_addr_to_str(&addr)));
2876                         }
2877                         break;
2878                 case AF_INET6:
2879                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
2880                                 ctdb_addr_to_str(&addr)));
2881                         break;
2882                 default:
2883                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2884                 }
2885
2886                 return 0;
2887         }
2888
2889         if (vnn->pnn != ctdb->pnn) {
2890                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2891                         ctdb_addr_to_str(&addr),
2892                         client_id, client->pid));
2893                 /* failing this call will tell smbd to die */
2894                 return -1;
2895         }
2896
2897         ip = talloc(client, struct ctdb_client_ip);
2898         CTDB_NO_MEMORY(ctdb, ip);
2899
2900         ip->ctdb      = ctdb;
2901         ip->addr      = addr;
2902         ip->client_id = client_id;
2903         talloc_set_destructor(ip, ctdb_client_ip_destructor);
2904         DLIST_ADD(ctdb->client_ip_list, ip);
2905
2906         tcp = talloc(client, struct ctdb_tcp_list);
2907         CTDB_NO_MEMORY(ctdb, tcp);
2908
2909         tcp->connection.src = tcp_sock->src;
2910         tcp->connection.dst = tcp_sock->dst;
2911
2912         DLIST_ADD(client->tcp_list, tcp);
2913
2914         t.src = tcp_sock->src;
2915         t.dst = tcp_sock->dst;
2916
2917         data.dptr = (uint8_t *)&t;
2918         data.dsize = sizeof(t);
2919
2920         switch (addr.sa.sa_family) {
2921         case AF_INET:
2922                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2923                         (unsigned)ntohs(tcp_sock->dst.ip.sin_port),
2924                         ctdb_addr_to_str(&tcp_sock->src),
2925                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2926                 break;
2927         case AF_INET6:
2928                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2929                         (unsigned)ntohs(tcp_sock->dst.ip6.sin6_port),
2930                         ctdb_addr_to_str(&tcp_sock->src),
2931                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2932                 break;
2933         default:
2934                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2935         }
2936
2937
2938         /* tell all nodes about this tcp connection */
2939         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
2940                                        CTDB_CONTROL_TCP_ADD,
2941                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2942         if (ret != 0) {
2943                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
2944                 return -1;
2945         }
2946
2947         return 0;
2948 }
2949
2950 /*
2951   find a tcp address on a list
2952  */
2953 static struct ctdb_connection *ctdb_tcp_find(struct ctdb_tcp_array *array,
2954                                            struct ctdb_connection *tcp)
2955 {
2956         int i;
2957
2958         if (array == NULL) {
2959                 return NULL;
2960         }
2961
2962         for (i=0;i<array->num;i++) {
2963                 if (ctdb_same_sockaddr(&array->connections[i].src, &tcp->src) &&
2964                     ctdb_same_sockaddr(&array->connections[i].dst, &tcp->dst)) {
2965                         return &array->connections[i];
2966                 }
2967         }
2968         return NULL;
2969 }
2970
2971
2972
2973 /*
2974   called by a daemon to inform us of a TCP connection that one of its
2975   clients managing that should tickled with an ACK when IP takeover is
2976   done
2977  */
2978 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
2979 {
2980         struct ctdb_connection *p = (struct ctdb_connection *)indata.dptr;
2981         struct ctdb_tcp_array *tcparray;
2982         struct ctdb_connection tcp;
2983         struct ctdb_vnn *vnn;
2984
2985         /* If we don't have public IPs, tickles are useless */
2986         if (ctdb->vnn == NULL) {
2987                 return 0;
2988         }
2989
2990         vnn = find_public_ip_vnn(ctdb, &p->dst);
2991         if (vnn == NULL) {
2992                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
2993                         ctdb_addr_to_str(&p->dst)));
2994
2995                 return -1;
2996         }
2997
2998
2999         tcparray = vnn->tcp_array;
3000
3001         /* If this is the first tickle */
3002         if (tcparray == NULL) {
3003                 tcparray = talloc(vnn, struct ctdb_tcp_array);
3004                 CTDB_NO_MEMORY(ctdb, tcparray);
3005                 vnn->tcp_array = tcparray;
3006
3007                 tcparray->num = 0;
3008                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_connection));
3009                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
3010
3011                 tcparray->connections[tcparray->num].src = p->src;
3012                 tcparray->connections[tcparray->num].dst = p->dst;
3013                 tcparray->num++;
3014
3015                 if (tcp_update_needed) {
3016                         vnn->tcp_update_needed = true;
3017                 }
3018                 return 0;
3019         }
3020
3021
3022         /* Do we already have this tickle ?*/
3023         tcp.src = p->src;
3024         tcp.dst = p->dst;
3025         if (ctdb_tcp_find(tcparray, &tcp) != NULL) {
3026                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
3027                         ctdb_addr_to_str(&tcp.dst),
3028                         ntohs(tcp.dst.ip.sin_port),
3029                         vnn->pnn));
3030                 return 0;
3031         }
3032
3033         /* A new tickle, we must add it to the array */
3034         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
3035                                         struct ctdb_connection,
3036                                         tcparray->num+1);
3037         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3038
3039         tcparray->connections[tcparray->num].src = p->src;
3040         tcparray->connections[tcparray->num].dst = p->dst;
3041         tcparray->num++;
3042
3043         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
3044                 ctdb_addr_to_str(&tcp.dst),
3045                 ntohs(tcp.dst.ip.sin_port),
3046                 vnn->pnn));
3047
3048         if (tcp_update_needed) {
3049                 vnn->tcp_update_needed = true;
3050         }
3051
3052         return 0;
3053 }
3054
3055
3056 /*
3057   called by a daemon to inform us of a TCP connection that one of its
3058   clients managing that should tickled with an ACK when IP takeover is
3059   done
3060  */
3061 static void ctdb_remove_connection(struct ctdb_context *ctdb, struct ctdb_connection *conn)
3062 {
3063         struct ctdb_connection *tcpp;
3064         struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst);
3065
3066         if (vnn == NULL) {
3067                 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
3068                         ctdb_addr_to_str(&conn->dst)));
3069                 return;
3070         }
3071
3072         /* if the array is empty we cant remove it
3073            and we don't need to do anything
3074          */
3075         if (vnn->tcp_array == NULL) {
3076                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
3077                         ctdb_addr_to_str(&conn->dst),
3078                         ntohs(conn->dst.ip.sin_port)));
3079                 return;
3080         }
3081
3082
3083         /* See if we know this connection
3084            if we don't know this connection  then we dont need to do anything
3085          */
3086         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
3087         if (tcpp == NULL) {
3088                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
3089                         ctdb_addr_to_str(&conn->dst),
3090                         ntohs(conn->dst.ip.sin_port)));
3091                 return;
3092         }
3093
3094
3095         /* We need to remove this entry from the array.
3096            Instead of allocating a new array and copying data to it
3097            we cheat and just copy the last entry in the existing array
3098            to the entry that is to be removed and just shring the 
3099            ->num field
3100          */
3101         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
3102         vnn->tcp_array->num--;
3103
3104         /* If we deleted the last entry we also need to remove the entire array
3105          */
3106         if (vnn->tcp_array->num == 0) {
3107                 talloc_free(vnn->tcp_array);
3108                 vnn->tcp_array = NULL;
3109         }               
3110
3111         vnn->tcp_update_needed = true;
3112
3113         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
3114                 ctdb_addr_to_str(&conn->src),
3115                 ntohs(conn->src.ip.sin_port)));
3116 }
3117
3118
3119 /*
3120   called by a daemon to inform us of a TCP connection that one of its
3121   clients used are no longer needed in the tickle database
3122  */
3123 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
3124 {
3125         struct ctdb_connection *conn = (struct ctdb_connection *)indata.dptr;
3126
3127         /* If we don't have public IPs, tickles are useless */
3128         if (ctdb->vnn == NULL) {
3129                 return 0;
3130         }
3131
3132         ctdb_remove_connection(ctdb, conn);
3133
3134         return 0;
3135 }
3136
3137
3138 /*
3139   Called when another daemon starts - causes all tickles for all
3140   public addresses we are serving to be sent to the new node on the
3141   next check.  This actually causes the next scheduled call to
3142   tdb_update_tcp_tickles() to update all nodes.  This is simple and
3143   doesn't require careful error handling.
3144  */
3145 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t pnn)
3146 {
3147         struct ctdb_vnn *vnn;
3148
3149         DEBUG(DEBUG_INFO, ("Received startup control from node %lu\n",
3150                            (unsigned long) pnn));
3151
3152         for (vnn = ctdb->vnn; vnn != NULL; vnn = vnn->next) {
3153                 vnn->tcp_update_needed = true;
3154         }
3155
3156         return 0;
3157 }
3158
3159
3160 /*
3161   called when a client structure goes away - hook to remove
3162   elements from the tcp_list in all daemons
3163  */
3164 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
3165 {
3166         while (client->tcp_list) {
3167                 struct ctdb_tcp_list *tcp = client->tcp_list;
3168                 DLIST_REMOVE(client->tcp_list, tcp);
3169                 ctdb_remove_connection(client->ctdb, &tcp->connection);
3170         }
3171 }
3172
3173
3174 void ctdb_release_all_ips(struct ctdb_context *ctdb)
3175 {
3176         struct ctdb_vnn *vnn;
3177         int count = 0;
3178
3179         if (ctdb->tunable.disable_ip_failover == 1) {
3180                 return;
3181         }
3182
3183         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3184                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
3185                         ctdb_vnn_unassign_iface(ctdb, vnn);
3186                         continue;
3187                 }
3188                 if (!vnn->iface) {
3189                         continue;
3190                 }
3191
3192                 /* Don't allow multiple releases at once.  Some code,
3193                  * particularly ctdb_tickle_sentenced_connections() is
3194                  * not re-entrant */
3195                 if (vnn->update_in_flight) {
3196                         DEBUG(DEBUG_WARNING,
3197                               (__location__
3198                                " Not releasing IP %s/%u on interface %s, an update is already in progess\n",
3199                                     ctdb_addr_to_str(&vnn->public_address),
3200                                     vnn->public_netmask_bits,
3201                                     ctdb_vnn_iface_string(vnn)));
3202                         continue;
3203                 }
3204                 vnn->update_in_flight = true;
3205
3206                 DEBUG(DEBUG_INFO,("Release of IP %s/%u on interface %s node:-1\n",
3207                                     ctdb_addr_to_str(&vnn->public_address),
3208                                     vnn->public_netmask_bits,
3209                                     ctdb_vnn_iface_string(vnn)));
3210
3211                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
3212                                   ctdb_vnn_iface_string(vnn),
3213                                   ctdb_addr_to_str(&vnn->public_address),
3214                                   vnn->public_netmask_bits);
3215                 release_kill_clients(ctdb, &vnn->public_address);
3216                 ctdb_vnn_unassign_iface(ctdb, vnn);
3217                 vnn->update_in_flight = false;
3218                 count++;
3219         }
3220
3221         DEBUG(DEBUG_NOTICE,(__location__ " Released %d public IPs\n", count));
3222 }
3223
3224
3225 /*
3226   get list of public IPs
3227  */
3228 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
3229                                     struct ctdb_req_control_old *c, TDB_DATA *outdata)
3230 {
3231         int i, num, len;
3232         struct ctdb_public_ip_list_old *ips;
3233         struct ctdb_vnn *vnn;
3234         bool only_available = false;
3235
3236         if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
3237                 only_available = true;
3238         }
3239
3240         /* count how many public ip structures we have */
3241         num = 0;
3242         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3243                 num++;
3244         }
3245
3246         len = offsetof(struct ctdb_public_ip_list_old, ips) +
3247                 num*sizeof(struct ctdb_public_ip);
3248         ips = talloc_zero_size(outdata, len);
3249         CTDB_NO_MEMORY(ctdb, ips);
3250
3251         i = 0;
3252         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3253                 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
3254                         continue;
3255                 }
3256                 ips->ips[i].pnn  = vnn->pnn;
3257                 ips->ips[i].addr = vnn->public_address;
3258                 i++;
3259         }
3260         ips->num = i;
3261         len = offsetof(struct ctdb_public_ip_list_old, ips) +
3262                 i*sizeof(struct ctdb_public_ip);
3263
3264         outdata->dsize = len;
3265         outdata->dptr  = (uint8_t *)ips;
3266
3267         return 0;
3268 }
3269
3270
3271 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
3272                                         struct ctdb_req_control_old *c,
3273                                         TDB_DATA indata,
3274                                         TDB_DATA *outdata)
3275 {
3276         int i, num, len;
3277         ctdb_sock_addr *addr;
3278         struct ctdb_public_ip_info_old *info;
3279         struct ctdb_vnn *vnn;
3280
3281         addr = (ctdb_sock_addr *)indata.dptr;
3282
3283         vnn = find_public_ip_vnn(ctdb, addr);
3284         if (vnn == NULL) {
3285                 /* if it is not a public ip   it could be our 'single ip' */
3286                 if (ctdb->single_ip_vnn) {
3287                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
3288                                 vnn = ctdb->single_ip_vnn;
3289                         }
3290                 }
3291         }
3292         if (vnn == NULL) {
3293                 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
3294                                  "'%s'not a public address\n",
3295                                  ctdb_addr_to_str(addr)));
3296                 return -1;
3297         }
3298
3299         /* count how many public ip structures we have */
3300         num = 0;
3301         for (;vnn->ifaces[num];) {
3302                 num++;
3303         }
3304
3305         len = offsetof(struct ctdb_public_ip_info_old, ifaces) +
3306                 num*sizeof(struct ctdb_iface);
3307         info = talloc_zero_size(outdata, len);
3308         CTDB_NO_MEMORY(ctdb, info);
3309
3310         info->ip.addr = vnn->public_address;
3311         info->ip.pnn = vnn->pnn;
3312         info->active_idx = 0xFFFFFFFF;
3313
3314         for (i=0; vnn->ifaces[i]; i++) {
3315                 struct ctdb_interface *cur;
3316
3317                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
3318                 if (cur == NULL) {
3319                         DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
3320                                            vnn->ifaces[i]));
3321                         return -1;
3322                 }
3323                 if (vnn->iface == cur) {
3324                         info->active_idx = i;
3325                 }
3326                 strncpy(info->ifaces[i].name, cur->name, sizeof(info->ifaces[i].name)-1);
3327                 info->ifaces[i].link_state = cur->link_up;
3328                 info->ifaces[i].references = cur->references;
3329         }
3330         info->num = i;
3331         len = offsetof(struct ctdb_public_ip_info_old, ifaces) +
3332                 i*sizeof(struct ctdb_iface);
3333
3334         outdata->dsize = len;
3335         outdata->dptr  = (uint8_t *)info;
3336
3337         return 0;
3338 }
3339
3340 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
3341                                 struct ctdb_req_control_old *c,
3342                                 TDB_DATA *outdata)
3343 {
3344         int i, num, len;
3345         struct ctdb_iface_list_old *ifaces;
3346         struct ctdb_interface *cur;
3347
3348         /* count how many public ip structures we have */
3349         num = 0;
3350         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3351                 num++;
3352         }
3353
3354         len = offsetof(struct ctdb_iface_list_old, ifaces) +
3355                 num*sizeof(struct ctdb_iface);
3356         ifaces = talloc_zero_size(outdata, len);
3357         CTDB_NO_MEMORY(ctdb, ifaces);
3358
3359         i = 0;
3360         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3361                 strcpy(ifaces->ifaces[i].name, cur->name);
3362                 ifaces->ifaces[i].link_state = cur->link_up;
3363                 ifaces->ifaces[i].references = cur->references;
3364                 i++;
3365         }
3366         ifaces->num = i;
3367         len = offsetof(struct ctdb_iface_list_old, ifaces) +
3368                 i*sizeof(struct ctdb_iface);
3369
3370         outdata->dsize = len;
3371         outdata->dptr  = (uint8_t *)ifaces;
3372
3373         return 0;
3374 }
3375
3376 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
3377                                     struct ctdb_req_control_old *c,
3378                                     TDB_DATA indata)
3379 {
3380         struct ctdb_iface *info;
3381         struct ctdb_interface *iface;
3382         bool link_up = false;
3383
3384         info = (struct ctdb_iface *)indata.dptr;
3385
3386         if (info->name[CTDB_IFACE_SIZE] != '\0') {
3387                 int len = strnlen(info->name, CTDB_IFACE_SIZE);
3388                 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
3389                                   len, len, info->name));
3390                 return -1;
3391         }
3392
3393         switch (info->link_state) {
3394         case 0:
3395                 link_up = false;
3396                 break;
3397         case 1:
3398                 link_up = true;
3399                 break;
3400         default:
3401                 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
3402                                   (unsigned int)info->link_state));
3403                 return -1;
3404         }
3405
3406         if (info->references != 0) {
3407                 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
3408                                   (unsigned int)info->references));
3409                 return -1;
3410         }
3411
3412         iface = ctdb_find_iface(ctdb, info->name);
3413         if (iface == NULL) {
3414                 return -1;
3415         }
3416
3417         if (link_up == iface->link_up) {
3418                 return 0;
3419         }
3420
3421         DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
3422               ("iface[%s] has changed it's link status %s => %s\n",
3423                iface->name,
3424                iface->link_up?"up":"down",
3425                link_up?"up":"down"));
3426
3427         iface->link_up = link_up;
3428         return 0;
3429 }
3430
3431
3432 /* 
3433    structure containing the listening socket and the list of tcp connections
3434    that the ctdb daemon is to kill
3435 */
3436 struct ctdb_kill_tcp {
3437         struct ctdb_vnn *vnn;
3438         struct ctdb_context *ctdb;
3439         int capture_fd;
3440         struct tevent_fd *fde;
3441         trbt_tree_t *connections;
3442         void *private_data;
3443 };
3444
3445 /*
3446   a tcp connection that is to be killed
3447  */
3448 struct ctdb_killtcp_con {
3449         ctdb_sock_addr src_addr;
3450         ctdb_sock_addr dst_addr;
3451         int count;
3452         struct ctdb_kill_tcp *killtcp;
3453 };
3454
3455 /* this function is used to create a key to represent this socketpair
3456    in the killtcp tree.
3457    this key is used to insert and lookup matching socketpairs that are
3458    to be tickled and RST
3459 */
3460 #define KILLTCP_KEYLEN  10
3461 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
3462 {
3463         static uint32_t key[KILLTCP_KEYLEN];
3464
3465         bzero(key, sizeof(key));
3466
3467         if (src->sa.sa_family != dst->sa.sa_family) {
3468                 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
3469                 return key;
3470         }
3471         
3472         switch (src->sa.sa_family) {
3473         case AF_INET:
3474                 key[0]  = dst->ip.sin_addr.s_addr;
3475                 key[1]  = src->ip.sin_addr.s_addr;
3476                 key[2]  = dst->ip.sin_port;
3477                 key[3]  = src->ip.sin_port;
3478                 break;
3479         case AF_INET6: {
3480                 uint32_t *dst6_addr32 =
3481                         (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
3482                 uint32_t *src6_addr32 =
3483                         (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
3484                 key[0]  = dst6_addr32[3];
3485                 key[1]  = src6_addr32[3];
3486                 key[2]  = dst6_addr32[2];
3487                 key[3]  = src6_addr32[2];
3488                 key[4]  = dst6_addr32[1];
3489                 key[5]  = src6_addr32[1];
3490                 key[6]  = dst6_addr32[0];
3491                 key[7]  = src6_addr32[0];
3492                 key[8]  = dst->ip6.sin6_port;
3493                 key[9]  = src->ip6.sin6_port;
3494                 break;
3495         }
3496         default:
3497                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
3498                 return key;
3499         }
3500
3501         return key;
3502 }
3503
3504 /*
3505   called when we get a read event on the raw socket
3506  */
3507 static void capture_tcp_handler(struct tevent_context *ev,
3508                                 struct tevent_fd *fde,
3509                                 uint16_t flags, void *private_data)
3510 {
3511         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3512         struct ctdb_killtcp_con *con;
3513         ctdb_sock_addr src, dst;
3514         uint32_t ack_seq, seq;
3515
3516         if (!(flags & TEVENT_FD_READ)) {
3517                 return;
3518         }
3519
3520         if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
3521                                 killtcp->private_data,
3522                                 &src, &dst,
3523                                 &ack_seq, &seq) != 0) {
3524                 /* probably a non-tcp ACK packet */
3525                 return;
3526         }
3527
3528         /* check if we have this guy in our list of connections
3529            to kill
3530         */
3531         con = trbt_lookuparray32(killtcp->connections, 
3532                         KILLTCP_KEYLEN, killtcp_key(&src, &dst));
3533         if (con == NULL) {
3534                 /* no this was some other packet we can just ignore */
3535                 return;
3536         }
3537
3538         /* This one has been tickled !
3539            now reset him and remove him from the list.
3540          */
3541         DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
3542                 ntohs(con->dst_addr.ip.sin_port),
3543                 ctdb_addr_to_str(&con->src_addr),
3544                 ntohs(con->src_addr.ip.sin_port)));
3545
3546         ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
3547         talloc_free(con);
3548 }
3549
3550
3551 /* when traversing the list of all tcp connections to send tickle acks to
3552    (so that we can capture the ack coming back and kill the connection
3553     by a RST)
3554    this callback is called for each connection we are currently trying to kill
3555 */
3556 static int tickle_connection_traverse(void *param, void *data)
3557 {
3558         struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
3559
3560         /* have tried too many times, just give up */
3561         if (con->count >= 5) {
3562                 /* can't delete in traverse: reparent to delete_cons */
3563                 talloc_steal(param, con);
3564                 return 0;
3565         }
3566
3567         /* othervise, try tickling it again */
3568         con->count++;
3569         ctdb_sys_send_tcp(
3570                 (ctdb_sock_addr *)&con->dst_addr,
3571                 (ctdb_sock_addr *)&con->src_addr,
3572                 0, 0, 0);
3573         return 0;
3574 }
3575
3576
3577 /* 
3578    called every second until all sentenced connections have been reset
3579  */
3580 static void ctdb_tickle_sentenced_connections(struct tevent_context *ev,
3581                                               struct tevent_timer *te,
3582                                               struct timeval t, void *private_data)
3583 {
3584         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3585         void *delete_cons = talloc_new(NULL);
3586
3587         /* loop over all connections sending tickle ACKs */
3588         trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
3589
3590         /* now we've finished traverse, it's safe to do deletion. */
3591         talloc_free(delete_cons);
3592
3593         /* If there are no more connections to kill we can remove the
3594            entire killtcp structure
3595          */
3596         if ( (killtcp->connections == NULL) || 
3597              (killtcp->connections->root == NULL) ) {
3598                 talloc_free(killtcp);
3599                 return;
3600         }
3601
3602         /* try tickling them again in a seconds time
3603          */
3604         tevent_add_timer(killtcp->ctdb->ev, killtcp,
3605                          timeval_current_ofs(1, 0),
3606                          ctdb_tickle_sentenced_connections, killtcp);
3607 }
3608
3609 /*
3610   destroy the killtcp structure
3611  */
3612 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
3613 {
3614         struct ctdb_vnn *tmpvnn;
3615
3616         /* verify that this vnn is still active */
3617         for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
3618                 if (tmpvnn == killtcp->vnn) {
3619                         break;
3620                 }
3621         }
3622
3623         if (tmpvnn == NULL) {
3624                 return 0;
3625         }
3626
3627         if (killtcp->vnn->killtcp != killtcp) {
3628                 return 0;
3629         }
3630
3631         killtcp->vnn->killtcp = NULL;
3632
3633         return 0;
3634 }
3635
3636
3637 /* nothing fancy here, just unconditionally replace any existing
3638    connection structure with the new one.
3639
3640    don't even free the old one if it did exist, that one is talloc_stolen
3641    by the same node in the tree anyway and will be deleted when the new data 
3642    is deleted
3643 */
3644 static void *add_killtcp_callback(void *parm, void *data)
3645 {
3646         return parm;
3647 }
3648
3649 /*
3650   add a tcp socket to the list of connections we want to RST
3651  */
3652 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb, 
3653                                        ctdb_sock_addr *s,
3654                                        ctdb_sock_addr *d)
3655 {
3656         ctdb_sock_addr src, dst;
3657         struct ctdb_kill_tcp *killtcp;
3658         struct ctdb_killtcp_con *con;
3659         struct ctdb_vnn *vnn;
3660
3661         ctdb_canonicalize_ip(s, &src);
3662         ctdb_canonicalize_ip(d, &dst);
3663
3664         vnn = find_public_ip_vnn(ctdb, &dst);
3665         if (vnn == NULL) {
3666                 vnn = find_public_ip_vnn(ctdb, &src);
3667         }
3668         if (vnn == NULL) {
3669                 /* if it is not a public ip   it could be our 'single ip' */
3670                 if (ctdb->single_ip_vnn) {
3671                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
3672                                 vnn = ctdb->single_ip_vnn;
3673                         }
3674                 }
3675         }
3676         if (vnn == NULL) {
3677                 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n")); 
3678                 return -1;
3679         }
3680
3681         killtcp = vnn->killtcp;
3682         
3683         /* If this is the first connection to kill we must allocate
3684            a new structure
3685          */
3686         if (killtcp == NULL) {
3687                 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
3688                 CTDB_NO_MEMORY(ctdb, killtcp);
3689
3690                 killtcp->vnn         = vnn;
3691                 killtcp->ctdb        = ctdb;
3692                 killtcp->capture_fd  = -1;
3693                 killtcp->connections = trbt_create(killtcp, 0);
3694
3695                 vnn->killtcp         = killtcp;
3696                 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
3697         }
3698
3699
3700
3701         /* create a structure that describes this connection we want to
3702            RST and store it in killtcp->connections
3703         */
3704         con = talloc(killtcp, struct ctdb_killtcp_con);
3705         CTDB_NO_MEMORY(ctdb, con);
3706         con->src_addr = src;
3707         con->dst_addr = dst;
3708         con->count    = 0;
3709         con->killtcp  = killtcp;
3710
3711
3712         trbt_insertarray32_callback(killtcp->connections,
3713                         KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
3714                         add_killtcp_callback, con);
3715
3716         /* 
3717            If we don't have a socket to listen on yet we must create it
3718          */
3719         if (killtcp->capture_fd == -1) {
3720                 const char *iface = ctdb_vnn_iface_string(vnn);
3721                 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
3722                 if (killtcp->capture_fd == -1) {
3723                         DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
3724                                           "socket on iface '%s' for killtcp (%s)\n",
3725                                           iface, strerror(errno)));
3726                         goto failed;
3727                 }
3728         }
3729
3730
3731         if (killtcp->fde == NULL) {
3732                 killtcp->fde = tevent_add_fd(ctdb->ev, killtcp,
3733                                              killtcp->capture_fd,
3734                                              TEVENT_FD_READ,
3735                                              capture_tcp_handler, killtcp);
3736                 tevent_fd_set_auto_close(killtcp->fde);
3737
3738                 /* We also need to set up some events to tickle all these connections
3739                    until they are all reset
3740                 */
3741                 tevent_add_timer(ctdb->ev, killtcp, timeval_current_ofs(1, 0),
3742                                  ctdb_tickle_sentenced_connections, killtcp);
3743         }
3744
3745         /* tickle him once now */
3746         ctdb_sys_send_tcp(
3747                 &con->dst_addr,
3748                 &con->src_addr,
3749                 0, 0, 0);
3750
3751         return 0;
3752
3753 failed:
3754         talloc_free(vnn->killtcp);
3755         vnn->killtcp = NULL;
3756         return -1;
3757 }
3758
3759 /*
3760   kill a TCP connection.
3761  */
3762 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
3763 {
3764         struct ctdb_connection *killtcp = (struct ctdb_connection *)indata.dptr;
3765
3766         return ctdb_killtcp_add_connection(ctdb, &killtcp->src, &killtcp->dst);
3767 }
3768
3769 /*
3770   called by a daemon to inform us of the entire list of TCP tickles for
3771   a particular public address.
3772   this control should only be sent by the node that is currently serving
3773   that public address.
3774  */
3775 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
3776 {
3777         struct ctdb_tickle_list_old *list = (struct ctdb_tickle_list_old *)indata.dptr;
3778         struct ctdb_tcp_array *tcparray;
3779         struct ctdb_vnn *vnn;
3780
3781         /* We must at least have tickles.num or else we cant verify the size
3782            of the received data blob
3783          */
3784         if (indata.dsize < offsetof(struct ctdb_tickle_list_old, connections)) {
3785                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_tickle_list. Not enough data for the tickle.num field\n"));
3786                 return -1;
3787         }
3788
3789         /* verify that the size of data matches what we expect */
3790         if (indata.dsize < offsetof(struct ctdb_tickle_list_old, connections)
3791                          + sizeof(struct ctdb_connection) * list->num) {
3792                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_tickle_list\n"));
3793                 return -1;
3794         }
3795
3796         DEBUG(DEBUG_INFO, ("Received tickle update for public address %s\n",
3797                            ctdb_addr_to_str(&list->addr)));
3798
3799         vnn = find_public_ip_vnn(ctdb, &list->addr);
3800         if (vnn == NULL) {
3801                 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n",
3802                         ctdb_addr_to_str(&list->addr)));
3803
3804                 return 1;
3805         }
3806
3807         /* remove any old ticklelist we might have */
3808         talloc_free(vnn->tcp_array);
3809         vnn->tcp_array = NULL;
3810
3811         tcparray = talloc(vnn, struct ctdb_tcp_array);
3812         CTDB_NO_MEMORY(ctdb, tcparray);
3813
3814         tcparray->num = list->num;
3815
3816         tcparray->connections = talloc_array(tcparray, struct ctdb_connection, tcparray->num);
3817         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3818
3819         memcpy(tcparray->connections, &list->connections[0],
3820                sizeof(struct ctdb_connection)*tcparray->num);
3821
3822         /* We now have a new fresh tickle list array for this vnn */
3823         vnn->tcp_array = tcparray;
3824
3825         return 0;
3826 }
3827
3828 /*
3829   called to return the full list of tickles for the puclic address associated 
3830   with the provided vnn
3831  */
3832 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3833 {
3834         ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3835         struct ctdb_tickle_list_old *list;
3836         struct ctdb_tcp_array *tcparray;
3837         int num;
3838         struct ctdb_vnn *vnn;
3839
3840         vnn = find_public_ip_vnn(ctdb, addr);
3841         if (vnn == NULL) {
3842                 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n", 
3843                         ctdb_addr_to_str(addr)));
3844
3845                 return 1;
3846         }
3847
3848         tcparray = vnn->tcp_array;
3849         if (tcparray) {
3850                 num = tcparray->num;
3851         } else {
3852                 num = 0;
3853         }
3854
3855         outdata->dsize = offsetof(struct ctdb_tickle_list_old, connections)
3856                         + sizeof(struct ctdb_connection) * num;
3857
3858         outdata->dptr  = talloc_size(outdata, outdata->dsize);
3859         CTDB_NO_MEMORY(ctdb, outdata->dptr);
3860         list = (struct ctdb_tickle_list_old *)outdata->dptr;
3861
3862         list->addr = *addr;
3863         list->num = num;
3864         if (num) {
3865                 memcpy(&list->connections[0], tcparray->connections,
3866                         sizeof(struct ctdb_connection) * num);
3867         }
3868
3869         return 0;
3870 }
3871
3872
3873 /*
3874   set the list of all tcp tickles for a public address
3875  */
3876 static int ctdb_send_set_tcp_tickles_for_ip(struct ctdb_context *ctdb,
3877                                             ctdb_sock_addr *addr,
3878                                             struct ctdb_tcp_array *tcparray)
3879 {
3880         int ret, num;
3881         TDB_DATA data;
3882         struct ctdb_tickle_list_old *list;
3883
3884         if (tcparray) {
3885                 num = tcparray->num;
3886         } else {
3887                 num = 0;
3888         }
3889
3890         data.dsize = offsetof(struct ctdb_tickle_list_old, connections) +
3891                         sizeof(struct ctdb_connection) * num;
3892         data.dptr = talloc_size(ctdb, data.dsize);
3893         CTDB_NO_MEMORY(ctdb, data.dptr);
3894
3895         list = (struct ctdb_tickle_list_old *)data.dptr;
3896         list->addr = *addr;
3897         list->num = num;
3898         if (tcparray) {
3899                 memcpy(&list->connections[0], tcparray->connections, sizeof(struct ctdb_connection) * num);
3900         }
3901
3902         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_ALL, 0,
3903                                        CTDB_CONTROL_SET_TCP_TICKLE_LIST,
3904                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3905         if (ret != 0) {
3906                 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
3907                 return -1;
3908         }
3909
3910         talloc_free(data.dptr);
3911
3912         return ret;
3913 }
3914
3915
3916 /*
3917   perform tickle updates if required
3918  */
3919 static void ctdb_update_tcp_tickles(struct tevent_context *ev,
3920                                     struct tevent_timer *te,
3921                                     struct timeval t, void *private_data)
3922 {
3923         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3924         int ret;
3925         struct ctdb_vnn *vnn;
3926
3927         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3928                 /* we only send out updates for public addresses that 
3929                    we have taken over
3930                  */
3931                 if (ctdb->pnn != vnn->pnn) {
3932                         continue;
3933                 }
3934                 /* We only send out the updates if we need to */
3935                 if (!vnn->tcp_update_needed) {
3936                         continue;
3937                 }
3938                 ret = ctdb_send_set_tcp_tickles_for_ip(ctdb,
3939                                                        &vnn->public_address,
3940                                                        vnn->tcp_array);
3941                 if (ret != 0) {
3942                         DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
3943                                 ctdb_addr_to_str(&vnn->public_address)));
3944                 } else {
3945                         DEBUG(DEBUG_INFO,
3946                               ("Sent tickle update for public address %s\n",
3947                                ctdb_addr_to_str(&vnn->public_address)));
3948                         vnn->tcp_update_needed = false;
3949                 }
3950         }
3951
3952         tevent_add_timer(ctdb->ev, ctdb->tickle_update_context,
3953                          timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
3954                          ctdb_update_tcp_tickles, ctdb);
3955 }
3956
3957 /*
3958   start periodic update of tcp tickles
3959  */
3960 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
3961 {
3962         ctdb->tickle_update_context = talloc_new(ctdb);
3963
3964         tevent_add_timer(ctdb->ev, ctdb->tickle_update_context,
3965                          timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
3966                          ctdb_update_tcp_tickles, ctdb);
3967 }
3968
3969
3970
3971
3972 struct control_gratious_arp {
3973         struct ctdb_context *ctdb;
3974         ctdb_sock_addr addr;
3975         const char *iface;
3976         int count;
3977 };
3978
3979 /*
3980   send a control_gratuitous arp
3981  */
3982 static void send_gratious_arp(struct tevent_context *ev,
3983                               struct tevent_timer *te,
3984                               struct timeval t, void *private_data)
3985 {
3986         int ret;
3987         struct control_gratious_arp *arp = talloc_get_type(private_data, 
3988                                                         struct control_gratious_arp);
3989
3990         ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
3991         if (ret != 0) {
3992                 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
3993                                  arp->iface, strerror(errno)));
3994         }
3995
3996
3997         arp->count++;
3998         if (arp->count == CTDB_ARP_REPEAT) {
3999                 talloc_free(arp);
4000                 return;
4001         }
4002
4003         tevent_add_timer(arp->ctdb->ev, arp,
4004                          timeval_current_ofs(CTDB_ARP_INTERVAL, 0),
4005                          send_gratious_arp, arp);
4006 }
4007
4008
4009 /*
4010   send a gratious arp 
4011  */
4012 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
4013 {
4014         struct ctdb_addr_info_old *gratious_arp = (struct ctdb_addr_info_old *)indata.dptr;
4015         struct control_gratious_arp *arp;
4016
4017         /* verify the size of indata */
4018         if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
4019                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
4020                                  (unsigned)indata.dsize, 
4021                                  (unsigned)offsetof(struct ctdb_addr_info_old, iface)));
4022                 return -1;
4023         }
4024         if (indata.dsize != 
4025                 ( offsetof(struct ctdb_addr_info_old, iface)
4026                 + gratious_arp->len ) ){
4027
4028                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4029                         "but should be %u bytes\n", 
4030                          (unsigned)indata.dsize, 
4031                          (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+gratious_arp->len)));
4032                 return -1;
4033         }
4034
4035
4036         arp = talloc(ctdb, struct control_gratious_arp);
4037         CTDB_NO_MEMORY(ctdb, arp);
4038
4039         arp->ctdb  = ctdb;
4040         arp->addr   = gratious_arp->addr;
4041         arp->iface = talloc_strdup(arp, gratious_arp->iface);
4042         CTDB_NO_MEMORY(ctdb, arp->iface);
4043         arp->count = 0;
4044
4045         tevent_add_timer(arp->ctdb->ev, arp,
4046                          timeval_zero(), send_gratious_arp, arp);
4047
4048         return 0;
4049 }
4050
4051 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
4052 {
4053         struct ctdb_addr_info_old *pub = (struct ctdb_addr_info_old *)indata.dptr;
4054         int ret;
4055
4056         /* verify the size of indata */
4057         if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
4058                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_addr_info structure\n"));
4059                 return -1;
4060         }
4061         if (indata.dsize != 
4062                 ( offsetof(struct ctdb_addr_info_old, iface)
4063                 + pub->len ) ){
4064
4065                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4066                         "but should be %u bytes\n", 
4067                          (unsigned)indata.dsize, 
4068                          (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+pub->len)));
4069                 return -1;
4070         }
4071
4072         DEBUG(DEBUG_NOTICE,("Add IP %s\n", ctdb_addr_to_str(&pub->addr)));
4073
4074         ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
4075
4076         if (ret != 0) {
4077                 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
4078                 return -1;
4079         }
4080
4081         return 0;
4082 }
4083
4084 struct delete_ip_callback_state {
4085         struct ctdb_req_control_old *c;
4086 };
4087
4088 /*
4089   called when releaseip event finishes for del_public_address
4090  */
4091 static void delete_ip_callback(struct ctdb_context *ctdb,
4092                                int32_t status, TDB_DATA data,
4093                                const char *errormsg,
4094                                void *private_data)
4095 {
4096         struct delete_ip_callback_state *state =
4097                 talloc_get_type(private_data, struct delete_ip_callback_state);
4098
4099         /* If release failed then fail. */
4100         ctdb_request_control_reply(ctdb, state->c, NULL, status, errormsg);
4101         talloc_free(private_data);
4102 }
4103
4104 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb,
4105                                         struct ctdb_req_control_old *c,
4106                                         TDB_DATA indata, bool *async_reply)
4107 {
4108         struct ctdb_addr_info_old *pub = (struct ctdb_addr_info_old *)indata.dptr;
4109         struct ctdb_vnn *vnn;
4110
4111         /* verify the size of indata */
4112         if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
4113                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_addr_info structure\n"));
4114                 return -1;
4115         }
4116         if (indata.dsize != 
4117                 ( offsetof(struct ctdb_addr_info_old, iface)
4118                 + pub->len ) ){
4119
4120                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4121                         "but should be %u bytes\n", 
4122                          (unsigned)indata.dsize, 
4123                          (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+pub->len)));
4124                 return -1;
4125         }
4126
4127         DEBUG(DEBUG_NOTICE,("Delete IP %s\n", ctdb_addr_to_str(&pub->addr)));
4128
4129         /* walk over all public addresses until we find a match */
4130         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
4131                 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
4132                         if (vnn->pnn == ctdb->pnn) {
4133                                 struct delete_ip_callback_state *state;
4134                                 struct ctdb_public_ip *ip;
4135                                 TDB_DATA data;
4136                                 int ret;
4137
4138                                 vnn->delete_pending = true;
4139
4140                                 state = talloc(ctdb,
4141                                                struct delete_ip_callback_state);
4142                                 CTDB_NO_MEMORY(ctdb, state);
4143                                 state->c = c;
4144
4145                                 ip = talloc(state, struct ctdb_public_ip);
4146                                 if (ip == NULL) {
4147                                         DEBUG(DEBUG_ERR,
4148                                               (__location__ " Out of memory\n"));
4149                                         talloc_free(state);
4150                                         return -1;
4151                                 }
4152                                 ip->pnn = -1;
4153                                 ip->addr = pub->addr;
4154
4155                                 data.dsize = sizeof(struct ctdb_public_ip);
4156                                 data.dptr = (unsigned char *)ip;
4157
4158                                 ret = ctdb_daemon_send_control(ctdb,
4159                                                                ctdb_get_pnn(ctdb),
4160                                                                0,
4161                                                                CTDB_CONTROL_RELEASE_IP,
4162                                                                0, 0,
4163                                                                data,
4164                                                                delete_ip_callback,
4165                                                                state);
4166                                 if (ret == -1) {
4167                                         DEBUG(DEBUG_ERR,
4168                                               (__location__ "Unable to send "
4169                                                "CTDB_CONTROL_RELEASE_IP\n"));
4170                                         talloc_free(state);
4171                                         return -1;
4172                                 }
4173
4174                                 state->c = talloc_steal(state, c);
4175                                 *async_reply = true;
4176                         } else {
4177                                 /* This IP is not hosted on the
4178                                  * current node so just delete it
4179                                  * now. */
4180                                 do_delete_ip(ctdb, vnn);
4181                         }
4182
4183                         return 0;
4184                 }
4185         }
4186
4187         DEBUG(DEBUG_ERR,("Delete IP of unknown public IP address %s\n",
4188                          ctdb_addr_to_str(&pub->addr)));
4189         return -1;
4190 }
4191
4192
4193 struct ipreallocated_callback_state {
4194         struct ctdb_req_control_old *c;
4195 };
4196
4197 static void ctdb_ipreallocated_callback(struct ctdb_context *ctdb,
4198                                         int status, void *p)
4199 {
4200         struct ipreallocated_callback_state *state =
4201                 talloc_get_type(p, struct ipreallocated_callback_state);
4202
4203         if (status != 0) {
4204                 DEBUG(DEBUG_ERR,
4205                       (" \"ipreallocated\" event script failed (status %d)\n",
4206                        status));
4207                 if (status == -ETIME) {
4208                         ctdb_ban_self(ctdb);
4209                 }
4210         }
4211
4212         ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
4213         talloc_free(state);
4214 }
4215
4216 /* A control to run the ipreallocated event */
4217 int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
4218                                    struct ctdb_req_control_old *c,
4219                                    bool *async_reply)
4220 {
4221         int ret;
4222         struct ipreallocated_callback_state *state;
4223
4224         state = talloc(ctdb, struct ipreallocated_callback_state);
4225         CTDB_NO_MEMORY(ctdb, state);
4226
4227         DEBUG(DEBUG_INFO,(__location__ " Running \"ipreallocated\" event\n"));
4228
4229         ret = ctdb_event_script_callback(ctdb, state,
4230                                          ctdb_ipreallocated_callback, state,
4231                                          CTDB_EVENT_IPREALLOCATED,
4232                                          "%s", "");
4233
4234         if (ret != 0) {
4235                 DEBUG(DEBUG_ERR,("Failed to run \"ipreallocated\" event \n"));
4236                 talloc_free(state);
4237                 return -1;
4238         }
4239
4240         /* tell the control that we will be reply asynchronously */
4241         state->c    = talloc_steal(state, c);
4242         *async_reply = true;
4243
4244         return 0;
4245 }
4246
4247
4248 /* This function is called from the recovery daemon to verify that a remote
4249    node has the expected ip allocation.
4250    This is verified against ctdb->ip_tree
4251 */
4252 static int verify_remote_ip_allocation(struct ctdb_context *ctdb,
4253                                        struct ctdb_public_ip_list_old *ips,
4254                                        uint32_t pnn)
4255 {
4256         struct public_ip_list *tmp_ip;
4257         int i;
4258
4259         if (ctdb->ip_tree == NULL) {
4260                 /* don't know the expected allocation yet, assume remote node
4261                    is correct. */
4262                 return 0;
4263         }
4264
4265         if (ips == NULL) {
4266                 return 0;
4267         }
4268
4269         for (i=0; i<ips->num; i++) {
4270                 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
4271                 if (tmp_ip == NULL) {
4272                         DEBUG(DEBUG_ERR,("Node %u has new or unknown public IP %s\n", pnn, ctdb_addr_to_str(&ips->ips[i].addr)));
4273                         return -1;
4274                 }
4275
4276                 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
4277                         continue;
4278                 }
4279
4280                 if (tmp_ip->pnn != ips->ips[i].pnn) {
4281                         DEBUG(DEBUG_ERR,
4282                               ("Inconsistent IP allocation - node %u thinks %s is held by node %u while it is assigned to node %u\n",
4283                                pnn,
4284                                ctdb_addr_to_str(&ips->ips[i].addr),
4285                                ips->ips[i].pnn, tmp_ip->pnn));
4286                         return -1;
4287                 }
4288         }
4289
4290         return 0;
4291 }
4292
4293 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
4294 {
4295         struct public_ip_list *tmp_ip;
4296
4297         /* IP tree is never built if DisableIPFailover is set */
4298         if (ctdb->tunable.disable_ip_failover != 0) {
4299                 return 0;
4300         }
4301
4302         if (ctdb->ip_tree == NULL) {
4303                 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
4304                 return -1;
4305         }
4306
4307         tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
4308         if (tmp_ip == NULL) {
4309                 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
4310                 return -1;
4311         }
4312
4313         DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
4314         tmp_ip->pnn = ip->pnn;
4315
4316         return 0;
4317 }
4318
4319 void clear_ip_assignment_tree(struct ctdb_context *ctdb)
4320 {
4321         TALLOC_FREE(ctdb->ip_tree);
4322 }
4323
4324 struct ctdb_reloadips_handle {
4325         struct ctdb_context *ctdb;
4326         struct ctdb_req_control_old *c;
4327         int status;
4328         int fd[2];
4329         pid_t child;
4330         struct tevent_fd *fde;
4331 };
4332
4333 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
4334 {
4335         if (h == h->ctdb->reload_ips) {
4336                 h->ctdb->reload_ips = NULL;
4337         }
4338         if (h->c != NULL) {
4339                 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
4340                 h->c = NULL;
4341         }
4342         ctdb_kill(h->ctdb, h->child, SIGKILL);
4343         return 0;
4344 }
4345
4346 static void ctdb_reloadips_timeout_event(struct tevent_context *ev,
4347                                          struct tevent_timer *te,
4348                                          struct timeval t, void *private_data)
4349 {
4350         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4351
4352         talloc_free(h);
4353 }
4354
4355 static void ctdb_reloadips_child_handler(struct tevent_context *ev,
4356                                          struct tevent_fd *fde,
4357                                          uint16_t flags, void *private_data)
4358 {
4359         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4360
4361         char res;
4362         int ret;
4363
4364         ret = sys_read(h->fd[0], &res, 1);
4365         if (ret < 1 || res != 0) {
4366                 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
4367                 res = 1;
4368         }
4369         h->status = res;
4370
4371         talloc_free(h);
4372 }
4373
4374 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
4375 {
4376         TALLOC_CTX *mem_ctx = talloc_new(NULL);
4377         struct ctdb_public_ip_list_old *ips;
4378         struct ctdb_vnn *vnn;
4379         struct client_async_data *async_data;
4380         struct timeval timeout;
4381         TDB_DATA data;
4382         struct ctdb_client_control_state *state;
4383         bool first_add;
4384         int i, ret;
4385
4386         CTDB_NO_MEMORY(ctdb, mem_ctx);
4387
4388         /* Read IPs from local node */
4389         ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(),
4390                                        CTDB_CURRENT_NODE, mem_ctx, &ips);
4391         if (ret != 0) {
4392                 DEBUG(DEBUG_ERR,
4393                       ("Unable to fetch public IPs from local node\n"));
4394                 talloc_free(mem_ctx);
4395                 return -1;
4396         }
4397
4398         /* Read IPs file - this is safe since this is a child process */
4399         ctdb->vnn = NULL;
4400         if (ctdb_set_public_addresses(ctdb, false) != 0) {
4401                 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
4402                 talloc_free(mem_ctx);
4403                 return -1;
4404         }
4405
4406         async_data = talloc_zero(mem_ctx, struct client_async_data);
4407         CTDB_NO_MEMORY(ctdb, async_data);
4408
4409         /* Compare IPs between node and file for IPs to be deleted */
4410         for (i = 0; i < ips->num; i++) {
4411                 /* */
4412                 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4413                         if (ctdb_same_ip(&vnn->public_address,
4414                                          &ips->ips[i].addr)) {
4415                                 /* IP is still in file */
4416                                 break;
4417                         }
4418                 }
4419
4420                 if (vnn == NULL) {
4421                         /* Delete IP ips->ips[i] */
4422                         struct ctdb_addr_info_old *pub;
4423
4424                         DEBUG(DEBUG_NOTICE,
4425                               ("IP %s no longer configured, deleting it\n",
4426                                ctdb_addr_to_str(&ips->ips[i].addr)));
4427
4428                         pub = talloc_zero(mem_ctx, struct ctdb_addr_info_old);
4429                         CTDB_NO_MEMORY(ctdb, pub);
4430
4431                         pub->addr  = ips->ips[i].addr;
4432                         pub->mask  = 0;
4433                         pub->len   = 0;
4434
4435                         timeout = TAKEOVER_TIMEOUT();
4436
4437                         data.dsize = offsetof(struct ctdb_addr_info_old,
4438                                               iface) + pub->len;
4439                         data.dptr = (uint8_t *)pub;
4440
4441                         state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4442                                                   CTDB_CONTROL_DEL_PUBLIC_IP,
4443                                                   0, data, async_data,
4444                                                   &timeout, NULL);
4445                         if (state == NULL) {
4446                                 DEBUG(DEBUG_ERR,
4447                                       (__location__
4448                                        " failed sending CTDB_CONTROL_DEL_PUBLIC_IP\n"));
4449                                 goto failed;
4450                         }
4451
4452                         ctdb_client_async_add(async_data, state);
4453                 }
4454         }
4455
4456         /* Compare IPs between node and file for IPs to be added */
4457         first_add = true;
4458         for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4459                 for (i = 0; i < ips->num; i++) {
4460                         if (ctdb_same_ip(&vnn->public_address,
4461                                          &ips->ips[i].addr)) {
4462                                 /* IP already on node */
4463                                 break;
4464                         }
4465                 }
4466                 if (i == ips->num) {
4467                         /* Add IP ips->ips[i] */
4468                         struct ctdb_addr_info_old *pub;
4469                         const char *ifaces = NULL;
4470                         uint32_t len;
4471                         int iface = 0;
4472
4473                         DEBUG(DEBUG_NOTICE,
4474                               ("New IP %s configured, adding it\n",
4475                                ctdb_addr_to_str(&vnn->public_address)));
4476                         if (first_add) {
4477                                 uint32_t pnn = ctdb_get_pnn(ctdb);
4478
4479                                 data.dsize = sizeof(pnn);
4480                                 data.dptr  = (uint8_t *)&pnn;
4481
4482                                 ret = ctdb_client_send_message(
4483                                         ctdb,
4484                                         CTDB_BROADCAST_CONNECTED,
4485                                         CTDB_SRVID_REBALANCE_NODE,
4486                                         data);
4487                                 if (ret != 0) {
4488                                         DEBUG(DEBUG_WARNING,
4489                                               ("Failed to send message to force node reallocation - IPs may be unbalanced\n"));
4490                                 }
4491
4492                                 first_add = false;
4493                         }
4494
4495                         ifaces = vnn->ifaces[0];
4496                         iface = 1;
4497                         while (vnn->ifaces[iface] != NULL) {
4498                                 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces,
4499                                                          vnn->ifaces[iface]);
4500                                 iface++;
4501                         }
4502
4503                         len   = strlen(ifaces) + 1;
4504                         pub = talloc_zero_size(mem_ctx,
4505                                                offsetof(struct ctdb_addr_info_old, iface) + len);
4506                         CTDB_NO_MEMORY(ctdb, pub);
4507
4508                         pub->addr  = vnn->public_address;
4509                         pub->mask  = vnn->public_netmask_bits;
4510                         pub->len   = len;
4511                         memcpy(&pub->iface[0], ifaces, pub->len);
4512
4513                         timeout = TAKEOVER_TIMEOUT();
4514
4515                         data.dsize = offsetof(struct ctdb_addr_info_old,
4516                                               iface) + pub->len;
4517                         data.dptr = (uint8_t *)pub;
4518
4519                         state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4520                                                   CTDB_CONTROL_ADD_PUBLIC_IP,
4521                                                   0, data, async_data,
4522                                                   &timeout, NULL);
4523                         if (state == NULL) {
4524                                 DEBUG(DEBUG_ERR,
4525                                       (__location__
4526                                        " failed sending CTDB_CONTROL_ADD_PUBLIC_IP\n"));
4527                                 goto failed;
4528                         }
4529
4530                         ctdb_client_async_add(async_data, state);
4531                 }
4532         }
4533
4534         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
4535                 DEBUG(DEBUG_ERR,(__location__ " Add/delete IPs failed\n"));
4536                 goto failed;
4537         }
4538
4539         talloc_free(mem_ctx);
4540         return 0;
4541
4542 failed:
4543         talloc_free(mem_ctx);
4544         return -1;
4545 }
4546
4547 /* This control is sent to force the node to re-read the public addresses file
4548    and drop any addresses we should nnot longer host, and add new addresses
4549    that we are now able to host
4550 */
4551 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control_old *c, bool *async_reply)
4552 {
4553         struct ctdb_reloadips_handle *h;
4554         pid_t parent = getpid();
4555
4556         if (ctdb->reload_ips != NULL) {
4557                 talloc_free(ctdb->reload_ips);
4558                 ctdb->reload_ips = NULL;
4559         }
4560
4561         h = talloc(ctdb, struct ctdb_reloadips_handle);
4562         CTDB_NO_MEMORY(ctdb, h);
4563         h->ctdb     = ctdb;
4564         h->c        = NULL;
4565         h->status   = -1;
4566         
4567         if (pipe(h->fd) == -1) {
4568                 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
4569                 talloc_free(h);
4570                 return -1;
4571         }
4572
4573         h->child = ctdb_fork(ctdb);
4574         if (h->child == (pid_t)-1) {
4575                 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
4576                 close(h->fd[0]);
4577                 close(h->fd[1]);
4578                 talloc_free(h);
4579                 return -1;
4580         }
4581
4582         /* child process */
4583         if (h->child == 0) {
4584                 signed char res = 0;
4585
4586                 close(h->fd[0]);
4587                 debug_extra = talloc_asprintf(NULL, "reloadips:");
4588
4589                 prctl_set_comment("ctdb_reloadips");
4590                 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
4591                         DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
4592                         res = -1;
4593                 } else {
4594                         res = ctdb_reloadips_child(ctdb);
4595                         if (res != 0) {
4596                                 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
4597                         }
4598                 }
4599
4600                 sys_write(h->fd[1], &res, 1);
4601                 /* make sure we die when our parent dies */
4602                 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
4603                         sleep(5);
4604                 }
4605                 _exit(0);
4606         }
4607
4608         h->c             = talloc_steal(h, c);
4609
4610         close(h->fd[1]);
4611         set_close_on_exec(h->fd[0]);
4612
4613         talloc_set_destructor(h, ctdb_reloadips_destructor);
4614
4615
4616         h->fde = tevent_add_fd(ctdb->ev, h, h->fd[0], TEVENT_FD_READ,
4617                                ctdb_reloadips_child_handler, (void *)h);
4618         tevent_fd_set_auto_close(h->fde);
4619
4620         tevent_add_timer(ctdb->ev, h, timeval_current_ofs(120, 0),
4621                          ctdb_reloadips_timeout_event, h);
4622
4623         /* we reply later */
4624         *async_reply = true;
4625         return 0;
4626 }