ctdb-recoverd: Fix typo in comment
[obnox/samba/samba-obnox.git] / ctdb / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6    Copyright (C) Martin Schwenke  2011
7
8    This program is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3 of the License, or
11    (at your option) any later version.
12    
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17    
18    You should have received a copy of the GNU General Public License
19    along with this program; if not, see <http://www.gnu.org/licenses/>.
20 */
21 #include "includes.h"
22 #include "tdb.h"
23 #include "lib/util/dlinklist.h"
24 #include "system/network.h"
25 #include "system/filesys.h"
26 #include "system/wait.h"
27 #include "../include/ctdb_private.h"
28 #include "../common/rb_tree.h"
29
30
31 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
32
33 #define CTDB_ARP_INTERVAL 1
34 #define CTDB_ARP_REPEAT   3
35
36 /* Flags used in IP allocation algorithms. */
37 struct ctdb_ipflags {
38         bool noiptakeover;
39         bool noiphost;
40 };
41
42 struct ctdb_iface {
43         struct ctdb_iface *prev, *next;
44         const char *name;
45         bool link_up;
46         uint32_t references;
47 };
48
49 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
50 {
51         if (vnn->iface) {
52                 return vnn->iface->name;
53         }
54
55         return "__none__";
56 }
57
58 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
59 {
60         struct ctdb_iface *i;
61
62         /* Verify that we dont have an entry for this ip yet */
63         for (i=ctdb->ifaces;i;i=i->next) {
64                 if (strcmp(i->name, iface) == 0) {
65                         return 0;
66                 }
67         }
68
69         /* create a new structure for this interface */
70         i = talloc_zero(ctdb, struct ctdb_iface);
71         CTDB_NO_MEMORY_FATAL(ctdb, i);
72         i->name = talloc_strdup(i, iface);
73         CTDB_NO_MEMORY(ctdb, i->name);
74         /*
75          * If link_up defaults to true then IPs can be allocated to a
76          * node during the first recovery.  However, then an interface
77          * could have its link marked down during the startup event,
78          * causing the IP to move almost immediately.  If link_up
79          * defaults to false then, during normal operation, IPs added
80          * to a new interface can't be assigned until a monitor cycle
81          * has occurred and marked the new interfaces up.  This makes
82          * IP allocation unpredictable.  The following is a neat
83          * compromise: early in startup link_up defaults to false, so
84          * IPs can't be assigned, and after startup IPs can be
85          * assigned immediately.
86          */
87         i->link_up = (ctdb->runstate == CTDB_RUNSTATE_RUNNING);
88
89         DLIST_ADD(ctdb->ifaces, i);
90
91         return 0;
92 }
93
94 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
95                                         const char *name)
96 {
97         int n;
98
99         for (n = 0; vnn->ifaces[n] != NULL; n++) {
100                 if (strcmp(name, vnn->ifaces[n]) == 0) {
101                         return true;
102                 }
103         }
104
105         return false;
106 }
107
108 /* If any interfaces now have no possible IPs then delete them.  This
109  * implementation is naive (i.e. simple) rather than clever
110  * (i.e. complex).  Given that this is run on delip and that operation
111  * is rare, this doesn't need to be efficient - it needs to be
112  * foolproof.  One alternative is reference counting, where the logic
113  * is distributed and can, therefore, be broken in multiple places.
114  * Another alternative is to build a red-black tree of interfaces that
115  * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
116  * once) and then walking ctdb->ifaces once and deleting those not in
117  * the tree.  Let's go to one of those if the naive implementation
118  * causes problems...  :-)
119  */
120 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
121                                         struct ctdb_vnn *vnn)
122 {
123         struct ctdb_iface *i, *next;
124
125         /* For each interface, check if there's an IP using it. */
126         for (i = ctdb->ifaces; i != NULL; i = next) {
127                 struct ctdb_vnn *tv;
128                 bool found;
129                 next = i->next;
130
131                 /* Only consider interfaces named in the given VNN. */
132                 if (!vnn_has_interface_with_name(vnn, i->name)) {
133                         continue;
134                 }
135
136                 /* Is the "single IP" on this interface? */
137                 if ((ctdb->single_ip_vnn != NULL) &&
138                     (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
139                     (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
140                         /* Found, next interface please... */
141                         continue;
142                 }
143                 /* Search for a vnn with this interface. */
144                 found = false;
145                 for (tv=ctdb->vnn; tv; tv=tv->next) {
146                         if (vnn_has_interface_with_name(tv, i->name)) {
147                                 found = true;
148                                 break;
149                         }
150                 }
151
152                 if (!found) {
153                         /* None of the VNNs are using this interface. */
154                         DLIST_REMOVE(ctdb->ifaces, i);
155                         talloc_free(i);
156                 }
157         }
158 }
159
160
161 static struct ctdb_iface *ctdb_find_iface(struct ctdb_context *ctdb,
162                                           const char *iface)
163 {
164         struct ctdb_iface *i;
165
166         for (i=ctdb->ifaces;i;i=i->next) {
167                 if (strcmp(i->name, iface) == 0) {
168                         return i;
169                 }
170         }
171
172         return NULL;
173 }
174
175 static struct ctdb_iface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
176                                               struct ctdb_vnn *vnn)
177 {
178         int i;
179         struct ctdb_iface *cur = NULL;
180         struct ctdb_iface *best = NULL;
181
182         for (i=0; vnn->ifaces[i]; i++) {
183
184                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
185                 if (cur == NULL) {
186                         continue;
187                 }
188
189                 if (!cur->link_up) {
190                         continue;
191                 }
192
193                 if (best == NULL) {
194                         best = cur;
195                         continue;
196                 }
197
198                 if (cur->references < best->references) {
199                         best = cur;
200                         continue;
201                 }
202         }
203
204         return best;
205 }
206
207 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
208                                      struct ctdb_vnn *vnn)
209 {
210         struct ctdb_iface *best = NULL;
211
212         if (vnn->iface) {
213                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
214                                    "still assigned to iface '%s'\n",
215                                    ctdb_addr_to_str(&vnn->public_address),
216                                    ctdb_vnn_iface_string(vnn)));
217                 return 0;
218         }
219
220         best = ctdb_vnn_best_iface(ctdb, vnn);
221         if (best == NULL) {
222                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
223                                   "cannot assign to iface any iface\n",
224                                   ctdb_addr_to_str(&vnn->public_address)));
225                 return -1;
226         }
227
228         vnn->iface = best;
229         best->references++;
230         vnn->pnn = ctdb->pnn;
231
232         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
233                            "now assigned to iface '%s' refs[%d]\n",
234                            ctdb_addr_to_str(&vnn->public_address),
235                            ctdb_vnn_iface_string(vnn),
236                            best->references));
237         return 0;
238 }
239
240 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
241                                     struct ctdb_vnn *vnn)
242 {
243         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
244                            "now unassigned (old iface '%s' refs[%d])\n",
245                            ctdb_addr_to_str(&vnn->public_address),
246                            ctdb_vnn_iface_string(vnn),
247                            vnn->iface?vnn->iface->references:0));
248         if (vnn->iface) {
249                 vnn->iface->references--;
250         }
251         vnn->iface = NULL;
252         if (vnn->pnn == ctdb->pnn) {
253                 vnn->pnn = -1;
254         }
255 }
256
257 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
258                                struct ctdb_vnn *vnn)
259 {
260         int i;
261
262         if (vnn->delete_pending) {
263                 return false;
264         }
265
266         if (vnn->iface && vnn->iface->link_up) {
267                 return true;
268         }
269
270         for (i=0; vnn->ifaces[i]; i++) {
271                 struct ctdb_iface *cur;
272
273                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
274                 if (cur == NULL) {
275                         continue;
276                 }
277
278                 if (cur->link_up) {
279                         return true;
280                 }
281         }
282
283         return false;
284 }
285
286 struct ctdb_takeover_arp {
287         struct ctdb_context *ctdb;
288         uint32_t count;
289         ctdb_sock_addr addr;
290         struct ctdb_tcp_array *tcparray;
291         struct ctdb_vnn *vnn;
292 };
293
294
295 /*
296   lists of tcp endpoints
297  */
298 struct ctdb_tcp_list {
299         struct ctdb_tcp_list *prev, *next;
300         struct ctdb_tcp_connection connection;
301 };
302
303 /*
304   list of clients to kill on IP release
305  */
306 struct ctdb_client_ip {
307         struct ctdb_client_ip *prev, *next;
308         struct ctdb_context *ctdb;
309         ctdb_sock_addr addr;
310         uint32_t client_id;
311 };
312
313
314 /*
315   send a gratuitous arp
316  */
317 static void ctdb_control_send_arp(struct event_context *ev, struct timed_event *te, 
318                                   struct timeval t, void *private_data)
319 {
320         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
321                                                         struct ctdb_takeover_arp);
322         int i, ret;
323         struct ctdb_tcp_array *tcparray;
324         const char *iface = ctdb_vnn_iface_string(arp->vnn);
325
326         ret = ctdb_sys_send_arp(&arp->addr, iface);
327         if (ret != 0) {
328                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
329                                   iface, strerror(errno)));
330         }
331
332         tcparray = arp->tcparray;
333         if (tcparray) {
334                 for (i=0;i<tcparray->num;i++) {
335                         struct ctdb_tcp_connection *tcon;
336
337                         tcon = &tcparray->connections[i];
338                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
339                                 (unsigned)ntohs(tcon->dst_addr.ip.sin_port), 
340                                 ctdb_addr_to_str(&tcon->src_addr),
341                                 (unsigned)ntohs(tcon->src_addr.ip.sin_port)));
342                         ret = ctdb_sys_send_tcp(
343                                 &tcon->src_addr, 
344                                 &tcon->dst_addr,
345                                 0, 0, 0);
346                         if (ret != 0) {
347                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
348                                         ctdb_addr_to_str(&tcon->src_addr)));
349                         }
350                 }
351         }
352
353         arp->count++;
354
355         if (arp->count == CTDB_ARP_REPEAT) {
356                 talloc_free(arp);
357                 return;
358         }
359
360         event_add_timed(arp->ctdb->ev, arp->vnn->takeover_ctx, 
361                         timeval_current_ofs(CTDB_ARP_INTERVAL, 100000), 
362                         ctdb_control_send_arp, arp);
363 }
364
365 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
366                                        struct ctdb_vnn *vnn)
367 {
368         struct ctdb_takeover_arp *arp;
369         struct ctdb_tcp_array *tcparray;
370
371         if (!vnn->takeover_ctx) {
372                 vnn->takeover_ctx = talloc_new(vnn);
373                 if (!vnn->takeover_ctx) {
374                         return -1;
375                 }
376         }
377
378         arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
379         if (!arp) {
380                 return -1;
381         }
382
383         arp->ctdb = ctdb;
384         arp->addr = vnn->public_address;
385         arp->vnn  = vnn;
386
387         tcparray = vnn->tcp_array;
388         if (tcparray) {
389                 /* add all of the known tcp connections for this IP to the
390                    list of tcp connections to send tickle acks for */
391                 arp->tcparray = talloc_steal(arp, tcparray);
392
393                 vnn->tcp_array = NULL;
394                 vnn->tcp_update_needed = true;
395         }
396
397         event_add_timed(arp->ctdb->ev, vnn->takeover_ctx,
398                         timeval_zero(), ctdb_control_send_arp, arp);
399
400         return 0;
401 }
402
403 struct takeover_callback_state {
404         struct ctdb_req_control *c;
405         ctdb_sock_addr *addr;
406         struct ctdb_vnn *vnn;
407 };
408
409 struct ctdb_do_takeip_state {
410         struct ctdb_req_control *c;
411         struct ctdb_vnn *vnn;
412 };
413
414 /*
415   called when takeip event finishes
416  */
417 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
418                                     void *private_data)
419 {
420         struct ctdb_do_takeip_state *state =
421                 talloc_get_type(private_data, struct ctdb_do_takeip_state);
422         int32_t ret;
423         TDB_DATA data;
424
425         if (status != 0) {
426                 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
427         
428                 if (status == -ETIME) {
429                         ctdb_ban_self(ctdb);
430                 }
431                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
432                                  ctdb_addr_to_str(&state->vnn->public_address),
433                                  ctdb_vnn_iface_string(state->vnn)));
434                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
435
436                 node->flags |= NODE_FLAGS_UNHEALTHY;
437                 talloc_free(state);
438                 return;
439         }
440
441         if (ctdb->do_checkpublicip) {
442
443         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
444         if (ret != 0) {
445                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
446                 talloc_free(state);
447                 return;
448         }
449
450         }
451
452         data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
453         data.dsize = strlen((char *)data.dptr) + 1;
454         DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
455
456         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
457
458
459         /* the control succeeded */
460         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
461         talloc_free(state);
462         return;
463 }
464
465 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
466 {
467         state->vnn->update_in_flight = false;
468         return 0;
469 }
470
471 /*
472   take over an ip address
473  */
474 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
475                               struct ctdb_req_control *c,
476                               struct ctdb_vnn *vnn)
477 {
478         int ret;
479         struct ctdb_do_takeip_state *state;
480
481         if (vnn->update_in_flight) {
482                 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
483                                     "update for this IP already in flight\n",
484                                     ctdb_addr_to_str(&vnn->public_address),
485                                     vnn->public_netmask_bits));
486                 return -1;
487         }
488
489         ret = ctdb_vnn_assign_iface(ctdb, vnn);
490         if (ret != 0) {
491                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
492                                  "assign a usable interface\n",
493                                  ctdb_addr_to_str(&vnn->public_address),
494                                  vnn->public_netmask_bits));
495                 return -1;
496         }
497
498         state = talloc(vnn, struct ctdb_do_takeip_state);
499         CTDB_NO_MEMORY(ctdb, state);
500
501         state->c = talloc_steal(ctdb, c);
502         state->vnn   = vnn;
503
504         vnn->update_in_flight = true;
505         talloc_set_destructor(state, ctdb_takeip_destructor);
506
507         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
508                             ctdb_addr_to_str(&vnn->public_address),
509                             vnn->public_netmask_bits,
510                             ctdb_vnn_iface_string(vnn)));
511
512         ret = ctdb_event_script_callback(ctdb,
513                                          state,
514                                          ctdb_do_takeip_callback,
515                                          state,
516                                          CTDB_EVENT_TAKE_IP,
517                                          "%s %s %u",
518                                          ctdb_vnn_iface_string(vnn),
519                                          ctdb_addr_to_str(&vnn->public_address),
520                                          vnn->public_netmask_bits);
521
522         if (ret != 0) {
523                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
524                         ctdb_addr_to_str(&vnn->public_address),
525                         ctdb_vnn_iface_string(vnn)));
526                 talloc_free(state);
527                 return -1;
528         }
529
530         return 0;
531 }
532
533 struct ctdb_do_updateip_state {
534         struct ctdb_req_control *c;
535         struct ctdb_iface *old;
536         struct ctdb_vnn *vnn;
537 };
538
539 /*
540   called when updateip event finishes
541  */
542 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
543                                       void *private_data)
544 {
545         struct ctdb_do_updateip_state *state =
546                 talloc_get_type(private_data, struct ctdb_do_updateip_state);
547         int32_t ret;
548
549         if (status != 0) {
550                 if (status == -ETIME) {
551                         ctdb_ban_self(ctdb);
552                 }
553                 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
554                         ctdb_addr_to_str(&state->vnn->public_address),
555                         state->old->name,
556                         ctdb_vnn_iface_string(state->vnn)));
557
558                 /*
559                  * All we can do is reset the old interface
560                  * and let the next run fix it
561                  */
562                 ctdb_vnn_unassign_iface(ctdb, state->vnn);
563                 state->vnn->iface = state->old;
564                 state->vnn->iface->references++;
565
566                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
567                 talloc_free(state);
568                 return;
569         }
570
571         if (ctdb->do_checkpublicip) {
572
573         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
574         if (ret != 0) {
575                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
576                 talloc_free(state);
577                 return;
578         }
579
580         }
581
582         /* the control succeeded */
583         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
584         talloc_free(state);
585         return;
586 }
587
588 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
589 {
590         state->vnn->update_in_flight = false;
591         return 0;
592 }
593
594 /*
595   update (move) an ip address
596  */
597 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
598                                 struct ctdb_req_control *c,
599                                 struct ctdb_vnn *vnn)
600 {
601         int ret;
602         struct ctdb_do_updateip_state *state;
603         struct ctdb_iface *old = vnn->iface;
604         const char *new_name;
605
606         if (vnn->update_in_flight) {
607                 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
608                                     "update for this IP already in flight\n",
609                                     ctdb_addr_to_str(&vnn->public_address),
610                                     vnn->public_netmask_bits));
611                 return -1;
612         }
613
614         ctdb_vnn_unassign_iface(ctdb, vnn);
615         ret = ctdb_vnn_assign_iface(ctdb, vnn);
616         if (ret != 0) {
617                 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
618                                  "assin a usable interface (old iface '%s')\n",
619                                  ctdb_addr_to_str(&vnn->public_address),
620                                  vnn->public_netmask_bits,
621                                  old->name));
622                 return -1;
623         }
624
625         new_name = ctdb_vnn_iface_string(vnn);
626         if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
627                 /* A benign update from one interface onto itself.
628                  * no need to run the eventscripts in this case, just return
629                  * success.
630                  */
631                 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
632                 return 0;
633         }
634
635         state = talloc(vnn, struct ctdb_do_updateip_state);
636         CTDB_NO_MEMORY(ctdb, state);
637
638         state->c = talloc_steal(ctdb, c);
639         state->old = old;
640         state->vnn = vnn;
641
642         vnn->update_in_flight = true;
643         talloc_set_destructor(state, ctdb_updateip_destructor);
644
645         DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
646                             "interface %s to %s\n",
647                             ctdb_addr_to_str(&vnn->public_address),
648                             vnn->public_netmask_bits,
649                             old->name,
650                             new_name));
651
652         ret = ctdb_event_script_callback(ctdb,
653                                          state,
654                                          ctdb_do_updateip_callback,
655                                          state,
656                                          CTDB_EVENT_UPDATE_IP,
657                                          "%s %s %s %u",
658                                          state->old->name,
659                                          new_name,
660                                          ctdb_addr_to_str(&vnn->public_address),
661                                          vnn->public_netmask_bits);
662         if (ret != 0) {
663                 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
664                                  ctdb_addr_to_str(&vnn->public_address),
665                                  old->name, new_name));
666                 talloc_free(state);
667                 return -1;
668         }
669
670         return 0;
671 }
672
673 /*
674   Find the vnn of the node that has a public ip address
675   returns -1 if the address is not known as a public address
676  */
677 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
678 {
679         struct ctdb_vnn *vnn;
680
681         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
682                 if (ctdb_same_ip(&vnn->public_address, addr)) {
683                         return vnn;
684                 }
685         }
686
687         return NULL;
688 }
689
690 /*
691   take over an ip address
692  */
693 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
694                                  struct ctdb_req_control *c,
695                                  TDB_DATA indata,
696                                  bool *async_reply)
697 {
698         int ret;
699         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
700         struct ctdb_vnn *vnn;
701         bool have_ip = false;
702         bool do_updateip = false;
703         bool do_takeip = false;
704         struct ctdb_iface *best_iface = NULL;
705
706         if (pip->pnn != ctdb->pnn) {
707                 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
708                                  "with pnn %d, but we're node %d\n",
709                                  ctdb_addr_to_str(&pip->addr),
710                                  pip->pnn, ctdb->pnn));
711                 return -1;
712         }
713
714         /* update out vnn list */
715         vnn = find_public_ip_vnn(ctdb, &pip->addr);
716         if (vnn == NULL) {
717                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
718                         ctdb_addr_to_str(&pip->addr)));
719                 return 0;
720         }
721
722         if (ctdb->do_checkpublicip) {
723                 have_ip = ctdb_sys_have_ip(&pip->addr);
724         }
725         best_iface = ctdb_vnn_best_iface(ctdb, vnn);
726         if (best_iface == NULL) {
727                 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
728                                  "a usable interface (old %s, have_ip %d)\n",
729                                  ctdb_addr_to_str(&vnn->public_address),
730                                  vnn->public_netmask_bits,
731                                  ctdb_vnn_iface_string(vnn),
732                                  have_ip));
733                 return -1;
734         }
735
736         if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
737                 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
738                 have_ip = false;
739         }
740
741
742         if (vnn->iface == NULL && have_ip) {
743                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
744                                   "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
745                                  ctdb_addr_to_str(&vnn->public_address)));
746                 return 0;
747         }
748
749         if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
750                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
751                                   "and we have it on iface[%s], but it was assigned to node %d"
752                                   "and we are node %d, banning ourself\n",
753                                  ctdb_addr_to_str(&vnn->public_address),
754                                  ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
755                 ctdb_ban_self(ctdb);
756                 return -1;
757         }
758
759         if (vnn->pnn == -1 && have_ip) {
760                 vnn->pnn = ctdb->pnn;
761                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
762                                   "and we already have it on iface[%s], update local daemon\n",
763                                  ctdb_addr_to_str(&vnn->public_address),
764                                   ctdb_vnn_iface_string(vnn)));
765                 return 0;
766         }
767
768         if (vnn->iface) {
769                 if (vnn->iface != best_iface) {
770                         if (!vnn->iface->link_up) {
771                                 do_updateip = true;
772                         } else if (vnn->iface->references > (best_iface->references + 1)) {
773                                 /* only move when the rebalance gains something */
774                                         do_updateip = true;
775                         }
776                 }
777         }
778
779         if (!have_ip) {
780                 if (do_updateip) {
781                         ctdb_vnn_unassign_iface(ctdb, vnn);
782                         do_updateip = false;
783                 }
784                 do_takeip = true;
785         }
786
787         if (do_takeip) {
788                 ret = ctdb_do_takeip(ctdb, c, vnn);
789                 if (ret != 0) {
790                         return -1;
791                 }
792         } else if (do_updateip) {
793                 ret = ctdb_do_updateip(ctdb, c, vnn);
794                 if (ret != 0) {
795                         return -1;
796                 }
797         } else {
798                 /*
799                  * The interface is up and the kernel known the ip
800                  * => do nothing
801                  */
802                 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
803                         ctdb_addr_to_str(&pip->addr),
804                         vnn->public_netmask_bits,
805                         ctdb_vnn_iface_string(vnn)));
806                 return 0;
807         }
808
809         /* tell ctdb_control.c that we will be replying asynchronously */
810         *async_reply = true;
811
812         return 0;
813 }
814
815 /*
816   takeover an ip address old v4 style
817  */
818 int32_t ctdb_control_takeover_ipv4(struct ctdb_context *ctdb, 
819                                 struct ctdb_req_control *c,
820                                 TDB_DATA indata, 
821                                 bool *async_reply)
822 {
823         TDB_DATA data;
824         
825         data.dsize = sizeof(struct ctdb_public_ip);
826         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
827         CTDB_NO_MEMORY(ctdb, data.dptr);
828         
829         memcpy(data.dptr, indata.dptr, indata.dsize);
830         return ctdb_control_takeover_ip(ctdb, c, data, async_reply);
831 }
832
833 /*
834   kill any clients that are registered with a IP that is being released
835  */
836 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
837 {
838         struct ctdb_client_ip *ip;
839
840         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
841                 ctdb_addr_to_str(addr)));
842
843         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
844                 ctdb_sock_addr tmp_addr;
845
846                 tmp_addr = ip->addr;
847                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
848                         ip->client_id,
849                         ctdb_addr_to_str(&ip->addr)));
850
851                 if (ctdb_same_ip(&tmp_addr, addr)) {
852                         struct ctdb_client *client = ctdb_reqid_find(ctdb, 
853                                                                      ip->client_id, 
854                                                                      struct ctdb_client);
855                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
856                                 ip->client_id,
857                                 ctdb_addr_to_str(&ip->addr),
858                                 client->pid));
859
860                         if (client->pid != 0) {
861                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
862                                         (unsigned)client->pid,
863                                         ctdb_addr_to_str(addr),
864                                         ip->client_id));
865                                 kill(client->pid, SIGKILL);
866                         }
867                 }
868         }
869 }
870
871 static void do_delete_ip(struct ctdb_context *ctdb, struct ctdb_vnn *vnn)
872 {
873         DLIST_REMOVE(ctdb->vnn, vnn);
874         ctdb_vnn_unassign_iface(ctdb, vnn);
875         ctdb_remove_orphaned_ifaces(ctdb, vnn);
876         talloc_free(vnn);
877 }
878
879 /*
880   called when releaseip event finishes
881  */
882 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
883                                 void *private_data)
884 {
885         struct takeover_callback_state *state = 
886                 talloc_get_type(private_data, struct takeover_callback_state);
887         TDB_DATA data;
888
889         if (status == -ETIME) {
890                 ctdb_ban_self(ctdb);
891         }
892
893         if (ctdb->do_checkpublicip && ctdb_sys_have_ip(state->addr)) {
894                 DEBUG(DEBUG_ERR, ("IP %s still hosted during release IP callback, failing\n",
895                                   ctdb_addr_to_str(state->addr)));
896                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
897                 talloc_free(state);
898                 return;
899         }
900
901         /* send a message to all clients of this node telling them
902            that the cluster has been reconfigured and they should
903            release any sockets on this IP */
904         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
905         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
906         data.dsize = strlen((char *)data.dptr)+1;
907
908         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
909
910         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
911
912         /* kill clients that have registered with this IP */
913         release_kill_clients(ctdb, state->addr);
914
915         ctdb_vnn_unassign_iface(ctdb, state->vnn);
916
917         /* Process the IP if it has been marked for deletion */
918         if (state->vnn->delete_pending) {
919                 do_delete_ip(ctdb, state->vnn);
920                 state->vnn = NULL;
921         }
922
923         /* the control succeeded */
924         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
925         talloc_free(state);
926 }
927
928 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
929 {
930         if (state->vnn != NULL) {
931                 state->vnn->update_in_flight = false;
932         }
933         return 0;
934 }
935
936 /*
937   release an ip address
938  */
939 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
940                                 struct ctdb_req_control *c,
941                                 TDB_DATA indata, 
942                                 bool *async_reply)
943 {
944         int ret;
945         struct takeover_callback_state *state;
946         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
947         struct ctdb_vnn *vnn;
948         char *iface;
949
950         /* update our vnn list */
951         vnn = find_public_ip_vnn(ctdb, &pip->addr);
952         if (vnn == NULL) {
953                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
954                         ctdb_addr_to_str(&pip->addr)));
955                 return 0;
956         }
957         vnn->pnn = pip->pnn;
958
959         /* stop any previous arps */
960         talloc_free(vnn->takeover_ctx);
961         vnn->takeover_ctx = NULL;
962
963         /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
964          * lazy multicast to drop an IP from any node that isn't the
965          * intended new node.  The following causes makes ctdbd ignore
966          * a release for any address it doesn't host.
967          */
968         if (ctdb->do_checkpublicip) {
969                 if (!ctdb_sys_have_ip(&pip->addr)) {
970                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
971                                 ctdb_addr_to_str(&pip->addr),
972                                 vnn->public_netmask_bits,
973                                 ctdb_vnn_iface_string(vnn)));
974                         ctdb_vnn_unassign_iface(ctdb, vnn);
975                         return 0;
976                 }
977         } else {
978                 if (vnn->iface == NULL) {
979                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
980                                            ctdb_addr_to_str(&pip->addr),
981                                            vnn->public_netmask_bits));
982                         return 0;
983                 }
984         }
985
986         /* There is a potential race between take_ip and us because we
987          * update the VNN via a callback that run when the
988          * eventscripts have been run.  Avoid the race by allowing one
989          * update to be in flight at a time.
990          */
991         if (vnn->update_in_flight) {
992                 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
993                                     "update for this IP already in flight\n",
994                                     ctdb_addr_to_str(&vnn->public_address),
995                                     vnn->public_netmask_bits));
996                 return -1;
997         }
998
999         iface = strdup(ctdb_vnn_iface_string(vnn));
1000
1001         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
1002                 ctdb_addr_to_str(&pip->addr),
1003                 vnn->public_netmask_bits,
1004                 iface,
1005                 pip->pnn));
1006
1007         state = talloc(ctdb, struct takeover_callback_state);
1008         CTDB_NO_MEMORY(ctdb, state);
1009
1010         state->c = talloc_steal(state, c);
1011         state->addr = talloc(state, ctdb_sock_addr);       
1012         CTDB_NO_MEMORY(ctdb, state->addr);
1013         *state->addr = pip->addr;
1014         state->vnn   = vnn;
1015
1016         vnn->update_in_flight = true;
1017         talloc_set_destructor(state, ctdb_releaseip_destructor);
1018
1019         ret = ctdb_event_script_callback(ctdb, 
1020                                          state, release_ip_callback, state,
1021                                          CTDB_EVENT_RELEASE_IP,
1022                                          "%s %s %u",
1023                                          iface,
1024                                          ctdb_addr_to_str(&pip->addr),
1025                                          vnn->public_netmask_bits);
1026         free(iface);
1027         if (ret != 0) {
1028                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1029                         ctdb_addr_to_str(&pip->addr),
1030                         ctdb_vnn_iface_string(vnn)));
1031                 talloc_free(state);
1032                 return -1;
1033         }
1034
1035         /* tell the control that we will be reply asynchronously */
1036         *async_reply = true;
1037         return 0;
1038 }
1039
1040 /*
1041   release an ip address old v4 style
1042  */
1043 int32_t ctdb_control_release_ipv4(struct ctdb_context *ctdb, 
1044                                 struct ctdb_req_control *c,
1045                                 TDB_DATA indata, 
1046                                 bool *async_reply)
1047 {
1048         TDB_DATA data;
1049         
1050         data.dsize = sizeof(struct ctdb_public_ip);
1051         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
1052         CTDB_NO_MEMORY(ctdb, data.dptr);
1053         
1054         memcpy(data.dptr, indata.dptr, indata.dsize);
1055         return ctdb_control_release_ip(ctdb, c, data, async_reply);
1056 }
1057
1058
1059 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1060                                    ctdb_sock_addr *addr,
1061                                    unsigned mask, const char *ifaces,
1062                                    bool check_address)
1063 {
1064         struct ctdb_vnn      *vnn;
1065         uint32_t num = 0;
1066         char *tmp;
1067         const char *iface;
1068         int i;
1069         int ret;
1070
1071         tmp = strdup(ifaces);
1072         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1073                 if (!ctdb_sys_check_iface_exists(iface)) {
1074                         DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1075                         free(tmp);
1076                         return -1;
1077                 }
1078         }
1079         free(tmp);
1080
1081         /* Verify that we dont have an entry for this ip yet */
1082         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1083                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1084                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
1085                                 ctdb_addr_to_str(addr)));
1086                         return -1;
1087                 }               
1088         }
1089
1090         /* create a new vnn structure for this ip address */
1091         vnn = talloc_zero(ctdb, struct ctdb_vnn);
1092         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1093         vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1094         tmp = talloc_strdup(vnn, ifaces);
1095         CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1096         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1097                 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1098                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1099                 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1100                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1101                 num++;
1102         }
1103         talloc_free(tmp);
1104         vnn->ifaces[num] = NULL;
1105         vnn->public_address      = *addr;
1106         vnn->public_netmask_bits = mask;
1107         vnn->pnn                 = -1;
1108         if (check_address) {
1109                 if (ctdb_sys_have_ip(addr)) {
1110                         DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1111                         vnn->pnn = ctdb->pnn;
1112                 }
1113         }
1114
1115         for (i=0; vnn->ifaces[i]; i++) {
1116                 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1117                 if (ret != 0) {
1118                         DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1119                                            "for public_address[%s]\n",
1120                                            vnn->ifaces[i], ctdb_addr_to_str(addr)));
1121                         talloc_free(vnn);
1122                         return -1;
1123                 }
1124         }
1125
1126         DLIST_ADD(ctdb->vnn, vnn);
1127
1128         return 0;
1129 }
1130
1131 static void ctdb_check_interfaces_event(struct event_context *ev, struct timed_event *te, 
1132                                   struct timeval t, void *private_data)
1133 {
1134         struct ctdb_context *ctdb = talloc_get_type(private_data, 
1135                                                         struct ctdb_context);
1136         struct ctdb_vnn *vnn;
1137
1138         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1139                 int i;
1140
1141                 for (i=0; vnn->ifaces[i] != NULL; i++) {
1142                         if (!ctdb_sys_check_iface_exists(vnn->ifaces[i])) {
1143                                 DEBUG(DEBUG_CRIT,("Interface %s does not exist but is used by public ip %s\n",
1144                                         vnn->ifaces[i],
1145                                         ctdb_addr_to_str(&vnn->public_address)));
1146                         }
1147                 }
1148         }
1149
1150         event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx, 
1151                 timeval_current_ofs(30, 0), 
1152                 ctdb_check_interfaces_event, ctdb);
1153 }
1154
1155
1156 int ctdb_start_monitoring_interfaces(struct ctdb_context *ctdb)
1157 {
1158         if (ctdb->check_public_ifaces_ctx != NULL) {
1159                 talloc_free(ctdb->check_public_ifaces_ctx);
1160                 ctdb->check_public_ifaces_ctx = NULL;
1161         }
1162
1163         ctdb->check_public_ifaces_ctx = talloc_new(ctdb);
1164         if (ctdb->check_public_ifaces_ctx == NULL) {
1165                 ctdb_fatal(ctdb, "failed to allocate context for checking interfaces");
1166         }
1167
1168         event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx, 
1169                 timeval_current_ofs(30, 0), 
1170                 ctdb_check_interfaces_event, ctdb);
1171
1172         return 0;
1173 }
1174
1175
1176 /*
1177   setup the public address lists from a file
1178 */
1179 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1180 {
1181         char **lines;
1182         int nlines;
1183         int i;
1184
1185         lines = file_lines_load(ctdb->public_addresses_file, &nlines, 0, ctdb);
1186         if (lines == NULL) {
1187                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1188                 return -1;
1189         }
1190         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1191                 nlines--;
1192         }
1193
1194         for (i=0;i<nlines;i++) {
1195                 unsigned mask;
1196                 ctdb_sock_addr addr;
1197                 const char *addrstr;
1198                 const char *ifaces;
1199                 char *tok, *line;
1200
1201                 line = lines[i];
1202                 while ((*line == ' ') || (*line == '\t')) {
1203                         line++;
1204                 }
1205                 if (*line == '#') {
1206                         continue;
1207                 }
1208                 if (strcmp(line, "") == 0) {
1209                         continue;
1210                 }
1211                 tok = strtok(line, " \t");
1212                 addrstr = tok;
1213                 tok = strtok(NULL, " \t");
1214                 if (tok == NULL) {
1215                         if (NULL == ctdb->default_public_interface) {
1216                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1217                                          i+1));
1218                                 talloc_free(lines);
1219                                 return -1;
1220                         }
1221                         ifaces = ctdb->default_public_interface;
1222                 } else {
1223                         ifaces = tok;
1224                 }
1225
1226                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1227                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1228                         talloc_free(lines);
1229                         return -1;
1230                 }
1231                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1232                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1233                         talloc_free(lines);
1234                         return -1;
1235                 }
1236         }
1237
1238
1239         talloc_free(lines);
1240         return 0;
1241 }
1242
1243 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1244                               const char *iface,
1245                               const char *ip)
1246 {
1247         struct ctdb_vnn *svnn;
1248         struct ctdb_iface *cur = NULL;
1249         bool ok;
1250         int ret;
1251
1252         svnn = talloc_zero(ctdb, struct ctdb_vnn);
1253         CTDB_NO_MEMORY(ctdb, svnn);
1254
1255         svnn->ifaces = talloc_array(svnn, const char *, 2);
1256         CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1257         svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1258         CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1259         svnn->ifaces[1] = NULL;
1260
1261         ok = parse_ip(ip, iface, 0, &svnn->public_address);
1262         if (!ok) {
1263                 talloc_free(svnn);
1264                 return -1;
1265         }
1266
1267         ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1268         if (ret != 0) {
1269                 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1270                                    "for single_ip[%s]\n",
1271                                    svnn->ifaces[0],
1272                                    ctdb_addr_to_str(&svnn->public_address)));
1273                 talloc_free(svnn);
1274                 return -1;
1275         }
1276
1277         /* assume the single public ip interface is initially "good" */
1278         cur = ctdb_find_iface(ctdb, iface);
1279         if (cur == NULL) {
1280                 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1281                 return -1;
1282         }
1283         cur->link_up = true;
1284
1285         ret = ctdb_vnn_assign_iface(ctdb, svnn);
1286         if (ret != 0) {
1287                 talloc_free(svnn);
1288                 return -1;
1289         }
1290
1291         ctdb->single_ip_vnn = svnn;
1292         return 0;
1293 }
1294
1295 struct ctdb_public_ip_list {
1296         struct ctdb_public_ip_list *next;
1297         uint32_t pnn;
1298         ctdb_sock_addr addr;
1299 };
1300
1301 /* Given a physical node, return the number of
1302    public addresses that is currently assigned to this node.
1303 */
1304 static int node_ip_coverage(struct ctdb_context *ctdb, 
1305         int32_t pnn,
1306         struct ctdb_public_ip_list *ips)
1307 {
1308         int num=0;
1309
1310         for (;ips;ips=ips->next) {
1311                 if (ips->pnn == pnn) {
1312                         num++;
1313                 }
1314         }
1315         return num;
1316 }
1317
1318
1319 /* Can the given node host the given IP: is the public IP known to the
1320  * node and is NOIPHOST unset?
1321 */
1322 static bool can_node_host_ip(struct ctdb_context *ctdb, int32_t pnn, 
1323                              struct ctdb_ipflags ipflags,
1324                              struct ctdb_public_ip_list *ip)
1325 {
1326         struct ctdb_all_public_ips *public_ips;
1327         int i;
1328
1329         if (ipflags.noiphost) {
1330                 return false;
1331         }
1332
1333         public_ips = ctdb->nodes[pnn]->available_public_ips;
1334
1335         if (public_ips == NULL) {
1336                 return false;
1337         }
1338
1339         for (i=0; i<public_ips->num; i++) {
1340                 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1341                         /* yes, this node can serve this public ip */
1342                         return true;
1343                 }
1344         }
1345
1346         return false;
1347 }
1348
1349 static bool can_node_takeover_ip(struct ctdb_context *ctdb, int32_t pnn, 
1350                                  struct ctdb_ipflags ipflags,
1351                                  struct ctdb_public_ip_list *ip)
1352 {
1353         if (ipflags.noiptakeover) {
1354                 return false;
1355         }
1356
1357         return can_node_host_ip(ctdb, pnn, ipflags, ip);
1358 }
1359
1360 /* search the node lists list for a node to takeover this ip.
1361    pick the node that currently are serving the least number of ips
1362    so that the ips get spread out evenly.
1363 */
1364 static int find_takeover_node(struct ctdb_context *ctdb, 
1365                 struct ctdb_ipflags *ipflags,
1366                 struct ctdb_public_ip_list *ip,
1367                 struct ctdb_public_ip_list *all_ips)
1368 {
1369         int pnn, min=0, num;
1370         int i, numnodes;
1371
1372         numnodes = talloc_array_length(ipflags);
1373         pnn    = -1;
1374         for (i=0; i<numnodes; i++) {
1375                 /* verify that this node can serve this ip */
1376                 if (!can_node_takeover_ip(ctdb, i, ipflags[i], ip)) {
1377                         /* no it couldnt   so skip to the next node */
1378                         continue;
1379                 }
1380
1381                 num = node_ip_coverage(ctdb, i, all_ips);
1382                 /* was this the first node we checked ? */
1383                 if (pnn == -1) {
1384                         pnn = i;
1385                         min  = num;
1386                 } else {
1387                         if (num < min) {
1388                                 pnn = i;
1389                                 min  = num;
1390                         }
1391                 }
1392         }       
1393         if (pnn == -1) {
1394                 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1395                         ctdb_addr_to_str(&ip->addr)));
1396
1397                 return -1;
1398         }
1399
1400         ip->pnn = pnn;
1401         return 0;
1402 }
1403
1404 #define IP_KEYLEN       4
1405 static uint32_t *ip_key(ctdb_sock_addr *ip)
1406 {
1407         static uint32_t key[IP_KEYLEN];
1408
1409         bzero(key, sizeof(key));
1410
1411         switch (ip->sa.sa_family) {
1412         case AF_INET:
1413                 key[3]  = htonl(ip->ip.sin_addr.s_addr);
1414                 break;
1415         case AF_INET6: {
1416                 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1417                 key[0]  = htonl(s6_a32[0]);
1418                 key[1]  = htonl(s6_a32[1]);
1419                 key[2]  = htonl(s6_a32[2]);
1420                 key[3]  = htonl(s6_a32[3]);
1421                 break;
1422         }
1423         default:
1424                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1425                 return key;
1426         }
1427
1428         return key;
1429 }
1430
1431 static void *add_ip_callback(void *parm, void *data)
1432 {
1433         struct ctdb_public_ip_list *this_ip = parm; 
1434         struct ctdb_public_ip_list *prev_ip = data; 
1435
1436         if (prev_ip == NULL) {
1437                 return parm;
1438         }
1439         if (this_ip->pnn == -1) {
1440                 this_ip->pnn = prev_ip->pnn;
1441         }
1442
1443         return parm;
1444 }
1445
1446 static int getips_count_callback(void *param, void *data)
1447 {
1448         struct ctdb_public_ip_list **ip_list = (struct ctdb_public_ip_list **)param;
1449         struct ctdb_public_ip_list *new_ip = (struct ctdb_public_ip_list *)data;
1450
1451         new_ip->next = *ip_list;
1452         *ip_list     = new_ip;
1453         return 0;
1454 }
1455
1456 static struct ctdb_public_ip_list *
1457 create_merged_ip_list(struct ctdb_context *ctdb)
1458 {
1459         int i, j;
1460         struct ctdb_public_ip_list *ip_list;
1461         struct ctdb_all_public_ips *public_ips;
1462
1463         if (ctdb->ip_tree != NULL) {
1464                 talloc_free(ctdb->ip_tree);
1465                 ctdb->ip_tree = NULL;
1466         }
1467         ctdb->ip_tree = trbt_create(ctdb, 0);
1468
1469         for (i=0;i<ctdb->num_nodes;i++) {
1470                 public_ips = ctdb->nodes[i]->known_public_ips;
1471
1472                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1473                         continue;
1474                 }
1475
1476                 /* there were no public ips for this node */
1477                 if (public_ips == NULL) {
1478                         continue;
1479                 }               
1480
1481                 for (j=0;j<public_ips->num;j++) {
1482                         struct ctdb_public_ip_list *tmp_ip; 
1483
1484                         tmp_ip = talloc_zero(ctdb->ip_tree, struct ctdb_public_ip_list);
1485                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1486                         /* Do not use information about IP addresses hosted
1487                          * on other nodes, it may not be accurate */
1488                         if (public_ips->ips[j].pnn == ctdb->nodes[i]->pnn) {
1489                                 tmp_ip->pnn = public_ips->ips[j].pnn;
1490                         } else {
1491                                 tmp_ip->pnn = -1;
1492                         }
1493                         tmp_ip->addr = public_ips->ips[j].addr;
1494                         tmp_ip->next = NULL;
1495
1496                         trbt_insertarray32_callback(ctdb->ip_tree,
1497                                 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1498                                 add_ip_callback,
1499                                 tmp_ip);
1500                 }
1501         }
1502
1503         ip_list = NULL;
1504         trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1505
1506         return ip_list;
1507 }
1508
1509 /* 
1510  * This is the length of the longtest common prefix between the IPs.
1511  * It is calculated by XOR-ing the 2 IPs together and counting the
1512  * number of leading zeroes.  The implementation means that all
1513  * addresses end up being 128 bits long.
1514  *
1515  * FIXME? Should we consider IPv4 and IPv6 separately given that the
1516  * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1517  * lots of nodes and IP addresses?
1518  */
1519 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1520 {
1521         uint32_t ip1_k[IP_KEYLEN];
1522         uint32_t *t;
1523         int i;
1524         uint32_t x;
1525
1526         uint32_t distance = 0;
1527
1528         memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1529         t = ip_key(ip2);
1530         for (i=0; i<IP_KEYLEN; i++) {
1531                 x = ip1_k[i] ^ t[i];
1532                 if (x == 0) {
1533                         distance += 32;
1534                 } else {
1535                         /* Count number of leading zeroes. 
1536                          * FIXME? This could be optimised...
1537                          */
1538                         while ((x & (1 << 31)) == 0) {
1539                                 x <<= 1;
1540                                 distance += 1;
1541                         }
1542                 }
1543         }
1544
1545         return distance;
1546 }
1547
1548 /* Calculate the IP distance for the given IP relative to IPs on the
1549    given node.  The ips argument is generally the all_ips variable
1550    used in the main part of the algorithm.
1551  */
1552 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1553                                   struct ctdb_public_ip_list *ips,
1554                                   int pnn)
1555 {
1556         struct ctdb_public_ip_list *t;
1557         uint32_t d;
1558
1559         uint32_t sum = 0;
1560
1561         for (t=ips; t != NULL; t=t->next) {
1562                 if (t->pnn != pnn) {
1563                         continue;
1564                 }
1565
1566                 /* Optimisation: We never calculate the distance
1567                  * between an address and itself.  This allows us to
1568                  * calculate the effect of removing an address from a
1569                  * node by simply calculating the distance between
1570                  * that address and all of the exitsing addresses.
1571                  * Moreover, we assume that we're only ever dealing
1572                  * with addresses from all_ips so we can identify an
1573                  * address via a pointer rather than doing a more
1574                  * expensive address comparison. */
1575                 if (&(t->addr) == ip) {
1576                         continue;
1577                 }
1578
1579                 d = ip_distance(ip, &(t->addr));
1580                 sum += d * d;  /* Cheaper than pulling in math.h :-) */
1581         }
1582
1583         return sum;
1584 }
1585
1586 /* Return the LCP2 imbalance metric for addresses currently assigned
1587    to the given node.
1588  */
1589 static uint32_t lcp2_imbalance(struct ctdb_public_ip_list * all_ips, int pnn)
1590 {
1591         struct ctdb_public_ip_list *t;
1592
1593         uint32_t imbalance = 0;
1594
1595         for (t=all_ips; t!=NULL; t=t->next) {
1596                 if (t->pnn != pnn) {
1597                         continue;
1598                 }
1599                 /* Pass the rest of the IPs rather than the whole
1600                    all_ips input list.
1601                 */
1602                 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1603         }
1604
1605         return imbalance;
1606 }
1607
1608 /* Allocate any unassigned IPs just by looping through the IPs and
1609  * finding the best node for each.
1610  */
1611 static void basic_allocate_unassigned(struct ctdb_context *ctdb,
1612                                       struct ctdb_ipflags *ipflags,
1613                                       struct ctdb_public_ip_list *all_ips)
1614 {
1615         struct ctdb_public_ip_list *tmp_ip;
1616
1617         /* loop over all ip's and find a physical node to cover for 
1618            each unassigned ip.
1619         */
1620         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1621                 if (tmp_ip->pnn == -1) {
1622                         if (find_takeover_node(ctdb, ipflags, tmp_ip, all_ips)) {
1623                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1624                                         ctdb_addr_to_str(&tmp_ip->addr)));
1625                         }
1626                 }
1627         }
1628 }
1629
1630 /* Basic non-deterministic rebalancing algorithm.
1631  */
1632 static void basic_failback(struct ctdb_context *ctdb,
1633                            struct ctdb_ipflags *ipflags,
1634                            struct ctdb_public_ip_list *all_ips,
1635                            int num_ips)
1636 {
1637         int i, numnodes;
1638         int maxnode, maxnum, minnode, minnum, num, retries;
1639         struct ctdb_public_ip_list *tmp_ip;
1640
1641         numnodes = talloc_array_length(ipflags);
1642         retries = 0;
1643
1644 try_again:
1645         maxnum=0;
1646         minnum=0;
1647
1648         /* for each ip address, loop over all nodes that can serve
1649            this ip and make sure that the difference between the node
1650            serving the most and the node serving the least ip's are
1651            not greater than 1.
1652         */
1653         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1654                 if (tmp_ip->pnn == -1) {
1655                         continue;
1656                 }
1657
1658                 /* Get the highest and lowest number of ips's served by any 
1659                    valid node which can serve this ip.
1660                 */
1661                 maxnode = -1;
1662                 minnode = -1;
1663                 for (i=0; i<numnodes; i++) {
1664                         /* only check nodes that can actually serve this ip */
1665                         if (!can_node_takeover_ip(ctdb, i, ipflags[i], tmp_ip)) {
1666                                 /* no it couldnt   so skip to the next node */
1667                                 continue;
1668                         }
1669
1670                         num = node_ip_coverage(ctdb, i, all_ips);
1671                         if (maxnode == -1) {
1672                                 maxnode = i;
1673                                 maxnum  = num;
1674                         } else {
1675                                 if (num > maxnum) {
1676                                         maxnode = i;
1677                                         maxnum  = num;
1678                                 }
1679                         }
1680                         if (minnode == -1) {
1681                                 minnode = i;
1682                                 minnum  = num;
1683                         } else {
1684                                 if (num < minnum) {
1685                                         minnode = i;
1686                                         minnum  = num;
1687                                 }
1688                         }
1689                 }
1690                 if (maxnode == -1) {
1691                         DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1692                                 ctdb_addr_to_str(&tmp_ip->addr)));
1693
1694                         continue;
1695                 }
1696
1697                 /* if the spread between the smallest and largest coverage by
1698                    a node is >=2 we steal one of the ips from the node with
1699                    most coverage to even things out a bit.
1700                    try to do this a limited number of times since we dont
1701                    want to spend too much time balancing the ip coverage.
1702                 */
1703                 if ( (maxnum > minnum+1)
1704                      && (retries < (num_ips + 5)) ){
1705                         struct ctdb_public_ip_list *tmp;
1706
1707                         /* Reassign one of maxnode's VNNs */
1708                         for (tmp=all_ips;tmp;tmp=tmp->next) {
1709                                 if (tmp->pnn == maxnode) {
1710                                         (void)find_takeover_node(ctdb, ipflags, tmp, all_ips);
1711                                         retries++;
1712                                         goto try_again;;
1713                                 }
1714                         }
1715                 }
1716         }
1717 }
1718
1719 static void lcp2_init(struct ctdb_context *tmp_ctx,
1720                       struct ctdb_ipflags *ipflags,
1721                       struct ctdb_public_ip_list *all_ips,
1722                       uint32_t *force_rebalance_nodes,
1723                       uint32_t **lcp2_imbalances,
1724                       bool **rebalance_candidates)
1725 {
1726         int i, numnodes;
1727         struct ctdb_public_ip_list *tmp_ip;
1728
1729         numnodes = talloc_array_length(ipflags);
1730
1731         *rebalance_candidates = talloc_array(tmp_ctx, bool, numnodes);
1732         CTDB_NO_MEMORY_FATAL(tmp_ctx, *rebalance_candidates);
1733         *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, numnodes);
1734         CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
1735
1736         for (i=0; i<numnodes; i++) {
1737                 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1738                 /* First step: assume all nodes are candidates */
1739                 (*rebalance_candidates)[i] = true;
1740         }
1741
1742         /* 2nd step: if a node has IPs assigned then it must have been
1743          * healthy before, so we remove it from consideration.  This
1744          * is overkill but is all we have because we don't maintain
1745          * state between takeover runs.  An alternative would be to
1746          * keep state and invalidate it every time the recovery master
1747          * changes.
1748          */
1749         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1750                 if (tmp_ip->pnn != -1) {
1751                         (*rebalance_candidates)[tmp_ip->pnn] = false;
1752                 }
1753         }
1754
1755         /* 3rd step: if a node is forced to re-balance then
1756            we allow failback onto the node */
1757         if (force_rebalance_nodes == NULL) {
1758                 return;
1759         }
1760         for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
1761                 uint32_t pnn = force_rebalance_nodes[i];
1762                 if (pnn >= numnodes) {
1763                         DEBUG(DEBUG_ERR,
1764                               (__location__ "unknown node %u\n", pnn));
1765                         continue;
1766                 }
1767
1768                 DEBUG(DEBUG_NOTICE,
1769                       ("Forcing rebalancing of IPs to node %u\n", pnn));
1770                 (*rebalance_candidates)[pnn] = true;
1771         }
1772 }
1773
1774 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1775  * the IP/node combination that will cost the least.
1776  */
1777 static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
1778                                      struct ctdb_ipflags *ipflags,
1779                                      struct ctdb_public_ip_list *all_ips,
1780                                      uint32_t *lcp2_imbalances)
1781 {
1782         struct ctdb_public_ip_list *tmp_ip;
1783         int dstnode, numnodes;
1784
1785         int minnode;
1786         uint32_t mindsum, dstdsum, dstimbl, minimbl;
1787         struct ctdb_public_ip_list *minip;
1788
1789         bool should_loop = true;
1790         bool have_unassigned = true;
1791
1792         numnodes = talloc_array_length(ipflags);
1793
1794         while (have_unassigned && should_loop) {
1795                 should_loop = false;
1796
1797                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1798                 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1799
1800                 minnode = -1;
1801                 mindsum = 0;
1802                 minip = NULL;
1803
1804                 /* loop over each unassigned ip. */
1805                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1806                         if (tmp_ip->pnn != -1) {
1807                                 continue;
1808                         }
1809
1810                         for (dstnode=0; dstnode<numnodes; dstnode++) {
1811                                 /* only check nodes that can actually takeover this ip */
1812                                 if (!can_node_takeover_ip(ctdb, dstnode,
1813                                                           ipflags[dstnode],
1814                                                           tmp_ip)) {
1815                                         /* no it couldnt   so skip to the next node */
1816                                         continue;
1817                                 }
1818
1819                                 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1820                                 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1821                                 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1822                                                    ctdb_addr_to_str(&(tmp_ip->addr)),
1823                                                    dstnode,
1824                                                    dstimbl - lcp2_imbalances[dstnode]));
1825
1826
1827                                 if ((minnode == -1) || (dstdsum < mindsum)) {
1828                                         minnode = dstnode;
1829                                         minimbl = dstimbl;
1830                                         mindsum = dstdsum;
1831                                         minip = tmp_ip;
1832                                         should_loop = true;
1833                                 }
1834                         }
1835                 }
1836
1837                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1838
1839                 /* If we found one then assign it to the given node. */
1840                 if (minnode != -1) {
1841                         minip->pnn = minnode;
1842                         lcp2_imbalances[minnode] = minimbl;
1843                         DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1844                                           ctdb_addr_to_str(&(minip->addr)),
1845                                           minnode,
1846                                           mindsum));
1847                 }
1848
1849                 /* There might be a better way but at least this is clear. */
1850                 have_unassigned = false;
1851                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1852                         if (tmp_ip->pnn == -1) {
1853                                 have_unassigned = true;
1854                         }
1855                 }
1856         }
1857
1858         /* We know if we have an unassigned addresses so we might as
1859          * well optimise.
1860          */
1861         if (have_unassigned) {
1862                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1863                         if (tmp_ip->pnn == -1) {
1864                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1865                                                      ctdb_addr_to_str(&tmp_ip->addr)));
1866                         }
1867                 }
1868         }
1869 }
1870
1871 /* LCP2 algorithm for rebalancing the cluster.  Given a candidate node
1872  * to move IPs from, determines the best IP/destination node
1873  * combination to move from the source node.
1874  */
1875 static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
1876                                     struct ctdb_ipflags *ipflags,
1877                                     struct ctdb_public_ip_list *all_ips,
1878                                     int srcnode,
1879                                     uint32_t *lcp2_imbalances,
1880                                     bool *rebalance_candidates)
1881 {
1882         int dstnode, mindstnode, numnodes;
1883         uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1884         uint32_t minsrcimbl, mindstimbl;
1885         struct ctdb_public_ip_list *minip;
1886         struct ctdb_public_ip_list *tmp_ip;
1887
1888         /* Find an IP and destination node that best reduces imbalance. */
1889         srcimbl = 0;
1890         minip = NULL;
1891         minsrcimbl = 0;
1892         mindstnode = -1;
1893         mindstimbl = 0;
1894
1895         numnodes = talloc_array_length(ipflags);
1896
1897         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1898         DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n",
1899                            srcnode, lcp2_imbalances[srcnode]));
1900
1901         for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1902                 /* Only consider addresses on srcnode. */
1903                 if (tmp_ip->pnn != srcnode) {
1904                         continue;
1905                 }
1906
1907                 /* What is this IP address costing the source node? */
1908                 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1909                 srcimbl = lcp2_imbalances[srcnode] - srcdsum;
1910
1911                 /* Consider this IP address would cost each potential
1912                  * destination node.  Destination nodes are limited to
1913                  * those that are newly healthy, since we don't want
1914                  * to do gratuitous failover of IPs just to make minor
1915                  * balance improvements.
1916                  */
1917                 for (dstnode=0; dstnode<numnodes; dstnode++) {
1918                         if (!rebalance_candidates[dstnode]) {
1919                                 continue;
1920                         }
1921
1922                         /* only check nodes that can actually takeover this ip */
1923                         if (!can_node_takeover_ip(ctdb, dstnode,
1924                                                   ipflags[dstnode], tmp_ip)) {
1925                                 /* no it couldnt   so skip to the next node */
1926                                 continue;
1927                         }
1928
1929                         dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1930                         dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1931                         DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1932                                            srcnode, -srcdsum,
1933                                            ctdb_addr_to_str(&(tmp_ip->addr)),
1934                                            dstnode, dstdsum));
1935
1936                         if ((dstimbl < lcp2_imbalances[srcnode]) &&
1937                             (dstdsum < srcdsum) &&                      \
1938                             ((mindstnode == -1) ||                              \
1939                              ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1940
1941                                 minip = tmp_ip;
1942                                 minsrcimbl = srcimbl;
1943                                 mindstnode = dstnode;
1944                                 mindstimbl = dstimbl;
1945                         }
1946                 }
1947         }
1948         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1949
1950         if (mindstnode != -1) {
1951                 /* We found a move that makes things better... */
1952                 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1953                                   srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1954                                   ctdb_addr_to_str(&(minip->addr)),
1955                                   mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1956
1957
1958                 lcp2_imbalances[srcnode] = minsrcimbl;
1959                 lcp2_imbalances[mindstnode] = mindstimbl;
1960                 minip->pnn = mindstnode;
1961
1962                 return true;
1963         }
1964
1965         return false;
1966         
1967 }
1968
1969 struct lcp2_imbalance_pnn {
1970         uint32_t imbalance;
1971         int pnn;
1972 };
1973
1974 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
1975 {
1976         const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
1977         const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
1978
1979         if (lipa->imbalance > lipb->imbalance) {
1980                 return -1;
1981         } else if (lipa->imbalance == lipb->imbalance) {
1982                 return 0;
1983         } else {
1984                 return 1;
1985         }
1986 }
1987
1988 /* LCP2 algorithm for rebalancing the cluster.  This finds the source
1989  * node with the highest LCP2 imbalance, and then determines the best
1990  * IP/destination node combination to move from the source node.
1991  */
1992 static void lcp2_failback(struct ctdb_context *ctdb,
1993                           struct ctdb_ipflags *ipflags,
1994                           struct ctdb_public_ip_list *all_ips,
1995                           uint32_t *lcp2_imbalances,
1996                           bool *rebalance_candidates)
1997 {
1998         int i, numnodes;
1999         struct lcp2_imbalance_pnn * lips;
2000         bool again;
2001
2002         numnodes = talloc_array_length(ipflags);
2003
2004 try_again:
2005         /* Put the imbalances and nodes into an array, sort them and
2006          * iterate through candidates.  Usually the 1st one will be
2007          * used, so this doesn't cost much...
2008          */
2009         DEBUG(DEBUG_DEBUG,("+++++++++++++++++++++++++++++++++++++++++\n"));
2010         DEBUG(DEBUG_DEBUG,("Selecting most imbalanced node from:\n"));
2011         lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, numnodes);
2012         for (i=0; i<numnodes; i++) {
2013                 lips[i].imbalance = lcp2_imbalances[i];
2014                 lips[i].pnn = i;
2015                 DEBUG(DEBUG_DEBUG,(" %d [%d]\n", i, lcp2_imbalances[i]));
2016         }
2017         qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
2018               lcp2_cmp_imbalance_pnn);
2019
2020         again = false;
2021         for (i=0; i<numnodes; i++) {
2022                 /* This means that all nodes had 0 or 1 addresses, so
2023                  * can't be imbalanced.
2024                  */
2025                 if (lips[i].imbalance == 0) {
2026                         break;
2027                 }
2028
2029                 if (lcp2_failback_candidate(ctdb,
2030                                             ipflags,
2031                                             all_ips,
2032                                             lips[i].pnn,
2033                                             lcp2_imbalances,
2034                                             rebalance_candidates)) {
2035                         again = true;
2036                         break;
2037                 }
2038         }
2039
2040         talloc_free(lips);
2041         if (again) {
2042                 goto try_again;
2043         }
2044 }
2045
2046 static void unassign_unsuitable_ips(struct ctdb_context *ctdb,
2047                                     struct ctdb_ipflags *ipflags,
2048                                     struct ctdb_public_ip_list *all_ips)
2049 {
2050         struct ctdb_public_ip_list *tmp_ip;
2051
2052         /* verify that the assigned nodes can serve that public ip
2053            and set it to -1 if not
2054         */
2055         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2056                 if (tmp_ip->pnn == -1) {
2057                         continue;
2058                 }
2059                 if (!can_node_host_ip(ctdb, tmp_ip->pnn,
2060                                       ipflags[tmp_ip->pnn], tmp_ip) != 0) {
2061                         /* this node can not serve this ip. */
2062                         DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
2063                                            ctdb_addr_to_str(&(tmp_ip->addr)),
2064                                            tmp_ip->pnn));
2065                         tmp_ip->pnn = -1;
2066                 }
2067         }
2068 }
2069
2070 static void ip_alloc_deterministic_ips(struct ctdb_context *ctdb,
2071                                        struct ctdb_ipflags *ipflags,
2072                                        struct ctdb_public_ip_list *all_ips)
2073 {
2074         struct ctdb_public_ip_list *tmp_ip;
2075         int i, numnodes;
2076
2077         numnodes = talloc_array_length(ipflags);
2078
2079         DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
2080        /* Allocate IPs to nodes in a modulo fashion so that IPs will
2081         *  always be allocated the same way for a specific set of
2082         *  available/unavailable nodes.
2083         */
2084
2085         for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
2086                 tmp_ip->pnn = i % numnodes;
2087         }
2088
2089         /* IP failback doesn't make sense with deterministic
2090          * IPs, since the modulo step above implicitly fails
2091          * back IPs to their "home" node.
2092          */
2093         if (1 == ctdb->tunable.no_ip_failback) {
2094                 DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
2095         }
2096
2097         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2098
2099         basic_allocate_unassigned(ctdb, ipflags, all_ips);
2100
2101         /* No failback here! */
2102 }
2103
2104 static void ip_alloc_nondeterministic_ips(struct ctdb_context *ctdb,
2105                                           struct ctdb_ipflags *ipflags,
2106                                           struct ctdb_public_ip_list *all_ips)
2107 {
2108         /* This should be pushed down into basic_failback. */
2109         struct ctdb_public_ip_list *tmp_ip;
2110         int num_ips = 0;
2111         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2112                 num_ips++;
2113         }
2114
2115         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2116
2117         basic_allocate_unassigned(ctdb, ipflags, all_ips);
2118
2119         /* If we don't want IPs to fail back then don't rebalance IPs. */
2120         if (1 == ctdb->tunable.no_ip_failback) {
2121                 return;
2122         }
2123
2124         /* Now, try to make sure the ip adresses are evenly distributed
2125            across the nodes.
2126         */
2127         basic_failback(ctdb, ipflags, all_ips, num_ips);
2128 }
2129
2130 static void ip_alloc_lcp2(struct ctdb_context *ctdb,
2131                           struct ctdb_ipflags *ipflags,
2132                           struct ctdb_public_ip_list *all_ips,
2133                           uint32_t *force_rebalance_nodes)
2134 {
2135         uint32_t *lcp2_imbalances;
2136         bool *rebalance_candidates;
2137         int numnodes, num_rebalance_candidates, i;
2138
2139         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2140
2141         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2142
2143         lcp2_init(tmp_ctx, ipflags, all_ips,force_rebalance_nodes,
2144                   &lcp2_imbalances, &rebalance_candidates);
2145
2146         lcp2_allocate_unassigned(ctdb, ipflags, all_ips, lcp2_imbalances);
2147
2148         /* If we don't want IPs to fail back then don't rebalance IPs. */
2149         if (1 == ctdb->tunable.no_ip_failback) {
2150                 goto finished;
2151         }
2152
2153         /* It is only worth continuing if we have suitable target
2154          * nodes to transfer IPs to.  This check is much cheaper than
2155          * continuing on...
2156          */
2157         numnodes = talloc_array_length(ipflags);
2158         num_rebalance_candidates = 0;
2159         for (i=0; i<numnodes; i++) {
2160                 if (rebalance_candidates[i]) {
2161                         num_rebalance_candidates++;
2162                 }
2163         }
2164         if (num_rebalance_candidates == 0) {
2165                 goto finished;
2166         }
2167
2168         /* Now, try to make sure the ip adresses are evenly distributed
2169            across the nodes.
2170         */
2171         lcp2_failback(ctdb, ipflags, all_ips,
2172                       lcp2_imbalances, rebalance_candidates);
2173
2174 finished:
2175         talloc_free(tmp_ctx);
2176 }
2177
2178 static bool all_nodes_are_disabled(struct ctdb_node_map *nodemap)
2179 {
2180         int i;
2181
2182         for (i=0;i<nodemap->num;i++) {
2183                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
2184                         /* Found one completely healthy node */
2185                         return false;
2186                 }
2187         }
2188
2189         return true;
2190 }
2191
2192 /* The calculation part of the IP allocation algorithm. */
2193 static void ctdb_takeover_run_core(struct ctdb_context *ctdb,
2194                                    struct ctdb_ipflags *ipflags,
2195                                    struct ctdb_public_ip_list **all_ips_p,
2196                                    uint32_t *force_rebalance_nodes)
2197 {
2198         /* since nodes only know about those public addresses that
2199            can be served by that particular node, no single node has
2200            a full list of all public addresses that exist in the cluster.
2201            Walk over all node structures and create a merged list of
2202            all public addresses that exist in the cluster.
2203
2204            keep the tree of ips around as ctdb->ip_tree
2205         */
2206         *all_ips_p = create_merged_ip_list(ctdb);
2207
2208         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2209                 ip_alloc_lcp2(ctdb, ipflags, *all_ips_p, force_rebalance_nodes);
2210         } else if (1 == ctdb->tunable.deterministic_public_ips) {
2211                 ip_alloc_deterministic_ips(ctdb, ipflags, *all_ips_p);
2212         } else {
2213                 ip_alloc_nondeterministic_ips(ctdb, ipflags, *all_ips_p);
2214         }
2215
2216         /* at this point ->pnn is the node which will own each IP
2217            or -1 if there is no node that can cover this ip
2218         */
2219
2220         return;
2221 }
2222
2223 struct get_tunable_callback_data {
2224         const char *tunable;
2225         uint32_t *out;
2226         bool fatal;
2227 };
2228
2229 static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
2230                                  int32_t res, TDB_DATA outdata,
2231                                  void *callback)
2232 {
2233         struct get_tunable_callback_data *cd =
2234                 (struct get_tunable_callback_data *)callback;
2235         int size;
2236
2237         if (res != 0) {
2238                 /* Already handled in fail callback */
2239                 return;
2240         }
2241
2242         if (outdata.dsize != sizeof(uint32_t)) {
2243                 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading \"%s\" tunable from node %d. Expected %d bytes but received %d bytes\n",
2244                                  cd->tunable, pnn, (int)sizeof(uint32_t),
2245                                  (int)outdata.dsize));
2246                 cd->fatal = true;
2247                 return;
2248         }
2249
2250         size = talloc_array_length(cd->out);
2251         if (pnn >= size) {
2252                 DEBUG(DEBUG_ERR,("Got %s reply from node %d but nodemap only has %d entries\n",
2253                                  cd->tunable, pnn, size));
2254                 return;
2255         }
2256
2257                 
2258         cd->out[pnn] = *(uint32_t *)outdata.dptr;
2259 }
2260
2261 static void get_tunable_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2262                                        int32_t res, TDB_DATA outdata,
2263                                        void *callback)
2264 {
2265         struct get_tunable_callback_data *cd =
2266                 (struct get_tunable_callback_data *)callback;
2267
2268         switch (res) {
2269         case -ETIME:
2270                 DEBUG(DEBUG_ERR,
2271                       ("Timed out getting tunable \"%s\" from node %d\n",
2272                        cd->tunable, pnn));
2273                 cd->fatal = true;
2274                 break;
2275         case -EINVAL:
2276         case -1:
2277                 DEBUG(DEBUG_WARNING,
2278                       ("Tunable \"%s\" not implemented on node %d\n",
2279                        cd->tunable, pnn));
2280                 break;
2281         default:
2282                 DEBUG(DEBUG_ERR,
2283                       ("Unexpected error getting tunable \"%s\" from node %d\n",
2284                        cd->tunable, pnn));
2285                 cd->fatal = true;
2286         }
2287 }
2288
2289 static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
2290                                         TALLOC_CTX *tmp_ctx,
2291                                         struct ctdb_node_map *nodemap,
2292                                         const char *tunable,
2293                                         uint32_t default_value)
2294 {
2295         TDB_DATA data;
2296         struct ctdb_control_get_tunable *t;
2297         uint32_t *nodes;
2298         uint32_t *tvals;
2299         struct get_tunable_callback_data callback_data;
2300         int i;
2301
2302         tvals = talloc_array(tmp_ctx, uint32_t, nodemap->num);
2303         CTDB_NO_MEMORY_NULL(ctdb, tvals);
2304         for (i=0; i<nodemap->num; i++) {
2305                 tvals[i] = default_value;
2306         }
2307                 
2308         callback_data.out = tvals;
2309         callback_data.tunable = tunable;
2310         callback_data.fatal = false;
2311
2312         data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(tunable) + 1;
2313         data.dptr  = talloc_size(tmp_ctx, data.dsize);
2314         t = (struct ctdb_control_get_tunable *)data.dptr;
2315         t->length = strlen(tunable)+1;
2316         memcpy(t->name, tunable, t->length);
2317         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2318         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
2319                                       nodes, 0, TAKEOVER_TIMEOUT(),
2320                                       false, data,
2321                                       get_tunable_callback,
2322                                       get_tunable_fail_callback,
2323                                       &callback_data) != 0) {
2324                 if (callback_data.fatal) {
2325                         talloc_free(tvals);
2326                         tvals = NULL;
2327                 }
2328         }
2329         talloc_free(nodes);
2330         talloc_free(data.dptr);
2331
2332         return tvals;
2333 }
2334
2335 struct get_runstate_callback_data {
2336         enum ctdb_runstate *out;
2337         bool fatal;
2338 };
2339
2340 static void get_runstate_callback(struct ctdb_context *ctdb, uint32_t pnn,
2341                                   int32_t res, TDB_DATA outdata,
2342                                   void *callback_data)
2343 {
2344         struct get_runstate_callback_data *cd =
2345                 (struct get_runstate_callback_data *)callback_data;
2346         int size;
2347
2348         if (res != 0) {
2349                 /* Already handled in fail callback */
2350                 return;
2351         }
2352
2353         if (outdata.dsize != sizeof(uint32_t)) {
2354                 DEBUG(DEBUG_ERR,("Wrong size of returned data when getting runstate from node %d. Expected %d bytes but received %d bytes\n",
2355                                  pnn, (int)sizeof(uint32_t),
2356                                  (int)outdata.dsize));
2357                 cd->fatal = true;
2358                 return;
2359         }
2360
2361         size = talloc_array_length(cd->out);
2362         if (pnn >= size) {
2363                 DEBUG(DEBUG_ERR,("Got reply from node %d but nodemap only has %d entries\n",
2364                                  pnn, size));
2365                 return;
2366         }
2367
2368         cd->out[pnn] = (enum ctdb_runstate)*(uint32_t *)outdata.dptr;
2369 }
2370
2371 static void get_runstate_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2372                                        int32_t res, TDB_DATA outdata,
2373                                        void *callback)
2374 {
2375         struct get_runstate_callback_data *cd =
2376                 (struct get_runstate_callback_data *)callback;
2377
2378         switch (res) {
2379         case -ETIME:
2380                 DEBUG(DEBUG_ERR,
2381                       ("Timed out getting runstate from node %d\n", pnn));
2382                 cd->fatal = true;
2383                 break;
2384         default:
2385                 DEBUG(DEBUG_WARNING,
2386                       ("Error getting runstate from node %d - assuming runstates not supported\n",
2387                        pnn));
2388         }
2389 }
2390
2391 static enum ctdb_runstate * get_runstate_from_nodes(struct ctdb_context *ctdb,
2392                                                     TALLOC_CTX *tmp_ctx,
2393                                                     struct ctdb_node_map *nodemap,
2394                                                     enum ctdb_runstate default_value)
2395 {
2396         uint32_t *nodes;
2397         enum ctdb_runstate *rs;
2398         struct get_runstate_callback_data callback_data;
2399         int i;
2400
2401         rs = talloc_array(tmp_ctx, enum ctdb_runstate, nodemap->num);
2402         CTDB_NO_MEMORY_NULL(ctdb, rs);
2403         for (i=0; i<nodemap->num; i++) {
2404                 rs[i] = default_value;
2405         }
2406
2407         callback_data.out = rs;
2408         callback_data.fatal = false;
2409
2410         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2411         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_RUNSTATE,
2412                                       nodes, 0, TAKEOVER_TIMEOUT(),
2413                                       true, tdb_null,
2414                                       get_runstate_callback,
2415                                       get_runstate_fail_callback,
2416                                       &callback_data) != 0) {
2417                 if (callback_data.fatal) {
2418                         free(rs);
2419                         rs = NULL;
2420                 }
2421         }
2422         talloc_free(nodes);
2423
2424         return rs;
2425 }
2426
2427 /* Set internal flags for IP allocation:
2428  *   Clear ip flags
2429  *   Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
2430  *   Set NOIPHOST ip flag for each INACTIVE node
2431  *   if all nodes are disabled:
2432  *     Set NOIPHOST ip flags from per-node NoIPHostOnAllDisabled tunable
2433  *   else
2434  *     Set NOIPHOST ip flags for disabled nodes
2435  */
2436 static struct ctdb_ipflags *
2437 set_ipflags_internal(struct ctdb_context *ctdb,
2438                      TALLOC_CTX *tmp_ctx,
2439                      struct ctdb_node_map *nodemap,
2440                      uint32_t *tval_noiptakeover,
2441                      uint32_t *tval_noiphostonalldisabled,
2442                      enum ctdb_runstate *runstate)
2443 {
2444         int i;
2445         struct ctdb_ipflags *ipflags;
2446
2447         /* Clear IP flags - implicit due to talloc_zero */
2448         ipflags = talloc_zero_array(tmp_ctx, struct ctdb_ipflags, nodemap->num);
2449         CTDB_NO_MEMORY_NULL(ctdb, ipflags);
2450
2451         for (i=0;i<nodemap->num;i++) {
2452                 /* Can not take IPs on node with NoIPTakeover set */
2453                 if (tval_noiptakeover[i] != 0) {
2454                         ipflags[i].noiptakeover = true;
2455                 }
2456
2457                 /* Can not host IPs on node not in RUNNING state */
2458                 if (runstate[i] != CTDB_RUNSTATE_RUNNING) {
2459                         ipflags[i].noiphost = true;
2460                         continue;
2461                 }
2462                 /* Can not host IPs on INACTIVE node */
2463                 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2464                         ipflags[i].noiphost = true;
2465                 }
2466         }
2467
2468         if (all_nodes_are_disabled(nodemap)) {
2469                 /* If all nodes are disabled, can not host IPs on node
2470                  * with NoIPHostOnAllDisabled set
2471                  */
2472                 for (i=0;i<nodemap->num;i++) {
2473                         if (tval_noiphostonalldisabled[i] != 0) {
2474                                 ipflags[i].noiphost = true;
2475                         }
2476                 }
2477         } else {
2478                 /* If some nodes are not disabled, then can not host
2479                  * IPs on DISABLED node
2480                  */
2481                 for (i=0;i<nodemap->num;i++) {
2482                         if (nodemap->nodes[i].flags & NODE_FLAGS_DISABLED) {
2483                                 ipflags[i].noiphost = true;
2484                         }
2485                 }
2486         }
2487
2488         return ipflags;
2489 }
2490
2491 static struct ctdb_ipflags *set_ipflags(struct ctdb_context *ctdb,
2492                                         TALLOC_CTX *tmp_ctx,
2493                                         struct ctdb_node_map *nodemap)
2494 {
2495         uint32_t *tval_noiptakeover;
2496         uint32_t *tval_noiphostonalldisabled;
2497         struct ctdb_ipflags *ipflags;
2498         enum ctdb_runstate *runstate;
2499
2500
2501         tval_noiptakeover = get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2502                                                    "NoIPTakeover", 0);
2503         if (tval_noiptakeover == NULL) {
2504                 return NULL;
2505         }
2506
2507         tval_noiphostonalldisabled =
2508                 get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2509                                        "NoIPHostOnAllDisabled", 0);
2510         if (tval_noiphostonalldisabled == NULL) {
2511                 /* Caller frees tmp_ctx */
2512                 return NULL;
2513         }
2514
2515         /* Any nodes where CTDB_CONTROL_GET_RUNSTATE is not supported
2516          * will default to CTDB_RUNSTATE_RUNNING.  This ensures
2517          * reasonable behaviour on a mixed cluster during upgrade.
2518          */
2519         runstate = get_runstate_from_nodes(ctdb, tmp_ctx, nodemap,
2520                                            CTDB_RUNSTATE_RUNNING);
2521         if (runstate == NULL) {
2522                 /* Caller frees tmp_ctx */
2523                 return NULL;
2524         }
2525
2526         ipflags = set_ipflags_internal(ctdb, tmp_ctx, nodemap,
2527                                        tval_noiptakeover,
2528                                        tval_noiphostonalldisabled,
2529                                        runstate);
2530
2531         talloc_free(tval_noiptakeover);
2532         talloc_free(tval_noiphostonalldisabled);
2533         talloc_free(runstate);
2534
2535         return ipflags;
2536 }
2537
2538 struct iprealloc_callback_data {
2539         bool *retry_nodes;
2540         int retry_count;
2541         client_async_callback fail_callback;
2542         void *fail_callback_data;
2543         struct ctdb_node_map *nodemap;
2544 };
2545
2546 static void iprealloc_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2547                                         int32_t res, TDB_DATA outdata,
2548                                         void *callback)
2549 {
2550         int numnodes;
2551         struct iprealloc_callback_data *cd =
2552                 (struct iprealloc_callback_data *)callback;
2553
2554         numnodes = talloc_array_length(cd->retry_nodes);
2555         if (pnn > numnodes) {
2556                 DEBUG(DEBUG_ERR,
2557                       ("ipreallocated failure from node %d, "
2558                        "but only %d nodes in nodemap\n",
2559                        pnn, numnodes));
2560                 return;
2561         }
2562
2563         /* Can't run the "ipreallocated" event on a INACTIVE node */
2564         if (cd->nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE) {
2565                 DEBUG(DEBUG_WARNING,
2566                       ("ipreallocated failed on inactive node %d, ignoring\n",
2567                        pnn));
2568                 return;
2569         }
2570
2571         switch (res) {
2572         case -ETIME:
2573                 /* If the control timed out then that's a real error,
2574                  * so call the real fail callback
2575                  */
2576                 if (cd->fail_callback) {
2577                         cd->fail_callback(ctdb, pnn, res, outdata,
2578                                           cd->fail_callback_data);
2579                 } else {
2580                         DEBUG(DEBUG_WARNING,
2581                               ("iprealloc timed out but no callback registered\n"));
2582                 }
2583                 break;
2584         default:
2585                 /* If not a timeout then either the ipreallocated
2586                  * eventscript (or some setup) failed.  This might
2587                  * have failed because the IPREALLOCATED control isn't
2588                  * implemented - right now there is no way of knowing
2589                  * because the error codes are all folded down to -1.
2590                  * Consider retrying using EVENTSCRIPT control...
2591                  */
2592                 DEBUG(DEBUG_WARNING,
2593                       ("ipreallocated failure from node %d, flagging retry\n",
2594                        pnn));
2595                 cd->retry_nodes[pnn] = true;
2596                 cd->retry_count++;
2597         }
2598 }
2599
2600 struct takeover_callback_data {
2601         bool *node_failed;
2602         client_async_callback fail_callback;
2603         void *fail_callback_data;
2604         struct ctdb_node_map *nodemap;
2605 };
2606
2607 static void takeover_run_fail_callback(struct ctdb_context *ctdb,
2608                                        uint32_t node_pnn, int32_t res,
2609                                        TDB_DATA outdata, void *callback_data)
2610 {
2611         struct takeover_callback_data *cd =
2612                 talloc_get_type_abort(callback_data,
2613                                       struct takeover_callback_data);
2614         int i;
2615
2616         for (i = 0; i < cd->nodemap->num; i++) {
2617                 if (node_pnn == cd->nodemap->nodes[i].pnn) {
2618                         break;
2619                 }
2620         }
2621
2622         if (i == cd->nodemap->num) {
2623                 DEBUG(DEBUG_ERR, (__location__ " invalid PNN %u\n", node_pnn));
2624                 return;
2625         }
2626
2627         if (!cd->node_failed[i]) {
2628                 cd->node_failed[i] = true;
2629                 cd->fail_callback(ctdb, node_pnn, res, outdata,
2630                                   cd->fail_callback_data);
2631         }
2632 }
2633
2634 /*
2635   make any IP alias changes for public addresses that are necessary 
2636  */
2637 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
2638                       uint32_t *force_rebalance_nodes,
2639                       client_async_callback fail_callback, void *callback_data)
2640 {
2641         int i, j, ret;
2642         struct ctdb_public_ip ip;
2643         struct ctdb_public_ipv4 ipv4;
2644         uint32_t *nodes;
2645         struct ctdb_public_ip_list *all_ips, *tmp_ip;
2646         TDB_DATA data;
2647         struct timeval timeout;
2648         struct client_async_data *async_data;
2649         struct ctdb_client_control_state *state;
2650         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2651         struct ctdb_ipflags *ipflags;
2652         struct takeover_callback_data *takeover_data;
2653         struct iprealloc_callback_data iprealloc_data;
2654         bool *retry_data;
2655
2656         /*
2657          * ip failover is completely disabled, just send out the 
2658          * ipreallocated event.
2659          */
2660         if (ctdb->tunable.disable_ip_failover != 0) {
2661                 goto ipreallocated;
2662         }
2663
2664         ipflags = set_ipflags(ctdb, tmp_ctx, nodemap);
2665         if (ipflags == NULL) {
2666                 DEBUG(DEBUG_ERR,("Failed to set IP flags - aborting takeover run\n"));
2667                 talloc_free(tmp_ctx);
2668                 return -1;
2669         }
2670
2671         /* Do the IP reassignment calculations */
2672         ctdb_takeover_run_core(ctdb, ipflags, &all_ips, force_rebalance_nodes);
2673
2674         /* Now tell all nodes to release any public IPs should not
2675          * host.  This will be a NOOP on nodes that don't currently
2676          * hold the given IP.
2677          */
2678         takeover_data = talloc_zero(tmp_ctx, struct takeover_callback_data);
2679         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data);
2680
2681         takeover_data->node_failed = talloc_zero_array(tmp_ctx,
2682                                                        bool, nodemap->num);
2683         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data->node_failed);
2684         takeover_data->fail_callback = fail_callback;
2685         takeover_data->fail_callback_data = callback_data;
2686         takeover_data->nodemap = nodemap;
2687
2688         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2689         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2690
2691         async_data->fail_callback = takeover_run_fail_callback;
2692         async_data->callback_data = takeover_data;
2693
2694         ZERO_STRUCT(ip); /* Avoid valgrind warnings for union */
2695
2696         /* Send a RELEASE_IP to all nodes that should not be hosting
2697          * each IP.  For each IP, all but one of these will be
2698          * redundant.  However, the redundant ones are used to tell
2699          * nodes which node should be hosting the IP so that commands
2700          * like "ctdb ip" can display a particular nodes idea of who
2701          * is hosting what. */
2702         for (i=0;i<nodemap->num;i++) {
2703                 /* don't talk to unconnected nodes, but do talk to banned nodes */
2704                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2705                         continue;
2706                 }
2707
2708                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2709                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2710                                 /* This node should be serving this
2711                                    vnn so dont tell it to release the ip
2712                                 */
2713                                 continue;
2714                         }
2715                         if (tmp_ip->addr.sa.sa_family == AF_INET) {
2716                                 ipv4.pnn = tmp_ip->pnn;
2717                                 ipv4.sin = tmp_ip->addr.ip;
2718
2719                                 timeout = TAKEOVER_TIMEOUT();
2720                                 data.dsize = sizeof(ipv4);
2721                                 data.dptr  = (uint8_t *)&ipv4;
2722                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2723                                                 0, CTDB_CONTROL_RELEASE_IPv4, 0,
2724                                                 data, async_data,
2725                                                 &timeout, NULL);
2726                         } else {
2727                                 ip.pnn  = tmp_ip->pnn;
2728                                 ip.addr = tmp_ip->addr;
2729
2730                                 timeout = TAKEOVER_TIMEOUT();
2731                                 data.dsize = sizeof(ip);
2732                                 data.dptr  = (uint8_t *)&ip;
2733                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2734                                                 0, CTDB_CONTROL_RELEASE_IP, 0,
2735                                                 data, async_data,
2736                                                 &timeout, NULL);
2737                         }
2738
2739                         if (state == NULL) {
2740                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2741                                 talloc_free(tmp_ctx);
2742                                 return -1;
2743                         }
2744                 
2745                         ctdb_client_async_add(async_data, state);
2746                 }
2747         }
2748         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2749                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2750                 talloc_free(tmp_ctx);
2751                 return -1;
2752         }
2753         talloc_free(async_data);
2754
2755
2756         /* For each IP, send a TAKOVER_IP to the node that should be
2757          * hosting it.  Many of these will often be redundant (since
2758          * the allocation won't have changed) but they can be useful
2759          * to recover from inconsistencies. */
2760         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2761         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2762
2763         async_data->fail_callback = fail_callback;
2764         async_data->callback_data = callback_data;
2765
2766         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2767                 if (tmp_ip->pnn == -1) {
2768                         /* this IP won't be taken over */
2769                         continue;
2770                 }
2771
2772                 if (tmp_ip->addr.sa.sa_family == AF_INET) {
2773                         ipv4.pnn = tmp_ip->pnn;
2774                         ipv4.sin = tmp_ip->addr.ip;
2775
2776                         timeout = TAKEOVER_TIMEOUT();
2777                         data.dsize = sizeof(ipv4);
2778                         data.dptr  = (uint8_t *)&ipv4;
2779                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
2780                                         0, CTDB_CONTROL_TAKEOVER_IPv4, 0,
2781                                         data, async_data,
2782                                         &timeout, NULL);
2783                 } else {
2784                         ip.pnn  = tmp_ip->pnn;
2785                         ip.addr = tmp_ip->addr;
2786
2787                         timeout = TAKEOVER_TIMEOUT();
2788                         data.dsize = sizeof(ip);
2789                         data.dptr  = (uint8_t *)&ip;
2790                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
2791                                         0, CTDB_CONTROL_TAKEOVER_IP, 0,
2792                                         data, async_data,
2793                                         &timeout, NULL);
2794                 }
2795                 if (state == NULL) {
2796                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2797                         talloc_free(tmp_ctx);
2798                         return -1;
2799                 }
2800                 
2801                 ctdb_client_async_add(async_data, state);
2802         }
2803         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2804                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2805                 talloc_free(tmp_ctx);
2806                 return -1;
2807         }
2808
2809 ipreallocated:
2810         /*
2811          * Tell all nodes to run eventscripts to process the
2812          * "ipreallocated" event.  This can do a lot of things,
2813          * including restarting services to reconfigure them if public
2814          * IPs have moved.  Once upon a time this event only used to
2815          * update natgw.
2816          */
2817         retry_data = talloc_zero_array(tmp_ctx, bool, nodemap->num);
2818         CTDB_NO_MEMORY_FATAL(ctdb, retry_data);
2819         iprealloc_data.retry_nodes = retry_data;
2820         iprealloc_data.retry_count = 0;
2821         iprealloc_data.fail_callback = fail_callback;
2822         iprealloc_data.fail_callback_data = callback_data;
2823         iprealloc_data.nodemap = nodemap;
2824
2825         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2826         ret = ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
2827                                         nodes, 0, TAKEOVER_TIMEOUT(),
2828                                         false, tdb_null,
2829                                         NULL, iprealloc_fail_callback,
2830                                         &iprealloc_data);
2831         if (ret != 0) {
2832                 /* If the control failed then we should retry to any
2833                  * nodes flagged by iprealloc_fail_callback using the
2834                  * EVENTSCRIPT control.  This is a best-effort at
2835                  * backward compatiblity when running a mixed cluster
2836                  * where some nodes have not yet been upgraded to
2837                  * support the IPREALLOCATED control.
2838                  */
2839                 DEBUG(DEBUG_WARNING,
2840                       ("Retry ipreallocated to some nodes using eventscript control\n"));
2841
2842                 nodes = talloc_array(tmp_ctx, uint32_t,
2843                                      iprealloc_data.retry_count);
2844                 CTDB_NO_MEMORY_FATAL(ctdb, nodes);
2845
2846                 j = 0;
2847                 for (i=0; i<nodemap->num; i++) {
2848                         if (iprealloc_data.retry_nodes[i]) {
2849                                 nodes[j] = i;
2850                                 j++;
2851                         }
2852                 }
2853
2854                 data.dptr  = discard_const("ipreallocated");
2855                 data.dsize = strlen((char *)data.dptr) + 1; 
2856                 ret = ctdb_client_async_control(ctdb,
2857                                                 CTDB_CONTROL_RUN_EVENTSCRIPTS,
2858                                                 nodes, 0, TAKEOVER_TIMEOUT(),
2859                                                 false, data,
2860                                                 NULL, fail_callback,
2861                                                 callback_data);
2862                 if (ret != 0) {
2863                         DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
2864                 }
2865         }
2866
2867         talloc_free(tmp_ctx);
2868         return ret;
2869 }
2870
2871
2872 /*
2873   destroy a ctdb_client_ip structure
2874  */
2875 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2876 {
2877         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2878                 ctdb_addr_to_str(&ip->addr),
2879                 ntohs(ip->addr.ip.sin_port),
2880                 ip->client_id));
2881
2882         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2883         return 0;
2884 }
2885
2886 /*
2887   called by a client to inform us of a TCP connection that it is managing
2888   that should tickled with an ACK when IP takeover is done
2889   we handle both the old ipv4 style of packets as well as the new ipv4/6
2890   pdus.
2891  */
2892 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2893                                 TDB_DATA indata)
2894 {
2895         struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
2896         struct ctdb_control_tcp *old_addr = NULL;
2897         struct ctdb_control_tcp_addr new_addr;
2898         struct ctdb_control_tcp_addr *tcp_sock = NULL;
2899         struct ctdb_tcp_list *tcp;
2900         struct ctdb_tcp_connection t;
2901         int ret;
2902         TDB_DATA data;
2903         struct ctdb_client_ip *ip;
2904         struct ctdb_vnn *vnn;
2905         ctdb_sock_addr addr;
2906
2907         /* If we don't have public IPs, tickles are useless */
2908         if (ctdb->vnn == NULL) {
2909                 return 0;
2910         }
2911
2912         switch (indata.dsize) {
2913         case sizeof(struct ctdb_control_tcp):
2914                 old_addr = (struct ctdb_control_tcp *)indata.dptr;
2915                 ZERO_STRUCT(new_addr);
2916                 tcp_sock = &new_addr;
2917                 tcp_sock->src.ip  = old_addr->src;
2918                 tcp_sock->dest.ip = old_addr->dest;
2919                 break;
2920         case sizeof(struct ctdb_control_tcp_addr):
2921                 tcp_sock = (struct ctdb_control_tcp_addr *)indata.dptr;
2922                 break;
2923         default:
2924                 DEBUG(DEBUG_ERR,(__location__ " Invalid data structure passed "
2925                                  "to ctdb_control_tcp_client. size was %d but "
2926                                  "only allowed sizes are %lu and %lu\n",
2927                                  (int)indata.dsize,
2928                                  (long unsigned)sizeof(struct ctdb_control_tcp),
2929                                  (long unsigned)sizeof(struct ctdb_control_tcp_addr)));
2930                 return -1;
2931         }
2932
2933         addr = tcp_sock->src;
2934         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
2935         addr = tcp_sock->dest;
2936         ctdb_canonicalize_ip(&addr, &tcp_sock->dest);
2937
2938         ZERO_STRUCT(addr);
2939         memcpy(&addr, &tcp_sock->dest, sizeof(addr));
2940         vnn = find_public_ip_vnn(ctdb, &addr);
2941         if (vnn == NULL) {
2942                 switch (addr.sa.sa_family) {
2943                 case AF_INET:
2944                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2945                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
2946                                         ctdb_addr_to_str(&addr)));
2947                         }
2948                         break;
2949                 case AF_INET6:
2950                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
2951                                 ctdb_addr_to_str(&addr)));
2952                         break;
2953                 default:
2954                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2955                 }
2956
2957                 return 0;
2958         }
2959
2960         if (vnn->pnn != ctdb->pnn) {
2961                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2962                         ctdb_addr_to_str(&addr),
2963                         client_id, client->pid));
2964                 /* failing this call will tell smbd to die */
2965                 return -1;
2966         }
2967
2968         ip = talloc(client, struct ctdb_client_ip);
2969         CTDB_NO_MEMORY(ctdb, ip);
2970
2971         ip->ctdb      = ctdb;
2972         ip->addr      = addr;
2973         ip->client_id = client_id;
2974         talloc_set_destructor(ip, ctdb_client_ip_destructor);
2975         DLIST_ADD(ctdb->client_ip_list, ip);
2976
2977         tcp = talloc(client, struct ctdb_tcp_list);
2978         CTDB_NO_MEMORY(ctdb, tcp);
2979
2980         tcp->connection.src_addr = tcp_sock->src;
2981         tcp->connection.dst_addr = tcp_sock->dest;
2982
2983         DLIST_ADD(client->tcp_list, tcp);
2984
2985         t.src_addr = tcp_sock->src;
2986         t.dst_addr = tcp_sock->dest;
2987
2988         data.dptr = (uint8_t *)&t;
2989         data.dsize = sizeof(t);
2990
2991         switch (addr.sa.sa_family) {
2992         case AF_INET:
2993                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2994                         (unsigned)ntohs(tcp_sock->dest.ip.sin_port), 
2995                         ctdb_addr_to_str(&tcp_sock->src),
2996                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2997                 break;
2998         case AF_INET6:
2999                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
3000                         (unsigned)ntohs(tcp_sock->dest.ip6.sin6_port), 
3001                         ctdb_addr_to_str(&tcp_sock->src),
3002                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
3003                 break;
3004         default:
3005                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
3006         }
3007
3008
3009         /* tell all nodes about this tcp connection */
3010         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
3011                                        CTDB_CONTROL_TCP_ADD,
3012                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3013         if (ret != 0) {
3014                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
3015                 return -1;
3016         }
3017
3018         return 0;
3019 }
3020
3021 /*
3022   find a tcp address on a list
3023  */
3024 static struct ctdb_tcp_connection *ctdb_tcp_find(struct ctdb_tcp_array *array, 
3025                                            struct ctdb_tcp_connection *tcp)
3026 {
3027         int i;
3028
3029         if (array == NULL) {
3030                 return NULL;
3031         }
3032
3033         for (i=0;i<array->num;i++) {
3034                 if (ctdb_same_sockaddr(&array->connections[i].src_addr, &tcp->src_addr) &&
3035                     ctdb_same_sockaddr(&array->connections[i].dst_addr, &tcp->dst_addr)) {
3036                         return &array->connections[i];
3037                 }
3038         }
3039         return NULL;
3040 }
3041
3042
3043
3044 /*
3045   called by a daemon to inform us of a TCP connection that one of its
3046   clients managing that should tickled with an ACK when IP takeover is
3047   done
3048  */
3049 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
3050 {
3051         struct ctdb_tcp_connection *p = (struct ctdb_tcp_connection *)indata.dptr;
3052         struct ctdb_tcp_array *tcparray;
3053         struct ctdb_tcp_connection tcp;
3054         struct ctdb_vnn *vnn;
3055
3056         /* If we don't have public IPs, tickles are useless */
3057         if (ctdb->vnn == NULL) {
3058                 return 0;
3059         }
3060
3061         vnn = find_public_ip_vnn(ctdb, &p->dst_addr);
3062         if (vnn == NULL) {
3063                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
3064                         ctdb_addr_to_str(&p->dst_addr)));
3065
3066                 return -1;
3067         }
3068
3069
3070         tcparray = vnn->tcp_array;
3071
3072         /* If this is the first tickle */
3073         if (tcparray == NULL) {
3074                 tcparray = talloc(vnn, struct ctdb_tcp_array);
3075                 CTDB_NO_MEMORY(ctdb, tcparray);
3076                 vnn->tcp_array = tcparray;
3077
3078                 tcparray->num = 0;
3079                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
3080                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
3081
3082                 tcparray->connections[tcparray->num].src_addr = p->src_addr;
3083                 tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
3084                 tcparray->num++;
3085
3086                 if (tcp_update_needed) {
3087                         vnn->tcp_update_needed = true;
3088                 }
3089                 return 0;
3090         }
3091
3092
3093         /* Do we already have this tickle ?*/
3094         tcp.src_addr = p->src_addr;
3095         tcp.dst_addr = p->dst_addr;
3096         if (ctdb_tcp_find(tcparray, &tcp) != NULL) {
3097                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
3098                         ctdb_addr_to_str(&tcp.dst_addr),
3099                         ntohs(tcp.dst_addr.ip.sin_port),
3100                         vnn->pnn));
3101                 return 0;
3102         }
3103
3104         /* A new tickle, we must add it to the array */
3105         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
3106                                         struct ctdb_tcp_connection,
3107                                         tcparray->num+1);
3108         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3109
3110         tcparray->connections[tcparray->num].src_addr = p->src_addr;
3111         tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
3112         tcparray->num++;
3113
3114         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
3115                 ctdb_addr_to_str(&tcp.dst_addr),
3116                 ntohs(tcp.dst_addr.ip.sin_port),
3117                 vnn->pnn));
3118
3119         if (tcp_update_needed) {
3120                 vnn->tcp_update_needed = true;
3121         }
3122
3123         return 0;
3124 }
3125
3126
3127 /*
3128   called by a daemon to inform us of a TCP connection that one of its
3129   clients managing that should tickled with an ACK when IP takeover is
3130   done
3131  */
3132 static void ctdb_remove_tcp_connection(struct ctdb_context *ctdb, struct ctdb_tcp_connection *conn)
3133 {
3134         struct ctdb_tcp_connection *tcpp;
3135         struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst_addr);
3136
3137         if (vnn == NULL) {
3138                 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
3139                         ctdb_addr_to_str(&conn->dst_addr)));
3140                 return;
3141         }
3142
3143         /* if the array is empty we cant remove it
3144            and we dont need to do anything
3145          */
3146         if (vnn->tcp_array == NULL) {
3147                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
3148                         ctdb_addr_to_str(&conn->dst_addr),
3149                         ntohs(conn->dst_addr.ip.sin_port)));
3150                 return;
3151         }
3152
3153
3154         /* See if we know this connection
3155            if we dont know this connection  then we dont need to do anything
3156          */
3157         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
3158         if (tcpp == NULL) {
3159                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
3160                         ctdb_addr_to_str(&conn->dst_addr),
3161                         ntohs(conn->dst_addr.ip.sin_port)));
3162                 return;
3163         }
3164
3165
3166         /* We need to remove this entry from the array.
3167            Instead of allocating a new array and copying data to it
3168            we cheat and just copy the last entry in the existing array
3169            to the entry that is to be removed and just shring the 
3170            ->num field
3171          */
3172         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
3173         vnn->tcp_array->num--;
3174
3175         /* If we deleted the last entry we also need to remove the entire array
3176          */
3177         if (vnn->tcp_array->num == 0) {
3178                 talloc_free(vnn->tcp_array);
3179                 vnn->tcp_array = NULL;
3180         }               
3181
3182         vnn->tcp_update_needed = true;
3183
3184         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
3185                 ctdb_addr_to_str(&conn->src_addr),
3186                 ntohs(conn->src_addr.ip.sin_port)));
3187 }
3188
3189
3190 /*
3191   called by a daemon to inform us of a TCP connection that one of its
3192   clients used are no longer needed in the tickle database
3193  */
3194 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
3195 {
3196         struct ctdb_tcp_connection *conn = (struct ctdb_tcp_connection *)indata.dptr;
3197
3198         /* If we don't have public IPs, tickles are useless */
3199         if (ctdb->vnn == NULL) {
3200                 return 0;
3201         }
3202
3203         ctdb_remove_tcp_connection(ctdb, conn);
3204
3205         return 0;
3206 }
3207
3208
3209 /*
3210   Called when another daemon starts - causes all tickles for all
3211   public addresses we are serving to be sent to the new node on the
3212   next check.  This actually causes the next scheduled call to
3213   tdb_update_tcp_tickles() to update all nodes.  This is simple and
3214   doesn't require careful error handling.
3215  */
3216 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t pnn)
3217 {
3218         struct ctdb_vnn *vnn;
3219
3220         DEBUG(DEBUG_INFO, ("Received startup control from node %lu\n",
3221                            (unsigned long) pnn));
3222
3223         for (vnn = ctdb->vnn; vnn != NULL; vnn = vnn->next) {
3224                 vnn->tcp_update_needed = true;
3225         }
3226
3227         return 0;
3228 }
3229
3230
3231 /*
3232   called when a client structure goes away - hook to remove
3233   elements from the tcp_list in all daemons
3234  */
3235 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
3236 {
3237         while (client->tcp_list) {
3238                 struct ctdb_tcp_list *tcp = client->tcp_list;
3239                 DLIST_REMOVE(client->tcp_list, tcp);
3240                 ctdb_remove_tcp_connection(client->ctdb, &tcp->connection);
3241         }
3242 }
3243
3244
3245 /*
3246   release all IPs on shutdown
3247  */
3248 void ctdb_release_all_ips(struct ctdb_context *ctdb)
3249 {
3250         struct ctdb_vnn *vnn;
3251         int count = 0;
3252
3253         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3254                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
3255                         ctdb_vnn_unassign_iface(ctdb, vnn);
3256                         continue;
3257                 }
3258                 if (!vnn->iface) {
3259                         continue;
3260                 }
3261
3262                 DEBUG(DEBUG_INFO,("Release of IP %s/%u on interface %s node:-1\n",
3263                                     ctdb_addr_to_str(&vnn->public_address),
3264                                     vnn->public_netmask_bits,
3265                                     ctdb_vnn_iface_string(vnn)));
3266
3267                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
3268                                   ctdb_vnn_iface_string(vnn),
3269                                   ctdb_addr_to_str(&vnn->public_address),
3270                                   vnn->public_netmask_bits);
3271                 release_kill_clients(ctdb, &vnn->public_address);
3272                 ctdb_vnn_unassign_iface(ctdb, vnn);
3273                 count++;
3274         }
3275
3276         DEBUG(DEBUG_NOTICE,(__location__ " Released %d public IPs\n", count));
3277 }
3278
3279
3280 /*
3281   get list of public IPs
3282  */
3283 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
3284                                     struct ctdb_req_control *c, TDB_DATA *outdata)
3285 {
3286         int i, num, len;
3287         struct ctdb_all_public_ips *ips;
3288         struct ctdb_vnn *vnn;
3289         bool only_available = false;
3290
3291         if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
3292                 only_available = true;
3293         }
3294
3295         /* count how many public ip structures we have */
3296         num = 0;
3297         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3298                 num++;
3299         }
3300
3301         len = offsetof(struct ctdb_all_public_ips, ips) + 
3302                 num*sizeof(struct ctdb_public_ip);
3303         ips = talloc_zero_size(outdata, len);
3304         CTDB_NO_MEMORY(ctdb, ips);
3305
3306         i = 0;
3307         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3308                 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
3309                         continue;
3310                 }
3311                 ips->ips[i].pnn  = vnn->pnn;
3312                 ips->ips[i].addr = vnn->public_address;
3313                 i++;
3314         }
3315         ips->num = i;
3316         len = offsetof(struct ctdb_all_public_ips, ips) +
3317                 i*sizeof(struct ctdb_public_ip);
3318
3319         outdata->dsize = len;
3320         outdata->dptr  = (uint8_t *)ips;
3321
3322         return 0;
3323 }
3324
3325
3326 /*
3327   get list of public IPs, old ipv4 style.  only returns ipv4 addresses
3328  */
3329 int32_t ctdb_control_get_public_ipsv4(struct ctdb_context *ctdb, 
3330                                     struct ctdb_req_control *c, TDB_DATA *outdata)
3331 {
3332         int i, num, len;
3333         struct ctdb_all_public_ipsv4 *ips;
3334         struct ctdb_vnn *vnn;
3335
3336         /* count how many public ip structures we have */
3337         num = 0;
3338         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3339                 if (vnn->public_address.sa.sa_family != AF_INET) {
3340                         continue;
3341                 }
3342                 num++;
3343         }
3344
3345         len = offsetof(struct ctdb_all_public_ipsv4, ips) + 
3346                 num*sizeof(struct ctdb_public_ipv4);
3347         ips = talloc_zero_size(outdata, len);
3348         CTDB_NO_MEMORY(ctdb, ips);
3349
3350         outdata->dsize = len;
3351         outdata->dptr  = (uint8_t *)ips;
3352
3353         ips->num = num;
3354         i = 0;
3355         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3356                 if (vnn->public_address.sa.sa_family != AF_INET) {
3357                         continue;
3358                 }
3359                 ips->ips[i].pnn = vnn->pnn;
3360                 ips->ips[i].sin = vnn->public_address.ip;
3361                 i++;
3362         }
3363
3364         return 0;
3365 }
3366
3367 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
3368                                         struct ctdb_req_control *c,
3369                                         TDB_DATA indata,
3370                                         TDB_DATA *outdata)
3371 {
3372         int i, num, len;
3373         ctdb_sock_addr *addr;
3374         struct ctdb_control_public_ip_info *info;
3375         struct ctdb_vnn *vnn;
3376
3377         addr = (ctdb_sock_addr *)indata.dptr;
3378
3379         vnn = find_public_ip_vnn(ctdb, addr);
3380         if (vnn == NULL) {
3381                 /* if it is not a public ip   it could be our 'single ip' */
3382                 if (ctdb->single_ip_vnn) {
3383                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
3384                                 vnn = ctdb->single_ip_vnn;
3385                         }
3386                 }
3387         }
3388         if (vnn == NULL) {
3389                 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
3390                                  "'%s'not a public address\n",
3391                                  ctdb_addr_to_str(addr)));
3392                 return -1;
3393         }
3394
3395         /* count how many public ip structures we have */
3396         num = 0;
3397         for (;vnn->ifaces[num];) {
3398                 num++;
3399         }
3400
3401         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3402                 num*sizeof(struct ctdb_control_iface_info);
3403         info = talloc_zero_size(outdata, len);
3404         CTDB_NO_MEMORY(ctdb, info);
3405
3406         info->ip.addr = vnn->public_address;
3407         info->ip.pnn = vnn->pnn;
3408         info->active_idx = 0xFFFFFFFF;
3409
3410         for (i=0; vnn->ifaces[i]; i++) {
3411                 struct ctdb_iface *cur;
3412
3413                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
3414                 if (cur == NULL) {
3415                         DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
3416                                            vnn->ifaces[i]));
3417                         return -1;
3418                 }
3419                 if (vnn->iface == cur) {
3420                         info->active_idx = i;
3421                 }
3422                 strncpy(info->ifaces[i].name, cur->name, sizeof(info->ifaces[i].name)-1);
3423                 info->ifaces[i].link_state = cur->link_up;
3424                 info->ifaces[i].references = cur->references;
3425         }
3426         info->num = i;
3427         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3428                 i*sizeof(struct ctdb_control_iface_info);
3429
3430         outdata->dsize = len;
3431         outdata->dptr  = (uint8_t *)info;
3432
3433         return 0;
3434 }
3435
3436 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
3437                                 struct ctdb_req_control *c,
3438                                 TDB_DATA *outdata)
3439 {
3440         int i, num, len;
3441         struct ctdb_control_get_ifaces *ifaces;
3442         struct ctdb_iface *cur;
3443
3444         /* count how many public ip structures we have */
3445         num = 0;
3446         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3447                 num++;
3448         }
3449
3450         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3451                 num*sizeof(struct ctdb_control_iface_info);
3452         ifaces = talloc_zero_size(outdata, len);
3453         CTDB_NO_MEMORY(ctdb, ifaces);
3454
3455         i = 0;
3456         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3457                 strcpy(ifaces->ifaces[i].name, cur->name);
3458                 ifaces->ifaces[i].link_state = cur->link_up;
3459                 ifaces->ifaces[i].references = cur->references;
3460                 i++;
3461         }
3462         ifaces->num = i;
3463         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3464                 i*sizeof(struct ctdb_control_iface_info);
3465
3466         outdata->dsize = len;
3467         outdata->dptr  = (uint8_t *)ifaces;
3468
3469         return 0;
3470 }
3471
3472 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
3473                                     struct ctdb_req_control *c,
3474                                     TDB_DATA indata)
3475 {
3476         struct ctdb_control_iface_info *info;
3477         struct ctdb_iface *iface;
3478         bool link_up = false;
3479
3480         info = (struct ctdb_control_iface_info *)indata.dptr;
3481
3482         if (info->name[CTDB_IFACE_SIZE] != '\0') {
3483                 int len = strnlen(info->name, CTDB_IFACE_SIZE);
3484                 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
3485                                   len, len, info->name));
3486                 return -1;
3487         }
3488
3489         switch (info->link_state) {
3490         case 0:
3491                 link_up = false;
3492                 break;
3493         case 1:
3494                 link_up = true;
3495                 break;
3496         default:
3497                 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
3498                                   (unsigned int)info->link_state));
3499                 return -1;
3500         }
3501
3502         if (info->references != 0) {
3503                 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
3504                                   (unsigned int)info->references));
3505                 return -1;
3506         }
3507
3508         iface = ctdb_find_iface(ctdb, info->name);
3509         if (iface == NULL) {
3510                 return -1;
3511         }
3512
3513         if (link_up == iface->link_up) {
3514                 return 0;
3515         }
3516
3517         DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
3518               ("iface[%s] has changed it's link status %s => %s\n",
3519                iface->name,
3520                iface->link_up?"up":"down",
3521                link_up?"up":"down"));
3522
3523         iface->link_up = link_up;
3524         return 0;
3525 }
3526
3527
3528 /* 
3529    structure containing the listening socket and the list of tcp connections
3530    that the ctdb daemon is to kill
3531 */
3532 struct ctdb_kill_tcp {
3533         struct ctdb_vnn *vnn;
3534         struct ctdb_context *ctdb;
3535         int capture_fd;
3536         struct fd_event *fde;
3537         trbt_tree_t *connections;
3538         void *private_data;
3539 };
3540
3541 /*
3542   a tcp connection that is to be killed
3543  */
3544 struct ctdb_killtcp_con {
3545         ctdb_sock_addr src_addr;
3546         ctdb_sock_addr dst_addr;
3547         int count;
3548         struct ctdb_kill_tcp *killtcp;
3549 };
3550
3551 /* this function is used to create a key to represent this socketpair
3552    in the killtcp tree.
3553    this key is used to insert and lookup matching socketpairs that are
3554    to be tickled and RST
3555 */
3556 #define KILLTCP_KEYLEN  10
3557 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
3558 {
3559         static uint32_t key[KILLTCP_KEYLEN];
3560
3561         bzero(key, sizeof(key));
3562
3563         if (src->sa.sa_family != dst->sa.sa_family) {
3564                 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
3565                 return key;
3566         }
3567         
3568         switch (src->sa.sa_family) {
3569         case AF_INET:
3570                 key[0]  = dst->ip.sin_addr.s_addr;
3571                 key[1]  = src->ip.sin_addr.s_addr;
3572                 key[2]  = dst->ip.sin_port;
3573                 key[3]  = src->ip.sin_port;
3574                 break;
3575         case AF_INET6: {
3576                 uint32_t *dst6_addr32 =
3577                         (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
3578                 uint32_t *src6_addr32 =
3579                         (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
3580                 key[0]  = dst6_addr32[3];
3581                 key[1]  = src6_addr32[3];
3582                 key[2]  = dst6_addr32[2];
3583                 key[3]  = src6_addr32[2];
3584                 key[4]  = dst6_addr32[1];
3585                 key[5]  = src6_addr32[1];
3586                 key[6]  = dst6_addr32[0];
3587                 key[7]  = src6_addr32[0];
3588                 key[8]  = dst->ip6.sin6_port;
3589                 key[9]  = src->ip6.sin6_port;
3590                 break;
3591         }
3592         default:
3593                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
3594                 return key;
3595         }
3596
3597         return key;
3598 }
3599
3600 /*
3601   called when we get a read event on the raw socket
3602  */
3603 static void capture_tcp_handler(struct event_context *ev, struct fd_event *fde, 
3604                                 uint16_t flags, void *private_data)
3605 {
3606         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3607         struct ctdb_killtcp_con *con;
3608         ctdb_sock_addr src, dst;
3609         uint32_t ack_seq, seq;
3610
3611         if (!(flags & EVENT_FD_READ)) {
3612                 return;
3613         }
3614
3615         if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
3616                                 killtcp->private_data,
3617                                 &src, &dst,
3618                                 &ack_seq, &seq) != 0) {
3619                 /* probably a non-tcp ACK packet */
3620                 return;
3621         }
3622
3623         /* check if we have this guy in our list of connections
3624            to kill
3625         */
3626         con = trbt_lookuparray32(killtcp->connections, 
3627                         KILLTCP_KEYLEN, killtcp_key(&src, &dst));
3628         if (con == NULL) {
3629                 /* no this was some other packet we can just ignore */
3630                 return;
3631         }
3632
3633         /* This one has been tickled !
3634            now reset him and remove him from the list.
3635          */
3636         DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
3637                 ntohs(con->dst_addr.ip.sin_port),
3638                 ctdb_addr_to_str(&con->src_addr),
3639                 ntohs(con->src_addr.ip.sin_port)));
3640
3641         ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
3642         talloc_free(con);
3643 }
3644
3645
3646 /* when traversing the list of all tcp connections to send tickle acks to
3647    (so that we can capture the ack coming back and kill the connection
3648     by a RST)
3649    this callback is called for each connection we are currently trying to kill
3650 */
3651 static int tickle_connection_traverse(void *param, void *data)
3652 {
3653         struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
3654
3655         /* have tried too many times, just give up */
3656         if (con->count >= 5) {
3657                 /* can't delete in traverse: reparent to delete_cons */
3658                 talloc_steal(param, con);
3659                 return 0;
3660         }
3661
3662         /* othervise, try tickling it again */
3663         con->count++;
3664         ctdb_sys_send_tcp(
3665                 (ctdb_sock_addr *)&con->dst_addr,
3666                 (ctdb_sock_addr *)&con->src_addr,
3667                 0, 0, 0);
3668         return 0;
3669 }
3670
3671
3672 /* 
3673    called every second until all sentenced connections have been reset
3674  */
3675 static void ctdb_tickle_sentenced_connections(struct event_context *ev, struct timed_event *te, 
3676                                               struct timeval t, void *private_data)
3677 {
3678         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3679         void *delete_cons = talloc_new(NULL);
3680
3681         /* loop over all connections sending tickle ACKs */
3682         trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
3683
3684         /* now we've finished traverse, it's safe to do deletion. */
3685         talloc_free(delete_cons);
3686
3687         /* If there are no more connections to kill we can remove the
3688            entire killtcp structure
3689          */
3690         if ( (killtcp->connections == NULL) || 
3691              (killtcp->connections->root == NULL) ) {
3692                 talloc_free(killtcp);
3693                 return;
3694         }
3695
3696         /* try tickling them again in a seconds time
3697          */
3698         event_add_timed(killtcp->ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
3699                         ctdb_tickle_sentenced_connections, killtcp);
3700 }
3701
3702 /*
3703   destroy the killtcp structure
3704  */
3705 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
3706 {
3707         struct ctdb_vnn *tmpvnn;
3708
3709         /* verify that this vnn is still active */
3710         for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
3711                 if (tmpvnn == killtcp->vnn) {
3712                         break;
3713                 }
3714         }
3715
3716         if (tmpvnn == NULL) {
3717                 return 0;
3718         }
3719
3720         if (killtcp->vnn->killtcp != killtcp) {
3721                 return 0;
3722         }
3723
3724         killtcp->vnn->killtcp = NULL;
3725
3726         return 0;
3727 }
3728
3729
3730 /* nothing fancy here, just unconditionally replace any existing
3731    connection structure with the new one.
3732
3733    dont even free the old one if it did exist, that one is talloc_stolen
3734    by the same node in the tree anyway and will be deleted when the new data 
3735    is deleted
3736 */
3737 static void *add_killtcp_callback(void *parm, void *data)
3738 {
3739         return parm;
3740 }
3741
3742 /*
3743   add a tcp socket to the list of connections we want to RST
3744  */
3745 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb, 
3746                                        ctdb_sock_addr *s,
3747                                        ctdb_sock_addr *d)
3748 {
3749         ctdb_sock_addr src, dst;
3750         struct ctdb_kill_tcp *killtcp;
3751         struct ctdb_killtcp_con *con;
3752         struct ctdb_vnn *vnn;
3753
3754         ctdb_canonicalize_ip(s, &src);
3755         ctdb_canonicalize_ip(d, &dst);
3756
3757         vnn = find_public_ip_vnn(ctdb, &dst);
3758         if (vnn == NULL) {
3759                 vnn = find_public_ip_vnn(ctdb, &src);
3760         }
3761         if (vnn == NULL) {
3762                 /* if it is not a public ip   it could be our 'single ip' */
3763                 if (ctdb->single_ip_vnn) {
3764                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
3765                                 vnn = ctdb->single_ip_vnn;
3766                         }
3767                 }
3768         }
3769         if (vnn == NULL) {
3770                 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n")); 
3771                 return -1;
3772         }
3773
3774         killtcp = vnn->killtcp;
3775         
3776         /* If this is the first connection to kill we must allocate
3777            a new structure
3778          */
3779         if (killtcp == NULL) {
3780                 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
3781                 CTDB_NO_MEMORY(ctdb, killtcp);
3782
3783                 killtcp->vnn         = vnn;
3784                 killtcp->ctdb        = ctdb;
3785                 killtcp->capture_fd  = -1;
3786                 killtcp->connections = trbt_create(killtcp, 0);
3787
3788                 vnn->killtcp         = killtcp;
3789                 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
3790         }
3791
3792
3793
3794         /* create a structure that describes this connection we want to
3795            RST and store it in killtcp->connections
3796         */
3797         con = talloc(killtcp, struct ctdb_killtcp_con);
3798         CTDB_NO_MEMORY(ctdb, con);
3799         con->src_addr = src;
3800         con->dst_addr = dst;
3801         con->count    = 0;
3802         con->killtcp  = killtcp;
3803
3804
3805         trbt_insertarray32_callback(killtcp->connections,
3806                         KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
3807                         add_killtcp_callback, con);
3808
3809         /* 
3810            If we dont have a socket to listen on yet we must create it
3811          */
3812         if (killtcp->capture_fd == -1) {
3813                 const char *iface = ctdb_vnn_iface_string(vnn);
3814                 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
3815                 if (killtcp->capture_fd == -1) {
3816                         DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
3817                                           "socket on iface '%s' for killtcp (%s)\n",
3818                                           iface, strerror(errno)));
3819                         goto failed;
3820                 }
3821         }
3822
3823
3824         if (killtcp->fde == NULL) {
3825                 killtcp->fde = event_add_fd(ctdb->ev, killtcp, killtcp->capture_fd, 
3826                                             EVENT_FD_READ,
3827                                             capture_tcp_handler, killtcp);
3828                 tevent_fd_set_auto_close(killtcp->fde);
3829
3830                 /* We also need to set up some events to tickle all these connections
3831                    until they are all reset
3832                 */
3833                 event_add_timed(ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
3834                                 ctdb_tickle_sentenced_connections, killtcp);
3835         }
3836
3837         /* tickle him once now */
3838         ctdb_sys_send_tcp(
3839                 &con->dst_addr,
3840                 &con->src_addr,
3841                 0, 0, 0);
3842
3843         return 0;
3844
3845 failed:
3846         talloc_free(vnn->killtcp);
3847         vnn->killtcp = NULL;
3848         return -1;
3849 }
3850
3851 /*
3852   kill a TCP connection.
3853  */
3854 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
3855 {
3856         struct ctdb_control_killtcp *killtcp = (struct ctdb_control_killtcp *)indata.dptr;
3857
3858         return ctdb_killtcp_add_connection(ctdb, &killtcp->src_addr, &killtcp->dst_addr);
3859 }
3860
3861 /*
3862   called by a daemon to inform us of the entire list of TCP tickles for
3863   a particular public address.
3864   this control should only be sent by the node that is currently serving
3865   that public address.
3866  */
3867 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
3868 {
3869         struct ctdb_control_tcp_tickle_list *list = (struct ctdb_control_tcp_tickle_list *)indata.dptr;
3870         struct ctdb_tcp_array *tcparray;
3871         struct ctdb_vnn *vnn;
3872
3873         /* We must at least have tickles.num or else we cant verify the size
3874            of the received data blob
3875          */
3876         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3877                                         tickles.connections)) {
3878                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list. Not enough data for the tickle.num field\n"));
3879                 return -1;
3880         }
3881
3882         /* verify that the size of data matches what we expect */
3883         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3884                                 tickles.connections)
3885                          + sizeof(struct ctdb_tcp_connection)
3886                                  * list->tickles.num) {
3887                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list\n"));
3888                 return -1;
3889         }
3890
3891         DEBUG(DEBUG_INFO, ("Received tickle update for public address %s\n",
3892                            ctdb_addr_to_str(&list->addr)));
3893
3894         vnn = find_public_ip_vnn(ctdb, &list->addr);
3895         if (vnn == NULL) {
3896                 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n",
3897                         ctdb_addr_to_str(&list->addr)));
3898
3899                 return 1;
3900         }
3901
3902         /* remove any old ticklelist we might have */
3903         talloc_free(vnn->tcp_array);
3904         vnn->tcp_array = NULL;
3905
3906         tcparray = talloc(vnn, struct ctdb_tcp_array);
3907         CTDB_NO_MEMORY(ctdb, tcparray);
3908
3909         tcparray->num = list->tickles.num;
3910
3911         tcparray->connections = talloc_array(tcparray, struct ctdb_tcp_connection, tcparray->num);
3912         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3913
3914         memcpy(tcparray->connections, &list->tickles.connections[0],
3915                sizeof(struct ctdb_tcp_connection)*tcparray->num);
3916
3917         /* We now have a new fresh tickle list array for this vnn */
3918         vnn->tcp_array = tcparray;
3919
3920         return 0;
3921 }
3922
3923 /*
3924   called to return the full list of tickles for the puclic address associated 
3925   with the provided vnn
3926  */
3927 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3928 {
3929         ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3930         struct ctdb_control_tcp_tickle_list *list;
3931         struct ctdb_tcp_array *tcparray;
3932         int num;
3933         struct ctdb_vnn *vnn;
3934
3935         vnn = find_public_ip_vnn(ctdb, addr);
3936         if (vnn == NULL) {
3937                 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n", 
3938                         ctdb_addr_to_str(addr)));
3939
3940                 return 1;
3941         }
3942
3943         tcparray = vnn->tcp_array;
3944         if (tcparray) {
3945                 num = tcparray->num;
3946         } else {
3947                 num = 0;
3948         }
3949
3950         outdata->dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3951                                 tickles.connections)
3952                         + sizeof(struct ctdb_tcp_connection) * num;
3953
3954         outdata->dptr  = talloc_size(outdata, outdata->dsize);
3955         CTDB_NO_MEMORY(ctdb, outdata->dptr);
3956         list = (struct ctdb_control_tcp_tickle_list *)outdata->dptr;
3957
3958         list->addr = *addr;
3959         list->tickles.num = num;
3960         if (num) {
3961                 memcpy(&list->tickles.connections[0], tcparray->connections, 
3962                         sizeof(struct ctdb_tcp_connection) * num);
3963         }
3964
3965         return 0;
3966 }
3967
3968
3969 /*
3970   set the list of all tcp tickles for a public address
3971  */
3972 static int ctdb_send_set_tcp_tickles_for_ip(struct ctdb_context *ctdb,
3973                                             ctdb_sock_addr *addr,
3974                                             struct ctdb_tcp_array *tcparray)
3975 {
3976         int ret, num;
3977         TDB_DATA data;
3978         struct ctdb_control_tcp_tickle_list *list;
3979
3980         if (tcparray) {
3981                 num = tcparray->num;
3982         } else {
3983                 num = 0;
3984         }
3985
3986         data.dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3987                                 tickles.connections) +
3988                         sizeof(struct ctdb_tcp_connection) * num;
3989         data.dptr = talloc_size(ctdb, data.dsize);
3990         CTDB_NO_MEMORY(ctdb, data.dptr);
3991
3992         list = (struct ctdb_control_tcp_tickle_list *)data.dptr;
3993         list->addr = *addr;
3994         list->tickles.num = num;
3995         if (tcparray) {
3996                 memcpy(&list->tickles.connections[0], tcparray->connections, sizeof(struct ctdb_tcp_connection) * num);
3997         }
3998
3999         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_ALL, 0,
4000                                        CTDB_CONTROL_SET_TCP_TICKLE_LIST,
4001                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
4002         if (ret != 0) {
4003                 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
4004                 return -1;
4005         }
4006
4007         talloc_free(data.dptr);
4008
4009         return ret;
4010 }
4011
4012
4013 /*
4014   perform tickle updates if required
4015  */
4016 static void ctdb_update_tcp_tickles(struct event_context *ev, 
4017                                 struct timed_event *te, 
4018                                 struct timeval t, void *private_data)
4019 {
4020         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4021         int ret;
4022         struct ctdb_vnn *vnn;
4023
4024         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
4025                 /* we only send out updates for public addresses that 
4026                    we have taken over
4027                  */
4028                 if (ctdb->pnn != vnn->pnn) {
4029                         continue;
4030                 }
4031                 /* We only send out the updates if we need to */
4032                 if (!vnn->tcp_update_needed) {
4033                         continue;
4034                 }
4035                 ret = ctdb_send_set_tcp_tickles_for_ip(ctdb,
4036                                                        &vnn->public_address,
4037                                                        vnn->tcp_array);
4038                 if (ret != 0) {
4039                         DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
4040                                 ctdb_addr_to_str(&vnn->public_address)));
4041                 } else {
4042                         DEBUG(DEBUG_INFO,
4043                               ("Sent tickle update for public address %s\n",
4044                                ctdb_addr_to_str(&vnn->public_address)));
4045                         vnn->tcp_update_needed = false;
4046                 }
4047         }
4048
4049         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
4050                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
4051                              ctdb_update_tcp_tickles, ctdb);
4052 }               
4053         
4054
4055 /*
4056   start periodic update of tcp tickles
4057  */
4058 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
4059 {
4060         ctdb->tickle_update_context = talloc_new(ctdb);
4061
4062         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
4063                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
4064                              ctdb_update_tcp_tickles, ctdb);
4065 }
4066
4067
4068
4069
4070 struct control_gratious_arp {
4071         struct ctdb_context *ctdb;
4072         ctdb_sock_addr addr;
4073         const char *iface;
4074         int count;
4075 };
4076
4077 /*
4078   send a control_gratuitous arp
4079  */
4080 static void send_gratious_arp(struct event_context *ev, struct timed_event *te, 
4081                                   struct timeval t, void *private_data)
4082 {
4083         int ret;
4084         struct control_gratious_arp *arp = talloc_get_type(private_data, 
4085                                                         struct control_gratious_arp);
4086
4087         ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
4088         if (ret != 0) {
4089                 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
4090                                  arp->iface, strerror(errno)));
4091         }
4092
4093
4094         arp->count++;
4095         if (arp->count == CTDB_ARP_REPEAT) {
4096                 talloc_free(arp);
4097                 return;
4098         }
4099
4100         event_add_timed(arp->ctdb->ev, arp, 
4101                         timeval_current_ofs(CTDB_ARP_INTERVAL, 0), 
4102                         send_gratious_arp, arp);
4103 }
4104
4105
4106 /*
4107   send a gratious arp 
4108  */
4109 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
4110 {
4111         struct ctdb_control_gratious_arp *gratious_arp = (struct ctdb_control_gratious_arp *)indata.dptr;
4112         struct control_gratious_arp *arp;
4113
4114         /* verify the size of indata */
4115         if (indata.dsize < offsetof(struct ctdb_control_gratious_arp, iface)) {
4116                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
4117                                  (unsigned)indata.dsize, 
4118                                  (unsigned)offsetof(struct ctdb_control_gratious_arp, iface)));
4119                 return -1;
4120         }
4121         if (indata.dsize != 
4122                 ( offsetof(struct ctdb_control_gratious_arp, iface)
4123                 + gratious_arp->len ) ){
4124
4125                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4126                         "but should be %u bytes\n", 
4127                          (unsigned)indata.dsize, 
4128                          (unsigned)(offsetof(struct ctdb_control_gratious_arp, iface)+gratious_arp->len)));
4129                 return -1;
4130         }
4131
4132
4133         arp = talloc(ctdb, struct control_gratious_arp);
4134         CTDB_NO_MEMORY(ctdb, arp);
4135
4136         arp->ctdb  = ctdb;
4137         arp->addr   = gratious_arp->addr;
4138         arp->iface = talloc_strdup(arp, gratious_arp->iface);
4139         CTDB_NO_MEMORY(ctdb, arp->iface);
4140         arp->count = 0;
4141         
4142         event_add_timed(arp->ctdb->ev, arp, 
4143                         timeval_zero(), send_gratious_arp, arp);
4144
4145         return 0;
4146 }
4147
4148 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
4149 {
4150         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
4151         int ret;
4152
4153         /* verify the size of indata */
4154         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
4155                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
4156                 return -1;
4157         }
4158         if (indata.dsize != 
4159                 ( offsetof(struct ctdb_control_ip_iface, iface)
4160                 + pub->len ) ){
4161
4162                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4163                         "but should be %u bytes\n", 
4164                          (unsigned)indata.dsize, 
4165                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
4166                 return -1;
4167         }
4168
4169         DEBUG(DEBUG_NOTICE,("Add IP %s\n", ctdb_addr_to_str(&pub->addr)));
4170
4171         ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
4172
4173         if (ret != 0) {
4174                 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
4175                 return -1;
4176         }
4177
4178         return 0;
4179 }
4180
4181 struct delete_ip_callback_state {
4182         struct ctdb_req_control *c;
4183 };
4184
4185 /*
4186   called when releaseip event finishes for del_public_address
4187  */
4188 static void delete_ip_callback(struct ctdb_context *ctdb,
4189                                int32_t status, TDB_DATA data,
4190                                const char *errormsg,
4191                                void *private_data)
4192 {
4193         struct delete_ip_callback_state *state =
4194                 talloc_get_type(private_data, struct delete_ip_callback_state);
4195
4196         /* If release failed then fail. */
4197         ctdb_request_control_reply(ctdb, state->c, NULL, status, errormsg);
4198         talloc_free(private_data);
4199 }
4200
4201 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb,
4202                                         struct ctdb_req_control *c,
4203                                         TDB_DATA indata, bool *async_reply)
4204 {
4205         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
4206         struct ctdb_vnn *vnn;
4207
4208         /* verify the size of indata */
4209         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
4210                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
4211                 return -1;
4212         }
4213         if (indata.dsize != 
4214                 ( offsetof(struct ctdb_control_ip_iface, iface)
4215                 + pub->len ) ){
4216
4217                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4218                         "but should be %u bytes\n", 
4219                          (unsigned)indata.dsize, 
4220                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
4221                 return -1;
4222         }
4223
4224         DEBUG(DEBUG_NOTICE,("Delete IP %s\n", ctdb_addr_to_str(&pub->addr)));
4225
4226         /* walk over all public addresses until we find a match */
4227         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
4228                 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
4229                         if (vnn->pnn == ctdb->pnn) {
4230                                 struct delete_ip_callback_state *state;
4231                                 struct ctdb_public_ip *ip;
4232                                 TDB_DATA data;
4233                                 int ret;
4234
4235                                 vnn->delete_pending = true;
4236
4237                                 state = talloc(ctdb,
4238                                                struct delete_ip_callback_state);
4239                                 CTDB_NO_MEMORY(ctdb, state);
4240                                 state->c = c;
4241
4242                                 ip = talloc(state, struct ctdb_public_ip);
4243                                 if (ip == NULL) {
4244                                         DEBUG(DEBUG_ERR,
4245                                               (__location__ " Out of memory\n"));
4246                                         talloc_free(state);
4247                                         return -1;
4248                                 }
4249                                 ip->pnn = -1;
4250                                 ip->addr = pub->addr;
4251
4252                                 data.dsize = sizeof(struct ctdb_public_ip);
4253                                 data.dptr = (unsigned char *)ip;
4254
4255                                 ret = ctdb_daemon_send_control(ctdb,
4256                                                                ctdb_get_pnn(ctdb),
4257                                                                0,
4258                                                                CTDB_CONTROL_RELEASE_IP,
4259                                                                0, 0,
4260                                                                data,
4261                                                                delete_ip_callback,
4262                                                                state);
4263                                 if (ret == -1) {
4264                                         DEBUG(DEBUG_ERR,
4265                                               (__location__ "Unable to send "
4266                                                "CTDB_CONTROL_RELEASE_IP\n"));
4267                                         talloc_free(state);
4268                                         return -1;
4269                                 }
4270
4271                                 state->c = talloc_steal(state, c);
4272                                 *async_reply = true;
4273                         } else {
4274                                 /* This IP is not hosted on the
4275                                  * current node so just delete it
4276                                  * now. */
4277                                 do_delete_ip(ctdb, vnn);
4278                         }
4279
4280                         return 0;
4281                 }
4282         }
4283
4284         DEBUG(DEBUG_ERR,("Delete IP of unknown public IP address %s\n",
4285                          ctdb_addr_to_str(&pub->addr)));
4286         return -1;
4287 }
4288
4289
4290 struct ipreallocated_callback_state {
4291         struct ctdb_req_control *c;
4292 };
4293
4294 static void ctdb_ipreallocated_callback(struct ctdb_context *ctdb,
4295                                         int status, void *p)
4296 {
4297         struct ipreallocated_callback_state *state =
4298                 talloc_get_type(p, struct ipreallocated_callback_state);
4299
4300         if (status != 0) {
4301                 DEBUG(DEBUG_ERR,
4302                       (" \"ipreallocated\" event script failed (status %d)\n",
4303                        status));
4304                 if (status == -ETIME) {
4305                         ctdb_ban_self(ctdb);
4306                 }
4307         }
4308
4309         ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
4310         talloc_free(state);
4311 }
4312
4313 /* A control to run the ipreallocated event */
4314 int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
4315                                    struct ctdb_req_control *c,
4316                                    bool *async_reply)
4317 {
4318         int ret;
4319         struct ipreallocated_callback_state *state;
4320
4321         state = talloc(ctdb, struct ipreallocated_callback_state);
4322         CTDB_NO_MEMORY(ctdb, state);
4323
4324         DEBUG(DEBUG_INFO,(__location__ " Running \"ipreallocated\" event\n"));
4325
4326         ret = ctdb_event_script_callback(ctdb, state,
4327                                          ctdb_ipreallocated_callback, state,
4328                                          CTDB_EVENT_IPREALLOCATED,
4329                                          "%s", "");
4330
4331         if (ret != 0) {
4332                 DEBUG(DEBUG_ERR,("Failed to run \"ipreallocated\" event \n"));
4333                 talloc_free(state);
4334                 return -1;
4335         }
4336
4337         /* tell the control that we will be reply asynchronously */
4338         state->c    = talloc_steal(state, c);
4339         *async_reply = true;
4340
4341         return 0;
4342 }
4343
4344
4345 /* This function is called from the recovery daemon to verify that a remote
4346    node has the expected ip allocation.
4347    This is verified against ctdb->ip_tree
4348 */
4349 int verify_remote_ip_allocation(struct ctdb_context *ctdb,
4350                                 struct ctdb_all_public_ips *ips,
4351                                 uint32_t pnn)
4352 {
4353         struct ctdb_public_ip_list *tmp_ip; 
4354         int i;
4355
4356         if (ctdb->ip_tree == NULL) {
4357                 /* dont know the expected allocation yet, assume remote node
4358                    is correct. */
4359                 return 0;
4360         }
4361
4362         if (ips == NULL) {
4363                 return 0;
4364         }
4365
4366         for (i=0; i<ips->num; i++) {
4367                 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
4368                 if (tmp_ip == NULL) {
4369                         DEBUG(DEBUG_ERR,("Node %u has new or unknown public IP %s\n", pnn, ctdb_addr_to_str(&ips->ips[i].addr)));
4370                         return -1;
4371                 }
4372
4373                 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
4374                         continue;
4375                 }
4376
4377                 if (tmp_ip->pnn != ips->ips[i].pnn) {
4378                         DEBUG(DEBUG_ERR,
4379                               ("Inconsistent IP allocation - node %u thinks %s is held by node %u while it is assigned to node %u\n",
4380                                pnn,
4381                                ctdb_addr_to_str(&ips->ips[i].addr),
4382                                ips->ips[i].pnn, tmp_ip->pnn));
4383                         return -1;
4384                 }
4385         }
4386
4387         return 0;
4388 }
4389
4390 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
4391 {
4392         struct ctdb_public_ip_list *tmp_ip; 
4393
4394         if (ctdb->ip_tree == NULL) {
4395                 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
4396                 return -1;
4397         }
4398
4399         tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
4400         if (tmp_ip == NULL) {
4401                 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
4402                 return -1;
4403         }
4404
4405         DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
4406         tmp_ip->pnn = ip->pnn;
4407
4408         return 0;
4409 }
4410
4411
4412 struct ctdb_reloadips_handle {
4413         struct ctdb_context *ctdb;
4414         struct ctdb_req_control *c;
4415         int status;
4416         int fd[2];
4417         pid_t child;
4418         struct fd_event *fde;
4419 };
4420
4421 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
4422 {
4423         if (h == h->ctdb->reload_ips) {
4424                 h->ctdb->reload_ips = NULL;
4425         }
4426         if (h->c != NULL) {
4427                 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
4428                 h->c = NULL;
4429         }
4430         ctdb_kill(h->ctdb, h->child, SIGKILL);
4431         return 0;
4432 }
4433
4434 static void ctdb_reloadips_timeout_event(struct event_context *ev,
4435                                 struct timed_event *te,
4436                                 struct timeval t, void *private_data)
4437 {
4438         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4439
4440         talloc_free(h);
4441 }       
4442
4443 static void ctdb_reloadips_child_handler(struct event_context *ev, struct fd_event *fde, 
4444                              uint16_t flags, void *private_data)
4445 {
4446         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4447
4448         char res;
4449         int ret;
4450
4451         ret = sys_read(h->fd[0], &res, 1);
4452         if (ret < 1 || res != 0) {
4453                 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
4454                 res = 1;
4455         }
4456         h->status = res;
4457
4458         talloc_free(h);
4459 }
4460
4461 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
4462 {
4463         TALLOC_CTX *mem_ctx = talloc_new(NULL);
4464         struct ctdb_all_public_ips *ips;
4465         struct ctdb_vnn *vnn;
4466         struct client_async_data *async_data;
4467         struct timeval timeout;
4468         TDB_DATA data;
4469         struct ctdb_client_control_state *state;
4470         bool first_add;
4471         int i, ret;
4472
4473         CTDB_NO_MEMORY(ctdb, mem_ctx);
4474
4475         /* Read IPs from local node */
4476         ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(),
4477                                        CTDB_CURRENT_NODE, mem_ctx, &ips);
4478         if (ret != 0) {
4479                 DEBUG(DEBUG_ERR,
4480                       ("Unable to fetch public IPs from local node\n"));
4481                 talloc_free(mem_ctx);
4482                 return -1;
4483         }
4484
4485         /* Read IPs file - this is safe since this is a child process */
4486         ctdb->vnn = NULL;
4487         if (ctdb_set_public_addresses(ctdb, false) != 0) {
4488                 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
4489                 talloc_free(mem_ctx);
4490                 return -1;
4491         }
4492
4493         async_data = talloc_zero(mem_ctx, struct client_async_data);
4494         CTDB_NO_MEMORY(ctdb, async_data);
4495
4496         /* Compare IPs between node and file for IPs to be deleted */
4497         for (i = 0; i < ips->num; i++) {
4498                 /* */
4499                 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4500                         if (ctdb_same_ip(&vnn->public_address,
4501                                          &ips->ips[i].addr)) {
4502                                 /* IP is still in file */
4503                                 break;
4504                         }
4505                 }
4506
4507                 if (vnn == NULL) {
4508                         /* Delete IP ips->ips[i] */
4509                         struct ctdb_control_ip_iface *pub;
4510
4511                         DEBUG(DEBUG_NOTICE,
4512                               ("IP %s no longer configured, deleting it\n",
4513                                ctdb_addr_to_str(&ips->ips[i].addr)));
4514
4515                         pub = talloc_zero(mem_ctx,
4516                                           struct ctdb_control_ip_iface);
4517                         CTDB_NO_MEMORY(ctdb, pub);
4518
4519                         pub->addr  = ips->ips[i].addr;
4520                         pub->mask  = 0;
4521                         pub->len   = 0;
4522
4523                         timeout = TAKEOVER_TIMEOUT();
4524
4525                         data.dsize = offsetof(struct ctdb_control_ip_iface,
4526                                               iface) + pub->len;
4527                         data.dptr = (uint8_t *)pub;
4528
4529                         state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4530                                                   CTDB_CONTROL_DEL_PUBLIC_IP,
4531                                                   0, data, async_data,
4532                                                   &timeout, NULL);
4533                         if (state == NULL) {
4534                                 DEBUG(DEBUG_ERR,
4535                                       (__location__
4536                                        " failed sending CTDB_CONTROL_DEL_PUBLIC_IP\n"));
4537                                 goto failed;
4538                         }
4539
4540                         ctdb_client_async_add(async_data, state);
4541                 }
4542         }
4543
4544         /* Compare IPs between node and file for IPs to be added */
4545         first_add = true;
4546         for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4547                 for (i = 0; i < ips->num; i++) {
4548                         if (ctdb_same_ip(&vnn->public_address,
4549                                          &ips->ips[i].addr)) {
4550                                 /* IP already on node */
4551                                 break;
4552                         }
4553                 }
4554                 if (i == ips->num) {
4555                         /* Add IP ips->ips[i] */
4556                         struct ctdb_control_ip_iface *pub;
4557                         const char *ifaces = NULL;
4558                         uint32_t len;
4559                         int iface = 0;
4560
4561                         DEBUG(DEBUG_NOTICE,
4562                               ("New IP %s configured, adding it\n",
4563                                ctdb_addr_to_str(&vnn->public_address)));
4564                         if (first_add) {
4565                                 uint32_t pnn = ctdb_get_pnn(ctdb);
4566
4567                                 data.dsize = sizeof(pnn);
4568                                 data.dptr  = (uint8_t *)&pnn;
4569
4570                                 ret = ctdb_client_send_message(
4571                                         ctdb,
4572                                         CTDB_BROADCAST_CONNECTED,
4573                                         CTDB_SRVID_REBALANCE_NODE,
4574                                         data);
4575                                 if (ret != 0) {
4576                                         DEBUG(DEBUG_WARNING,
4577                                               ("Failed to send message to force node reallocation - IPs may be unbalanced\n"));
4578                                 }
4579
4580                                 first_add = false;
4581                         }
4582
4583                         ifaces = vnn->ifaces[0];
4584                         iface = 1;
4585                         while (vnn->ifaces[iface] != NULL) {
4586                                 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces,
4587                                                          vnn->ifaces[iface]);
4588                                 iface++;
4589                         }
4590
4591                         len   = strlen(ifaces) + 1;
4592                         pub = talloc_zero_size(mem_ctx,
4593                                                offsetof(struct ctdb_control_ip_iface, iface) + len);
4594                         CTDB_NO_MEMORY(ctdb, pub);
4595
4596                         pub->addr  = vnn->public_address;
4597                         pub->mask  = vnn->public_netmask_bits;
4598                         pub->len   = len;
4599                         memcpy(&pub->iface[0], ifaces, pub->len);
4600
4601                         timeout = TAKEOVER_TIMEOUT();
4602
4603                         data.dsize = offsetof(struct ctdb_control_ip_iface,
4604                                               iface) + pub->len;
4605                         data.dptr = (uint8_t *)pub;
4606
4607                         state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4608                                                   CTDB_CONTROL_ADD_PUBLIC_IP,
4609                                                   0, data, async_data,
4610                                                   &timeout, NULL);
4611                         if (state == NULL) {
4612                                 DEBUG(DEBUG_ERR,
4613                                       (__location__
4614                                        " failed sending CTDB_CONTROL_ADD_PUBLIC_IP\n"));
4615                                 goto failed;
4616                         }
4617
4618                         ctdb_client_async_add(async_data, state);
4619                 }
4620         }
4621
4622         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
4623                 DEBUG(DEBUG_ERR,(__location__ " Add/delete IPs failed\n"));
4624                 goto failed;
4625         }
4626
4627         talloc_free(mem_ctx);
4628         return 0;
4629
4630 failed:
4631         talloc_free(mem_ctx);
4632         return -1;
4633 }
4634
4635 /* This control is sent to force the node to re-read the public addresses file
4636    and drop any addresses we should nnot longer host, and add new addresses
4637    that we are now able to host
4638 */
4639 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control *c, bool *async_reply)
4640 {
4641         struct ctdb_reloadips_handle *h;
4642         pid_t parent = getpid();
4643
4644         if (ctdb->reload_ips != NULL) {
4645                 talloc_free(ctdb->reload_ips);
4646                 ctdb->reload_ips = NULL;
4647         }
4648
4649         h = talloc(ctdb, struct ctdb_reloadips_handle);
4650         CTDB_NO_MEMORY(ctdb, h);
4651         h->ctdb     = ctdb;
4652         h->c        = NULL;
4653         h->status   = -1;
4654         
4655         if (pipe(h->fd) == -1) {
4656                 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
4657                 talloc_free(h);
4658                 return -1;
4659         }
4660
4661         h->child = ctdb_fork(ctdb);
4662         if (h->child == (pid_t)-1) {
4663                 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
4664                 close(h->fd[0]);
4665                 close(h->fd[1]);
4666                 talloc_free(h);
4667                 return -1;
4668         }
4669
4670         /* child process */
4671         if (h->child == 0) {
4672                 signed char res = 0;
4673
4674                 close(h->fd[0]);
4675                 debug_extra = talloc_asprintf(NULL, "reloadips:");
4676
4677                 ctdb_set_process_name("ctdb_reloadips");
4678                 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
4679                         DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
4680                         res = -1;
4681                 } else {
4682                         res = ctdb_reloadips_child(ctdb);
4683                         if (res != 0) {
4684                                 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
4685                         }
4686                 }
4687
4688                 sys_write(h->fd[1], &res, 1);
4689                 /* make sure we die when our parent dies */
4690                 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
4691                         sleep(5);
4692                 }
4693                 _exit(0);
4694         }
4695
4696         h->c             = talloc_steal(h, c);
4697
4698         close(h->fd[1]);
4699         set_close_on_exec(h->fd[0]);
4700
4701         talloc_set_destructor(h, ctdb_reloadips_destructor);
4702
4703
4704         h->fde = event_add_fd(ctdb->ev, h, h->fd[0],
4705                         EVENT_FD_READ, ctdb_reloadips_child_handler,
4706                         (void *)h);
4707         tevent_fd_set_auto_close(h->fde);
4708
4709         event_add_timed(ctdb->ev, h,
4710                         timeval_current_ofs(120, 0),
4711                         ctdb_reloadips_timeout_event, h);
4712
4713         /* we reply later */
4714         *async_reply = true;
4715         return 0;
4716 }