ctdb: Coverity fix for CID 1125630
[obnox/samba/samba-obnox.git] / ctdb / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6    Copyright (C) Martin Schwenke  2011
7
8    This program is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3 of the License, or
11    (at your option) any later version.
12    
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17    
18    You should have received a copy of the GNU General Public License
19    along with this program; if not, see <http://www.gnu.org/licenses/>.
20 */
21 #include "includes.h"
22 #include "tdb.h"
23 #include "lib/util/dlinklist.h"
24 #include "system/network.h"
25 #include "system/filesys.h"
26 #include "system/wait.h"
27 #include "../include/ctdb_private.h"
28 #include "../common/rb_tree.h"
29
30
31 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
32
33 #define CTDB_ARP_INTERVAL 1
34 #define CTDB_ARP_REPEAT   3
35
36 /* Flags used in IP allocation algorithms. */
37 struct ctdb_ipflags {
38         bool noiptakeover;
39         bool noiphost;
40 };
41
42 struct ctdb_iface {
43         struct ctdb_iface *prev, *next;
44         const char *name;
45         bool link_up;
46         uint32_t references;
47 };
48
49 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
50 {
51         if (vnn->iface) {
52                 return vnn->iface->name;
53         }
54
55         return "__none__";
56 }
57
58 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
59 {
60         struct ctdb_iface *i;
61
62         /* Verify that we dont have an entry for this ip yet */
63         for (i=ctdb->ifaces;i;i=i->next) {
64                 if (strcmp(i->name, iface) == 0) {
65                         return 0;
66                 }
67         }
68
69         /* create a new structure for this interface */
70         i = talloc_zero(ctdb, struct ctdb_iface);
71         CTDB_NO_MEMORY_FATAL(ctdb, i);
72         i->name = talloc_strdup(i, iface);
73         CTDB_NO_MEMORY(ctdb, i->name);
74         /*
75          * If link_up defaults to true then IPs can be allocated to a
76          * node during the first recovery.  However, then an interface
77          * could have its link marked down during the startup event,
78          * causing the IP to move almost immediately.  If link_up
79          * defaults to false then, during normal operation, IPs added
80          * to a new interface can't be assigned until a monitor cycle
81          * has occurred and marked the new interfaces up.  This makes
82          * IP allocation unpredictable.  The following is a neat
83          * compromise: early in startup link_up defaults to false, so
84          * IPs can't be assigned, and after startup IPs can be
85          * assigned immediately.
86          */
87         i->link_up = (ctdb->runstate == CTDB_RUNSTATE_RUNNING);
88
89         DLIST_ADD(ctdb->ifaces, i);
90
91         return 0;
92 }
93
94 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
95                                         const char *name)
96 {
97         int n;
98
99         for (n = 0; vnn->ifaces[n] != NULL; n++) {
100                 if (strcmp(name, vnn->ifaces[n]) == 0) {
101                         return true;
102                 }
103         }
104
105         return false;
106 }
107
108 /* If any interfaces now have no possible IPs then delete them.  This
109  * implementation is naive (i.e. simple) rather than clever
110  * (i.e. complex).  Given that this is run on delip and that operation
111  * is rare, this doesn't need to be efficient - it needs to be
112  * foolproof.  One alternative is reference counting, where the logic
113  * is distributed and can, therefore, be broken in multiple places.
114  * Another alternative is to build a red-black tree of interfaces that
115  * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
116  * once) and then walking ctdb->ifaces once and deleting those not in
117  * the tree.  Let's go to one of those if the naive implementation
118  * causes problems...  :-)
119  */
120 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
121                                         struct ctdb_vnn *vnn)
122 {
123         struct ctdb_iface *i, *next;
124
125         /* For each interface, check if there's an IP using it. */
126         for (i = ctdb->ifaces; i != NULL; i = next) {
127                 struct ctdb_vnn *tv;
128                 bool found;
129                 next = i->next;
130
131                 /* Only consider interfaces named in the given VNN. */
132                 if (!vnn_has_interface_with_name(vnn, i->name)) {
133                         continue;
134                 }
135
136                 /* Is the "single IP" on this interface? */
137                 if ((ctdb->single_ip_vnn != NULL) &&
138                     (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
139                     (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
140                         /* Found, next interface please... */
141                         continue;
142                 }
143                 /* Search for a vnn with this interface. */
144                 found = false;
145                 for (tv=ctdb->vnn; tv; tv=tv->next) {
146                         if (vnn_has_interface_with_name(tv, i->name)) {
147                                 found = true;
148                                 break;
149                         }
150                 }
151
152                 if (!found) {
153                         /* None of the VNNs are using this interface. */
154                         DLIST_REMOVE(ctdb->ifaces, i);
155                         talloc_free(i);
156                 }
157         }
158 }
159
160
161 static struct ctdb_iface *ctdb_find_iface(struct ctdb_context *ctdb,
162                                           const char *iface)
163 {
164         struct ctdb_iface *i;
165
166         for (i=ctdb->ifaces;i;i=i->next) {
167                 if (strcmp(i->name, iface) == 0) {
168                         return i;
169                 }
170         }
171
172         return NULL;
173 }
174
175 static struct ctdb_iface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
176                                               struct ctdb_vnn *vnn)
177 {
178         int i;
179         struct ctdb_iface *cur = NULL;
180         struct ctdb_iface *best = NULL;
181
182         for (i=0; vnn->ifaces[i]; i++) {
183
184                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
185                 if (cur == NULL) {
186                         continue;
187                 }
188
189                 if (!cur->link_up) {
190                         continue;
191                 }
192
193                 if (best == NULL) {
194                         best = cur;
195                         continue;
196                 }
197
198                 if (cur->references < best->references) {
199                         best = cur;
200                         continue;
201                 }
202         }
203
204         return best;
205 }
206
207 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
208                                      struct ctdb_vnn *vnn)
209 {
210         struct ctdb_iface *best = NULL;
211
212         if (vnn->iface) {
213                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
214                                    "still assigned to iface '%s'\n",
215                                    ctdb_addr_to_str(&vnn->public_address),
216                                    ctdb_vnn_iface_string(vnn)));
217                 return 0;
218         }
219
220         best = ctdb_vnn_best_iface(ctdb, vnn);
221         if (best == NULL) {
222                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
223                                   "cannot assign to iface any iface\n",
224                                   ctdb_addr_to_str(&vnn->public_address)));
225                 return -1;
226         }
227
228         vnn->iface = best;
229         best->references++;
230         vnn->pnn = ctdb->pnn;
231
232         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
233                            "now assigned to iface '%s' refs[%d]\n",
234                            ctdb_addr_to_str(&vnn->public_address),
235                            ctdb_vnn_iface_string(vnn),
236                            best->references));
237         return 0;
238 }
239
240 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
241                                     struct ctdb_vnn *vnn)
242 {
243         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
244                            "now unassigned (old iface '%s' refs[%d])\n",
245                            ctdb_addr_to_str(&vnn->public_address),
246                            ctdb_vnn_iface_string(vnn),
247                            vnn->iface?vnn->iface->references:0));
248         if (vnn->iface) {
249                 vnn->iface->references--;
250         }
251         vnn->iface = NULL;
252         if (vnn->pnn == ctdb->pnn) {
253                 vnn->pnn = -1;
254         }
255 }
256
257 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
258                                struct ctdb_vnn *vnn)
259 {
260         int i;
261
262         if (vnn->delete_pending) {
263                 return false;
264         }
265
266         if (vnn->iface && vnn->iface->link_up) {
267                 return true;
268         }
269
270         for (i=0; vnn->ifaces[i]; i++) {
271                 struct ctdb_iface *cur;
272
273                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
274                 if (cur == NULL) {
275                         continue;
276                 }
277
278                 if (cur->link_up) {
279                         return true;
280                 }
281         }
282
283         return false;
284 }
285
286 struct ctdb_takeover_arp {
287         struct ctdb_context *ctdb;
288         uint32_t count;
289         ctdb_sock_addr addr;
290         struct ctdb_tcp_array *tcparray;
291         struct ctdb_vnn *vnn;
292 };
293
294
295 /*
296   lists of tcp endpoints
297  */
298 struct ctdb_tcp_list {
299         struct ctdb_tcp_list *prev, *next;
300         struct ctdb_tcp_connection connection;
301 };
302
303 /*
304   list of clients to kill on IP release
305  */
306 struct ctdb_client_ip {
307         struct ctdb_client_ip *prev, *next;
308         struct ctdb_context *ctdb;
309         ctdb_sock_addr addr;
310         uint32_t client_id;
311 };
312
313
314 /*
315   send a gratuitous arp
316  */
317 static void ctdb_control_send_arp(struct event_context *ev, struct timed_event *te, 
318                                   struct timeval t, void *private_data)
319 {
320         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
321                                                         struct ctdb_takeover_arp);
322         int i, ret;
323         struct ctdb_tcp_array *tcparray;
324         const char *iface = ctdb_vnn_iface_string(arp->vnn);
325
326         ret = ctdb_sys_send_arp(&arp->addr, iface);
327         if (ret != 0) {
328                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
329                                   iface, strerror(errno)));
330         }
331
332         tcparray = arp->tcparray;
333         if (tcparray) {
334                 for (i=0;i<tcparray->num;i++) {
335                         struct ctdb_tcp_connection *tcon;
336
337                         tcon = &tcparray->connections[i];
338                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
339                                 (unsigned)ntohs(tcon->dst_addr.ip.sin_port), 
340                                 ctdb_addr_to_str(&tcon->src_addr),
341                                 (unsigned)ntohs(tcon->src_addr.ip.sin_port)));
342                         ret = ctdb_sys_send_tcp(
343                                 &tcon->src_addr, 
344                                 &tcon->dst_addr,
345                                 0, 0, 0);
346                         if (ret != 0) {
347                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
348                                         ctdb_addr_to_str(&tcon->src_addr)));
349                         }
350                 }
351         }
352
353         arp->count++;
354
355         if (arp->count == CTDB_ARP_REPEAT) {
356                 talloc_free(arp);
357                 return;
358         }
359
360         event_add_timed(arp->ctdb->ev, arp->vnn->takeover_ctx, 
361                         timeval_current_ofs(CTDB_ARP_INTERVAL, 100000), 
362                         ctdb_control_send_arp, arp);
363 }
364
365 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
366                                        struct ctdb_vnn *vnn)
367 {
368         struct ctdb_takeover_arp *arp;
369         struct ctdb_tcp_array *tcparray;
370
371         if (!vnn->takeover_ctx) {
372                 vnn->takeover_ctx = talloc_new(vnn);
373                 if (!vnn->takeover_ctx) {
374                         return -1;
375                 }
376         }
377
378         arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
379         if (!arp) {
380                 return -1;
381         }
382
383         arp->ctdb = ctdb;
384         arp->addr = vnn->public_address;
385         arp->vnn  = vnn;
386
387         tcparray = vnn->tcp_array;
388         if (tcparray) {
389                 /* add all of the known tcp connections for this IP to the
390                    list of tcp connections to send tickle acks for */
391                 arp->tcparray = talloc_steal(arp, tcparray);
392
393                 vnn->tcp_array = NULL;
394                 vnn->tcp_update_needed = true;
395         }
396
397         event_add_timed(arp->ctdb->ev, vnn->takeover_ctx,
398                         timeval_zero(), ctdb_control_send_arp, arp);
399
400         return 0;
401 }
402
403 struct takeover_callback_state {
404         struct ctdb_req_control *c;
405         ctdb_sock_addr *addr;
406         struct ctdb_vnn *vnn;
407 };
408
409 struct ctdb_do_takeip_state {
410         struct ctdb_req_control *c;
411         struct ctdb_vnn *vnn;
412 };
413
414 /*
415   called when takeip event finishes
416  */
417 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
418                                     void *private_data)
419 {
420         struct ctdb_do_takeip_state *state =
421                 talloc_get_type(private_data, struct ctdb_do_takeip_state);
422         int32_t ret;
423         TDB_DATA data;
424
425         if (status != 0) {
426                 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
427         
428                 if (status == -ETIME) {
429                         ctdb_ban_self(ctdb);
430                 }
431                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
432                                  ctdb_addr_to_str(&state->vnn->public_address),
433                                  ctdb_vnn_iface_string(state->vnn)));
434                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
435
436                 node->flags |= NODE_FLAGS_UNHEALTHY;
437                 talloc_free(state);
438                 return;
439         }
440
441         if (ctdb->do_checkpublicip) {
442
443         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
444         if (ret != 0) {
445                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
446                 talloc_free(state);
447                 return;
448         }
449
450         }
451
452         data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
453         data.dsize = strlen((char *)data.dptr) + 1;
454         DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
455
456         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
457
458
459         /* the control succeeded */
460         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
461         talloc_free(state);
462         return;
463 }
464
465 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
466 {
467         state->vnn->update_in_flight = false;
468         return 0;
469 }
470
471 /*
472   take over an ip address
473  */
474 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
475                               struct ctdb_req_control *c,
476                               struct ctdb_vnn *vnn)
477 {
478         int ret;
479         struct ctdb_do_takeip_state *state;
480
481         if (vnn->update_in_flight) {
482                 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
483                                     "update for this IP already in flight\n",
484                                     ctdb_addr_to_str(&vnn->public_address),
485                                     vnn->public_netmask_bits));
486                 return -1;
487         }
488
489         ret = ctdb_vnn_assign_iface(ctdb, vnn);
490         if (ret != 0) {
491                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
492                                  "assign a usable interface\n",
493                                  ctdb_addr_to_str(&vnn->public_address),
494                                  vnn->public_netmask_bits));
495                 return -1;
496         }
497
498         state = talloc(vnn, struct ctdb_do_takeip_state);
499         CTDB_NO_MEMORY(ctdb, state);
500
501         state->c = talloc_steal(ctdb, c);
502         state->vnn   = vnn;
503
504         vnn->update_in_flight = true;
505         talloc_set_destructor(state, ctdb_takeip_destructor);
506
507         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
508                             ctdb_addr_to_str(&vnn->public_address),
509                             vnn->public_netmask_bits,
510                             ctdb_vnn_iface_string(vnn)));
511
512         ret = ctdb_event_script_callback(ctdb,
513                                          state,
514                                          ctdb_do_takeip_callback,
515                                          state,
516                                          CTDB_EVENT_TAKE_IP,
517                                          "%s %s %u",
518                                          ctdb_vnn_iface_string(vnn),
519                                          ctdb_addr_to_str(&vnn->public_address),
520                                          vnn->public_netmask_bits);
521
522         if (ret != 0) {
523                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
524                         ctdb_addr_to_str(&vnn->public_address),
525                         ctdb_vnn_iface_string(vnn)));
526                 talloc_free(state);
527                 return -1;
528         }
529
530         return 0;
531 }
532
533 struct ctdb_do_updateip_state {
534         struct ctdb_req_control *c;
535         struct ctdb_iface *old;
536         struct ctdb_vnn *vnn;
537 };
538
539 /*
540   called when updateip event finishes
541  */
542 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
543                                       void *private_data)
544 {
545         struct ctdb_do_updateip_state *state =
546                 talloc_get_type(private_data, struct ctdb_do_updateip_state);
547         int32_t ret;
548
549         if (status != 0) {
550                 if (status == -ETIME) {
551                         ctdb_ban_self(ctdb);
552                 }
553                 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
554                         ctdb_addr_to_str(&state->vnn->public_address),
555                         state->old->name,
556                         ctdb_vnn_iface_string(state->vnn)));
557
558                 /*
559                  * All we can do is reset the old interface
560                  * and let the next run fix it
561                  */
562                 ctdb_vnn_unassign_iface(ctdb, state->vnn);
563                 state->vnn->iface = state->old;
564                 state->vnn->iface->references++;
565
566                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
567                 talloc_free(state);
568                 return;
569         }
570
571         if (ctdb->do_checkpublicip) {
572
573         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
574         if (ret != 0) {
575                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
576                 talloc_free(state);
577                 return;
578         }
579
580         }
581
582         /* the control succeeded */
583         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
584         talloc_free(state);
585         return;
586 }
587
588 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
589 {
590         state->vnn->update_in_flight = false;
591         return 0;
592 }
593
594 /*
595   update (move) an ip address
596  */
597 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
598                                 struct ctdb_req_control *c,
599                                 struct ctdb_vnn *vnn)
600 {
601         int ret;
602         struct ctdb_do_updateip_state *state;
603         struct ctdb_iface *old = vnn->iface;
604         const char *new_name;
605
606         if (vnn->update_in_flight) {
607                 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
608                                     "update for this IP already in flight\n",
609                                     ctdb_addr_to_str(&vnn->public_address),
610                                     vnn->public_netmask_bits));
611                 return -1;
612         }
613
614         ctdb_vnn_unassign_iface(ctdb, vnn);
615         ret = ctdb_vnn_assign_iface(ctdb, vnn);
616         if (ret != 0) {
617                 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
618                                  "assin a usable interface (old iface '%s')\n",
619                                  ctdb_addr_to_str(&vnn->public_address),
620                                  vnn->public_netmask_bits,
621                                  old->name));
622                 return -1;
623         }
624
625         new_name = ctdb_vnn_iface_string(vnn);
626         if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
627                 /* A benign update from one interface onto itself.
628                  * no need to run the eventscripts in this case, just return
629                  * success.
630                  */
631                 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
632                 return 0;
633         }
634
635         state = talloc(vnn, struct ctdb_do_updateip_state);
636         CTDB_NO_MEMORY(ctdb, state);
637
638         state->c = talloc_steal(ctdb, c);
639         state->old = old;
640         state->vnn = vnn;
641
642         vnn->update_in_flight = true;
643         talloc_set_destructor(state, ctdb_updateip_destructor);
644
645         DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
646                             "interface %s to %s\n",
647                             ctdb_addr_to_str(&vnn->public_address),
648                             vnn->public_netmask_bits,
649                             old->name,
650                             new_name));
651
652         ret = ctdb_event_script_callback(ctdb,
653                                          state,
654                                          ctdb_do_updateip_callback,
655                                          state,
656                                          CTDB_EVENT_UPDATE_IP,
657                                          "%s %s %s %u",
658                                          state->old->name,
659                                          new_name,
660                                          ctdb_addr_to_str(&vnn->public_address),
661                                          vnn->public_netmask_bits);
662         if (ret != 0) {
663                 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
664                                  ctdb_addr_to_str(&vnn->public_address),
665                                  old->name, new_name));
666                 talloc_free(state);
667                 return -1;
668         }
669
670         return 0;
671 }
672
673 /*
674   Find the vnn of the node that has a public ip address
675   returns -1 if the address is not known as a public address
676  */
677 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
678 {
679         struct ctdb_vnn *vnn;
680
681         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
682                 if (ctdb_same_ip(&vnn->public_address, addr)) {
683                         return vnn;
684                 }
685         }
686
687         return NULL;
688 }
689
690 /*
691   take over an ip address
692  */
693 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
694                                  struct ctdb_req_control *c,
695                                  TDB_DATA indata,
696                                  bool *async_reply)
697 {
698         int ret;
699         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
700         struct ctdb_vnn *vnn;
701         bool have_ip = false;
702         bool do_updateip = false;
703         bool do_takeip = false;
704         struct ctdb_iface *best_iface = NULL;
705
706         if (pip->pnn != ctdb->pnn) {
707                 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
708                                  "with pnn %d, but we're node %d\n",
709                                  ctdb_addr_to_str(&pip->addr),
710                                  pip->pnn, ctdb->pnn));
711                 return -1;
712         }
713
714         /* update out vnn list */
715         vnn = find_public_ip_vnn(ctdb, &pip->addr);
716         if (vnn == NULL) {
717                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
718                         ctdb_addr_to_str(&pip->addr)));
719                 return 0;
720         }
721
722         if (ctdb->do_checkpublicip) {
723                 have_ip = ctdb_sys_have_ip(&pip->addr);
724         }
725         best_iface = ctdb_vnn_best_iface(ctdb, vnn);
726         if (best_iface == NULL) {
727                 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
728                                  "a usable interface (old %s, have_ip %d)\n",
729                                  ctdb_addr_to_str(&vnn->public_address),
730                                  vnn->public_netmask_bits,
731                                  ctdb_vnn_iface_string(vnn),
732                                  have_ip));
733                 return -1;
734         }
735
736         if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
737                 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
738                 have_ip = false;
739         }
740
741
742         if (vnn->iface == NULL && have_ip) {
743                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
744                                   "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
745                                  ctdb_addr_to_str(&vnn->public_address)));
746                 return 0;
747         }
748
749         if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
750                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
751                                   "and we have it on iface[%s], but it was assigned to node %d"
752                                   "and we are node %d, banning ourself\n",
753                                  ctdb_addr_to_str(&vnn->public_address),
754                                  ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
755                 ctdb_ban_self(ctdb);
756                 return -1;
757         }
758
759         if (vnn->pnn == -1 && have_ip) {
760                 vnn->pnn = ctdb->pnn;
761                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
762                                   "and we already have it on iface[%s], update local daemon\n",
763                                  ctdb_addr_to_str(&vnn->public_address),
764                                   ctdb_vnn_iface_string(vnn)));
765                 return 0;
766         }
767
768         if (vnn->iface) {
769                 if (vnn->iface != best_iface) {
770                         if (!vnn->iface->link_up) {
771                                 do_updateip = true;
772                         } else if (vnn->iface->references > (best_iface->references + 1)) {
773                                 /* only move when the rebalance gains something */
774                                         do_updateip = true;
775                         }
776                 }
777         }
778
779         if (!have_ip) {
780                 if (do_updateip) {
781                         ctdb_vnn_unassign_iface(ctdb, vnn);
782                         do_updateip = false;
783                 }
784                 do_takeip = true;
785         }
786
787         if (do_takeip) {
788                 ret = ctdb_do_takeip(ctdb, c, vnn);
789                 if (ret != 0) {
790                         return -1;
791                 }
792         } else if (do_updateip) {
793                 ret = ctdb_do_updateip(ctdb, c, vnn);
794                 if (ret != 0) {
795                         return -1;
796                 }
797         } else {
798                 /*
799                  * The interface is up and the kernel known the ip
800                  * => do nothing
801                  */
802                 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
803                         ctdb_addr_to_str(&pip->addr),
804                         vnn->public_netmask_bits,
805                         ctdb_vnn_iface_string(vnn)));
806                 return 0;
807         }
808
809         /* tell ctdb_control.c that we will be replying asynchronously */
810         *async_reply = true;
811
812         return 0;
813 }
814
815 /*
816   takeover an ip address old v4 style
817  */
818 int32_t ctdb_control_takeover_ipv4(struct ctdb_context *ctdb, 
819                                 struct ctdb_req_control *c,
820                                 TDB_DATA indata, 
821                                 bool *async_reply)
822 {
823         TDB_DATA data;
824         
825         data.dsize = sizeof(struct ctdb_public_ip);
826         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
827         CTDB_NO_MEMORY(ctdb, data.dptr);
828         
829         memcpy(data.dptr, indata.dptr, indata.dsize);
830         return ctdb_control_takeover_ip(ctdb, c, data, async_reply);
831 }
832
833 /*
834   kill any clients that are registered with a IP that is being released
835  */
836 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
837 {
838         struct ctdb_client_ip *ip;
839
840         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
841                 ctdb_addr_to_str(addr)));
842
843         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
844                 ctdb_sock_addr tmp_addr;
845
846                 tmp_addr = ip->addr;
847                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
848                         ip->client_id,
849                         ctdb_addr_to_str(&ip->addr)));
850
851                 if (ctdb_same_ip(&tmp_addr, addr)) {
852                         struct ctdb_client *client = ctdb_reqid_find(ctdb, 
853                                                                      ip->client_id, 
854                                                                      struct ctdb_client);
855                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
856                                 ip->client_id,
857                                 ctdb_addr_to_str(&ip->addr),
858                                 client->pid));
859
860                         if (client->pid != 0) {
861                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
862                                         (unsigned)client->pid,
863                                         ctdb_addr_to_str(addr),
864                                         ip->client_id));
865                                 kill(client->pid, SIGKILL);
866                         }
867                 }
868         }
869 }
870
871 static void do_delete_ip(struct ctdb_context *ctdb, struct ctdb_vnn *vnn)
872 {
873         DLIST_REMOVE(ctdb->vnn, vnn);
874         ctdb_vnn_unassign_iface(ctdb, vnn);
875         ctdb_remove_orphaned_ifaces(ctdb, vnn);
876         talloc_free(vnn);
877 }
878
879 /*
880   called when releaseip event finishes
881  */
882 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
883                                 void *private_data)
884 {
885         struct takeover_callback_state *state = 
886                 talloc_get_type(private_data, struct takeover_callback_state);
887         TDB_DATA data;
888
889         if (status == -ETIME) {
890                 ctdb_ban_self(ctdb);
891         }
892
893         if (ctdb->do_checkpublicip && ctdb_sys_have_ip(state->addr)) {
894                 DEBUG(DEBUG_ERR, ("IP %s still hosted during release IP callback, failing\n",
895                                   ctdb_addr_to_str(state->addr)));
896                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
897                 talloc_free(state);
898                 return;
899         }
900
901         /* send a message to all clients of this node telling them
902            that the cluster has been reconfigured and they should
903            release any sockets on this IP */
904         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
905         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
906         data.dsize = strlen((char *)data.dptr)+1;
907
908         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
909
910         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
911
912         /* kill clients that have registered with this IP */
913         release_kill_clients(ctdb, state->addr);
914
915         ctdb_vnn_unassign_iface(ctdb, state->vnn);
916
917         /* Process the IP if it has been marked for deletion */
918         if (state->vnn->delete_pending) {
919                 do_delete_ip(ctdb, state->vnn);
920                 state->vnn = NULL;
921         }
922
923         /* the control succeeded */
924         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
925         talloc_free(state);
926 }
927
928 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
929 {
930         if (state->vnn != NULL) {
931                 state->vnn->update_in_flight = false;
932         }
933         return 0;
934 }
935
936 /*
937   release an ip address
938  */
939 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
940                                 struct ctdb_req_control *c,
941                                 TDB_DATA indata, 
942                                 bool *async_reply)
943 {
944         int ret;
945         struct takeover_callback_state *state;
946         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
947         struct ctdb_vnn *vnn;
948         char *iface;
949
950         /* update our vnn list */
951         vnn = find_public_ip_vnn(ctdb, &pip->addr);
952         if (vnn == NULL) {
953                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
954                         ctdb_addr_to_str(&pip->addr)));
955                 return 0;
956         }
957         vnn->pnn = pip->pnn;
958
959         /* stop any previous arps */
960         talloc_free(vnn->takeover_ctx);
961         vnn->takeover_ctx = NULL;
962
963         /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
964          * lazy multicast to drop an IP from any node that isn't the
965          * intended new node.  The following causes makes ctdbd ignore
966          * a release for any address it doesn't host.
967          */
968         if (ctdb->do_checkpublicip) {
969                 if (!ctdb_sys_have_ip(&pip->addr)) {
970                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
971                                 ctdb_addr_to_str(&pip->addr),
972                                 vnn->public_netmask_bits,
973                                 ctdb_vnn_iface_string(vnn)));
974                         ctdb_vnn_unassign_iface(ctdb, vnn);
975                         return 0;
976                 }
977         } else {
978                 if (vnn->iface == NULL) {
979                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
980                                            ctdb_addr_to_str(&pip->addr),
981                                            vnn->public_netmask_bits));
982                         return 0;
983                 }
984         }
985
986         /* There is a potential race between take_ip and us because we
987          * update the VNN via a callback that run when the
988          * eventscripts have been run.  Avoid the race by allowing one
989          * update to be in flight at a time.
990          */
991         if (vnn->update_in_flight) {
992                 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
993                                     "update for this IP already in flight\n",
994                                     ctdb_addr_to_str(&vnn->public_address),
995                                     vnn->public_netmask_bits));
996                 return -1;
997         }
998
999         iface = strdup(ctdb_vnn_iface_string(vnn));
1000
1001         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
1002                 ctdb_addr_to_str(&pip->addr),
1003                 vnn->public_netmask_bits,
1004                 iface,
1005                 pip->pnn));
1006
1007         state = talloc(ctdb, struct takeover_callback_state);
1008         if (state == NULL) {
1009                 ctdb_set_error(ctdb, "Out of memory at %s:%d",
1010                                __FILE__, __LINE__);
1011                 free(iface);
1012                 return -1;
1013         }
1014
1015         state->c = talloc_steal(state, c);
1016         state->addr = talloc(state, ctdb_sock_addr);       
1017         if (state->addr == NULL) {
1018                 ctdb_set_error(ctdb, "Out of memory at %s:%d",
1019                                __FILE__, __LINE__);
1020                 free(iface);
1021                 talloc_free(state);
1022                 return -1;
1023         }
1024         *state->addr = pip->addr;
1025         state->vnn   = vnn;
1026
1027         vnn->update_in_flight = true;
1028         talloc_set_destructor(state, ctdb_releaseip_destructor);
1029
1030         ret = ctdb_event_script_callback(ctdb, 
1031                                          state, release_ip_callback, state,
1032                                          CTDB_EVENT_RELEASE_IP,
1033                                          "%s %s %u",
1034                                          iface,
1035                                          ctdb_addr_to_str(&pip->addr),
1036                                          vnn->public_netmask_bits);
1037         free(iface);
1038         if (ret != 0) {
1039                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1040                         ctdb_addr_to_str(&pip->addr),
1041                         ctdb_vnn_iface_string(vnn)));
1042                 talloc_free(state);
1043                 return -1;
1044         }
1045
1046         /* tell the control that we will be reply asynchronously */
1047         *async_reply = true;
1048         return 0;
1049 }
1050
1051 /*
1052   release an ip address old v4 style
1053  */
1054 int32_t ctdb_control_release_ipv4(struct ctdb_context *ctdb, 
1055                                 struct ctdb_req_control *c,
1056                                 TDB_DATA indata, 
1057                                 bool *async_reply)
1058 {
1059         TDB_DATA data;
1060         
1061         data.dsize = sizeof(struct ctdb_public_ip);
1062         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
1063         CTDB_NO_MEMORY(ctdb, data.dptr);
1064         
1065         memcpy(data.dptr, indata.dptr, indata.dsize);
1066         return ctdb_control_release_ip(ctdb, c, data, async_reply);
1067 }
1068
1069
1070 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1071                                    ctdb_sock_addr *addr,
1072                                    unsigned mask, const char *ifaces,
1073                                    bool check_address)
1074 {
1075         struct ctdb_vnn      *vnn;
1076         uint32_t num = 0;
1077         char *tmp;
1078         const char *iface;
1079         int i;
1080         int ret;
1081
1082         tmp = strdup(ifaces);
1083         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1084                 if (!ctdb_sys_check_iface_exists(iface)) {
1085                         DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1086                         free(tmp);
1087                         return -1;
1088                 }
1089         }
1090         free(tmp);
1091
1092         /* Verify that we dont have an entry for this ip yet */
1093         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1094                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1095                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
1096                                 ctdb_addr_to_str(addr)));
1097                         return -1;
1098                 }               
1099         }
1100
1101         /* create a new vnn structure for this ip address */
1102         vnn = talloc_zero(ctdb, struct ctdb_vnn);
1103         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1104         vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1105         tmp = talloc_strdup(vnn, ifaces);
1106         CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1107         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1108                 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1109                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1110                 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1111                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1112                 num++;
1113         }
1114         talloc_free(tmp);
1115         vnn->ifaces[num] = NULL;
1116         vnn->public_address      = *addr;
1117         vnn->public_netmask_bits = mask;
1118         vnn->pnn                 = -1;
1119         if (check_address) {
1120                 if (ctdb_sys_have_ip(addr)) {
1121                         DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1122                         vnn->pnn = ctdb->pnn;
1123                 }
1124         }
1125
1126         for (i=0; vnn->ifaces[i]; i++) {
1127                 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1128                 if (ret != 0) {
1129                         DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1130                                            "for public_address[%s]\n",
1131                                            vnn->ifaces[i], ctdb_addr_to_str(addr)));
1132                         talloc_free(vnn);
1133                         return -1;
1134                 }
1135         }
1136
1137         DLIST_ADD(ctdb->vnn, vnn);
1138
1139         return 0;
1140 }
1141
1142 static void ctdb_check_interfaces_event(struct event_context *ev, struct timed_event *te, 
1143                                   struct timeval t, void *private_data)
1144 {
1145         struct ctdb_context *ctdb = talloc_get_type(private_data, 
1146                                                         struct ctdb_context);
1147         struct ctdb_vnn *vnn;
1148
1149         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1150                 int i;
1151
1152                 for (i=0; vnn->ifaces[i] != NULL; i++) {
1153                         if (!ctdb_sys_check_iface_exists(vnn->ifaces[i])) {
1154                                 DEBUG(DEBUG_CRIT,("Interface %s does not exist but is used by public ip %s\n",
1155                                         vnn->ifaces[i],
1156                                         ctdb_addr_to_str(&vnn->public_address)));
1157                         }
1158                 }
1159         }
1160
1161         event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx, 
1162                 timeval_current_ofs(30, 0), 
1163                 ctdb_check_interfaces_event, ctdb);
1164 }
1165
1166
1167 int ctdb_start_monitoring_interfaces(struct ctdb_context *ctdb)
1168 {
1169         if (ctdb->check_public_ifaces_ctx != NULL) {
1170                 talloc_free(ctdb->check_public_ifaces_ctx);
1171                 ctdb->check_public_ifaces_ctx = NULL;
1172         }
1173
1174         ctdb->check_public_ifaces_ctx = talloc_new(ctdb);
1175         if (ctdb->check_public_ifaces_ctx == NULL) {
1176                 ctdb_fatal(ctdb, "failed to allocate context for checking interfaces");
1177         }
1178
1179         event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx, 
1180                 timeval_current_ofs(30, 0), 
1181                 ctdb_check_interfaces_event, ctdb);
1182
1183         return 0;
1184 }
1185
1186
1187 /*
1188   setup the public address lists from a file
1189 */
1190 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1191 {
1192         char **lines;
1193         int nlines;
1194         int i;
1195
1196         lines = file_lines_load(ctdb->public_addresses_file, &nlines, 0, ctdb);
1197         if (lines == NULL) {
1198                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1199                 return -1;
1200         }
1201         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1202                 nlines--;
1203         }
1204
1205         for (i=0;i<nlines;i++) {
1206                 unsigned mask;
1207                 ctdb_sock_addr addr;
1208                 const char *addrstr;
1209                 const char *ifaces;
1210                 char *tok, *line;
1211
1212                 line = lines[i];
1213                 while ((*line == ' ') || (*line == '\t')) {
1214                         line++;
1215                 }
1216                 if (*line == '#') {
1217                         continue;
1218                 }
1219                 if (strcmp(line, "") == 0) {
1220                         continue;
1221                 }
1222                 tok = strtok(line, " \t");
1223                 addrstr = tok;
1224                 tok = strtok(NULL, " \t");
1225                 if (tok == NULL) {
1226                         if (NULL == ctdb->default_public_interface) {
1227                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1228                                          i+1));
1229                                 talloc_free(lines);
1230                                 return -1;
1231                         }
1232                         ifaces = ctdb->default_public_interface;
1233                 } else {
1234                         ifaces = tok;
1235                 }
1236
1237                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1238                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1239                         talloc_free(lines);
1240                         return -1;
1241                 }
1242                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1243                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1244                         talloc_free(lines);
1245                         return -1;
1246                 }
1247         }
1248
1249
1250         talloc_free(lines);
1251         return 0;
1252 }
1253
1254 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1255                               const char *iface,
1256                               const char *ip)
1257 {
1258         struct ctdb_vnn *svnn;
1259         struct ctdb_iface *cur = NULL;
1260         bool ok;
1261         int ret;
1262
1263         svnn = talloc_zero(ctdb, struct ctdb_vnn);
1264         CTDB_NO_MEMORY(ctdb, svnn);
1265
1266         svnn->ifaces = talloc_array(svnn, const char *, 2);
1267         CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1268         svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1269         CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1270         svnn->ifaces[1] = NULL;
1271
1272         ok = parse_ip(ip, iface, 0, &svnn->public_address);
1273         if (!ok) {
1274                 talloc_free(svnn);
1275                 return -1;
1276         }
1277
1278         ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1279         if (ret != 0) {
1280                 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1281                                    "for single_ip[%s]\n",
1282                                    svnn->ifaces[0],
1283                                    ctdb_addr_to_str(&svnn->public_address)));
1284                 talloc_free(svnn);
1285                 return -1;
1286         }
1287
1288         /* assume the single public ip interface is initially "good" */
1289         cur = ctdb_find_iface(ctdb, iface);
1290         if (cur == NULL) {
1291                 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1292                 return -1;
1293         }
1294         cur->link_up = true;
1295
1296         ret = ctdb_vnn_assign_iface(ctdb, svnn);
1297         if (ret != 0) {
1298                 talloc_free(svnn);
1299                 return -1;
1300         }
1301
1302         ctdb->single_ip_vnn = svnn;
1303         return 0;
1304 }
1305
1306 struct ctdb_public_ip_list {
1307         struct ctdb_public_ip_list *next;
1308         uint32_t pnn;
1309         ctdb_sock_addr addr;
1310 };
1311
1312 /* Given a physical node, return the number of
1313    public addresses that is currently assigned to this node.
1314 */
1315 static int node_ip_coverage(struct ctdb_context *ctdb, 
1316         int32_t pnn,
1317         struct ctdb_public_ip_list *ips)
1318 {
1319         int num=0;
1320
1321         for (;ips;ips=ips->next) {
1322                 if (ips->pnn == pnn) {
1323                         num++;
1324                 }
1325         }
1326         return num;
1327 }
1328
1329
1330 /* Can the given node host the given IP: is the public IP known to the
1331  * node and is NOIPHOST unset?
1332 */
1333 static bool can_node_host_ip(struct ctdb_context *ctdb, int32_t pnn, 
1334                              struct ctdb_ipflags ipflags,
1335                              struct ctdb_public_ip_list *ip)
1336 {
1337         struct ctdb_all_public_ips *public_ips;
1338         int i;
1339
1340         if (ipflags.noiphost) {
1341                 return false;
1342         }
1343
1344         public_ips = ctdb->nodes[pnn]->available_public_ips;
1345
1346         if (public_ips == NULL) {
1347                 return false;
1348         }
1349
1350         for (i=0; i<public_ips->num; i++) {
1351                 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1352                         /* yes, this node can serve this public ip */
1353                         return true;
1354                 }
1355         }
1356
1357         return false;
1358 }
1359
1360 static bool can_node_takeover_ip(struct ctdb_context *ctdb, int32_t pnn, 
1361                                  struct ctdb_ipflags ipflags,
1362                                  struct ctdb_public_ip_list *ip)
1363 {
1364         if (ipflags.noiptakeover) {
1365                 return false;
1366         }
1367
1368         return can_node_host_ip(ctdb, pnn, ipflags, ip);
1369 }
1370
1371 /* search the node lists list for a node to takeover this ip.
1372    pick the node that currently are serving the least number of ips
1373    so that the ips get spread out evenly.
1374 */
1375 static int find_takeover_node(struct ctdb_context *ctdb, 
1376                 struct ctdb_ipflags *ipflags,
1377                 struct ctdb_public_ip_list *ip,
1378                 struct ctdb_public_ip_list *all_ips)
1379 {
1380         int pnn, min=0, num;
1381         int i, numnodes;
1382
1383         numnodes = talloc_array_length(ipflags);
1384         pnn    = -1;
1385         for (i=0; i<numnodes; i++) {
1386                 /* verify that this node can serve this ip */
1387                 if (!can_node_takeover_ip(ctdb, i, ipflags[i], ip)) {
1388                         /* no it couldnt   so skip to the next node */
1389                         continue;
1390                 }
1391
1392                 num = node_ip_coverage(ctdb, i, all_ips);
1393                 /* was this the first node we checked ? */
1394                 if (pnn == -1) {
1395                         pnn = i;
1396                         min  = num;
1397                 } else {
1398                         if (num < min) {
1399                                 pnn = i;
1400                                 min  = num;
1401                         }
1402                 }
1403         }       
1404         if (pnn == -1) {
1405                 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1406                         ctdb_addr_to_str(&ip->addr)));
1407
1408                 return -1;
1409         }
1410
1411         ip->pnn = pnn;
1412         return 0;
1413 }
1414
1415 #define IP_KEYLEN       4
1416 static uint32_t *ip_key(ctdb_sock_addr *ip)
1417 {
1418         static uint32_t key[IP_KEYLEN];
1419
1420         bzero(key, sizeof(key));
1421
1422         switch (ip->sa.sa_family) {
1423         case AF_INET:
1424                 key[3]  = htonl(ip->ip.sin_addr.s_addr);
1425                 break;
1426         case AF_INET6: {
1427                 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1428                 key[0]  = htonl(s6_a32[0]);
1429                 key[1]  = htonl(s6_a32[1]);
1430                 key[2]  = htonl(s6_a32[2]);
1431                 key[3]  = htonl(s6_a32[3]);
1432                 break;
1433         }
1434         default:
1435                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1436                 return key;
1437         }
1438
1439         return key;
1440 }
1441
1442 static void *add_ip_callback(void *parm, void *data)
1443 {
1444         struct ctdb_public_ip_list *this_ip = parm; 
1445         struct ctdb_public_ip_list *prev_ip = data; 
1446
1447         if (prev_ip == NULL) {
1448                 return parm;
1449         }
1450         if (this_ip->pnn == -1) {
1451                 this_ip->pnn = prev_ip->pnn;
1452         }
1453
1454         return parm;
1455 }
1456
1457 static int getips_count_callback(void *param, void *data)
1458 {
1459         struct ctdb_public_ip_list **ip_list = (struct ctdb_public_ip_list **)param;
1460         struct ctdb_public_ip_list *new_ip = (struct ctdb_public_ip_list *)data;
1461
1462         new_ip->next = *ip_list;
1463         *ip_list     = new_ip;
1464         return 0;
1465 }
1466
1467 static struct ctdb_public_ip_list *
1468 create_merged_ip_list(struct ctdb_context *ctdb)
1469 {
1470         int i, j;
1471         struct ctdb_public_ip_list *ip_list;
1472         struct ctdb_all_public_ips *public_ips;
1473
1474         if (ctdb->ip_tree != NULL) {
1475                 talloc_free(ctdb->ip_tree);
1476                 ctdb->ip_tree = NULL;
1477         }
1478         ctdb->ip_tree = trbt_create(ctdb, 0);
1479
1480         for (i=0;i<ctdb->num_nodes;i++) {
1481                 public_ips = ctdb->nodes[i]->known_public_ips;
1482
1483                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1484                         continue;
1485                 }
1486
1487                 /* there were no public ips for this node */
1488                 if (public_ips == NULL) {
1489                         continue;
1490                 }               
1491
1492                 for (j=0;j<public_ips->num;j++) {
1493                         struct ctdb_public_ip_list *tmp_ip; 
1494
1495                         tmp_ip = talloc_zero(ctdb->ip_tree, struct ctdb_public_ip_list);
1496                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1497                         /* Do not use information about IP addresses hosted
1498                          * on other nodes, it may not be accurate */
1499                         if (public_ips->ips[j].pnn == ctdb->nodes[i]->pnn) {
1500                                 tmp_ip->pnn = public_ips->ips[j].pnn;
1501                         } else {
1502                                 tmp_ip->pnn = -1;
1503                         }
1504                         tmp_ip->addr = public_ips->ips[j].addr;
1505                         tmp_ip->next = NULL;
1506
1507                         trbt_insertarray32_callback(ctdb->ip_tree,
1508                                 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1509                                 add_ip_callback,
1510                                 tmp_ip);
1511                 }
1512         }
1513
1514         ip_list = NULL;
1515         trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1516
1517         return ip_list;
1518 }
1519
1520 /* 
1521  * This is the length of the longtest common prefix between the IPs.
1522  * It is calculated by XOR-ing the 2 IPs together and counting the
1523  * number of leading zeroes.  The implementation means that all
1524  * addresses end up being 128 bits long.
1525  *
1526  * FIXME? Should we consider IPv4 and IPv6 separately given that the
1527  * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1528  * lots of nodes and IP addresses?
1529  */
1530 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1531 {
1532         uint32_t ip1_k[IP_KEYLEN];
1533         uint32_t *t;
1534         int i;
1535         uint32_t x;
1536
1537         uint32_t distance = 0;
1538
1539         memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1540         t = ip_key(ip2);
1541         for (i=0; i<IP_KEYLEN; i++) {
1542                 x = ip1_k[i] ^ t[i];
1543                 if (x == 0) {
1544                         distance += 32;
1545                 } else {
1546                         /* Count number of leading zeroes. 
1547                          * FIXME? This could be optimised...
1548                          */
1549                         while ((x & (1 << 31)) == 0) {
1550                                 x <<= 1;
1551                                 distance += 1;
1552                         }
1553                 }
1554         }
1555
1556         return distance;
1557 }
1558
1559 /* Calculate the IP distance for the given IP relative to IPs on the
1560    given node.  The ips argument is generally the all_ips variable
1561    used in the main part of the algorithm.
1562  */
1563 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1564                                   struct ctdb_public_ip_list *ips,
1565                                   int pnn)
1566 {
1567         struct ctdb_public_ip_list *t;
1568         uint32_t d;
1569
1570         uint32_t sum = 0;
1571
1572         for (t=ips; t != NULL; t=t->next) {
1573                 if (t->pnn != pnn) {
1574                         continue;
1575                 }
1576
1577                 /* Optimisation: We never calculate the distance
1578                  * between an address and itself.  This allows us to
1579                  * calculate the effect of removing an address from a
1580                  * node by simply calculating the distance between
1581                  * that address and all of the exitsing addresses.
1582                  * Moreover, we assume that we're only ever dealing
1583                  * with addresses from all_ips so we can identify an
1584                  * address via a pointer rather than doing a more
1585                  * expensive address comparison. */
1586                 if (&(t->addr) == ip) {
1587                         continue;
1588                 }
1589
1590                 d = ip_distance(ip, &(t->addr));
1591                 sum += d * d;  /* Cheaper than pulling in math.h :-) */
1592         }
1593
1594         return sum;
1595 }
1596
1597 /* Return the LCP2 imbalance metric for addresses currently assigned
1598    to the given node.
1599  */
1600 static uint32_t lcp2_imbalance(struct ctdb_public_ip_list * all_ips, int pnn)
1601 {
1602         struct ctdb_public_ip_list *t;
1603
1604         uint32_t imbalance = 0;
1605
1606         for (t=all_ips; t!=NULL; t=t->next) {
1607                 if (t->pnn != pnn) {
1608                         continue;
1609                 }
1610                 /* Pass the rest of the IPs rather than the whole
1611                    all_ips input list.
1612                 */
1613                 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1614         }
1615
1616         return imbalance;
1617 }
1618
1619 /* Allocate any unassigned IPs just by looping through the IPs and
1620  * finding the best node for each.
1621  */
1622 static void basic_allocate_unassigned(struct ctdb_context *ctdb,
1623                                       struct ctdb_ipflags *ipflags,
1624                                       struct ctdb_public_ip_list *all_ips)
1625 {
1626         struct ctdb_public_ip_list *tmp_ip;
1627
1628         /* loop over all ip's and find a physical node to cover for 
1629            each unassigned ip.
1630         */
1631         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1632                 if (tmp_ip->pnn == -1) {
1633                         if (find_takeover_node(ctdb, ipflags, tmp_ip, all_ips)) {
1634                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1635                                         ctdb_addr_to_str(&tmp_ip->addr)));
1636                         }
1637                 }
1638         }
1639 }
1640
1641 /* Basic non-deterministic rebalancing algorithm.
1642  */
1643 static void basic_failback(struct ctdb_context *ctdb,
1644                            struct ctdb_ipflags *ipflags,
1645                            struct ctdb_public_ip_list *all_ips,
1646                            int num_ips)
1647 {
1648         int i, numnodes;
1649         int maxnode, maxnum, minnode, minnum, num, retries;
1650         struct ctdb_public_ip_list *tmp_ip;
1651
1652         numnodes = talloc_array_length(ipflags);
1653         retries = 0;
1654
1655 try_again:
1656         maxnum=0;
1657         minnum=0;
1658
1659         /* for each ip address, loop over all nodes that can serve
1660            this ip and make sure that the difference between the node
1661            serving the most and the node serving the least ip's are
1662            not greater than 1.
1663         */
1664         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1665                 if (tmp_ip->pnn == -1) {
1666                         continue;
1667                 }
1668
1669                 /* Get the highest and lowest number of ips's served by any 
1670                    valid node which can serve this ip.
1671                 */
1672                 maxnode = -1;
1673                 minnode = -1;
1674                 for (i=0; i<numnodes; i++) {
1675                         /* only check nodes that can actually serve this ip */
1676                         if (!can_node_takeover_ip(ctdb, i, ipflags[i], tmp_ip)) {
1677                                 /* no it couldnt   so skip to the next node */
1678                                 continue;
1679                         }
1680
1681                         num = node_ip_coverage(ctdb, i, all_ips);
1682                         if (maxnode == -1) {
1683                                 maxnode = i;
1684                                 maxnum  = num;
1685                         } else {
1686                                 if (num > maxnum) {
1687                                         maxnode = i;
1688                                         maxnum  = num;
1689                                 }
1690                         }
1691                         if (minnode == -1) {
1692                                 minnode = i;
1693                                 minnum  = num;
1694                         } else {
1695                                 if (num < minnum) {
1696                                         minnode = i;
1697                                         minnum  = num;
1698                                 }
1699                         }
1700                 }
1701                 if (maxnode == -1) {
1702                         DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1703                                 ctdb_addr_to_str(&tmp_ip->addr)));
1704
1705                         continue;
1706                 }
1707
1708                 /* if the spread between the smallest and largest coverage by
1709                    a node is >=2 we steal one of the ips from the node with
1710                    most coverage to even things out a bit.
1711                    try to do this a limited number of times since we dont
1712                    want to spend too much time balancing the ip coverage.
1713                 */
1714                 if ( (maxnum > minnum+1)
1715                      && (retries < (num_ips + 5)) ){
1716                         struct ctdb_public_ip_list *tmp;
1717
1718                         /* Reassign one of maxnode's VNNs */
1719                         for (tmp=all_ips;tmp;tmp=tmp->next) {
1720                                 if (tmp->pnn == maxnode) {
1721                                         (void)find_takeover_node(ctdb, ipflags, tmp, all_ips);
1722                                         retries++;
1723                                         goto try_again;;
1724                                 }
1725                         }
1726                 }
1727         }
1728 }
1729
1730 static void lcp2_init(struct ctdb_context *tmp_ctx,
1731                       struct ctdb_ipflags *ipflags,
1732                       struct ctdb_public_ip_list *all_ips,
1733                       uint32_t *force_rebalance_nodes,
1734                       uint32_t **lcp2_imbalances,
1735                       bool **rebalance_candidates)
1736 {
1737         int i, numnodes;
1738         struct ctdb_public_ip_list *tmp_ip;
1739
1740         numnodes = talloc_array_length(ipflags);
1741
1742         *rebalance_candidates = talloc_array(tmp_ctx, bool, numnodes);
1743         CTDB_NO_MEMORY_FATAL(tmp_ctx, *rebalance_candidates);
1744         *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, numnodes);
1745         CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
1746
1747         for (i=0; i<numnodes; i++) {
1748                 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1749                 /* First step: assume all nodes are candidates */
1750                 (*rebalance_candidates)[i] = true;
1751         }
1752
1753         /* 2nd step: if a node has IPs assigned then it must have been
1754          * healthy before, so we remove it from consideration.  This
1755          * is overkill but is all we have because we don't maintain
1756          * state between takeover runs.  An alternative would be to
1757          * keep state and invalidate it every time the recovery master
1758          * changes.
1759          */
1760         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1761                 if (tmp_ip->pnn != -1) {
1762                         (*rebalance_candidates)[tmp_ip->pnn] = false;
1763                 }
1764         }
1765
1766         /* 3rd step: if a node is forced to re-balance then
1767            we allow failback onto the node */
1768         if (force_rebalance_nodes == NULL) {
1769                 return;
1770         }
1771         for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
1772                 uint32_t pnn = force_rebalance_nodes[i];
1773                 if (pnn >= numnodes) {
1774                         DEBUG(DEBUG_ERR,
1775                               (__location__ "unknown node %u\n", pnn));
1776                         continue;
1777                 }
1778
1779                 DEBUG(DEBUG_NOTICE,
1780                       ("Forcing rebalancing of IPs to node %u\n", pnn));
1781                 (*rebalance_candidates)[pnn] = true;
1782         }
1783 }
1784
1785 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1786  * the IP/node combination that will cost the least.
1787  */
1788 static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
1789                                      struct ctdb_ipflags *ipflags,
1790                                      struct ctdb_public_ip_list *all_ips,
1791                                      uint32_t *lcp2_imbalances)
1792 {
1793         struct ctdb_public_ip_list *tmp_ip;
1794         int dstnode, numnodes;
1795
1796         int minnode;
1797         uint32_t mindsum, dstdsum, dstimbl, minimbl;
1798         struct ctdb_public_ip_list *minip;
1799
1800         bool should_loop = true;
1801         bool have_unassigned = true;
1802
1803         numnodes = talloc_array_length(ipflags);
1804
1805         while (have_unassigned && should_loop) {
1806                 should_loop = false;
1807
1808                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1809                 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1810
1811                 minnode = -1;
1812                 mindsum = 0;
1813                 minip = NULL;
1814
1815                 /* loop over each unassigned ip. */
1816                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1817                         if (tmp_ip->pnn != -1) {
1818                                 continue;
1819                         }
1820
1821                         for (dstnode=0; dstnode<numnodes; dstnode++) {
1822                                 /* only check nodes that can actually takeover this ip */
1823                                 if (!can_node_takeover_ip(ctdb, dstnode,
1824                                                           ipflags[dstnode],
1825                                                           tmp_ip)) {
1826                                         /* no it couldnt   so skip to the next node */
1827                                         continue;
1828                                 }
1829
1830                                 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1831                                 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1832                                 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1833                                                    ctdb_addr_to_str(&(tmp_ip->addr)),
1834                                                    dstnode,
1835                                                    dstimbl - lcp2_imbalances[dstnode]));
1836
1837
1838                                 if ((minnode == -1) || (dstdsum < mindsum)) {
1839                                         minnode = dstnode;
1840                                         minimbl = dstimbl;
1841                                         mindsum = dstdsum;
1842                                         minip = tmp_ip;
1843                                         should_loop = true;
1844                                 }
1845                         }
1846                 }
1847
1848                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1849
1850                 /* If we found one then assign it to the given node. */
1851                 if (minnode != -1) {
1852                         minip->pnn = minnode;
1853                         lcp2_imbalances[minnode] = minimbl;
1854                         DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1855                                           ctdb_addr_to_str(&(minip->addr)),
1856                                           minnode,
1857                                           mindsum));
1858                 }
1859
1860                 /* There might be a better way but at least this is clear. */
1861                 have_unassigned = false;
1862                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1863                         if (tmp_ip->pnn == -1) {
1864                                 have_unassigned = true;
1865                         }
1866                 }
1867         }
1868
1869         /* We know if we have an unassigned addresses so we might as
1870          * well optimise.
1871          */
1872         if (have_unassigned) {
1873                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1874                         if (tmp_ip->pnn == -1) {
1875                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1876                                                      ctdb_addr_to_str(&tmp_ip->addr)));
1877                         }
1878                 }
1879         }
1880 }
1881
1882 /* LCP2 algorithm for rebalancing the cluster.  Given a candidate node
1883  * to move IPs from, determines the best IP/destination node
1884  * combination to move from the source node.
1885  */
1886 static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
1887                                     struct ctdb_ipflags *ipflags,
1888                                     struct ctdb_public_ip_list *all_ips,
1889                                     int srcnode,
1890                                     uint32_t *lcp2_imbalances,
1891                                     bool *rebalance_candidates)
1892 {
1893         int dstnode, mindstnode, numnodes;
1894         uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1895         uint32_t minsrcimbl, mindstimbl;
1896         struct ctdb_public_ip_list *minip;
1897         struct ctdb_public_ip_list *tmp_ip;
1898
1899         /* Find an IP and destination node that best reduces imbalance. */
1900         srcimbl = 0;
1901         minip = NULL;
1902         minsrcimbl = 0;
1903         mindstnode = -1;
1904         mindstimbl = 0;
1905
1906         numnodes = talloc_array_length(ipflags);
1907
1908         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1909         DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n",
1910                            srcnode, lcp2_imbalances[srcnode]));
1911
1912         for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1913                 /* Only consider addresses on srcnode. */
1914                 if (tmp_ip->pnn != srcnode) {
1915                         continue;
1916                 }
1917
1918                 /* What is this IP address costing the source node? */
1919                 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1920                 srcimbl = lcp2_imbalances[srcnode] - srcdsum;
1921
1922                 /* Consider this IP address would cost each potential
1923                  * destination node.  Destination nodes are limited to
1924                  * those that are newly healthy, since we don't want
1925                  * to do gratuitous failover of IPs just to make minor
1926                  * balance improvements.
1927                  */
1928                 for (dstnode=0; dstnode<numnodes; dstnode++) {
1929                         if (!rebalance_candidates[dstnode]) {
1930                                 continue;
1931                         }
1932
1933                         /* only check nodes that can actually takeover this ip */
1934                         if (!can_node_takeover_ip(ctdb, dstnode,
1935                                                   ipflags[dstnode], tmp_ip)) {
1936                                 /* no it couldnt   so skip to the next node */
1937                                 continue;
1938                         }
1939
1940                         dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1941                         dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1942                         DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1943                                            srcnode, -srcdsum,
1944                                            ctdb_addr_to_str(&(tmp_ip->addr)),
1945                                            dstnode, dstdsum));
1946
1947                         if ((dstimbl < lcp2_imbalances[srcnode]) &&
1948                             (dstdsum < srcdsum) &&                      \
1949                             ((mindstnode == -1) ||                              \
1950                              ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1951
1952                                 minip = tmp_ip;
1953                                 minsrcimbl = srcimbl;
1954                                 mindstnode = dstnode;
1955                                 mindstimbl = dstimbl;
1956                         }
1957                 }
1958         }
1959         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1960
1961         if (mindstnode != -1) {
1962                 /* We found a move that makes things better... */
1963                 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1964                                   srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1965                                   ctdb_addr_to_str(&(minip->addr)),
1966                                   mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1967
1968
1969                 lcp2_imbalances[srcnode] = minsrcimbl;
1970                 lcp2_imbalances[mindstnode] = mindstimbl;
1971                 minip->pnn = mindstnode;
1972
1973                 return true;
1974         }
1975
1976         return false;
1977         
1978 }
1979
1980 struct lcp2_imbalance_pnn {
1981         uint32_t imbalance;
1982         int pnn;
1983 };
1984
1985 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
1986 {
1987         const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
1988         const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
1989
1990         if (lipa->imbalance > lipb->imbalance) {
1991                 return -1;
1992         } else if (lipa->imbalance == lipb->imbalance) {
1993                 return 0;
1994         } else {
1995                 return 1;
1996         }
1997 }
1998
1999 /* LCP2 algorithm for rebalancing the cluster.  This finds the source
2000  * node with the highest LCP2 imbalance, and then determines the best
2001  * IP/destination node combination to move from the source node.
2002  */
2003 static void lcp2_failback(struct ctdb_context *ctdb,
2004                           struct ctdb_ipflags *ipflags,
2005                           struct ctdb_public_ip_list *all_ips,
2006                           uint32_t *lcp2_imbalances,
2007                           bool *rebalance_candidates)
2008 {
2009         int i, numnodes;
2010         struct lcp2_imbalance_pnn * lips;
2011         bool again;
2012
2013         numnodes = talloc_array_length(ipflags);
2014
2015 try_again:
2016         /* Put the imbalances and nodes into an array, sort them and
2017          * iterate through candidates.  Usually the 1st one will be
2018          * used, so this doesn't cost much...
2019          */
2020         DEBUG(DEBUG_DEBUG,("+++++++++++++++++++++++++++++++++++++++++\n"));
2021         DEBUG(DEBUG_DEBUG,("Selecting most imbalanced node from:\n"));
2022         lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, numnodes);
2023         for (i=0; i<numnodes; i++) {
2024                 lips[i].imbalance = lcp2_imbalances[i];
2025                 lips[i].pnn = i;
2026                 DEBUG(DEBUG_DEBUG,(" %d [%d]\n", i, lcp2_imbalances[i]));
2027         }
2028         qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
2029               lcp2_cmp_imbalance_pnn);
2030
2031         again = false;
2032         for (i=0; i<numnodes; i++) {
2033                 /* This means that all nodes had 0 or 1 addresses, so
2034                  * can't be imbalanced.
2035                  */
2036                 if (lips[i].imbalance == 0) {
2037                         break;
2038                 }
2039
2040                 if (lcp2_failback_candidate(ctdb,
2041                                             ipflags,
2042                                             all_ips,
2043                                             lips[i].pnn,
2044                                             lcp2_imbalances,
2045                                             rebalance_candidates)) {
2046                         again = true;
2047                         break;
2048                 }
2049         }
2050
2051         talloc_free(lips);
2052         if (again) {
2053                 goto try_again;
2054         }
2055 }
2056
2057 static void unassign_unsuitable_ips(struct ctdb_context *ctdb,
2058                                     struct ctdb_ipflags *ipflags,
2059                                     struct ctdb_public_ip_list *all_ips)
2060 {
2061         struct ctdb_public_ip_list *tmp_ip;
2062
2063         /* verify that the assigned nodes can serve that public ip
2064            and set it to -1 if not
2065         */
2066         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2067                 if (tmp_ip->pnn == -1) {
2068                         continue;
2069                 }
2070                 if (!can_node_host_ip(ctdb, tmp_ip->pnn,
2071                                       ipflags[tmp_ip->pnn], tmp_ip) != 0) {
2072                         /* this node can not serve this ip. */
2073                         DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
2074                                            ctdb_addr_to_str(&(tmp_ip->addr)),
2075                                            tmp_ip->pnn));
2076                         tmp_ip->pnn = -1;
2077                 }
2078         }
2079 }
2080
2081 static void ip_alloc_deterministic_ips(struct ctdb_context *ctdb,
2082                                        struct ctdb_ipflags *ipflags,
2083                                        struct ctdb_public_ip_list *all_ips)
2084 {
2085         struct ctdb_public_ip_list *tmp_ip;
2086         int i, numnodes;
2087
2088         numnodes = talloc_array_length(ipflags);
2089
2090         DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
2091        /* Allocate IPs to nodes in a modulo fashion so that IPs will
2092         *  always be allocated the same way for a specific set of
2093         *  available/unavailable nodes.
2094         */
2095
2096         for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
2097                 tmp_ip->pnn = i % numnodes;
2098         }
2099
2100         /* IP failback doesn't make sense with deterministic
2101          * IPs, since the modulo step above implicitly fails
2102          * back IPs to their "home" node.
2103          */
2104         if (1 == ctdb->tunable.no_ip_failback) {
2105                 DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
2106         }
2107
2108         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2109
2110         basic_allocate_unassigned(ctdb, ipflags, all_ips);
2111
2112         /* No failback here! */
2113 }
2114
2115 static void ip_alloc_nondeterministic_ips(struct ctdb_context *ctdb,
2116                                           struct ctdb_ipflags *ipflags,
2117                                           struct ctdb_public_ip_list *all_ips)
2118 {
2119         /* This should be pushed down into basic_failback. */
2120         struct ctdb_public_ip_list *tmp_ip;
2121         int num_ips = 0;
2122         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2123                 num_ips++;
2124         }
2125
2126         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2127
2128         basic_allocate_unassigned(ctdb, ipflags, all_ips);
2129
2130         /* If we don't want IPs to fail back then don't rebalance IPs. */
2131         if (1 == ctdb->tunable.no_ip_failback) {
2132                 return;
2133         }
2134
2135         /* Now, try to make sure the ip adresses are evenly distributed
2136            across the nodes.
2137         */
2138         basic_failback(ctdb, ipflags, all_ips, num_ips);
2139 }
2140
2141 static void ip_alloc_lcp2(struct ctdb_context *ctdb,
2142                           struct ctdb_ipflags *ipflags,
2143                           struct ctdb_public_ip_list *all_ips,
2144                           uint32_t *force_rebalance_nodes)
2145 {
2146         uint32_t *lcp2_imbalances;
2147         bool *rebalance_candidates;
2148         int numnodes, num_rebalance_candidates, i;
2149
2150         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2151
2152         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2153
2154         lcp2_init(tmp_ctx, ipflags, all_ips,force_rebalance_nodes,
2155                   &lcp2_imbalances, &rebalance_candidates);
2156
2157         lcp2_allocate_unassigned(ctdb, ipflags, all_ips, lcp2_imbalances);
2158
2159         /* If we don't want IPs to fail back then don't rebalance IPs. */
2160         if (1 == ctdb->tunable.no_ip_failback) {
2161                 goto finished;
2162         }
2163
2164         /* It is only worth continuing if we have suitable target
2165          * nodes to transfer IPs to.  This check is much cheaper than
2166          * continuing on...
2167          */
2168         numnodes = talloc_array_length(ipflags);
2169         num_rebalance_candidates = 0;
2170         for (i=0; i<numnodes; i++) {
2171                 if (rebalance_candidates[i]) {
2172                         num_rebalance_candidates++;
2173                 }
2174         }
2175         if (num_rebalance_candidates == 0) {
2176                 goto finished;
2177         }
2178
2179         /* Now, try to make sure the ip adresses are evenly distributed
2180            across the nodes.
2181         */
2182         lcp2_failback(ctdb, ipflags, all_ips,
2183                       lcp2_imbalances, rebalance_candidates);
2184
2185 finished:
2186         talloc_free(tmp_ctx);
2187 }
2188
2189 static bool all_nodes_are_disabled(struct ctdb_node_map *nodemap)
2190 {
2191         int i;
2192
2193         for (i=0;i<nodemap->num;i++) {
2194                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
2195                         /* Found one completely healthy node */
2196                         return false;
2197                 }
2198         }
2199
2200         return true;
2201 }
2202
2203 /* The calculation part of the IP allocation algorithm. */
2204 static void ctdb_takeover_run_core(struct ctdb_context *ctdb,
2205                                    struct ctdb_ipflags *ipflags,
2206                                    struct ctdb_public_ip_list **all_ips_p,
2207                                    uint32_t *force_rebalance_nodes)
2208 {
2209         /* since nodes only know about those public addresses that
2210            can be served by that particular node, no single node has
2211            a full list of all public addresses that exist in the cluster.
2212            Walk over all node structures and create a merged list of
2213            all public addresses that exist in the cluster.
2214
2215            keep the tree of ips around as ctdb->ip_tree
2216         */
2217         *all_ips_p = create_merged_ip_list(ctdb);
2218
2219         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2220                 ip_alloc_lcp2(ctdb, ipflags, *all_ips_p, force_rebalance_nodes);
2221         } else if (1 == ctdb->tunable.deterministic_public_ips) {
2222                 ip_alloc_deterministic_ips(ctdb, ipflags, *all_ips_p);
2223         } else {
2224                 ip_alloc_nondeterministic_ips(ctdb, ipflags, *all_ips_p);
2225         }
2226
2227         /* at this point ->pnn is the node which will own each IP
2228            or -1 if there is no node that can cover this ip
2229         */
2230
2231         return;
2232 }
2233
2234 struct get_tunable_callback_data {
2235         const char *tunable;
2236         uint32_t *out;
2237         bool fatal;
2238 };
2239
2240 static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
2241                                  int32_t res, TDB_DATA outdata,
2242                                  void *callback)
2243 {
2244         struct get_tunable_callback_data *cd =
2245                 (struct get_tunable_callback_data *)callback;
2246         int size;
2247
2248         if (res != 0) {
2249                 /* Already handled in fail callback */
2250                 return;
2251         }
2252
2253         if (outdata.dsize != sizeof(uint32_t)) {
2254                 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading \"%s\" tunable from node %d. Expected %d bytes but received %d bytes\n",
2255                                  cd->tunable, pnn, (int)sizeof(uint32_t),
2256                                  (int)outdata.dsize));
2257                 cd->fatal = true;
2258                 return;
2259         }
2260
2261         size = talloc_array_length(cd->out);
2262         if (pnn >= size) {
2263                 DEBUG(DEBUG_ERR,("Got %s reply from node %d but nodemap only has %d entries\n",
2264                                  cd->tunable, pnn, size));
2265                 return;
2266         }
2267
2268                 
2269         cd->out[pnn] = *(uint32_t *)outdata.dptr;
2270 }
2271
2272 static void get_tunable_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2273                                        int32_t res, TDB_DATA outdata,
2274                                        void *callback)
2275 {
2276         struct get_tunable_callback_data *cd =
2277                 (struct get_tunable_callback_data *)callback;
2278
2279         switch (res) {
2280         case -ETIME:
2281                 DEBUG(DEBUG_ERR,
2282                       ("Timed out getting tunable \"%s\" from node %d\n",
2283                        cd->tunable, pnn));
2284                 cd->fatal = true;
2285                 break;
2286         case -EINVAL:
2287         case -1:
2288                 DEBUG(DEBUG_WARNING,
2289                       ("Tunable \"%s\" not implemented on node %d\n",
2290                        cd->tunable, pnn));
2291                 break;
2292         default:
2293                 DEBUG(DEBUG_ERR,
2294                       ("Unexpected error getting tunable \"%s\" from node %d\n",
2295                        cd->tunable, pnn));
2296                 cd->fatal = true;
2297         }
2298 }
2299
2300 static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
2301                                         TALLOC_CTX *tmp_ctx,
2302                                         struct ctdb_node_map *nodemap,
2303                                         const char *tunable,
2304                                         uint32_t default_value)
2305 {
2306         TDB_DATA data;
2307         struct ctdb_control_get_tunable *t;
2308         uint32_t *nodes;
2309         uint32_t *tvals;
2310         struct get_tunable_callback_data callback_data;
2311         int i;
2312
2313         tvals = talloc_array(tmp_ctx, uint32_t, nodemap->num);
2314         CTDB_NO_MEMORY_NULL(ctdb, tvals);
2315         for (i=0; i<nodemap->num; i++) {
2316                 tvals[i] = default_value;
2317         }
2318                 
2319         callback_data.out = tvals;
2320         callback_data.tunable = tunable;
2321         callback_data.fatal = false;
2322
2323         data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(tunable) + 1;
2324         data.dptr  = talloc_size(tmp_ctx, data.dsize);
2325         t = (struct ctdb_control_get_tunable *)data.dptr;
2326         t->length = strlen(tunable)+1;
2327         memcpy(t->name, tunable, t->length);
2328         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2329         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
2330                                       nodes, 0, TAKEOVER_TIMEOUT(),
2331                                       false, data,
2332                                       get_tunable_callback,
2333                                       get_tunable_fail_callback,
2334                                       &callback_data) != 0) {
2335                 if (callback_data.fatal) {
2336                         talloc_free(tvals);
2337                         tvals = NULL;
2338                 }
2339         }
2340         talloc_free(nodes);
2341         talloc_free(data.dptr);
2342
2343         return tvals;
2344 }
2345
2346 struct get_runstate_callback_data {
2347         enum ctdb_runstate *out;
2348         bool fatal;
2349 };
2350
2351 static void get_runstate_callback(struct ctdb_context *ctdb, uint32_t pnn,
2352                                   int32_t res, TDB_DATA outdata,
2353                                   void *callback_data)
2354 {
2355         struct get_runstate_callback_data *cd =
2356                 (struct get_runstate_callback_data *)callback_data;
2357         int size;
2358
2359         if (res != 0) {
2360                 /* Already handled in fail callback */
2361                 return;
2362         }
2363
2364         if (outdata.dsize != sizeof(uint32_t)) {
2365                 DEBUG(DEBUG_ERR,("Wrong size of returned data when getting runstate from node %d. Expected %d bytes but received %d bytes\n",
2366                                  pnn, (int)sizeof(uint32_t),
2367                                  (int)outdata.dsize));
2368                 cd->fatal = true;
2369                 return;
2370         }
2371
2372         size = talloc_array_length(cd->out);
2373         if (pnn >= size) {
2374                 DEBUG(DEBUG_ERR,("Got reply from node %d but nodemap only has %d entries\n",
2375                                  pnn, size));
2376                 return;
2377         }
2378
2379         cd->out[pnn] = (enum ctdb_runstate)*(uint32_t *)outdata.dptr;
2380 }
2381
2382 static void get_runstate_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2383                                        int32_t res, TDB_DATA outdata,
2384                                        void *callback)
2385 {
2386         struct get_runstate_callback_data *cd =
2387                 (struct get_runstate_callback_data *)callback;
2388
2389         switch (res) {
2390         case -ETIME:
2391                 DEBUG(DEBUG_ERR,
2392                       ("Timed out getting runstate from node %d\n", pnn));
2393                 cd->fatal = true;
2394                 break;
2395         default:
2396                 DEBUG(DEBUG_WARNING,
2397                       ("Error getting runstate from node %d - assuming runstates not supported\n",
2398                        pnn));
2399         }
2400 }
2401
2402 static enum ctdb_runstate * get_runstate_from_nodes(struct ctdb_context *ctdb,
2403                                                     TALLOC_CTX *tmp_ctx,
2404                                                     struct ctdb_node_map *nodemap,
2405                                                     enum ctdb_runstate default_value)
2406 {
2407         uint32_t *nodes;
2408         enum ctdb_runstate *rs;
2409         struct get_runstate_callback_data callback_data;
2410         int i;
2411
2412         rs = talloc_array(tmp_ctx, enum ctdb_runstate, nodemap->num);
2413         CTDB_NO_MEMORY_NULL(ctdb, rs);
2414         for (i=0; i<nodemap->num; i++) {
2415                 rs[i] = default_value;
2416         }
2417
2418         callback_data.out = rs;
2419         callback_data.fatal = false;
2420
2421         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2422         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_RUNSTATE,
2423                                       nodes, 0, TAKEOVER_TIMEOUT(),
2424                                       true, tdb_null,
2425                                       get_runstate_callback,
2426                                       get_runstate_fail_callback,
2427                                       &callback_data) != 0) {
2428                 if (callback_data.fatal) {
2429                         free(rs);
2430                         rs = NULL;
2431                 }
2432         }
2433         talloc_free(nodes);
2434
2435         return rs;
2436 }
2437
2438 /* Set internal flags for IP allocation:
2439  *   Clear ip flags
2440  *   Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
2441  *   Set NOIPHOST ip flag for each INACTIVE node
2442  *   if all nodes are disabled:
2443  *     Set NOIPHOST ip flags from per-node NoIPHostOnAllDisabled tunable
2444  *   else
2445  *     Set NOIPHOST ip flags for disabled nodes
2446  */
2447 static struct ctdb_ipflags *
2448 set_ipflags_internal(struct ctdb_context *ctdb,
2449                      TALLOC_CTX *tmp_ctx,
2450                      struct ctdb_node_map *nodemap,
2451                      uint32_t *tval_noiptakeover,
2452                      uint32_t *tval_noiphostonalldisabled,
2453                      enum ctdb_runstate *runstate)
2454 {
2455         int i;
2456         struct ctdb_ipflags *ipflags;
2457
2458         /* Clear IP flags - implicit due to talloc_zero */
2459         ipflags = talloc_zero_array(tmp_ctx, struct ctdb_ipflags, nodemap->num);
2460         CTDB_NO_MEMORY_NULL(ctdb, ipflags);
2461
2462         for (i=0;i<nodemap->num;i++) {
2463                 /* Can not take IPs on node with NoIPTakeover set */
2464                 if (tval_noiptakeover[i] != 0) {
2465                         ipflags[i].noiptakeover = true;
2466                 }
2467
2468                 /* Can not host IPs on node not in RUNNING state */
2469                 if (runstate[i] != CTDB_RUNSTATE_RUNNING) {
2470                         ipflags[i].noiphost = true;
2471                         continue;
2472                 }
2473                 /* Can not host IPs on INACTIVE node */
2474                 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2475                         ipflags[i].noiphost = true;
2476                 }
2477         }
2478
2479         if (all_nodes_are_disabled(nodemap)) {
2480                 /* If all nodes are disabled, can not host IPs on node
2481                  * with NoIPHostOnAllDisabled set
2482                  */
2483                 for (i=0;i<nodemap->num;i++) {
2484                         if (tval_noiphostonalldisabled[i] != 0) {
2485                                 ipflags[i].noiphost = true;
2486                         }
2487                 }
2488         } else {
2489                 /* If some nodes are not disabled, then can not host
2490                  * IPs on DISABLED node
2491                  */
2492                 for (i=0;i<nodemap->num;i++) {
2493                         if (nodemap->nodes[i].flags & NODE_FLAGS_DISABLED) {
2494                                 ipflags[i].noiphost = true;
2495                         }
2496                 }
2497         }
2498
2499         return ipflags;
2500 }
2501
2502 static struct ctdb_ipflags *set_ipflags(struct ctdb_context *ctdb,
2503                                         TALLOC_CTX *tmp_ctx,
2504                                         struct ctdb_node_map *nodemap)
2505 {
2506         uint32_t *tval_noiptakeover;
2507         uint32_t *tval_noiphostonalldisabled;
2508         struct ctdb_ipflags *ipflags;
2509         enum ctdb_runstate *runstate;
2510
2511
2512         tval_noiptakeover = get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2513                                                    "NoIPTakeover", 0);
2514         if (tval_noiptakeover == NULL) {
2515                 return NULL;
2516         }
2517
2518         tval_noiphostonalldisabled =
2519                 get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2520                                        "NoIPHostOnAllDisabled", 0);
2521         if (tval_noiphostonalldisabled == NULL) {
2522                 /* Caller frees tmp_ctx */
2523                 return NULL;
2524         }
2525
2526         /* Any nodes where CTDB_CONTROL_GET_RUNSTATE is not supported
2527          * will default to CTDB_RUNSTATE_RUNNING.  This ensures
2528          * reasonable behaviour on a mixed cluster during upgrade.
2529          */
2530         runstate = get_runstate_from_nodes(ctdb, tmp_ctx, nodemap,
2531                                            CTDB_RUNSTATE_RUNNING);
2532         if (runstate == NULL) {
2533                 /* Caller frees tmp_ctx */
2534                 return NULL;
2535         }
2536
2537         ipflags = set_ipflags_internal(ctdb, tmp_ctx, nodemap,
2538                                        tval_noiptakeover,
2539                                        tval_noiphostonalldisabled,
2540                                        runstate);
2541
2542         talloc_free(tval_noiptakeover);
2543         talloc_free(tval_noiphostonalldisabled);
2544         talloc_free(runstate);
2545
2546         return ipflags;
2547 }
2548
2549 struct iprealloc_callback_data {
2550         bool *retry_nodes;
2551         int retry_count;
2552         client_async_callback fail_callback;
2553         void *fail_callback_data;
2554         struct ctdb_node_map *nodemap;
2555 };
2556
2557 static void iprealloc_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2558                                         int32_t res, TDB_DATA outdata,
2559                                         void *callback)
2560 {
2561         int numnodes;
2562         struct iprealloc_callback_data *cd =
2563                 (struct iprealloc_callback_data *)callback;
2564
2565         numnodes = talloc_array_length(cd->retry_nodes);
2566         if (pnn > numnodes) {
2567                 DEBUG(DEBUG_ERR,
2568                       ("ipreallocated failure from node %d, "
2569                        "but only %d nodes in nodemap\n",
2570                        pnn, numnodes));
2571                 return;
2572         }
2573
2574         /* Can't run the "ipreallocated" event on a INACTIVE node */
2575         if (cd->nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE) {
2576                 DEBUG(DEBUG_WARNING,
2577                       ("ipreallocated failed on inactive node %d, ignoring\n",
2578                        pnn));
2579                 return;
2580         }
2581
2582         switch (res) {
2583         case -ETIME:
2584                 /* If the control timed out then that's a real error,
2585                  * so call the real fail callback
2586                  */
2587                 if (cd->fail_callback) {
2588                         cd->fail_callback(ctdb, pnn, res, outdata,
2589                                           cd->fail_callback_data);
2590                 } else {
2591                         DEBUG(DEBUG_WARNING,
2592                               ("iprealloc timed out but no callback registered\n"));
2593                 }
2594                 break;
2595         default:
2596                 /* If not a timeout then either the ipreallocated
2597                  * eventscript (or some setup) failed.  This might
2598                  * have failed because the IPREALLOCATED control isn't
2599                  * implemented - right now there is no way of knowing
2600                  * because the error codes are all folded down to -1.
2601                  * Consider retrying using EVENTSCRIPT control...
2602                  */
2603                 DEBUG(DEBUG_WARNING,
2604                       ("ipreallocated failure from node %d, flagging retry\n",
2605                        pnn));
2606                 cd->retry_nodes[pnn] = true;
2607                 cd->retry_count++;
2608         }
2609 }
2610
2611 struct takeover_callback_data {
2612         bool *node_failed;
2613         client_async_callback fail_callback;
2614         void *fail_callback_data;
2615         struct ctdb_node_map *nodemap;
2616 };
2617
2618 static void takeover_run_fail_callback(struct ctdb_context *ctdb,
2619                                        uint32_t node_pnn, int32_t res,
2620                                        TDB_DATA outdata, void *callback_data)
2621 {
2622         struct takeover_callback_data *cd =
2623                 talloc_get_type_abort(callback_data,
2624                                       struct takeover_callback_data);
2625         int i;
2626
2627         for (i = 0; i < cd->nodemap->num; i++) {
2628                 if (node_pnn == cd->nodemap->nodes[i].pnn) {
2629                         break;
2630                 }
2631         }
2632
2633         if (i == cd->nodemap->num) {
2634                 DEBUG(DEBUG_ERR, (__location__ " invalid PNN %u\n", node_pnn));
2635                 return;
2636         }
2637
2638         if (!cd->node_failed[i]) {
2639                 cd->node_failed[i] = true;
2640                 cd->fail_callback(ctdb, node_pnn, res, outdata,
2641                                   cd->fail_callback_data);
2642         }
2643 }
2644
2645 /*
2646   make any IP alias changes for public addresses that are necessary 
2647  */
2648 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
2649                       uint32_t *force_rebalance_nodes,
2650                       client_async_callback fail_callback, void *callback_data)
2651 {
2652         int i, j, ret;
2653         struct ctdb_public_ip ip;
2654         struct ctdb_public_ipv4 ipv4;
2655         uint32_t *nodes;
2656         struct ctdb_public_ip_list *all_ips, *tmp_ip;
2657         TDB_DATA data;
2658         struct timeval timeout;
2659         struct client_async_data *async_data;
2660         struct ctdb_client_control_state *state;
2661         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2662         struct ctdb_ipflags *ipflags;
2663         struct takeover_callback_data *takeover_data;
2664         struct iprealloc_callback_data iprealloc_data;
2665         bool *retry_data;
2666
2667         /*
2668          * ip failover is completely disabled, just send out the 
2669          * ipreallocated event.
2670          */
2671         if (ctdb->tunable.disable_ip_failover != 0) {
2672                 goto ipreallocated;
2673         }
2674
2675         ipflags = set_ipflags(ctdb, tmp_ctx, nodemap);
2676         if (ipflags == NULL) {
2677                 DEBUG(DEBUG_ERR,("Failed to set IP flags - aborting takeover run\n"));
2678                 talloc_free(tmp_ctx);
2679                 return -1;
2680         }
2681
2682         /* Do the IP reassignment calculations */
2683         ctdb_takeover_run_core(ctdb, ipflags, &all_ips, force_rebalance_nodes);
2684
2685         /* Now tell all nodes to release any public IPs should not
2686          * host.  This will be a NOOP on nodes that don't currently
2687          * hold the given IP.
2688          */
2689         takeover_data = talloc_zero(tmp_ctx, struct takeover_callback_data);
2690         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data);
2691
2692         takeover_data->node_failed = talloc_zero_array(tmp_ctx,
2693                                                        bool, nodemap->num);
2694         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data->node_failed);
2695         takeover_data->fail_callback = fail_callback;
2696         takeover_data->fail_callback_data = callback_data;
2697         takeover_data->nodemap = nodemap;
2698
2699         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2700         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2701
2702         async_data->fail_callback = takeover_run_fail_callback;
2703         async_data->callback_data = takeover_data;
2704
2705         ZERO_STRUCT(ip); /* Avoid valgrind warnings for union */
2706
2707         /* Send a RELEASE_IP to all nodes that should not be hosting
2708          * each IP.  For each IP, all but one of these will be
2709          * redundant.  However, the redundant ones are used to tell
2710          * nodes which node should be hosting the IP so that commands
2711          * like "ctdb ip" can display a particular nodes idea of who
2712          * is hosting what. */
2713         for (i=0;i<nodemap->num;i++) {
2714                 /* don't talk to unconnected nodes, but do talk to banned nodes */
2715                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2716                         continue;
2717                 }
2718
2719                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2720                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2721                                 /* This node should be serving this
2722                                    vnn so dont tell it to release the ip
2723                                 */
2724                                 continue;
2725                         }
2726                         if (tmp_ip->addr.sa.sa_family == AF_INET) {
2727                                 ipv4.pnn = tmp_ip->pnn;
2728                                 ipv4.sin = tmp_ip->addr.ip;
2729
2730                                 timeout = TAKEOVER_TIMEOUT();
2731                                 data.dsize = sizeof(ipv4);
2732                                 data.dptr  = (uint8_t *)&ipv4;
2733                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2734                                                 0, CTDB_CONTROL_RELEASE_IPv4, 0,
2735                                                 data, async_data,
2736                                                 &timeout, NULL);
2737                         } else {
2738                                 ip.pnn  = tmp_ip->pnn;
2739                                 ip.addr = tmp_ip->addr;
2740
2741                                 timeout = TAKEOVER_TIMEOUT();
2742                                 data.dsize = sizeof(ip);
2743                                 data.dptr  = (uint8_t *)&ip;
2744                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2745                                                 0, CTDB_CONTROL_RELEASE_IP, 0,
2746                                                 data, async_data,
2747                                                 &timeout, NULL);
2748                         }
2749
2750                         if (state == NULL) {
2751                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2752                                 talloc_free(tmp_ctx);
2753                                 return -1;
2754                         }
2755                 
2756                         ctdb_client_async_add(async_data, state);
2757                 }
2758         }
2759         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2760                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2761                 talloc_free(tmp_ctx);
2762                 return -1;
2763         }
2764         talloc_free(async_data);
2765
2766
2767         /* For each IP, send a TAKOVER_IP to the node that should be
2768          * hosting it.  Many of these will often be redundant (since
2769          * the allocation won't have changed) but they can be useful
2770          * to recover from inconsistencies. */
2771         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2772         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2773
2774         async_data->fail_callback = fail_callback;
2775         async_data->callback_data = callback_data;
2776
2777         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2778                 if (tmp_ip->pnn == -1) {
2779                         /* this IP won't be taken over */
2780                         continue;
2781                 }
2782
2783                 if (tmp_ip->addr.sa.sa_family == AF_INET) {
2784                         ipv4.pnn = tmp_ip->pnn;
2785                         ipv4.sin = tmp_ip->addr.ip;
2786
2787                         timeout = TAKEOVER_TIMEOUT();
2788                         data.dsize = sizeof(ipv4);
2789                         data.dptr  = (uint8_t *)&ipv4;
2790                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
2791                                         0, CTDB_CONTROL_TAKEOVER_IPv4, 0,
2792                                         data, async_data,
2793                                         &timeout, NULL);
2794                 } else {
2795                         ip.pnn  = tmp_ip->pnn;
2796                         ip.addr = tmp_ip->addr;
2797
2798                         timeout = TAKEOVER_TIMEOUT();
2799                         data.dsize = sizeof(ip);
2800                         data.dptr  = (uint8_t *)&ip;
2801                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
2802                                         0, CTDB_CONTROL_TAKEOVER_IP, 0,
2803                                         data, async_data,
2804                                         &timeout, NULL);
2805                 }
2806                 if (state == NULL) {
2807                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2808                         talloc_free(tmp_ctx);
2809                         return -1;
2810                 }
2811                 
2812                 ctdb_client_async_add(async_data, state);
2813         }
2814         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2815                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2816                 talloc_free(tmp_ctx);
2817                 return -1;
2818         }
2819
2820 ipreallocated:
2821         /*
2822          * Tell all nodes to run eventscripts to process the
2823          * "ipreallocated" event.  This can do a lot of things,
2824          * including restarting services to reconfigure them if public
2825          * IPs have moved.  Once upon a time this event only used to
2826          * update natgw.
2827          */
2828         retry_data = talloc_zero_array(tmp_ctx, bool, nodemap->num);
2829         CTDB_NO_MEMORY_FATAL(ctdb, retry_data);
2830         iprealloc_data.retry_nodes = retry_data;
2831         iprealloc_data.retry_count = 0;
2832         iprealloc_data.fail_callback = fail_callback;
2833         iprealloc_data.fail_callback_data = callback_data;
2834         iprealloc_data.nodemap = nodemap;
2835
2836         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2837         ret = ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
2838                                         nodes, 0, TAKEOVER_TIMEOUT(),
2839                                         false, tdb_null,
2840                                         NULL, iprealloc_fail_callback,
2841                                         &iprealloc_data);
2842         if (ret != 0) {
2843                 /* If the control failed then we should retry to any
2844                  * nodes flagged by iprealloc_fail_callback using the
2845                  * EVENTSCRIPT control.  This is a best-effort at
2846                  * backward compatiblity when running a mixed cluster
2847                  * where some nodes have not yet been upgraded to
2848                  * support the IPREALLOCATED control.
2849                  */
2850                 DEBUG(DEBUG_WARNING,
2851                       ("Retry ipreallocated to some nodes using eventscript control\n"));
2852
2853                 nodes = talloc_array(tmp_ctx, uint32_t,
2854                                      iprealloc_data.retry_count);
2855                 CTDB_NO_MEMORY_FATAL(ctdb, nodes);
2856
2857                 j = 0;
2858                 for (i=0; i<nodemap->num; i++) {
2859                         if (iprealloc_data.retry_nodes[i]) {
2860                                 nodes[j] = i;
2861                                 j++;
2862                         }
2863                 }
2864
2865                 data.dptr  = discard_const("ipreallocated");
2866                 data.dsize = strlen((char *)data.dptr) + 1; 
2867                 ret = ctdb_client_async_control(ctdb,
2868                                                 CTDB_CONTROL_RUN_EVENTSCRIPTS,
2869                                                 nodes, 0, TAKEOVER_TIMEOUT(),
2870                                                 false, data,
2871                                                 NULL, fail_callback,
2872                                                 callback_data);
2873                 if (ret != 0) {
2874                         DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
2875                 }
2876         }
2877
2878         talloc_free(tmp_ctx);
2879         return ret;
2880 }
2881
2882
2883 /*
2884   destroy a ctdb_client_ip structure
2885  */
2886 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2887 {
2888         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2889                 ctdb_addr_to_str(&ip->addr),
2890                 ntohs(ip->addr.ip.sin_port),
2891                 ip->client_id));
2892
2893         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2894         return 0;
2895 }
2896
2897 /*
2898   called by a client to inform us of a TCP connection that it is managing
2899   that should tickled with an ACK when IP takeover is done
2900   we handle both the old ipv4 style of packets as well as the new ipv4/6
2901   pdus.
2902  */
2903 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2904                                 TDB_DATA indata)
2905 {
2906         struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
2907         struct ctdb_control_tcp *old_addr = NULL;
2908         struct ctdb_control_tcp_addr new_addr;
2909         struct ctdb_control_tcp_addr *tcp_sock = NULL;
2910         struct ctdb_tcp_list *tcp;
2911         struct ctdb_tcp_connection t;
2912         int ret;
2913         TDB_DATA data;
2914         struct ctdb_client_ip *ip;
2915         struct ctdb_vnn *vnn;
2916         ctdb_sock_addr addr;
2917
2918         /* If we don't have public IPs, tickles are useless */
2919         if (ctdb->vnn == NULL) {
2920                 return 0;
2921         }
2922
2923         switch (indata.dsize) {
2924         case sizeof(struct ctdb_control_tcp):
2925                 old_addr = (struct ctdb_control_tcp *)indata.dptr;
2926                 ZERO_STRUCT(new_addr);
2927                 tcp_sock = &new_addr;
2928                 tcp_sock->src.ip  = old_addr->src;
2929                 tcp_sock->dest.ip = old_addr->dest;
2930                 break;
2931         case sizeof(struct ctdb_control_tcp_addr):
2932                 tcp_sock = (struct ctdb_control_tcp_addr *)indata.dptr;
2933                 break;
2934         default:
2935                 DEBUG(DEBUG_ERR,(__location__ " Invalid data structure passed "
2936                                  "to ctdb_control_tcp_client. size was %d but "
2937                                  "only allowed sizes are %lu and %lu\n",
2938                                  (int)indata.dsize,
2939                                  (long unsigned)sizeof(struct ctdb_control_tcp),
2940                                  (long unsigned)sizeof(struct ctdb_control_tcp_addr)));
2941                 return -1;
2942         }
2943
2944         addr = tcp_sock->src;
2945         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
2946         addr = tcp_sock->dest;
2947         ctdb_canonicalize_ip(&addr, &tcp_sock->dest);
2948
2949         ZERO_STRUCT(addr);
2950         memcpy(&addr, &tcp_sock->dest, sizeof(addr));
2951         vnn = find_public_ip_vnn(ctdb, &addr);
2952         if (vnn == NULL) {
2953                 switch (addr.sa.sa_family) {
2954                 case AF_INET:
2955                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2956                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
2957                                         ctdb_addr_to_str(&addr)));
2958                         }
2959                         break;
2960                 case AF_INET6:
2961                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
2962                                 ctdb_addr_to_str(&addr)));
2963                         break;
2964                 default:
2965                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2966                 }
2967
2968                 return 0;
2969         }
2970
2971         if (vnn->pnn != ctdb->pnn) {
2972                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2973                         ctdb_addr_to_str(&addr),
2974                         client_id, client->pid));
2975                 /* failing this call will tell smbd to die */
2976                 return -1;
2977         }
2978
2979         ip = talloc(client, struct ctdb_client_ip);
2980         CTDB_NO_MEMORY(ctdb, ip);
2981
2982         ip->ctdb      = ctdb;
2983         ip->addr      = addr;
2984         ip->client_id = client_id;
2985         talloc_set_destructor(ip, ctdb_client_ip_destructor);
2986         DLIST_ADD(ctdb->client_ip_list, ip);
2987
2988         tcp = talloc(client, struct ctdb_tcp_list);
2989         CTDB_NO_MEMORY(ctdb, tcp);
2990
2991         tcp->connection.src_addr = tcp_sock->src;
2992         tcp->connection.dst_addr = tcp_sock->dest;
2993
2994         DLIST_ADD(client->tcp_list, tcp);
2995
2996         t.src_addr = tcp_sock->src;
2997         t.dst_addr = tcp_sock->dest;
2998
2999         data.dptr = (uint8_t *)&t;
3000         data.dsize = sizeof(t);
3001
3002         switch (addr.sa.sa_family) {
3003         case AF_INET:
3004                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
3005                         (unsigned)ntohs(tcp_sock->dest.ip.sin_port), 
3006                         ctdb_addr_to_str(&tcp_sock->src),
3007                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
3008                 break;
3009         case AF_INET6:
3010                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
3011                         (unsigned)ntohs(tcp_sock->dest.ip6.sin6_port), 
3012                         ctdb_addr_to_str(&tcp_sock->src),
3013                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
3014                 break;
3015         default:
3016                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
3017         }
3018
3019
3020         /* tell all nodes about this tcp connection */
3021         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
3022                                        CTDB_CONTROL_TCP_ADD,
3023                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3024         if (ret != 0) {
3025                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
3026                 return -1;
3027         }
3028
3029         return 0;
3030 }
3031
3032 /*
3033   find a tcp address on a list
3034  */
3035 static struct ctdb_tcp_connection *ctdb_tcp_find(struct ctdb_tcp_array *array, 
3036                                            struct ctdb_tcp_connection *tcp)
3037 {
3038         int i;
3039
3040         if (array == NULL) {
3041                 return NULL;
3042         }
3043
3044         for (i=0;i<array->num;i++) {
3045                 if (ctdb_same_sockaddr(&array->connections[i].src_addr, &tcp->src_addr) &&
3046                     ctdb_same_sockaddr(&array->connections[i].dst_addr, &tcp->dst_addr)) {
3047                         return &array->connections[i];
3048                 }
3049         }
3050         return NULL;
3051 }
3052
3053
3054
3055 /*
3056   called by a daemon to inform us of a TCP connection that one of its
3057   clients managing that should tickled with an ACK when IP takeover is
3058   done
3059  */
3060 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
3061 {
3062         struct ctdb_tcp_connection *p = (struct ctdb_tcp_connection *)indata.dptr;
3063         struct ctdb_tcp_array *tcparray;
3064         struct ctdb_tcp_connection tcp;
3065         struct ctdb_vnn *vnn;
3066
3067         /* If we don't have public IPs, tickles are useless */
3068         if (ctdb->vnn == NULL) {
3069                 return 0;
3070         }
3071
3072         vnn = find_public_ip_vnn(ctdb, &p->dst_addr);
3073         if (vnn == NULL) {
3074                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
3075                         ctdb_addr_to_str(&p->dst_addr)));
3076
3077                 return -1;
3078         }
3079
3080
3081         tcparray = vnn->tcp_array;
3082
3083         /* If this is the first tickle */
3084         if (tcparray == NULL) {
3085                 tcparray = talloc(vnn, struct ctdb_tcp_array);
3086                 CTDB_NO_MEMORY(ctdb, tcparray);
3087                 vnn->tcp_array = tcparray;
3088
3089                 tcparray->num = 0;
3090                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
3091                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
3092
3093                 tcparray->connections[tcparray->num].src_addr = p->src_addr;
3094                 tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
3095                 tcparray->num++;
3096
3097                 if (tcp_update_needed) {
3098                         vnn->tcp_update_needed = true;
3099                 }
3100                 return 0;
3101         }
3102
3103
3104         /* Do we already have this tickle ?*/
3105         tcp.src_addr = p->src_addr;
3106         tcp.dst_addr = p->dst_addr;
3107         if (ctdb_tcp_find(tcparray, &tcp) != NULL) {
3108                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
3109                         ctdb_addr_to_str(&tcp.dst_addr),
3110                         ntohs(tcp.dst_addr.ip.sin_port),
3111                         vnn->pnn));
3112                 return 0;
3113         }
3114
3115         /* A new tickle, we must add it to the array */
3116         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
3117                                         struct ctdb_tcp_connection,
3118                                         tcparray->num+1);
3119         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3120
3121         tcparray->connections[tcparray->num].src_addr = p->src_addr;
3122         tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
3123         tcparray->num++;
3124
3125         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
3126                 ctdb_addr_to_str(&tcp.dst_addr),
3127                 ntohs(tcp.dst_addr.ip.sin_port),
3128                 vnn->pnn));
3129
3130         if (tcp_update_needed) {
3131                 vnn->tcp_update_needed = true;
3132         }
3133
3134         return 0;
3135 }
3136
3137
3138 /*
3139   called by a daemon to inform us of a TCP connection that one of its
3140   clients managing that should tickled with an ACK when IP takeover is
3141   done
3142  */
3143 static void ctdb_remove_tcp_connection(struct ctdb_context *ctdb, struct ctdb_tcp_connection *conn)
3144 {
3145         struct ctdb_tcp_connection *tcpp;
3146         struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst_addr);
3147
3148         if (vnn == NULL) {
3149                 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
3150                         ctdb_addr_to_str(&conn->dst_addr)));
3151                 return;
3152         }
3153
3154         /* if the array is empty we cant remove it
3155            and we dont need to do anything
3156          */
3157         if (vnn->tcp_array == NULL) {
3158                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
3159                         ctdb_addr_to_str(&conn->dst_addr),
3160                         ntohs(conn->dst_addr.ip.sin_port)));
3161                 return;
3162         }
3163
3164
3165         /* See if we know this connection
3166            if we dont know this connection  then we dont need to do anything
3167          */
3168         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
3169         if (tcpp == NULL) {
3170                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
3171                         ctdb_addr_to_str(&conn->dst_addr),
3172                         ntohs(conn->dst_addr.ip.sin_port)));
3173                 return;
3174         }
3175
3176
3177         /* We need to remove this entry from the array.
3178            Instead of allocating a new array and copying data to it
3179            we cheat and just copy the last entry in the existing array
3180            to the entry that is to be removed and just shring the 
3181            ->num field
3182          */
3183         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
3184         vnn->tcp_array->num--;
3185
3186         /* If we deleted the last entry we also need to remove the entire array
3187          */
3188         if (vnn->tcp_array->num == 0) {
3189                 talloc_free(vnn->tcp_array);
3190                 vnn->tcp_array = NULL;
3191         }               
3192
3193         vnn->tcp_update_needed = true;
3194
3195         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
3196                 ctdb_addr_to_str(&conn->src_addr),
3197                 ntohs(conn->src_addr.ip.sin_port)));
3198 }
3199
3200
3201 /*
3202   called by a daemon to inform us of a TCP connection that one of its
3203   clients used are no longer needed in the tickle database
3204  */
3205 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
3206 {
3207         struct ctdb_tcp_connection *conn = (struct ctdb_tcp_connection *)indata.dptr;
3208
3209         /* If we don't have public IPs, tickles are useless */
3210         if (ctdb->vnn == NULL) {
3211                 return 0;
3212         }
3213
3214         ctdb_remove_tcp_connection(ctdb, conn);
3215
3216         return 0;
3217 }
3218
3219
3220 /*
3221   Called when another daemon starts - causes all tickles for all
3222   public addresses we are serving to be sent to the new node on the
3223   next check.  This actually causes the next scheduled call to
3224   tdb_update_tcp_tickles() to update all nodes.  This is simple and
3225   doesn't require careful error handling.
3226  */
3227 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t pnn)
3228 {
3229         struct ctdb_vnn *vnn;
3230
3231         DEBUG(DEBUG_INFO, ("Received startup control from node %lu\n",
3232                            (unsigned long) pnn));
3233
3234         for (vnn = ctdb->vnn; vnn != NULL; vnn = vnn->next) {
3235                 vnn->tcp_update_needed = true;
3236         }
3237
3238         return 0;
3239 }
3240
3241
3242 /*
3243   called when a client structure goes away - hook to remove
3244   elements from the tcp_list in all daemons
3245  */
3246 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
3247 {
3248         while (client->tcp_list) {
3249                 struct ctdb_tcp_list *tcp = client->tcp_list;
3250                 DLIST_REMOVE(client->tcp_list, tcp);
3251                 ctdb_remove_tcp_connection(client->ctdb, &tcp->connection);
3252         }
3253 }
3254
3255
3256 /*
3257   release all IPs on shutdown
3258  */
3259 void ctdb_release_all_ips(struct ctdb_context *ctdb)
3260 {
3261         struct ctdb_vnn *vnn;
3262         int count = 0;
3263
3264         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3265                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
3266                         ctdb_vnn_unassign_iface(ctdb, vnn);
3267                         continue;
3268                 }
3269                 if (!vnn->iface) {
3270                         continue;
3271                 }
3272
3273                 DEBUG(DEBUG_INFO,("Release of IP %s/%u on interface %s node:-1\n",
3274                                     ctdb_addr_to_str(&vnn->public_address),
3275                                     vnn->public_netmask_bits,
3276                                     ctdb_vnn_iface_string(vnn)));
3277
3278                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
3279                                   ctdb_vnn_iface_string(vnn),
3280                                   ctdb_addr_to_str(&vnn->public_address),
3281                                   vnn->public_netmask_bits);
3282                 release_kill_clients(ctdb, &vnn->public_address);
3283                 ctdb_vnn_unassign_iface(ctdb, vnn);
3284                 count++;
3285         }
3286
3287         DEBUG(DEBUG_NOTICE,(__location__ " Released %d public IPs\n", count));
3288 }
3289
3290
3291 /*
3292   get list of public IPs
3293  */
3294 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
3295                                     struct ctdb_req_control *c, TDB_DATA *outdata)
3296 {
3297         int i, num, len;
3298         struct ctdb_all_public_ips *ips;
3299         struct ctdb_vnn *vnn;
3300         bool only_available = false;
3301
3302         if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
3303                 only_available = true;
3304         }
3305
3306         /* count how many public ip structures we have */
3307         num = 0;
3308         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3309                 num++;
3310         }
3311
3312         len = offsetof(struct ctdb_all_public_ips, ips) + 
3313                 num*sizeof(struct ctdb_public_ip);
3314         ips = talloc_zero_size(outdata, len);
3315         CTDB_NO_MEMORY(ctdb, ips);
3316
3317         i = 0;
3318         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3319                 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
3320                         continue;
3321                 }
3322                 ips->ips[i].pnn  = vnn->pnn;
3323                 ips->ips[i].addr = vnn->public_address;
3324                 i++;
3325         }
3326         ips->num = i;
3327         len = offsetof(struct ctdb_all_public_ips, ips) +
3328                 i*sizeof(struct ctdb_public_ip);
3329
3330         outdata->dsize = len;
3331         outdata->dptr  = (uint8_t *)ips;
3332
3333         return 0;
3334 }
3335
3336
3337 /*
3338   get list of public IPs, old ipv4 style.  only returns ipv4 addresses
3339  */
3340 int32_t ctdb_control_get_public_ipsv4(struct ctdb_context *ctdb, 
3341                                     struct ctdb_req_control *c, TDB_DATA *outdata)
3342 {
3343         int i, num, len;
3344         struct ctdb_all_public_ipsv4 *ips;
3345         struct ctdb_vnn *vnn;
3346
3347         /* count how many public ip structures we have */
3348         num = 0;
3349         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3350                 if (vnn->public_address.sa.sa_family != AF_INET) {
3351                         continue;
3352                 }
3353                 num++;
3354         }
3355
3356         len = offsetof(struct ctdb_all_public_ipsv4, ips) + 
3357                 num*sizeof(struct ctdb_public_ipv4);
3358         ips = talloc_zero_size(outdata, len);
3359         CTDB_NO_MEMORY(ctdb, ips);
3360
3361         outdata->dsize = len;
3362         outdata->dptr  = (uint8_t *)ips;
3363
3364         ips->num = num;
3365         i = 0;
3366         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3367                 if (vnn->public_address.sa.sa_family != AF_INET) {
3368                         continue;
3369                 }
3370                 ips->ips[i].pnn = vnn->pnn;
3371                 ips->ips[i].sin = vnn->public_address.ip;
3372                 i++;
3373         }
3374
3375         return 0;
3376 }
3377
3378 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
3379                                         struct ctdb_req_control *c,
3380                                         TDB_DATA indata,
3381                                         TDB_DATA *outdata)
3382 {
3383         int i, num, len;
3384         ctdb_sock_addr *addr;
3385         struct ctdb_control_public_ip_info *info;
3386         struct ctdb_vnn *vnn;
3387
3388         addr = (ctdb_sock_addr *)indata.dptr;
3389
3390         vnn = find_public_ip_vnn(ctdb, addr);
3391         if (vnn == NULL) {
3392                 /* if it is not a public ip   it could be our 'single ip' */
3393                 if (ctdb->single_ip_vnn) {
3394                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
3395                                 vnn = ctdb->single_ip_vnn;
3396                         }
3397                 }
3398         }
3399         if (vnn == NULL) {
3400                 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
3401                                  "'%s'not a public address\n",
3402                                  ctdb_addr_to_str(addr)));
3403                 return -1;
3404         }
3405
3406         /* count how many public ip structures we have */
3407         num = 0;
3408         for (;vnn->ifaces[num];) {
3409                 num++;
3410         }
3411
3412         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3413                 num*sizeof(struct ctdb_control_iface_info);
3414         info = talloc_zero_size(outdata, len);
3415         CTDB_NO_MEMORY(ctdb, info);
3416
3417         info->ip.addr = vnn->public_address;
3418         info->ip.pnn = vnn->pnn;
3419         info->active_idx = 0xFFFFFFFF;
3420
3421         for (i=0; vnn->ifaces[i]; i++) {
3422                 struct ctdb_iface *cur;
3423
3424                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
3425                 if (cur == NULL) {
3426                         DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
3427                                            vnn->ifaces[i]));
3428                         return -1;
3429                 }
3430                 if (vnn->iface == cur) {
3431                         info->active_idx = i;
3432                 }
3433                 strncpy(info->ifaces[i].name, cur->name, sizeof(info->ifaces[i].name)-1);
3434                 info->ifaces[i].link_state = cur->link_up;
3435                 info->ifaces[i].references = cur->references;
3436         }
3437         info->num = i;
3438         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3439                 i*sizeof(struct ctdb_control_iface_info);
3440
3441         outdata->dsize = len;
3442         outdata->dptr  = (uint8_t *)info;
3443
3444         return 0;
3445 }
3446
3447 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
3448                                 struct ctdb_req_control *c,
3449                                 TDB_DATA *outdata)
3450 {
3451         int i, num, len;
3452         struct ctdb_control_get_ifaces *ifaces;
3453         struct ctdb_iface *cur;
3454
3455         /* count how many public ip structures we have */
3456         num = 0;
3457         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3458                 num++;
3459         }
3460
3461         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3462                 num*sizeof(struct ctdb_control_iface_info);
3463         ifaces = talloc_zero_size(outdata, len);
3464         CTDB_NO_MEMORY(ctdb, ifaces);
3465
3466         i = 0;
3467         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3468                 strcpy(ifaces->ifaces[i].name, cur->name);
3469                 ifaces->ifaces[i].link_state = cur->link_up;
3470                 ifaces->ifaces[i].references = cur->references;
3471                 i++;
3472         }
3473         ifaces->num = i;
3474         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3475                 i*sizeof(struct ctdb_control_iface_info);
3476
3477         outdata->dsize = len;
3478         outdata->dptr  = (uint8_t *)ifaces;
3479
3480         return 0;
3481 }
3482
3483 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
3484                                     struct ctdb_req_control *c,
3485                                     TDB_DATA indata)
3486 {
3487         struct ctdb_control_iface_info *info;
3488         struct ctdb_iface *iface;
3489         bool link_up = false;
3490
3491         info = (struct ctdb_control_iface_info *)indata.dptr;
3492
3493         if (info->name[CTDB_IFACE_SIZE] != '\0') {
3494                 int len = strnlen(info->name, CTDB_IFACE_SIZE);
3495                 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
3496                                   len, len, info->name));
3497                 return -1;
3498         }
3499
3500         switch (info->link_state) {
3501         case 0:
3502                 link_up = false;
3503                 break;
3504         case 1:
3505                 link_up = true;
3506                 break;
3507         default:
3508                 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
3509                                   (unsigned int)info->link_state));
3510                 return -1;
3511         }
3512
3513         if (info->references != 0) {
3514                 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
3515                                   (unsigned int)info->references));
3516                 return -1;
3517         }
3518
3519         iface = ctdb_find_iface(ctdb, info->name);
3520         if (iface == NULL) {
3521                 return -1;
3522         }
3523
3524         if (link_up == iface->link_up) {
3525                 return 0;
3526         }
3527
3528         DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
3529               ("iface[%s] has changed it's link status %s => %s\n",
3530                iface->name,
3531                iface->link_up?"up":"down",
3532                link_up?"up":"down"));
3533
3534         iface->link_up = link_up;
3535         return 0;
3536 }
3537
3538
3539 /* 
3540    structure containing the listening socket and the list of tcp connections
3541    that the ctdb daemon is to kill
3542 */
3543 struct ctdb_kill_tcp {
3544         struct ctdb_vnn *vnn;
3545         struct ctdb_context *ctdb;
3546         int capture_fd;
3547         struct fd_event *fde;
3548         trbt_tree_t *connections;
3549         void *private_data;
3550 };
3551
3552 /*
3553   a tcp connection that is to be killed
3554  */
3555 struct ctdb_killtcp_con {
3556         ctdb_sock_addr src_addr;
3557         ctdb_sock_addr dst_addr;
3558         int count;
3559         struct ctdb_kill_tcp *killtcp;
3560 };
3561
3562 /* this function is used to create a key to represent this socketpair
3563    in the killtcp tree.
3564    this key is used to insert and lookup matching socketpairs that are
3565    to be tickled and RST
3566 */
3567 #define KILLTCP_KEYLEN  10
3568 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
3569 {
3570         static uint32_t key[KILLTCP_KEYLEN];
3571
3572         bzero(key, sizeof(key));
3573
3574         if (src->sa.sa_family != dst->sa.sa_family) {
3575                 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
3576                 return key;
3577         }
3578         
3579         switch (src->sa.sa_family) {
3580         case AF_INET:
3581                 key[0]  = dst->ip.sin_addr.s_addr;
3582                 key[1]  = src->ip.sin_addr.s_addr;
3583                 key[2]  = dst->ip.sin_port;
3584                 key[3]  = src->ip.sin_port;
3585                 break;
3586         case AF_INET6: {
3587                 uint32_t *dst6_addr32 =
3588                         (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
3589                 uint32_t *src6_addr32 =
3590                         (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
3591                 key[0]  = dst6_addr32[3];
3592                 key[1]  = src6_addr32[3];
3593                 key[2]  = dst6_addr32[2];
3594                 key[3]  = src6_addr32[2];
3595                 key[4]  = dst6_addr32[1];
3596                 key[5]  = src6_addr32[1];
3597                 key[6]  = dst6_addr32[0];
3598                 key[7]  = src6_addr32[0];
3599                 key[8]  = dst->ip6.sin6_port;
3600                 key[9]  = src->ip6.sin6_port;
3601                 break;
3602         }
3603         default:
3604                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
3605                 return key;
3606         }
3607
3608         return key;
3609 }
3610
3611 /*
3612   called when we get a read event on the raw socket
3613  */
3614 static void capture_tcp_handler(struct event_context *ev, struct fd_event *fde, 
3615                                 uint16_t flags, void *private_data)
3616 {
3617         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3618         struct ctdb_killtcp_con *con;
3619         ctdb_sock_addr src, dst;
3620         uint32_t ack_seq, seq;
3621
3622         if (!(flags & EVENT_FD_READ)) {
3623                 return;
3624         }
3625
3626         if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
3627                                 killtcp->private_data,
3628                                 &src, &dst,
3629                                 &ack_seq, &seq) != 0) {
3630                 /* probably a non-tcp ACK packet */
3631                 return;
3632         }
3633
3634         /* check if we have this guy in our list of connections
3635            to kill
3636         */
3637         con = trbt_lookuparray32(killtcp->connections, 
3638                         KILLTCP_KEYLEN, killtcp_key(&src, &dst));
3639         if (con == NULL) {
3640                 /* no this was some other packet we can just ignore */
3641                 return;
3642         }
3643
3644         /* This one has been tickled !
3645            now reset him and remove him from the list.
3646          */
3647         DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
3648                 ntohs(con->dst_addr.ip.sin_port),
3649                 ctdb_addr_to_str(&con->src_addr),
3650                 ntohs(con->src_addr.ip.sin_port)));
3651
3652         ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
3653         talloc_free(con);
3654 }
3655
3656
3657 /* when traversing the list of all tcp connections to send tickle acks to
3658    (so that we can capture the ack coming back and kill the connection
3659     by a RST)
3660    this callback is called for each connection we are currently trying to kill
3661 */
3662 static int tickle_connection_traverse(void *param, void *data)
3663 {
3664         struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
3665
3666         /* have tried too many times, just give up */
3667         if (con->count >= 5) {
3668                 /* can't delete in traverse: reparent to delete_cons */
3669                 talloc_steal(param, con);
3670                 return 0;
3671         }
3672
3673         /* othervise, try tickling it again */
3674         con->count++;
3675         ctdb_sys_send_tcp(
3676                 (ctdb_sock_addr *)&con->dst_addr,
3677                 (ctdb_sock_addr *)&con->src_addr,
3678                 0, 0, 0);
3679         return 0;
3680 }
3681
3682
3683 /* 
3684    called every second until all sentenced connections have been reset
3685  */
3686 static void ctdb_tickle_sentenced_connections(struct event_context *ev, struct timed_event *te, 
3687                                               struct timeval t, void *private_data)
3688 {
3689         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3690         void *delete_cons = talloc_new(NULL);
3691
3692         /* loop over all connections sending tickle ACKs */
3693         trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
3694
3695         /* now we've finished traverse, it's safe to do deletion. */
3696         talloc_free(delete_cons);
3697
3698         /* If there are no more connections to kill we can remove the
3699            entire killtcp structure
3700          */
3701         if ( (killtcp->connections == NULL) || 
3702              (killtcp->connections->root == NULL) ) {
3703                 talloc_free(killtcp);
3704                 return;
3705         }
3706
3707         /* try tickling them again in a seconds time
3708          */
3709         event_add_timed(killtcp->ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
3710                         ctdb_tickle_sentenced_connections, killtcp);
3711 }
3712
3713 /*
3714   destroy the killtcp structure
3715  */
3716 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
3717 {
3718         struct ctdb_vnn *tmpvnn;
3719
3720         /* verify that this vnn is still active */
3721         for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
3722                 if (tmpvnn == killtcp->vnn) {
3723                         break;
3724                 }
3725         }
3726
3727         if (tmpvnn == NULL) {
3728                 return 0;
3729         }
3730
3731         if (killtcp->vnn->killtcp != killtcp) {
3732                 return 0;
3733         }
3734
3735         killtcp->vnn->killtcp = NULL;
3736
3737         return 0;
3738 }
3739
3740
3741 /* nothing fancy here, just unconditionally replace any existing
3742    connection structure with the new one.
3743
3744    dont even free the old one if it did exist, that one is talloc_stolen
3745    by the same node in the tree anyway and will be deleted when the new data 
3746    is deleted
3747 */
3748 static void *add_killtcp_callback(void *parm, void *data)
3749 {
3750         return parm;
3751 }
3752
3753 /*
3754   add a tcp socket to the list of connections we want to RST
3755  */
3756 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb, 
3757                                        ctdb_sock_addr *s,
3758                                        ctdb_sock_addr *d)
3759 {
3760         ctdb_sock_addr src, dst;
3761         struct ctdb_kill_tcp *killtcp;
3762         struct ctdb_killtcp_con *con;
3763         struct ctdb_vnn *vnn;
3764
3765         ctdb_canonicalize_ip(s, &src);
3766         ctdb_canonicalize_ip(d, &dst);
3767
3768         vnn = find_public_ip_vnn(ctdb, &dst);
3769         if (vnn == NULL) {
3770                 vnn = find_public_ip_vnn(ctdb, &src);
3771         }
3772         if (vnn == NULL) {
3773                 /* if it is not a public ip   it could be our 'single ip' */
3774                 if (ctdb->single_ip_vnn) {
3775                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
3776                                 vnn = ctdb->single_ip_vnn;
3777                         }
3778                 }
3779         }
3780         if (vnn == NULL) {
3781                 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n")); 
3782                 return -1;
3783         }
3784
3785         killtcp = vnn->killtcp;
3786         
3787         /* If this is the first connection to kill we must allocate
3788            a new structure
3789          */
3790         if (killtcp == NULL) {
3791                 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
3792                 CTDB_NO_MEMORY(ctdb, killtcp);
3793
3794                 killtcp->vnn         = vnn;
3795                 killtcp->ctdb        = ctdb;
3796                 killtcp->capture_fd  = -1;
3797                 killtcp->connections = trbt_create(killtcp, 0);
3798
3799                 vnn->killtcp         = killtcp;
3800                 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
3801         }
3802
3803
3804
3805         /* create a structure that describes this connection we want to
3806            RST and store it in killtcp->connections
3807         */
3808         con = talloc(killtcp, struct ctdb_killtcp_con);
3809         CTDB_NO_MEMORY(ctdb, con);
3810         con->src_addr = src;
3811         con->dst_addr = dst;
3812         con->count    = 0;
3813         con->killtcp  = killtcp;
3814
3815
3816         trbt_insertarray32_callback(killtcp->connections,
3817                         KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
3818                         add_killtcp_callback, con);
3819
3820         /* 
3821            If we dont have a socket to listen on yet we must create it
3822          */
3823         if (killtcp->capture_fd == -1) {
3824                 const char *iface = ctdb_vnn_iface_string(vnn);
3825                 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
3826                 if (killtcp->capture_fd == -1) {
3827                         DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
3828                                           "socket on iface '%s' for killtcp (%s)\n",
3829                                           iface, strerror(errno)));
3830                         goto failed;
3831                 }
3832         }
3833
3834
3835         if (killtcp->fde == NULL) {
3836                 killtcp->fde = event_add_fd(ctdb->ev, killtcp, killtcp->capture_fd, 
3837                                             EVENT_FD_READ,
3838                                             capture_tcp_handler, killtcp);
3839                 tevent_fd_set_auto_close(killtcp->fde);
3840
3841                 /* We also need to set up some events to tickle all these connections
3842                    until they are all reset
3843                 */
3844                 event_add_timed(ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
3845                                 ctdb_tickle_sentenced_connections, killtcp);
3846         }
3847
3848         /* tickle him once now */
3849         ctdb_sys_send_tcp(
3850                 &con->dst_addr,
3851                 &con->src_addr,
3852                 0, 0, 0);
3853
3854         return 0;
3855
3856 failed:
3857         talloc_free(vnn->killtcp);
3858         vnn->killtcp = NULL;
3859         return -1;
3860 }
3861
3862 /*
3863   kill a TCP connection.
3864  */
3865 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
3866 {
3867         struct ctdb_control_killtcp *killtcp = (struct ctdb_control_killtcp *)indata.dptr;
3868
3869         return ctdb_killtcp_add_connection(ctdb, &killtcp->src_addr, &killtcp->dst_addr);
3870 }
3871
3872 /*
3873   called by a daemon to inform us of the entire list of TCP tickles for
3874   a particular public address.
3875   this control should only be sent by the node that is currently serving
3876   that public address.
3877  */
3878 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
3879 {
3880         struct ctdb_control_tcp_tickle_list *list = (struct ctdb_control_tcp_tickle_list *)indata.dptr;
3881         struct ctdb_tcp_array *tcparray;
3882         struct ctdb_vnn *vnn;
3883
3884         /* We must at least have tickles.num or else we cant verify the size
3885            of the received data blob
3886          */
3887         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3888                                         tickles.connections)) {
3889                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list. Not enough data for the tickle.num field\n"));
3890                 return -1;
3891         }
3892
3893         /* verify that the size of data matches what we expect */
3894         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3895                                 tickles.connections)
3896                          + sizeof(struct ctdb_tcp_connection)
3897                                  * list->tickles.num) {
3898                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list\n"));
3899                 return -1;
3900         }
3901
3902         DEBUG(DEBUG_INFO, ("Received tickle update for public address %s\n",
3903                            ctdb_addr_to_str(&list->addr)));
3904
3905         vnn = find_public_ip_vnn(ctdb, &list->addr);
3906         if (vnn == NULL) {
3907                 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n",
3908                         ctdb_addr_to_str(&list->addr)));
3909
3910                 return 1;
3911         }
3912
3913         /* remove any old ticklelist we might have */
3914         talloc_free(vnn->tcp_array);
3915         vnn->tcp_array = NULL;
3916
3917         tcparray = talloc(vnn, struct ctdb_tcp_array);
3918         CTDB_NO_MEMORY(ctdb, tcparray);
3919
3920         tcparray->num = list->tickles.num;
3921
3922         tcparray->connections = talloc_array(tcparray, struct ctdb_tcp_connection, tcparray->num);
3923         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3924
3925         memcpy(tcparray->connections, &list->tickles.connections[0],
3926                sizeof(struct ctdb_tcp_connection)*tcparray->num);
3927
3928         /* We now have a new fresh tickle list array for this vnn */
3929         vnn->tcp_array = tcparray;
3930
3931         return 0;
3932 }
3933
3934 /*
3935   called to return the full list of tickles for the puclic address associated 
3936   with the provided vnn
3937  */
3938 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3939 {
3940         ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3941         struct ctdb_control_tcp_tickle_list *list;
3942         struct ctdb_tcp_array *tcparray;
3943         int num;
3944         struct ctdb_vnn *vnn;
3945
3946         vnn = find_public_ip_vnn(ctdb, addr);
3947         if (vnn == NULL) {
3948                 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n", 
3949                         ctdb_addr_to_str(addr)));
3950
3951                 return 1;
3952         }
3953
3954         tcparray = vnn->tcp_array;
3955         if (tcparray) {
3956                 num = tcparray->num;
3957         } else {
3958                 num = 0;
3959         }
3960
3961         outdata->dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3962                                 tickles.connections)
3963                         + sizeof(struct ctdb_tcp_connection) * num;
3964
3965         outdata->dptr  = talloc_size(outdata, outdata->dsize);
3966         CTDB_NO_MEMORY(ctdb, outdata->dptr);
3967         list = (struct ctdb_control_tcp_tickle_list *)outdata->dptr;
3968
3969         list->addr = *addr;
3970         list->tickles.num = num;
3971         if (num) {
3972                 memcpy(&list->tickles.connections[0], tcparray->connections, 
3973                         sizeof(struct ctdb_tcp_connection) * num);
3974         }
3975
3976         return 0;
3977 }
3978
3979
3980 /*
3981   set the list of all tcp tickles for a public address
3982  */
3983 static int ctdb_send_set_tcp_tickles_for_ip(struct ctdb_context *ctdb,
3984                                             ctdb_sock_addr *addr,
3985                                             struct ctdb_tcp_array *tcparray)
3986 {
3987         int ret, num;
3988         TDB_DATA data;
3989         struct ctdb_control_tcp_tickle_list *list;
3990
3991         if (tcparray) {
3992                 num = tcparray->num;
3993         } else {
3994                 num = 0;
3995         }
3996
3997         data.dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3998                                 tickles.connections) +
3999                         sizeof(struct ctdb_tcp_connection) * num;
4000         data.dptr = talloc_size(ctdb, data.dsize);
4001         CTDB_NO_MEMORY(ctdb, data.dptr);
4002
4003         list = (struct ctdb_control_tcp_tickle_list *)data.dptr;
4004         list->addr = *addr;
4005         list->tickles.num = num;
4006         if (tcparray) {
4007                 memcpy(&list->tickles.connections[0], tcparray->connections, sizeof(struct ctdb_tcp_connection) * num);
4008         }
4009
4010         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_ALL, 0,
4011                                        CTDB_CONTROL_SET_TCP_TICKLE_LIST,
4012                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
4013         if (ret != 0) {
4014                 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
4015                 return -1;
4016         }
4017
4018         talloc_free(data.dptr);
4019
4020         return ret;
4021 }
4022
4023
4024 /*
4025   perform tickle updates if required
4026  */
4027 static void ctdb_update_tcp_tickles(struct event_context *ev, 
4028                                 struct timed_event *te, 
4029                                 struct timeval t, void *private_data)
4030 {
4031         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4032         int ret;
4033         struct ctdb_vnn *vnn;
4034
4035         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
4036                 /* we only send out updates for public addresses that 
4037                    we have taken over
4038                  */
4039                 if (ctdb->pnn != vnn->pnn) {
4040                         continue;
4041                 }
4042                 /* We only send out the updates if we need to */
4043                 if (!vnn->tcp_update_needed) {
4044                         continue;
4045                 }
4046                 ret = ctdb_send_set_tcp_tickles_for_ip(ctdb,
4047                                                        &vnn->public_address,
4048                                                        vnn->tcp_array);
4049                 if (ret != 0) {
4050                         DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
4051                                 ctdb_addr_to_str(&vnn->public_address)));
4052                 } else {
4053                         DEBUG(DEBUG_INFO,
4054                               ("Sent tickle update for public address %s\n",
4055                                ctdb_addr_to_str(&vnn->public_address)));
4056                         vnn->tcp_update_needed = false;
4057                 }
4058         }
4059
4060         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
4061                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
4062                              ctdb_update_tcp_tickles, ctdb);
4063 }               
4064         
4065
4066 /*
4067   start periodic update of tcp tickles
4068  */
4069 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
4070 {
4071         ctdb->tickle_update_context = talloc_new(ctdb);
4072
4073         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
4074                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
4075                              ctdb_update_tcp_tickles, ctdb);
4076 }
4077
4078
4079
4080
4081 struct control_gratious_arp {
4082         struct ctdb_context *ctdb;
4083         ctdb_sock_addr addr;
4084         const char *iface;
4085         int count;
4086 };
4087
4088 /*
4089   send a control_gratuitous arp
4090  */
4091 static void send_gratious_arp(struct event_context *ev, struct timed_event *te, 
4092                                   struct timeval t, void *private_data)
4093 {
4094         int ret;
4095         struct control_gratious_arp *arp = talloc_get_type(private_data, 
4096                                                         struct control_gratious_arp);
4097
4098         ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
4099         if (ret != 0) {
4100                 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
4101                                  arp->iface, strerror(errno)));
4102         }
4103
4104
4105         arp->count++;
4106         if (arp->count == CTDB_ARP_REPEAT) {
4107                 talloc_free(arp);
4108                 return;
4109         }
4110
4111         event_add_timed(arp->ctdb->ev, arp, 
4112                         timeval_current_ofs(CTDB_ARP_INTERVAL, 0), 
4113                         send_gratious_arp, arp);
4114 }
4115
4116
4117 /*
4118   send a gratious arp 
4119  */
4120 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
4121 {
4122         struct ctdb_control_gratious_arp *gratious_arp = (struct ctdb_control_gratious_arp *)indata.dptr;
4123         struct control_gratious_arp *arp;
4124
4125         /* verify the size of indata */
4126         if (indata.dsize < offsetof(struct ctdb_control_gratious_arp, iface)) {
4127                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
4128                                  (unsigned)indata.dsize, 
4129                                  (unsigned)offsetof(struct ctdb_control_gratious_arp, iface)));
4130                 return -1;
4131         }
4132         if (indata.dsize != 
4133                 ( offsetof(struct ctdb_control_gratious_arp, iface)
4134                 + gratious_arp->len ) ){
4135
4136                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4137                         "but should be %u bytes\n", 
4138                          (unsigned)indata.dsize, 
4139                          (unsigned)(offsetof(struct ctdb_control_gratious_arp, iface)+gratious_arp->len)));
4140                 return -1;
4141         }
4142
4143
4144         arp = talloc(ctdb, struct control_gratious_arp);
4145         CTDB_NO_MEMORY(ctdb, arp);
4146
4147         arp->ctdb  = ctdb;
4148         arp->addr   = gratious_arp->addr;
4149         arp->iface = talloc_strdup(arp, gratious_arp->iface);
4150         CTDB_NO_MEMORY(ctdb, arp->iface);
4151         arp->count = 0;
4152         
4153         event_add_timed(arp->ctdb->ev, arp, 
4154                         timeval_zero(), send_gratious_arp, arp);
4155
4156         return 0;
4157 }
4158
4159 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
4160 {
4161         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
4162         int ret;
4163
4164         /* verify the size of indata */
4165         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
4166                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
4167                 return -1;
4168         }
4169         if (indata.dsize != 
4170                 ( offsetof(struct ctdb_control_ip_iface, iface)
4171                 + pub->len ) ){
4172
4173                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4174                         "but should be %u bytes\n", 
4175                          (unsigned)indata.dsize, 
4176                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
4177                 return -1;
4178         }
4179
4180         DEBUG(DEBUG_NOTICE,("Add IP %s\n", ctdb_addr_to_str(&pub->addr)));
4181
4182         ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
4183
4184         if (ret != 0) {
4185                 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
4186                 return -1;
4187         }
4188
4189         return 0;
4190 }
4191
4192 struct delete_ip_callback_state {
4193         struct ctdb_req_control *c;
4194 };
4195
4196 /*
4197   called when releaseip event finishes for del_public_address
4198  */
4199 static void delete_ip_callback(struct ctdb_context *ctdb,
4200                                int32_t status, TDB_DATA data,
4201                                const char *errormsg,
4202                                void *private_data)
4203 {
4204         struct delete_ip_callback_state *state =
4205                 talloc_get_type(private_data, struct delete_ip_callback_state);
4206
4207         /* If release failed then fail. */
4208         ctdb_request_control_reply(ctdb, state->c, NULL, status, errormsg);
4209         talloc_free(private_data);
4210 }
4211
4212 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb,
4213                                         struct ctdb_req_control *c,
4214                                         TDB_DATA indata, bool *async_reply)
4215 {
4216         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
4217         struct ctdb_vnn *vnn;
4218
4219         /* verify the size of indata */
4220         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
4221                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
4222                 return -1;
4223         }
4224         if (indata.dsize != 
4225                 ( offsetof(struct ctdb_control_ip_iface, iface)
4226                 + pub->len ) ){
4227
4228                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4229                         "but should be %u bytes\n", 
4230                          (unsigned)indata.dsize, 
4231                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
4232                 return -1;
4233         }
4234
4235         DEBUG(DEBUG_NOTICE,("Delete IP %s\n", ctdb_addr_to_str(&pub->addr)));
4236
4237         /* walk over all public addresses until we find a match */
4238         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
4239                 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
4240                         if (vnn->pnn == ctdb->pnn) {
4241                                 struct delete_ip_callback_state *state;
4242                                 struct ctdb_public_ip *ip;
4243                                 TDB_DATA data;
4244                                 int ret;
4245
4246                                 vnn->delete_pending = true;
4247
4248                                 state = talloc(ctdb,
4249                                                struct delete_ip_callback_state);
4250                                 CTDB_NO_MEMORY(ctdb, state);
4251                                 state->c = c;
4252
4253                                 ip = talloc(state, struct ctdb_public_ip);
4254                                 if (ip == NULL) {
4255                                         DEBUG(DEBUG_ERR,
4256                                               (__location__ " Out of memory\n"));
4257                                         talloc_free(state);
4258                                         return -1;
4259                                 }
4260                                 ip->pnn = -1;
4261                                 ip->addr = pub->addr;
4262
4263                                 data.dsize = sizeof(struct ctdb_public_ip);
4264                                 data.dptr = (unsigned char *)ip;
4265
4266                                 ret = ctdb_daemon_send_control(ctdb,
4267                                                                ctdb_get_pnn(ctdb),
4268                                                                0,
4269                                                                CTDB_CONTROL_RELEASE_IP,
4270                                                                0, 0,
4271                                                                data,
4272                                                                delete_ip_callback,
4273                                                                state);
4274                                 if (ret == -1) {
4275                                         DEBUG(DEBUG_ERR,
4276                                               (__location__ "Unable to send "
4277                                                "CTDB_CONTROL_RELEASE_IP\n"));
4278                                         talloc_free(state);
4279                                         return -1;
4280                                 }
4281
4282                                 state->c = talloc_steal(state, c);
4283                                 *async_reply = true;
4284                         } else {
4285                                 /* This IP is not hosted on the
4286                                  * current node so just delete it
4287                                  * now. */
4288                                 do_delete_ip(ctdb, vnn);
4289                         }
4290
4291                         return 0;
4292                 }
4293         }
4294
4295         DEBUG(DEBUG_ERR,("Delete IP of unknown public IP address %s\n",
4296                          ctdb_addr_to_str(&pub->addr)));
4297         return -1;
4298 }
4299
4300
4301 struct ipreallocated_callback_state {
4302         struct ctdb_req_control *c;
4303 };
4304
4305 static void ctdb_ipreallocated_callback(struct ctdb_context *ctdb,
4306                                         int status, void *p)
4307 {
4308         struct ipreallocated_callback_state *state =
4309                 talloc_get_type(p, struct ipreallocated_callback_state);
4310
4311         if (status != 0) {
4312                 DEBUG(DEBUG_ERR,
4313                       (" \"ipreallocated\" event script failed (status %d)\n",
4314                        status));
4315                 if (status == -ETIME) {
4316                         ctdb_ban_self(ctdb);
4317                 }
4318         }
4319
4320         ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
4321         talloc_free(state);
4322 }
4323
4324 /* A control to run the ipreallocated event */
4325 int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
4326                                    struct ctdb_req_control *c,
4327                                    bool *async_reply)
4328 {
4329         int ret;
4330         struct ipreallocated_callback_state *state;
4331
4332         state = talloc(ctdb, struct ipreallocated_callback_state);
4333         CTDB_NO_MEMORY(ctdb, state);
4334
4335         DEBUG(DEBUG_INFO,(__location__ " Running \"ipreallocated\" event\n"));
4336
4337         ret = ctdb_event_script_callback(ctdb, state,
4338                                          ctdb_ipreallocated_callback, state,
4339                                          CTDB_EVENT_IPREALLOCATED,
4340                                          "%s", "");
4341
4342         if (ret != 0) {
4343                 DEBUG(DEBUG_ERR,("Failed to run \"ipreallocated\" event \n"));
4344                 talloc_free(state);
4345                 return -1;
4346         }
4347
4348         /* tell the control that we will be reply asynchronously */
4349         state->c    = talloc_steal(state, c);
4350         *async_reply = true;
4351
4352         return 0;
4353 }
4354
4355
4356 /* This function is called from the recovery daemon to verify that a remote
4357    node has the expected ip allocation.
4358    This is verified against ctdb->ip_tree
4359 */
4360 int verify_remote_ip_allocation(struct ctdb_context *ctdb,
4361                                 struct ctdb_all_public_ips *ips,
4362                                 uint32_t pnn)
4363 {
4364         struct ctdb_public_ip_list *tmp_ip; 
4365         int i;
4366
4367         if (ctdb->ip_tree == NULL) {
4368                 /* dont know the expected allocation yet, assume remote node
4369                    is correct. */
4370                 return 0;
4371         }
4372
4373         if (ips == NULL) {
4374                 return 0;
4375         }
4376
4377         for (i=0; i<ips->num; i++) {
4378                 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
4379                 if (tmp_ip == NULL) {
4380                         DEBUG(DEBUG_ERR,("Node %u has new or unknown public IP %s\n", pnn, ctdb_addr_to_str(&ips->ips[i].addr)));
4381                         return -1;
4382                 }
4383
4384                 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
4385                         continue;
4386                 }
4387
4388                 if (tmp_ip->pnn != ips->ips[i].pnn) {
4389                         DEBUG(DEBUG_ERR,
4390                               ("Inconsistent IP allocation - node %u thinks %s is held by node %u while it is assigned to node %u\n",
4391                                pnn,
4392                                ctdb_addr_to_str(&ips->ips[i].addr),
4393                                ips->ips[i].pnn, tmp_ip->pnn));
4394                         return -1;
4395                 }
4396         }
4397
4398         return 0;
4399 }
4400
4401 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
4402 {
4403         struct ctdb_public_ip_list *tmp_ip; 
4404
4405         if (ctdb->ip_tree == NULL) {
4406                 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
4407                 return -1;
4408         }
4409
4410         tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
4411         if (tmp_ip == NULL) {
4412                 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
4413                 return -1;
4414         }
4415
4416         DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
4417         tmp_ip->pnn = ip->pnn;
4418
4419         return 0;
4420 }
4421
4422
4423 struct ctdb_reloadips_handle {
4424         struct ctdb_context *ctdb;
4425         struct ctdb_req_control *c;
4426         int status;
4427         int fd[2];
4428         pid_t child;
4429         struct fd_event *fde;
4430 };
4431
4432 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
4433 {
4434         if (h == h->ctdb->reload_ips) {
4435                 h->ctdb->reload_ips = NULL;
4436         }
4437         if (h->c != NULL) {
4438                 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
4439                 h->c = NULL;
4440         }
4441         ctdb_kill(h->ctdb, h->child, SIGKILL);
4442         return 0;
4443 }
4444
4445 static void ctdb_reloadips_timeout_event(struct event_context *ev,
4446                                 struct timed_event *te,
4447                                 struct timeval t, void *private_data)
4448 {
4449         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4450
4451         talloc_free(h);
4452 }       
4453
4454 static void ctdb_reloadips_child_handler(struct event_context *ev, struct fd_event *fde, 
4455                              uint16_t flags, void *private_data)
4456 {
4457         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4458
4459         char res;
4460         int ret;
4461
4462         ret = sys_read(h->fd[0], &res, 1);
4463         if (ret < 1 || res != 0) {
4464                 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
4465                 res = 1;
4466         }
4467         h->status = res;
4468
4469         talloc_free(h);
4470 }
4471
4472 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
4473 {
4474         TALLOC_CTX *mem_ctx = talloc_new(NULL);
4475         struct ctdb_all_public_ips *ips;
4476         struct ctdb_vnn *vnn;
4477         struct client_async_data *async_data;
4478         struct timeval timeout;
4479         TDB_DATA data;
4480         struct ctdb_client_control_state *state;
4481         bool first_add;
4482         int i, ret;
4483
4484         CTDB_NO_MEMORY(ctdb, mem_ctx);
4485
4486         /* Read IPs from local node */
4487         ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(),
4488                                        CTDB_CURRENT_NODE, mem_ctx, &ips);
4489         if (ret != 0) {
4490                 DEBUG(DEBUG_ERR,
4491                       ("Unable to fetch public IPs from local node\n"));
4492                 talloc_free(mem_ctx);
4493                 return -1;
4494         }
4495
4496         /* Read IPs file - this is safe since this is a child process */
4497         ctdb->vnn = NULL;
4498         if (ctdb_set_public_addresses(ctdb, false) != 0) {
4499                 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
4500                 talloc_free(mem_ctx);
4501                 return -1;
4502         }
4503
4504         async_data = talloc_zero(mem_ctx, struct client_async_data);
4505         CTDB_NO_MEMORY(ctdb, async_data);
4506
4507         /* Compare IPs between node and file for IPs to be deleted */
4508         for (i = 0; i < ips->num; i++) {
4509                 /* */
4510                 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4511                         if (ctdb_same_ip(&vnn->public_address,
4512                                          &ips->ips[i].addr)) {
4513                                 /* IP is still in file */
4514                                 break;
4515                         }
4516                 }
4517
4518                 if (vnn == NULL) {
4519                         /* Delete IP ips->ips[i] */
4520                         struct ctdb_control_ip_iface *pub;
4521
4522                         DEBUG(DEBUG_NOTICE,
4523                               ("IP %s no longer configured, deleting it\n",
4524                                ctdb_addr_to_str(&ips->ips[i].addr)));
4525
4526                         pub = talloc_zero(mem_ctx,
4527                                           struct ctdb_control_ip_iface);
4528                         CTDB_NO_MEMORY(ctdb, pub);
4529
4530                         pub->addr  = ips->ips[i].addr;
4531                         pub->mask  = 0;
4532                         pub->len   = 0;
4533
4534                         timeout = TAKEOVER_TIMEOUT();
4535
4536                         data.dsize = offsetof(struct ctdb_control_ip_iface,
4537                                               iface) + pub->len;
4538                         data.dptr = (uint8_t *)pub;
4539
4540                         state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4541                                                   CTDB_CONTROL_DEL_PUBLIC_IP,
4542                                                   0, data, async_data,
4543                                                   &timeout, NULL);
4544                         if (state == NULL) {
4545                                 DEBUG(DEBUG_ERR,
4546                                       (__location__
4547                                        " failed sending CTDB_CONTROL_DEL_PUBLIC_IP\n"));
4548                                 goto failed;
4549                         }
4550
4551                         ctdb_client_async_add(async_data, state);
4552                 }
4553         }
4554
4555         /* Compare IPs between node and file for IPs to be added */
4556         first_add = true;
4557         for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4558                 for (i = 0; i < ips->num; i++) {
4559                         if (ctdb_same_ip(&vnn->public_address,
4560                                          &ips->ips[i].addr)) {
4561                                 /* IP already on node */
4562                                 break;
4563                         }
4564                 }
4565                 if (i == ips->num) {
4566                         /* Add IP ips->ips[i] */
4567                         struct ctdb_control_ip_iface *pub;
4568                         const char *ifaces = NULL;
4569                         uint32_t len;
4570                         int iface = 0;
4571
4572                         DEBUG(DEBUG_NOTICE,
4573                               ("New IP %s configured, adding it\n",
4574                                ctdb_addr_to_str(&vnn->public_address)));
4575                         if (first_add) {
4576                                 uint32_t pnn = ctdb_get_pnn(ctdb);
4577
4578                                 data.dsize = sizeof(pnn);
4579                                 data.dptr  = (uint8_t *)&pnn;
4580
4581                                 ret = ctdb_client_send_message(
4582                                         ctdb,
4583                                         CTDB_BROADCAST_CONNECTED,
4584                                         CTDB_SRVID_REBALANCE_NODE,
4585                                         data);
4586                                 if (ret != 0) {
4587                                         DEBUG(DEBUG_WARNING,
4588                                               ("Failed to send message to force node reallocation - IPs may be unbalanced\n"));
4589                                 }
4590
4591                                 first_add = false;
4592                         }
4593
4594                         ifaces = vnn->ifaces[0];
4595                         iface = 1;
4596                         while (vnn->ifaces[iface] != NULL) {
4597                                 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces,
4598                                                          vnn->ifaces[iface]);
4599                                 iface++;
4600                         }
4601
4602                         len   = strlen(ifaces) + 1;
4603                         pub = talloc_zero_size(mem_ctx,
4604                                                offsetof(struct ctdb_control_ip_iface, iface) + len);
4605                         CTDB_NO_MEMORY(ctdb, pub);
4606
4607                         pub->addr  = vnn->public_address;
4608                         pub->mask  = vnn->public_netmask_bits;
4609                         pub->len   = len;
4610                         memcpy(&pub->iface[0], ifaces, pub->len);
4611
4612                         timeout = TAKEOVER_TIMEOUT();
4613
4614                         data.dsize = offsetof(struct ctdb_control_ip_iface,
4615                                               iface) + pub->len;
4616                         data.dptr = (uint8_t *)pub;
4617
4618                         state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4619                                                   CTDB_CONTROL_ADD_PUBLIC_IP,
4620                                                   0, data, async_data,
4621                                                   &timeout, NULL);
4622                         if (state == NULL) {
4623                                 DEBUG(DEBUG_ERR,
4624                                       (__location__
4625                                        " failed sending CTDB_CONTROL_ADD_PUBLIC_IP\n"));
4626                                 goto failed;
4627                         }
4628
4629                         ctdb_client_async_add(async_data, state);
4630                 }
4631         }
4632
4633         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
4634                 DEBUG(DEBUG_ERR,(__location__ " Add/delete IPs failed\n"));
4635                 goto failed;
4636         }
4637
4638         talloc_free(mem_ctx);
4639         return 0;
4640
4641 failed:
4642         talloc_free(mem_ctx);
4643         return -1;
4644 }
4645
4646 /* This control is sent to force the node to re-read the public addresses file
4647    and drop any addresses we should nnot longer host, and add new addresses
4648    that we are now able to host
4649 */
4650 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control *c, bool *async_reply)
4651 {
4652         struct ctdb_reloadips_handle *h;
4653         pid_t parent = getpid();
4654
4655         if (ctdb->reload_ips != NULL) {
4656                 talloc_free(ctdb->reload_ips);
4657                 ctdb->reload_ips = NULL;
4658         }
4659
4660         h = talloc(ctdb, struct ctdb_reloadips_handle);
4661         CTDB_NO_MEMORY(ctdb, h);
4662         h->ctdb     = ctdb;
4663         h->c        = NULL;
4664         h->status   = -1;
4665         
4666         if (pipe(h->fd) == -1) {
4667                 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
4668                 talloc_free(h);
4669                 return -1;
4670         }
4671
4672         h->child = ctdb_fork(ctdb);
4673         if (h->child == (pid_t)-1) {
4674                 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
4675                 close(h->fd[0]);
4676                 close(h->fd[1]);
4677                 talloc_free(h);
4678                 return -1;
4679         }
4680
4681         /* child process */
4682         if (h->child == 0) {
4683                 signed char res = 0;
4684
4685                 close(h->fd[0]);
4686                 debug_extra = talloc_asprintf(NULL, "reloadips:");
4687
4688                 ctdb_set_process_name("ctdb_reloadips");
4689                 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
4690                         DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
4691                         res = -1;
4692                 } else {
4693                         res = ctdb_reloadips_child(ctdb);
4694                         if (res != 0) {
4695                                 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
4696                         }
4697                 }
4698
4699                 sys_write(h->fd[1], &res, 1);
4700                 /* make sure we die when our parent dies */
4701                 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
4702                         sleep(5);
4703                 }
4704                 _exit(0);
4705         }
4706
4707         h->c             = talloc_steal(h, c);
4708
4709         close(h->fd[1]);
4710         set_close_on_exec(h->fd[0]);
4711
4712         talloc_set_destructor(h, ctdb_reloadips_destructor);
4713
4714
4715         h->fde = event_add_fd(ctdb->ev, h, h->fd[0],
4716                         EVENT_FD_READ, ctdb_reloadips_child_handler,
4717                         (void *)h);
4718         tevent_fd_set_auto_close(h->fde);
4719
4720         event_add_timed(ctdb->ev, h,
4721                         timeval_current_ofs(120, 0),
4722                         ctdb_reloadips_timeout_event, h);
4723
4724         /* we reply later */
4725         *async_reply = true;
4726         return 0;
4727 }