ctdb-daemon: Remove older data structure that supports only IPv4 addresses
[obnox/samba/samba-obnox.git] / ctdb / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6    Copyright (C) Martin Schwenke  2011
7
8    This program is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3 of the License, or
11    (at your option) any later version.
12    
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17    
18    You should have received a copy of the GNU General Public License
19    along with this program; if not, see <http://www.gnu.org/licenses/>.
20 */
21 #include "includes.h"
22 #include "tdb.h"
23 #include "lib/util/dlinklist.h"
24 #include "system/network.h"
25 #include "system/filesys.h"
26 #include "system/wait.h"
27 #include "../include/ctdb_private.h"
28 #include "../common/rb_tree.h"
29
30
31 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
32
33 #define CTDB_ARP_INTERVAL 1
34 #define CTDB_ARP_REPEAT   3
35
36 /* Flags used in IP allocation algorithms. */
37 struct ctdb_ipflags {
38         bool noiptakeover;
39         bool noiphost;
40         enum ctdb_runstate runstate;
41 };
42
43 struct ctdb_iface {
44         struct ctdb_iface *prev, *next;
45         const char *name;
46         bool link_up;
47         uint32_t references;
48 };
49
50 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
51 {
52         if (vnn->iface) {
53                 return vnn->iface->name;
54         }
55
56         return "__none__";
57 }
58
59 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
60 {
61         struct ctdb_iface *i;
62
63         /* Verify that we dont have an entry for this ip yet */
64         for (i=ctdb->ifaces;i;i=i->next) {
65                 if (strcmp(i->name, iface) == 0) {
66                         return 0;
67                 }
68         }
69
70         /* create a new structure for this interface */
71         i = talloc_zero(ctdb, struct ctdb_iface);
72         CTDB_NO_MEMORY_FATAL(ctdb, i);
73         i->name = talloc_strdup(i, iface);
74         CTDB_NO_MEMORY(ctdb, i->name);
75         /*
76          * If link_up defaults to true then IPs can be allocated to a
77          * node during the first recovery.  However, then an interface
78          * could have its link marked down during the startup event,
79          * causing the IP to move almost immediately.  If link_up
80          * defaults to false then, during normal operation, IPs added
81          * to a new interface can't be assigned until a monitor cycle
82          * has occurred and marked the new interfaces up.  This makes
83          * IP allocation unpredictable.  The following is a neat
84          * compromise: early in startup link_up defaults to false, so
85          * IPs can't be assigned, and after startup IPs can be
86          * assigned immediately.
87          */
88         i->link_up = (ctdb->runstate == CTDB_RUNSTATE_RUNNING);
89
90         DLIST_ADD(ctdb->ifaces, i);
91
92         return 0;
93 }
94
95 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
96                                         const char *name)
97 {
98         int n;
99
100         for (n = 0; vnn->ifaces[n] != NULL; n++) {
101                 if (strcmp(name, vnn->ifaces[n]) == 0) {
102                         return true;
103                 }
104         }
105
106         return false;
107 }
108
109 /* If any interfaces now have no possible IPs then delete them.  This
110  * implementation is naive (i.e. simple) rather than clever
111  * (i.e. complex).  Given that this is run on delip and that operation
112  * is rare, this doesn't need to be efficient - it needs to be
113  * foolproof.  One alternative is reference counting, where the logic
114  * is distributed and can, therefore, be broken in multiple places.
115  * Another alternative is to build a red-black tree of interfaces that
116  * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
117  * once) and then walking ctdb->ifaces once and deleting those not in
118  * the tree.  Let's go to one of those if the naive implementation
119  * causes problems...  :-)
120  */
121 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
122                                         struct ctdb_vnn *vnn)
123 {
124         struct ctdb_iface *i, *next;
125
126         /* For each interface, check if there's an IP using it. */
127         for (i = ctdb->ifaces; i != NULL; i = next) {
128                 struct ctdb_vnn *tv;
129                 bool found;
130                 next = i->next;
131
132                 /* Only consider interfaces named in the given VNN. */
133                 if (!vnn_has_interface_with_name(vnn, i->name)) {
134                         continue;
135                 }
136
137                 /* Is the "single IP" on this interface? */
138                 if ((ctdb->single_ip_vnn != NULL) &&
139                     (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
140                     (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
141                         /* Found, next interface please... */
142                         continue;
143                 }
144                 /* Search for a vnn with this interface. */
145                 found = false;
146                 for (tv=ctdb->vnn; tv; tv=tv->next) {
147                         if (vnn_has_interface_with_name(tv, i->name)) {
148                                 found = true;
149                                 break;
150                         }
151                 }
152
153                 if (!found) {
154                         /* None of the VNNs are using this interface. */
155                         DLIST_REMOVE(ctdb->ifaces, i);
156                         talloc_free(i);
157                 }
158         }
159 }
160
161
162 static struct ctdb_iface *ctdb_find_iface(struct ctdb_context *ctdb,
163                                           const char *iface)
164 {
165         struct ctdb_iface *i;
166
167         for (i=ctdb->ifaces;i;i=i->next) {
168                 if (strcmp(i->name, iface) == 0) {
169                         return i;
170                 }
171         }
172
173         return NULL;
174 }
175
176 static struct ctdb_iface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
177                                               struct ctdb_vnn *vnn)
178 {
179         int i;
180         struct ctdb_iface *cur = NULL;
181         struct ctdb_iface *best = NULL;
182
183         for (i=0; vnn->ifaces[i]; i++) {
184
185                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
186                 if (cur == NULL) {
187                         continue;
188                 }
189
190                 if (!cur->link_up) {
191                         continue;
192                 }
193
194                 if (best == NULL) {
195                         best = cur;
196                         continue;
197                 }
198
199                 if (cur->references < best->references) {
200                         best = cur;
201                         continue;
202                 }
203         }
204
205         return best;
206 }
207
208 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
209                                      struct ctdb_vnn *vnn)
210 {
211         struct ctdb_iface *best = NULL;
212
213         if (vnn->iface) {
214                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
215                                    "still assigned to iface '%s'\n",
216                                    ctdb_addr_to_str(&vnn->public_address),
217                                    ctdb_vnn_iface_string(vnn)));
218                 return 0;
219         }
220
221         best = ctdb_vnn_best_iface(ctdb, vnn);
222         if (best == NULL) {
223                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
224                                   "cannot assign to iface any iface\n",
225                                   ctdb_addr_to_str(&vnn->public_address)));
226                 return -1;
227         }
228
229         vnn->iface = best;
230         best->references++;
231         vnn->pnn = ctdb->pnn;
232
233         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
234                            "now assigned to iface '%s' refs[%d]\n",
235                            ctdb_addr_to_str(&vnn->public_address),
236                            ctdb_vnn_iface_string(vnn),
237                            best->references));
238         return 0;
239 }
240
241 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
242                                     struct ctdb_vnn *vnn)
243 {
244         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
245                            "now unassigned (old iface '%s' refs[%d])\n",
246                            ctdb_addr_to_str(&vnn->public_address),
247                            ctdb_vnn_iface_string(vnn),
248                            vnn->iface?vnn->iface->references:0));
249         if (vnn->iface) {
250                 vnn->iface->references--;
251         }
252         vnn->iface = NULL;
253         if (vnn->pnn == ctdb->pnn) {
254                 vnn->pnn = -1;
255         }
256 }
257
258 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
259                                struct ctdb_vnn *vnn)
260 {
261         int i;
262
263         if (vnn->delete_pending) {
264                 return false;
265         }
266
267         if (vnn->iface && vnn->iface->link_up) {
268                 return true;
269         }
270
271         for (i=0; vnn->ifaces[i]; i++) {
272                 struct ctdb_iface *cur;
273
274                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
275                 if (cur == NULL) {
276                         continue;
277                 }
278
279                 if (cur->link_up) {
280                         return true;
281                 }
282         }
283
284         return false;
285 }
286
287 struct ctdb_takeover_arp {
288         struct ctdb_context *ctdb;
289         uint32_t count;
290         ctdb_sock_addr addr;
291         struct ctdb_tcp_array *tcparray;
292         struct ctdb_vnn *vnn;
293 };
294
295
296 /*
297   lists of tcp endpoints
298  */
299 struct ctdb_tcp_list {
300         struct ctdb_tcp_list *prev, *next;
301         struct ctdb_tcp_connection connection;
302 };
303
304 /*
305   list of clients to kill on IP release
306  */
307 struct ctdb_client_ip {
308         struct ctdb_client_ip *prev, *next;
309         struct ctdb_context *ctdb;
310         ctdb_sock_addr addr;
311         uint32_t client_id;
312 };
313
314
315 /*
316   send a gratuitous arp
317  */
318 static void ctdb_control_send_arp(struct event_context *ev, struct timed_event *te, 
319                                   struct timeval t, void *private_data)
320 {
321         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
322                                                         struct ctdb_takeover_arp);
323         int i, ret;
324         struct ctdb_tcp_array *tcparray;
325         const char *iface = ctdb_vnn_iface_string(arp->vnn);
326
327         ret = ctdb_sys_send_arp(&arp->addr, iface);
328         if (ret != 0) {
329                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
330                                   iface, strerror(errno)));
331         }
332
333         tcparray = arp->tcparray;
334         if (tcparray) {
335                 for (i=0;i<tcparray->num;i++) {
336                         struct ctdb_tcp_connection *tcon;
337
338                         tcon = &tcparray->connections[i];
339                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
340                                 (unsigned)ntohs(tcon->dst_addr.ip.sin_port), 
341                                 ctdb_addr_to_str(&tcon->src_addr),
342                                 (unsigned)ntohs(tcon->src_addr.ip.sin_port)));
343                         ret = ctdb_sys_send_tcp(
344                                 &tcon->src_addr, 
345                                 &tcon->dst_addr,
346                                 0, 0, 0);
347                         if (ret != 0) {
348                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
349                                         ctdb_addr_to_str(&tcon->src_addr)));
350                         }
351                 }
352         }
353
354         arp->count++;
355
356         if (arp->count == CTDB_ARP_REPEAT) {
357                 talloc_free(arp);
358                 return;
359         }
360
361         event_add_timed(arp->ctdb->ev, arp->vnn->takeover_ctx, 
362                         timeval_current_ofs(CTDB_ARP_INTERVAL, 100000), 
363                         ctdb_control_send_arp, arp);
364 }
365
366 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
367                                        struct ctdb_vnn *vnn)
368 {
369         struct ctdb_takeover_arp *arp;
370         struct ctdb_tcp_array *tcparray;
371
372         if (!vnn->takeover_ctx) {
373                 vnn->takeover_ctx = talloc_new(vnn);
374                 if (!vnn->takeover_ctx) {
375                         return -1;
376                 }
377         }
378
379         arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
380         if (!arp) {
381                 return -1;
382         }
383
384         arp->ctdb = ctdb;
385         arp->addr = vnn->public_address;
386         arp->vnn  = vnn;
387
388         tcparray = vnn->tcp_array;
389         if (tcparray) {
390                 /* add all of the known tcp connections for this IP to the
391                    list of tcp connections to send tickle acks for */
392                 arp->tcparray = talloc_steal(arp, tcparray);
393
394                 vnn->tcp_array = NULL;
395                 vnn->tcp_update_needed = true;
396         }
397
398         event_add_timed(arp->ctdb->ev, vnn->takeover_ctx,
399                         timeval_zero(), ctdb_control_send_arp, arp);
400
401         return 0;
402 }
403
404 struct takeover_callback_state {
405         struct ctdb_req_control *c;
406         ctdb_sock_addr *addr;
407         struct ctdb_vnn *vnn;
408 };
409
410 struct ctdb_do_takeip_state {
411         struct ctdb_req_control *c;
412         struct ctdb_vnn *vnn;
413 };
414
415 /*
416   called when takeip event finishes
417  */
418 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
419                                     void *private_data)
420 {
421         struct ctdb_do_takeip_state *state =
422                 talloc_get_type(private_data, struct ctdb_do_takeip_state);
423         int32_t ret;
424         TDB_DATA data;
425
426         if (status != 0) {
427                 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
428         
429                 if (status == -ETIME) {
430                         ctdb_ban_self(ctdb);
431                 }
432                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
433                                  ctdb_addr_to_str(&state->vnn->public_address),
434                                  ctdb_vnn_iface_string(state->vnn)));
435                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
436
437                 node->flags |= NODE_FLAGS_UNHEALTHY;
438                 talloc_free(state);
439                 return;
440         }
441
442         if (ctdb->do_checkpublicip) {
443
444         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
445         if (ret != 0) {
446                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
447                 talloc_free(state);
448                 return;
449         }
450
451         }
452
453         data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
454         data.dsize = strlen((char *)data.dptr) + 1;
455         DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
456
457         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
458
459
460         /* the control succeeded */
461         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
462         talloc_free(state);
463         return;
464 }
465
466 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
467 {
468         state->vnn->update_in_flight = false;
469         return 0;
470 }
471
472 /*
473   take over an ip address
474  */
475 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
476                               struct ctdb_req_control *c,
477                               struct ctdb_vnn *vnn)
478 {
479         int ret;
480         struct ctdb_do_takeip_state *state;
481
482         if (vnn->update_in_flight) {
483                 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
484                                     "update for this IP already in flight\n",
485                                     ctdb_addr_to_str(&vnn->public_address),
486                                     vnn->public_netmask_bits));
487                 return -1;
488         }
489
490         ret = ctdb_vnn_assign_iface(ctdb, vnn);
491         if (ret != 0) {
492                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
493                                  "assign a usable interface\n",
494                                  ctdb_addr_to_str(&vnn->public_address),
495                                  vnn->public_netmask_bits));
496                 return -1;
497         }
498
499         state = talloc(vnn, struct ctdb_do_takeip_state);
500         CTDB_NO_MEMORY(ctdb, state);
501
502         state->c = talloc_steal(ctdb, c);
503         state->vnn   = vnn;
504
505         vnn->update_in_flight = true;
506         talloc_set_destructor(state, ctdb_takeip_destructor);
507
508         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
509                             ctdb_addr_to_str(&vnn->public_address),
510                             vnn->public_netmask_bits,
511                             ctdb_vnn_iface_string(vnn)));
512
513         ret = ctdb_event_script_callback(ctdb,
514                                          state,
515                                          ctdb_do_takeip_callback,
516                                          state,
517                                          CTDB_EVENT_TAKE_IP,
518                                          "%s %s %u",
519                                          ctdb_vnn_iface_string(vnn),
520                                          ctdb_addr_to_str(&vnn->public_address),
521                                          vnn->public_netmask_bits);
522
523         if (ret != 0) {
524                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
525                         ctdb_addr_to_str(&vnn->public_address),
526                         ctdb_vnn_iface_string(vnn)));
527                 talloc_free(state);
528                 return -1;
529         }
530
531         return 0;
532 }
533
534 struct ctdb_do_updateip_state {
535         struct ctdb_req_control *c;
536         struct ctdb_iface *old;
537         struct ctdb_vnn *vnn;
538 };
539
540 /*
541   called when updateip event finishes
542  */
543 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
544                                       void *private_data)
545 {
546         struct ctdb_do_updateip_state *state =
547                 talloc_get_type(private_data, struct ctdb_do_updateip_state);
548         int32_t ret;
549
550         if (status != 0) {
551                 if (status == -ETIME) {
552                         ctdb_ban_self(ctdb);
553                 }
554                 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
555                         ctdb_addr_to_str(&state->vnn->public_address),
556                         state->old->name,
557                         ctdb_vnn_iface_string(state->vnn)));
558
559                 /*
560                  * All we can do is reset the old interface
561                  * and let the next run fix it
562                  */
563                 ctdb_vnn_unassign_iface(ctdb, state->vnn);
564                 state->vnn->iface = state->old;
565                 state->vnn->iface->references++;
566
567                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
568                 talloc_free(state);
569                 return;
570         }
571
572         if (ctdb->do_checkpublicip) {
573
574         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
575         if (ret != 0) {
576                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
577                 talloc_free(state);
578                 return;
579         }
580
581         }
582
583         /* the control succeeded */
584         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
585         talloc_free(state);
586         return;
587 }
588
589 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
590 {
591         state->vnn->update_in_flight = false;
592         return 0;
593 }
594
595 /*
596   update (move) an ip address
597  */
598 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
599                                 struct ctdb_req_control *c,
600                                 struct ctdb_vnn *vnn)
601 {
602         int ret;
603         struct ctdb_do_updateip_state *state;
604         struct ctdb_iface *old = vnn->iface;
605         const char *new_name;
606
607         if (vnn->update_in_flight) {
608                 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
609                                     "update for this IP already in flight\n",
610                                     ctdb_addr_to_str(&vnn->public_address),
611                                     vnn->public_netmask_bits));
612                 return -1;
613         }
614
615         ctdb_vnn_unassign_iface(ctdb, vnn);
616         ret = ctdb_vnn_assign_iface(ctdb, vnn);
617         if (ret != 0) {
618                 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
619                                  "assin a usable interface (old iface '%s')\n",
620                                  ctdb_addr_to_str(&vnn->public_address),
621                                  vnn->public_netmask_bits,
622                                  old->name));
623                 return -1;
624         }
625
626         new_name = ctdb_vnn_iface_string(vnn);
627         if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
628                 /* A benign update from one interface onto itself.
629                  * no need to run the eventscripts in this case, just return
630                  * success.
631                  */
632                 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
633                 return 0;
634         }
635
636         state = talloc(vnn, struct ctdb_do_updateip_state);
637         CTDB_NO_MEMORY(ctdb, state);
638
639         state->c = talloc_steal(ctdb, c);
640         state->old = old;
641         state->vnn = vnn;
642
643         vnn->update_in_flight = true;
644         talloc_set_destructor(state, ctdb_updateip_destructor);
645
646         DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
647                             "interface %s to %s\n",
648                             ctdb_addr_to_str(&vnn->public_address),
649                             vnn->public_netmask_bits,
650                             old->name,
651                             new_name));
652
653         ret = ctdb_event_script_callback(ctdb,
654                                          state,
655                                          ctdb_do_updateip_callback,
656                                          state,
657                                          CTDB_EVENT_UPDATE_IP,
658                                          "%s %s %s %u",
659                                          state->old->name,
660                                          new_name,
661                                          ctdb_addr_to_str(&vnn->public_address),
662                                          vnn->public_netmask_bits);
663         if (ret != 0) {
664                 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
665                                  ctdb_addr_to_str(&vnn->public_address),
666                                  old->name, new_name));
667                 talloc_free(state);
668                 return -1;
669         }
670
671         return 0;
672 }
673
674 /*
675   Find the vnn of the node that has a public ip address
676   returns -1 if the address is not known as a public address
677  */
678 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
679 {
680         struct ctdb_vnn *vnn;
681
682         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
683                 if (ctdb_same_ip(&vnn->public_address, addr)) {
684                         return vnn;
685                 }
686         }
687
688         return NULL;
689 }
690
691 /*
692   take over an ip address
693  */
694 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
695                                  struct ctdb_req_control *c,
696                                  TDB_DATA indata,
697                                  bool *async_reply)
698 {
699         int ret;
700         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
701         struct ctdb_vnn *vnn;
702         bool have_ip = false;
703         bool do_updateip = false;
704         bool do_takeip = false;
705         struct ctdb_iface *best_iface = NULL;
706
707         if (pip->pnn != ctdb->pnn) {
708                 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
709                                  "with pnn %d, but we're node %d\n",
710                                  ctdb_addr_to_str(&pip->addr),
711                                  pip->pnn, ctdb->pnn));
712                 return -1;
713         }
714
715         /* update out vnn list */
716         vnn = find_public_ip_vnn(ctdb, &pip->addr);
717         if (vnn == NULL) {
718                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
719                         ctdb_addr_to_str(&pip->addr)));
720                 return 0;
721         }
722
723         if (ctdb->do_checkpublicip) {
724                 have_ip = ctdb_sys_have_ip(&pip->addr);
725         }
726         best_iface = ctdb_vnn_best_iface(ctdb, vnn);
727         if (best_iface == NULL) {
728                 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
729                                  "a usable interface (old %s, have_ip %d)\n",
730                                  ctdb_addr_to_str(&vnn->public_address),
731                                  vnn->public_netmask_bits,
732                                  ctdb_vnn_iface_string(vnn),
733                                  have_ip));
734                 return -1;
735         }
736
737         if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
738                 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
739                 have_ip = false;
740         }
741
742
743         if (vnn->iface == NULL && have_ip) {
744                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
745                                   "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
746                                  ctdb_addr_to_str(&vnn->public_address)));
747                 return 0;
748         }
749
750         if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
751                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
752                                   "and we have it on iface[%s], but it was assigned to node %d"
753                                   "and we are node %d, banning ourself\n",
754                                  ctdb_addr_to_str(&vnn->public_address),
755                                  ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
756                 ctdb_ban_self(ctdb);
757                 return -1;
758         }
759
760         if (vnn->pnn == -1 && have_ip) {
761                 vnn->pnn = ctdb->pnn;
762                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
763                                   "and we already have it on iface[%s], update local daemon\n",
764                                  ctdb_addr_to_str(&vnn->public_address),
765                                   ctdb_vnn_iface_string(vnn)));
766                 return 0;
767         }
768
769         if (vnn->iface) {
770                 if (vnn->iface != best_iface) {
771                         if (!vnn->iface->link_up) {
772                                 do_updateip = true;
773                         } else if (vnn->iface->references > (best_iface->references + 1)) {
774                                 /* only move when the rebalance gains something */
775                                         do_updateip = true;
776                         }
777                 }
778         }
779
780         if (!have_ip) {
781                 if (do_updateip) {
782                         ctdb_vnn_unassign_iface(ctdb, vnn);
783                         do_updateip = false;
784                 }
785                 do_takeip = true;
786         }
787
788         if (do_takeip) {
789                 ret = ctdb_do_takeip(ctdb, c, vnn);
790                 if (ret != 0) {
791                         return -1;
792                 }
793         } else if (do_updateip) {
794                 ret = ctdb_do_updateip(ctdb, c, vnn);
795                 if (ret != 0) {
796                         return -1;
797                 }
798         } else {
799                 /*
800                  * The interface is up and the kernel known the ip
801                  * => do nothing
802                  */
803                 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
804                         ctdb_addr_to_str(&pip->addr),
805                         vnn->public_netmask_bits,
806                         ctdb_vnn_iface_string(vnn)));
807                 return 0;
808         }
809
810         /* tell ctdb_control.c that we will be replying asynchronously */
811         *async_reply = true;
812
813         return 0;
814 }
815
816 /*
817   takeover an ip address old v4 style
818  */
819 int32_t ctdb_control_takeover_ipv4(struct ctdb_context *ctdb, 
820                                 struct ctdb_req_control *c,
821                                 TDB_DATA indata, 
822                                 bool *async_reply)
823 {
824         TDB_DATA data;
825         
826         data.dsize = sizeof(struct ctdb_public_ip);
827         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
828         CTDB_NO_MEMORY(ctdb, data.dptr);
829         
830         memcpy(data.dptr, indata.dptr, indata.dsize);
831         return ctdb_control_takeover_ip(ctdb, c, data, async_reply);
832 }
833
834 /*
835   kill any clients that are registered with a IP that is being released
836  */
837 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
838 {
839         struct ctdb_client_ip *ip;
840
841         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
842                 ctdb_addr_to_str(addr)));
843
844         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
845                 ctdb_sock_addr tmp_addr;
846
847                 tmp_addr = ip->addr;
848                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
849                         ip->client_id,
850                         ctdb_addr_to_str(&ip->addr)));
851
852                 if (ctdb_same_ip(&tmp_addr, addr)) {
853                         struct ctdb_client *client = ctdb_reqid_find(ctdb, 
854                                                                      ip->client_id, 
855                                                                      struct ctdb_client);
856                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
857                                 ip->client_id,
858                                 ctdb_addr_to_str(&ip->addr),
859                                 client->pid));
860
861                         if (client->pid != 0) {
862                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
863                                         (unsigned)client->pid,
864                                         ctdb_addr_to_str(addr),
865                                         ip->client_id));
866                                 kill(client->pid, SIGKILL);
867                         }
868                 }
869         }
870 }
871
872 static void do_delete_ip(struct ctdb_context *ctdb, struct ctdb_vnn *vnn)
873 {
874         DLIST_REMOVE(ctdb->vnn, vnn);
875         ctdb_vnn_unassign_iface(ctdb, vnn);
876         ctdb_remove_orphaned_ifaces(ctdb, vnn);
877         talloc_free(vnn);
878 }
879
880 /*
881   called when releaseip event finishes
882  */
883 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
884                                 void *private_data)
885 {
886         struct takeover_callback_state *state = 
887                 talloc_get_type(private_data, struct takeover_callback_state);
888         TDB_DATA data;
889
890         if (status == -ETIME) {
891                 ctdb_ban_self(ctdb);
892         }
893
894         if (ctdb->do_checkpublicip && ctdb_sys_have_ip(state->addr)) {
895                 DEBUG(DEBUG_ERR, ("IP %s still hosted during release IP callback, failing\n",
896                                   ctdb_addr_to_str(state->addr)));
897                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
898                 talloc_free(state);
899                 return;
900         }
901
902         /* send a message to all clients of this node telling them
903            that the cluster has been reconfigured and they should
904            release any sockets on this IP */
905         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
906         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
907         data.dsize = strlen((char *)data.dptr)+1;
908
909         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
910
911         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
912
913         /* kill clients that have registered with this IP */
914         release_kill_clients(ctdb, state->addr);
915
916         ctdb_vnn_unassign_iface(ctdb, state->vnn);
917
918         /* Process the IP if it has been marked for deletion */
919         if (state->vnn->delete_pending) {
920                 do_delete_ip(ctdb, state->vnn);
921                 state->vnn = NULL;
922         }
923
924         /* the control succeeded */
925         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
926         talloc_free(state);
927 }
928
929 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
930 {
931         if (state->vnn != NULL) {
932                 state->vnn->update_in_flight = false;
933         }
934         return 0;
935 }
936
937 /*
938   release an ip address
939  */
940 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
941                                 struct ctdb_req_control *c,
942                                 TDB_DATA indata, 
943                                 bool *async_reply)
944 {
945         int ret;
946         struct takeover_callback_state *state;
947         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
948         struct ctdb_vnn *vnn;
949         char *iface;
950
951         /* update our vnn list */
952         vnn = find_public_ip_vnn(ctdb, &pip->addr);
953         if (vnn == NULL) {
954                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
955                         ctdb_addr_to_str(&pip->addr)));
956                 return 0;
957         }
958         vnn->pnn = pip->pnn;
959
960         /* stop any previous arps */
961         talloc_free(vnn->takeover_ctx);
962         vnn->takeover_ctx = NULL;
963
964         /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
965          * lazy multicast to drop an IP from any node that isn't the
966          * intended new node.  The following causes makes ctdbd ignore
967          * a release for any address it doesn't host.
968          */
969         if (ctdb->do_checkpublicip) {
970                 if (!ctdb_sys_have_ip(&pip->addr)) {
971                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
972                                 ctdb_addr_to_str(&pip->addr),
973                                 vnn->public_netmask_bits,
974                                 ctdb_vnn_iface_string(vnn)));
975                         ctdb_vnn_unassign_iface(ctdb, vnn);
976                         return 0;
977                 }
978         } else {
979                 if (vnn->iface == NULL) {
980                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
981                                            ctdb_addr_to_str(&pip->addr),
982                                            vnn->public_netmask_bits));
983                         return 0;
984                 }
985         }
986
987         /* There is a potential race between take_ip and us because we
988          * update the VNN via a callback that run when the
989          * eventscripts have been run.  Avoid the race by allowing one
990          * update to be in flight at a time.
991          */
992         if (vnn->update_in_flight) {
993                 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
994                                     "update for this IP already in flight\n",
995                                     ctdb_addr_to_str(&vnn->public_address),
996                                     vnn->public_netmask_bits));
997                 return -1;
998         }
999
1000         iface = strdup(ctdb_vnn_iface_string(vnn));
1001
1002         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
1003                 ctdb_addr_to_str(&pip->addr),
1004                 vnn->public_netmask_bits,
1005                 iface,
1006                 pip->pnn));
1007
1008         state = talloc(ctdb, struct takeover_callback_state);
1009         if (state == NULL) {
1010                 ctdb_set_error(ctdb, "Out of memory at %s:%d",
1011                                __FILE__, __LINE__);
1012                 free(iface);
1013                 return -1;
1014         }
1015
1016         state->c = talloc_steal(state, c);
1017         state->addr = talloc(state, ctdb_sock_addr);       
1018         if (state->addr == NULL) {
1019                 ctdb_set_error(ctdb, "Out of memory at %s:%d",
1020                                __FILE__, __LINE__);
1021                 free(iface);
1022                 talloc_free(state);
1023                 return -1;
1024         }
1025         *state->addr = pip->addr;
1026         state->vnn   = vnn;
1027
1028         vnn->update_in_flight = true;
1029         talloc_set_destructor(state, ctdb_releaseip_destructor);
1030
1031         ret = ctdb_event_script_callback(ctdb, 
1032                                          state, release_ip_callback, state,
1033                                          CTDB_EVENT_RELEASE_IP,
1034                                          "%s %s %u",
1035                                          iface,
1036                                          ctdb_addr_to_str(&pip->addr),
1037                                          vnn->public_netmask_bits);
1038         free(iface);
1039         if (ret != 0) {
1040                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1041                         ctdb_addr_to_str(&pip->addr),
1042                         ctdb_vnn_iface_string(vnn)));
1043                 talloc_free(state);
1044                 return -1;
1045         }
1046
1047         /* tell the control that we will be reply asynchronously */
1048         *async_reply = true;
1049         return 0;
1050 }
1051
1052 /*
1053   release an ip address old v4 style
1054  */
1055 int32_t ctdb_control_release_ipv4(struct ctdb_context *ctdb, 
1056                                 struct ctdb_req_control *c,
1057                                 TDB_DATA indata, 
1058                                 bool *async_reply)
1059 {
1060         TDB_DATA data;
1061         
1062         data.dsize = sizeof(struct ctdb_public_ip);
1063         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
1064         CTDB_NO_MEMORY(ctdb, data.dptr);
1065         
1066         memcpy(data.dptr, indata.dptr, indata.dsize);
1067         return ctdb_control_release_ip(ctdb, c, data, async_reply);
1068 }
1069
1070
1071 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1072                                    ctdb_sock_addr *addr,
1073                                    unsigned mask, const char *ifaces,
1074                                    bool check_address)
1075 {
1076         struct ctdb_vnn      *vnn;
1077         uint32_t num = 0;
1078         char *tmp;
1079         const char *iface;
1080         int i;
1081         int ret;
1082
1083         tmp = strdup(ifaces);
1084         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1085                 if (!ctdb_sys_check_iface_exists(iface)) {
1086                         DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1087                         free(tmp);
1088                         return -1;
1089                 }
1090         }
1091         free(tmp);
1092
1093         /* Verify that we dont have an entry for this ip yet */
1094         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1095                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1096                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
1097                                 ctdb_addr_to_str(addr)));
1098                         return -1;
1099                 }               
1100         }
1101
1102         /* create a new vnn structure for this ip address */
1103         vnn = talloc_zero(ctdb, struct ctdb_vnn);
1104         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1105         vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1106         tmp = talloc_strdup(vnn, ifaces);
1107         CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1108         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1109                 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1110                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1111                 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1112                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1113                 num++;
1114         }
1115         talloc_free(tmp);
1116         vnn->ifaces[num] = NULL;
1117         vnn->public_address      = *addr;
1118         vnn->public_netmask_bits = mask;
1119         vnn->pnn                 = -1;
1120         if (check_address) {
1121                 if (ctdb_sys_have_ip(addr)) {
1122                         DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1123                         vnn->pnn = ctdb->pnn;
1124                 }
1125         }
1126
1127         for (i=0; vnn->ifaces[i]; i++) {
1128                 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1129                 if (ret != 0) {
1130                         DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1131                                            "for public_address[%s]\n",
1132                                            vnn->ifaces[i], ctdb_addr_to_str(addr)));
1133                         talloc_free(vnn);
1134                         return -1;
1135                 }
1136         }
1137
1138         DLIST_ADD(ctdb->vnn, vnn);
1139
1140         return 0;
1141 }
1142
1143 /*
1144   setup the public address lists from a file
1145 */
1146 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1147 {
1148         char **lines;
1149         int nlines;
1150         int i;
1151
1152         lines = file_lines_load(ctdb->public_addresses_file, &nlines, 0, ctdb);
1153         if (lines == NULL) {
1154                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1155                 return -1;
1156         }
1157         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1158                 nlines--;
1159         }
1160
1161         for (i=0;i<nlines;i++) {
1162                 unsigned mask;
1163                 ctdb_sock_addr addr;
1164                 const char *addrstr;
1165                 const char *ifaces;
1166                 char *tok, *line;
1167
1168                 line = lines[i];
1169                 while ((*line == ' ') || (*line == '\t')) {
1170                         line++;
1171                 }
1172                 if (*line == '#') {
1173                         continue;
1174                 }
1175                 if (strcmp(line, "") == 0) {
1176                         continue;
1177                 }
1178                 tok = strtok(line, " \t");
1179                 addrstr = tok;
1180                 tok = strtok(NULL, " \t");
1181                 if (tok == NULL) {
1182                         if (NULL == ctdb->default_public_interface) {
1183                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1184                                          i+1));
1185                                 talloc_free(lines);
1186                                 return -1;
1187                         }
1188                         ifaces = ctdb->default_public_interface;
1189                 } else {
1190                         ifaces = tok;
1191                 }
1192
1193                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1194                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1195                         talloc_free(lines);
1196                         return -1;
1197                 }
1198                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1199                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1200                         talloc_free(lines);
1201                         return -1;
1202                 }
1203         }
1204
1205
1206         talloc_free(lines);
1207         return 0;
1208 }
1209
1210 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1211                               const char *iface,
1212                               const char *ip)
1213 {
1214         struct ctdb_vnn *svnn;
1215         struct ctdb_iface *cur = NULL;
1216         bool ok;
1217         int ret;
1218
1219         svnn = talloc_zero(ctdb, struct ctdb_vnn);
1220         CTDB_NO_MEMORY(ctdb, svnn);
1221
1222         svnn->ifaces = talloc_array(svnn, const char *, 2);
1223         CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1224         svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1225         CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1226         svnn->ifaces[1] = NULL;
1227
1228         ok = parse_ip(ip, iface, 0, &svnn->public_address);
1229         if (!ok) {
1230                 talloc_free(svnn);
1231                 return -1;
1232         }
1233
1234         ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1235         if (ret != 0) {
1236                 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1237                                    "for single_ip[%s]\n",
1238                                    svnn->ifaces[0],
1239                                    ctdb_addr_to_str(&svnn->public_address)));
1240                 talloc_free(svnn);
1241                 return -1;
1242         }
1243
1244         /* assume the single public ip interface is initially "good" */
1245         cur = ctdb_find_iface(ctdb, iface);
1246         if (cur == NULL) {
1247                 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1248                 return -1;
1249         }
1250         cur->link_up = true;
1251
1252         ret = ctdb_vnn_assign_iface(ctdb, svnn);
1253         if (ret != 0) {
1254                 talloc_free(svnn);
1255                 return -1;
1256         }
1257
1258         ctdb->single_ip_vnn = svnn;
1259         return 0;
1260 }
1261
1262 struct ctdb_public_ip_list {
1263         struct ctdb_public_ip_list *next;
1264         uint32_t pnn;
1265         ctdb_sock_addr addr;
1266 };
1267
1268 /* Given a physical node, return the number of
1269    public addresses that is currently assigned to this node.
1270 */
1271 static int node_ip_coverage(struct ctdb_context *ctdb, 
1272         int32_t pnn,
1273         struct ctdb_public_ip_list *ips)
1274 {
1275         int num=0;
1276
1277         for (;ips;ips=ips->next) {
1278                 if (ips->pnn == pnn) {
1279                         num++;
1280                 }
1281         }
1282         return num;
1283 }
1284
1285
1286 /* Can the given node host the given IP: is the public IP known to the
1287  * node and is NOIPHOST unset?
1288 */
1289 static bool can_node_host_ip(struct ctdb_context *ctdb, int32_t pnn, 
1290                              struct ctdb_ipflags ipflags,
1291                              struct ctdb_public_ip_list *ip)
1292 {
1293         struct ctdb_all_public_ips *public_ips;
1294         int i;
1295
1296         if (ipflags.noiphost) {
1297                 return false;
1298         }
1299
1300         public_ips = ctdb->nodes[pnn]->available_public_ips;
1301
1302         if (public_ips == NULL) {
1303                 return false;
1304         }
1305
1306         for (i=0; i<public_ips->num; i++) {
1307                 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1308                         /* yes, this node can serve this public ip */
1309                         return true;
1310                 }
1311         }
1312
1313         return false;
1314 }
1315
1316 static bool can_node_takeover_ip(struct ctdb_context *ctdb, int32_t pnn, 
1317                                  struct ctdb_ipflags ipflags,
1318                                  struct ctdb_public_ip_list *ip)
1319 {
1320         if (ipflags.noiptakeover) {
1321                 return false;
1322         }
1323
1324         return can_node_host_ip(ctdb, pnn, ipflags, ip);
1325 }
1326
1327 /* search the node lists list for a node to takeover this ip.
1328    pick the node that currently are serving the least number of ips
1329    so that the ips get spread out evenly.
1330 */
1331 static int find_takeover_node(struct ctdb_context *ctdb, 
1332                 struct ctdb_ipflags *ipflags,
1333                 struct ctdb_public_ip_list *ip,
1334                 struct ctdb_public_ip_list *all_ips)
1335 {
1336         int pnn, min=0, num;
1337         int i, numnodes;
1338
1339         numnodes = talloc_array_length(ipflags);
1340         pnn    = -1;
1341         for (i=0; i<numnodes; i++) {
1342                 /* verify that this node can serve this ip */
1343                 if (!can_node_takeover_ip(ctdb, i, ipflags[i], ip)) {
1344                         /* no it couldnt   so skip to the next node */
1345                         continue;
1346                 }
1347
1348                 num = node_ip_coverage(ctdb, i, all_ips);
1349                 /* was this the first node we checked ? */
1350                 if (pnn == -1) {
1351                         pnn = i;
1352                         min  = num;
1353                 } else {
1354                         if (num < min) {
1355                                 pnn = i;
1356                                 min  = num;
1357                         }
1358                 }
1359         }       
1360         if (pnn == -1) {
1361                 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1362                         ctdb_addr_to_str(&ip->addr)));
1363
1364                 return -1;
1365         }
1366
1367         ip->pnn = pnn;
1368         return 0;
1369 }
1370
1371 #define IP_KEYLEN       4
1372 static uint32_t *ip_key(ctdb_sock_addr *ip)
1373 {
1374         static uint32_t key[IP_KEYLEN];
1375
1376         bzero(key, sizeof(key));
1377
1378         switch (ip->sa.sa_family) {
1379         case AF_INET:
1380                 key[3]  = htonl(ip->ip.sin_addr.s_addr);
1381                 break;
1382         case AF_INET6: {
1383                 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1384                 key[0]  = htonl(s6_a32[0]);
1385                 key[1]  = htonl(s6_a32[1]);
1386                 key[2]  = htonl(s6_a32[2]);
1387                 key[3]  = htonl(s6_a32[3]);
1388                 break;
1389         }
1390         default:
1391                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1392                 return key;
1393         }
1394
1395         return key;
1396 }
1397
1398 static void *add_ip_callback(void *parm, void *data)
1399 {
1400         struct ctdb_public_ip_list *this_ip = parm; 
1401         struct ctdb_public_ip_list *prev_ip = data; 
1402
1403         if (prev_ip == NULL) {
1404                 return parm;
1405         }
1406         if (this_ip->pnn == -1) {
1407                 this_ip->pnn = prev_ip->pnn;
1408         }
1409
1410         return parm;
1411 }
1412
1413 static int getips_count_callback(void *param, void *data)
1414 {
1415         struct ctdb_public_ip_list **ip_list = (struct ctdb_public_ip_list **)param;
1416         struct ctdb_public_ip_list *new_ip = (struct ctdb_public_ip_list *)data;
1417
1418         new_ip->next = *ip_list;
1419         *ip_list     = new_ip;
1420         return 0;
1421 }
1422
1423 static struct ctdb_public_ip_list *
1424 create_merged_ip_list(struct ctdb_context *ctdb)
1425 {
1426         int i, j;
1427         struct ctdb_public_ip_list *ip_list;
1428         struct ctdb_all_public_ips *public_ips;
1429
1430         if (ctdb->ip_tree != NULL) {
1431                 talloc_free(ctdb->ip_tree);
1432                 ctdb->ip_tree = NULL;
1433         }
1434         ctdb->ip_tree = trbt_create(ctdb, 0);
1435
1436         for (i=0;i<ctdb->num_nodes;i++) {
1437                 public_ips = ctdb->nodes[i]->known_public_ips;
1438
1439                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1440                         continue;
1441                 }
1442
1443                 /* there were no public ips for this node */
1444                 if (public_ips == NULL) {
1445                         continue;
1446                 }               
1447
1448                 for (j=0;j<public_ips->num;j++) {
1449                         struct ctdb_public_ip_list *tmp_ip; 
1450
1451                         tmp_ip = talloc_zero(ctdb->ip_tree, struct ctdb_public_ip_list);
1452                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1453                         /* Do not use information about IP addresses hosted
1454                          * on other nodes, it may not be accurate */
1455                         if (public_ips->ips[j].pnn == ctdb->nodes[i]->pnn) {
1456                                 tmp_ip->pnn = public_ips->ips[j].pnn;
1457                         } else {
1458                                 tmp_ip->pnn = -1;
1459                         }
1460                         tmp_ip->addr = public_ips->ips[j].addr;
1461                         tmp_ip->next = NULL;
1462
1463                         trbt_insertarray32_callback(ctdb->ip_tree,
1464                                 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1465                                 add_ip_callback,
1466                                 tmp_ip);
1467                 }
1468         }
1469
1470         ip_list = NULL;
1471         trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1472
1473         return ip_list;
1474 }
1475
1476 /* 
1477  * This is the length of the longtest common prefix between the IPs.
1478  * It is calculated by XOR-ing the 2 IPs together and counting the
1479  * number of leading zeroes.  The implementation means that all
1480  * addresses end up being 128 bits long.
1481  *
1482  * FIXME? Should we consider IPv4 and IPv6 separately given that the
1483  * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1484  * lots of nodes and IP addresses?
1485  */
1486 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1487 {
1488         uint32_t ip1_k[IP_KEYLEN];
1489         uint32_t *t;
1490         int i;
1491         uint32_t x;
1492
1493         uint32_t distance = 0;
1494
1495         memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1496         t = ip_key(ip2);
1497         for (i=0; i<IP_KEYLEN; i++) {
1498                 x = ip1_k[i] ^ t[i];
1499                 if (x == 0) {
1500                         distance += 32;
1501                 } else {
1502                         /* Count number of leading zeroes. 
1503                          * FIXME? This could be optimised...
1504                          */
1505                         while ((x & (1 << 31)) == 0) {
1506                                 x <<= 1;
1507                                 distance += 1;
1508                         }
1509                 }
1510         }
1511
1512         return distance;
1513 }
1514
1515 /* Calculate the IP distance for the given IP relative to IPs on the
1516    given node.  The ips argument is generally the all_ips variable
1517    used in the main part of the algorithm.
1518  */
1519 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1520                                   struct ctdb_public_ip_list *ips,
1521                                   int pnn)
1522 {
1523         struct ctdb_public_ip_list *t;
1524         uint32_t d;
1525
1526         uint32_t sum = 0;
1527
1528         for (t=ips; t != NULL; t=t->next) {
1529                 if (t->pnn != pnn) {
1530                         continue;
1531                 }
1532
1533                 /* Optimisation: We never calculate the distance
1534                  * between an address and itself.  This allows us to
1535                  * calculate the effect of removing an address from a
1536                  * node by simply calculating the distance between
1537                  * that address and all of the exitsing addresses.
1538                  * Moreover, we assume that we're only ever dealing
1539                  * with addresses from all_ips so we can identify an
1540                  * address via a pointer rather than doing a more
1541                  * expensive address comparison. */
1542                 if (&(t->addr) == ip) {
1543                         continue;
1544                 }
1545
1546                 d = ip_distance(ip, &(t->addr));
1547                 sum += d * d;  /* Cheaper than pulling in math.h :-) */
1548         }
1549
1550         return sum;
1551 }
1552
1553 /* Return the LCP2 imbalance metric for addresses currently assigned
1554    to the given node.
1555  */
1556 static uint32_t lcp2_imbalance(struct ctdb_public_ip_list * all_ips, int pnn)
1557 {
1558         struct ctdb_public_ip_list *t;
1559
1560         uint32_t imbalance = 0;
1561
1562         for (t=all_ips; t!=NULL; t=t->next) {
1563                 if (t->pnn != pnn) {
1564                         continue;
1565                 }
1566                 /* Pass the rest of the IPs rather than the whole
1567                    all_ips input list.
1568                 */
1569                 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1570         }
1571
1572         return imbalance;
1573 }
1574
1575 /* Allocate any unassigned IPs just by looping through the IPs and
1576  * finding the best node for each.
1577  */
1578 static void basic_allocate_unassigned(struct ctdb_context *ctdb,
1579                                       struct ctdb_ipflags *ipflags,
1580                                       struct ctdb_public_ip_list *all_ips)
1581 {
1582         struct ctdb_public_ip_list *tmp_ip;
1583
1584         /* loop over all ip's and find a physical node to cover for 
1585            each unassigned ip.
1586         */
1587         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1588                 if (tmp_ip->pnn == -1) {
1589                         if (find_takeover_node(ctdb, ipflags, tmp_ip, all_ips)) {
1590                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1591                                         ctdb_addr_to_str(&tmp_ip->addr)));
1592                         }
1593                 }
1594         }
1595 }
1596
1597 /* Basic non-deterministic rebalancing algorithm.
1598  */
1599 static void basic_failback(struct ctdb_context *ctdb,
1600                            struct ctdb_ipflags *ipflags,
1601                            struct ctdb_public_ip_list *all_ips,
1602                            int num_ips)
1603 {
1604         int i, numnodes;
1605         int maxnode, maxnum, minnode, minnum, num, retries;
1606         struct ctdb_public_ip_list *tmp_ip;
1607
1608         numnodes = talloc_array_length(ipflags);
1609         retries = 0;
1610
1611 try_again:
1612         maxnum=0;
1613         minnum=0;
1614
1615         /* for each ip address, loop over all nodes that can serve
1616            this ip and make sure that the difference between the node
1617            serving the most and the node serving the least ip's are
1618            not greater than 1.
1619         */
1620         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1621                 if (tmp_ip->pnn == -1) {
1622                         continue;
1623                 }
1624
1625                 /* Get the highest and lowest number of ips's served by any 
1626                    valid node which can serve this ip.
1627                 */
1628                 maxnode = -1;
1629                 minnode = -1;
1630                 for (i=0; i<numnodes; i++) {
1631                         /* only check nodes that can actually serve this ip */
1632                         if (!can_node_takeover_ip(ctdb, i, ipflags[i], tmp_ip)) {
1633                                 /* no it couldnt   so skip to the next node */
1634                                 continue;
1635                         }
1636
1637                         num = node_ip_coverage(ctdb, i, all_ips);
1638                         if (maxnode == -1) {
1639                                 maxnode = i;
1640                                 maxnum  = num;
1641                         } else {
1642                                 if (num > maxnum) {
1643                                         maxnode = i;
1644                                         maxnum  = num;
1645                                 }
1646                         }
1647                         if (minnode == -1) {
1648                                 minnode = i;
1649                                 minnum  = num;
1650                         } else {
1651                                 if (num < minnum) {
1652                                         minnode = i;
1653                                         minnum  = num;
1654                                 }
1655                         }
1656                 }
1657                 if (maxnode == -1) {
1658                         DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1659                                 ctdb_addr_to_str(&tmp_ip->addr)));
1660
1661                         continue;
1662                 }
1663
1664                 /* if the spread between the smallest and largest coverage by
1665                    a node is >=2 we steal one of the ips from the node with
1666                    most coverage to even things out a bit.
1667                    try to do this a limited number of times since we dont
1668                    want to spend too much time balancing the ip coverage.
1669                 */
1670                 if ( (maxnum > minnum+1)
1671                      && (retries < (num_ips + 5)) ){
1672                         struct ctdb_public_ip_list *tmp;
1673
1674                         /* Reassign one of maxnode's VNNs */
1675                         for (tmp=all_ips;tmp;tmp=tmp->next) {
1676                                 if (tmp->pnn == maxnode) {
1677                                         (void)find_takeover_node(ctdb, ipflags, tmp, all_ips);
1678                                         retries++;
1679                                         goto try_again;;
1680                                 }
1681                         }
1682                 }
1683         }
1684 }
1685
1686 static void lcp2_init(struct ctdb_context *tmp_ctx,
1687                       struct ctdb_ipflags *ipflags,
1688                       struct ctdb_public_ip_list *all_ips,
1689                       uint32_t *force_rebalance_nodes,
1690                       uint32_t **lcp2_imbalances,
1691                       bool **rebalance_candidates)
1692 {
1693         int i, numnodes;
1694         struct ctdb_public_ip_list *tmp_ip;
1695
1696         numnodes = talloc_array_length(ipflags);
1697
1698         *rebalance_candidates = talloc_array(tmp_ctx, bool, numnodes);
1699         CTDB_NO_MEMORY_FATAL(tmp_ctx, *rebalance_candidates);
1700         *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, numnodes);
1701         CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
1702
1703         for (i=0; i<numnodes; i++) {
1704                 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1705                 /* First step: assume all nodes are candidates */
1706                 (*rebalance_candidates)[i] = true;
1707         }
1708
1709         /* 2nd step: if a node has IPs assigned then it must have been
1710          * healthy before, so we remove it from consideration.  This
1711          * is overkill but is all we have because we don't maintain
1712          * state between takeover runs.  An alternative would be to
1713          * keep state and invalidate it every time the recovery master
1714          * changes.
1715          */
1716         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1717                 if (tmp_ip->pnn != -1) {
1718                         (*rebalance_candidates)[tmp_ip->pnn] = false;
1719                 }
1720         }
1721
1722         /* 3rd step: if a node is forced to re-balance then
1723            we allow failback onto the node */
1724         if (force_rebalance_nodes == NULL) {
1725                 return;
1726         }
1727         for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
1728                 uint32_t pnn = force_rebalance_nodes[i];
1729                 if (pnn >= numnodes) {
1730                         DEBUG(DEBUG_ERR,
1731                               (__location__ "unknown node %u\n", pnn));
1732                         continue;
1733                 }
1734
1735                 DEBUG(DEBUG_NOTICE,
1736                       ("Forcing rebalancing of IPs to node %u\n", pnn));
1737                 (*rebalance_candidates)[pnn] = true;
1738         }
1739 }
1740
1741 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1742  * the IP/node combination that will cost the least.
1743  */
1744 static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
1745                                      struct ctdb_ipflags *ipflags,
1746                                      struct ctdb_public_ip_list *all_ips,
1747                                      uint32_t *lcp2_imbalances)
1748 {
1749         struct ctdb_public_ip_list *tmp_ip;
1750         int dstnode, numnodes;
1751
1752         int minnode;
1753         uint32_t mindsum, dstdsum, dstimbl, minimbl;
1754         struct ctdb_public_ip_list *minip;
1755
1756         bool should_loop = true;
1757         bool have_unassigned = true;
1758
1759         numnodes = talloc_array_length(ipflags);
1760
1761         while (have_unassigned && should_loop) {
1762                 should_loop = false;
1763
1764                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1765                 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1766
1767                 minnode = -1;
1768                 mindsum = 0;
1769                 minip = NULL;
1770
1771                 /* loop over each unassigned ip. */
1772                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1773                         if (tmp_ip->pnn != -1) {
1774                                 continue;
1775                         }
1776
1777                         for (dstnode=0; dstnode<numnodes; dstnode++) {
1778                                 /* only check nodes that can actually takeover this ip */
1779                                 if (!can_node_takeover_ip(ctdb, dstnode,
1780                                                           ipflags[dstnode],
1781                                                           tmp_ip)) {
1782                                         /* no it couldnt   so skip to the next node */
1783                                         continue;
1784                                 }
1785
1786                                 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1787                                 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1788                                 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1789                                                    ctdb_addr_to_str(&(tmp_ip->addr)),
1790                                                    dstnode,
1791                                                    dstimbl - lcp2_imbalances[dstnode]));
1792
1793
1794                                 if ((minnode == -1) || (dstdsum < mindsum)) {
1795                                         minnode = dstnode;
1796                                         minimbl = dstimbl;
1797                                         mindsum = dstdsum;
1798                                         minip = tmp_ip;
1799                                         should_loop = true;
1800                                 }
1801                         }
1802                 }
1803
1804                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1805
1806                 /* If we found one then assign it to the given node. */
1807                 if (minnode != -1) {
1808                         minip->pnn = minnode;
1809                         lcp2_imbalances[minnode] = minimbl;
1810                         DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1811                                           ctdb_addr_to_str(&(minip->addr)),
1812                                           minnode,
1813                                           mindsum));
1814                 }
1815
1816                 /* There might be a better way but at least this is clear. */
1817                 have_unassigned = false;
1818                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1819                         if (tmp_ip->pnn == -1) {
1820                                 have_unassigned = true;
1821                         }
1822                 }
1823         }
1824
1825         /* We know if we have an unassigned addresses so we might as
1826          * well optimise.
1827          */
1828         if (have_unassigned) {
1829                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1830                         if (tmp_ip->pnn == -1) {
1831                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1832                                                      ctdb_addr_to_str(&tmp_ip->addr)));
1833                         }
1834                 }
1835         }
1836 }
1837
1838 /* LCP2 algorithm for rebalancing the cluster.  Given a candidate node
1839  * to move IPs from, determines the best IP/destination node
1840  * combination to move from the source node.
1841  */
1842 static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
1843                                     struct ctdb_ipflags *ipflags,
1844                                     struct ctdb_public_ip_list *all_ips,
1845                                     int srcnode,
1846                                     uint32_t *lcp2_imbalances,
1847                                     bool *rebalance_candidates)
1848 {
1849         int dstnode, mindstnode, numnodes;
1850         uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1851         uint32_t minsrcimbl, mindstimbl;
1852         struct ctdb_public_ip_list *minip;
1853         struct ctdb_public_ip_list *tmp_ip;
1854
1855         /* Find an IP and destination node that best reduces imbalance. */
1856         srcimbl = 0;
1857         minip = NULL;
1858         minsrcimbl = 0;
1859         mindstnode = -1;
1860         mindstimbl = 0;
1861
1862         numnodes = talloc_array_length(ipflags);
1863
1864         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1865         DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n",
1866                            srcnode, lcp2_imbalances[srcnode]));
1867
1868         for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1869                 /* Only consider addresses on srcnode. */
1870                 if (tmp_ip->pnn != srcnode) {
1871                         continue;
1872                 }
1873
1874                 /* What is this IP address costing the source node? */
1875                 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1876                 srcimbl = lcp2_imbalances[srcnode] - srcdsum;
1877
1878                 /* Consider this IP address would cost each potential
1879                  * destination node.  Destination nodes are limited to
1880                  * those that are newly healthy, since we don't want
1881                  * to do gratuitous failover of IPs just to make minor
1882                  * balance improvements.
1883                  */
1884                 for (dstnode=0; dstnode<numnodes; dstnode++) {
1885                         if (!rebalance_candidates[dstnode]) {
1886                                 continue;
1887                         }
1888
1889                         /* only check nodes that can actually takeover this ip */
1890                         if (!can_node_takeover_ip(ctdb, dstnode,
1891                                                   ipflags[dstnode], tmp_ip)) {
1892                                 /* no it couldnt   so skip to the next node */
1893                                 continue;
1894                         }
1895
1896                         dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1897                         dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1898                         DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1899                                            srcnode, -srcdsum,
1900                                            ctdb_addr_to_str(&(tmp_ip->addr)),
1901                                            dstnode, dstdsum));
1902
1903                         if ((dstimbl < lcp2_imbalances[srcnode]) &&
1904                             (dstdsum < srcdsum) &&                      \
1905                             ((mindstnode == -1) ||                              \
1906                              ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1907
1908                                 minip = tmp_ip;
1909                                 minsrcimbl = srcimbl;
1910                                 mindstnode = dstnode;
1911                                 mindstimbl = dstimbl;
1912                         }
1913                 }
1914         }
1915         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1916
1917         if (mindstnode != -1) {
1918                 /* We found a move that makes things better... */
1919                 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1920                                   srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1921                                   ctdb_addr_to_str(&(minip->addr)),
1922                                   mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1923
1924
1925                 lcp2_imbalances[srcnode] = minsrcimbl;
1926                 lcp2_imbalances[mindstnode] = mindstimbl;
1927                 minip->pnn = mindstnode;
1928
1929                 return true;
1930         }
1931
1932         return false;
1933         
1934 }
1935
1936 struct lcp2_imbalance_pnn {
1937         uint32_t imbalance;
1938         int pnn;
1939 };
1940
1941 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
1942 {
1943         const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
1944         const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
1945
1946         if (lipa->imbalance > lipb->imbalance) {
1947                 return -1;
1948         } else if (lipa->imbalance == lipb->imbalance) {
1949                 return 0;
1950         } else {
1951                 return 1;
1952         }
1953 }
1954
1955 /* LCP2 algorithm for rebalancing the cluster.  This finds the source
1956  * node with the highest LCP2 imbalance, and then determines the best
1957  * IP/destination node combination to move from the source node.
1958  */
1959 static void lcp2_failback(struct ctdb_context *ctdb,
1960                           struct ctdb_ipflags *ipflags,
1961                           struct ctdb_public_ip_list *all_ips,
1962                           uint32_t *lcp2_imbalances,
1963                           bool *rebalance_candidates)
1964 {
1965         int i, numnodes;
1966         struct lcp2_imbalance_pnn * lips;
1967         bool again;
1968
1969         numnodes = talloc_array_length(ipflags);
1970
1971 try_again:
1972         /* Put the imbalances and nodes into an array, sort them and
1973          * iterate through candidates.  Usually the 1st one will be
1974          * used, so this doesn't cost much...
1975          */
1976         DEBUG(DEBUG_DEBUG,("+++++++++++++++++++++++++++++++++++++++++\n"));
1977         DEBUG(DEBUG_DEBUG,("Selecting most imbalanced node from:\n"));
1978         lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, numnodes);
1979         for (i=0; i<numnodes; i++) {
1980                 lips[i].imbalance = lcp2_imbalances[i];
1981                 lips[i].pnn = i;
1982                 DEBUG(DEBUG_DEBUG,(" %d [%d]\n", i, lcp2_imbalances[i]));
1983         }
1984         qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
1985               lcp2_cmp_imbalance_pnn);
1986
1987         again = false;
1988         for (i=0; i<numnodes; i++) {
1989                 /* This means that all nodes had 0 or 1 addresses, so
1990                  * can't be imbalanced.
1991                  */
1992                 if (lips[i].imbalance == 0) {
1993                         break;
1994                 }
1995
1996                 if (lcp2_failback_candidate(ctdb,
1997                                             ipflags,
1998                                             all_ips,
1999                                             lips[i].pnn,
2000                                             lcp2_imbalances,
2001                                             rebalance_candidates)) {
2002                         again = true;
2003                         break;
2004                 }
2005         }
2006
2007         talloc_free(lips);
2008         if (again) {
2009                 goto try_again;
2010         }
2011 }
2012
2013 static void unassign_unsuitable_ips(struct ctdb_context *ctdb,
2014                                     struct ctdb_ipflags *ipflags,
2015                                     struct ctdb_public_ip_list *all_ips)
2016 {
2017         struct ctdb_public_ip_list *tmp_ip;
2018
2019         /* verify that the assigned nodes can serve that public ip
2020            and set it to -1 if not
2021         */
2022         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2023                 if (tmp_ip->pnn == -1) {
2024                         continue;
2025                 }
2026                 if (!can_node_host_ip(ctdb, tmp_ip->pnn,
2027                                       ipflags[tmp_ip->pnn], tmp_ip) != 0) {
2028                         /* this node can not serve this ip. */
2029                         DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
2030                                            ctdb_addr_to_str(&(tmp_ip->addr)),
2031                                            tmp_ip->pnn));
2032                         tmp_ip->pnn = -1;
2033                 }
2034         }
2035 }
2036
2037 static void ip_alloc_deterministic_ips(struct ctdb_context *ctdb,
2038                                        struct ctdb_ipflags *ipflags,
2039                                        struct ctdb_public_ip_list *all_ips)
2040 {
2041         struct ctdb_public_ip_list *tmp_ip;
2042         int i, numnodes;
2043
2044         numnodes = talloc_array_length(ipflags);
2045
2046         DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
2047        /* Allocate IPs to nodes in a modulo fashion so that IPs will
2048         *  always be allocated the same way for a specific set of
2049         *  available/unavailable nodes.
2050         */
2051
2052         for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
2053                 tmp_ip->pnn = i % numnodes;
2054         }
2055
2056         /* IP failback doesn't make sense with deterministic
2057          * IPs, since the modulo step above implicitly fails
2058          * back IPs to their "home" node.
2059          */
2060         if (1 == ctdb->tunable.no_ip_failback) {
2061                 DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
2062         }
2063
2064         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2065
2066         basic_allocate_unassigned(ctdb, ipflags, all_ips);
2067
2068         /* No failback here! */
2069 }
2070
2071 static void ip_alloc_nondeterministic_ips(struct ctdb_context *ctdb,
2072                                           struct ctdb_ipflags *ipflags,
2073                                           struct ctdb_public_ip_list *all_ips)
2074 {
2075         /* This should be pushed down into basic_failback. */
2076         struct ctdb_public_ip_list *tmp_ip;
2077         int num_ips = 0;
2078         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2079                 num_ips++;
2080         }
2081
2082         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2083
2084         basic_allocate_unassigned(ctdb, ipflags, all_ips);
2085
2086         /* If we don't want IPs to fail back then don't rebalance IPs. */
2087         if (1 == ctdb->tunable.no_ip_failback) {
2088                 return;
2089         }
2090
2091         /* Now, try to make sure the ip adresses are evenly distributed
2092            across the nodes.
2093         */
2094         basic_failback(ctdb, ipflags, all_ips, num_ips);
2095 }
2096
2097 static void ip_alloc_lcp2(struct ctdb_context *ctdb,
2098                           struct ctdb_ipflags *ipflags,
2099                           struct ctdb_public_ip_list *all_ips,
2100                           uint32_t *force_rebalance_nodes)
2101 {
2102         uint32_t *lcp2_imbalances;
2103         bool *rebalance_candidates;
2104         int numnodes, num_rebalance_candidates, i;
2105
2106         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2107
2108         unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2109
2110         lcp2_init(tmp_ctx, ipflags, all_ips,force_rebalance_nodes,
2111                   &lcp2_imbalances, &rebalance_candidates);
2112
2113         lcp2_allocate_unassigned(ctdb, ipflags, all_ips, lcp2_imbalances);
2114
2115         /* If we don't want IPs to fail back then don't rebalance IPs. */
2116         if (1 == ctdb->tunable.no_ip_failback) {
2117                 goto finished;
2118         }
2119
2120         /* It is only worth continuing if we have suitable target
2121          * nodes to transfer IPs to.  This check is much cheaper than
2122          * continuing on...
2123          */
2124         numnodes = talloc_array_length(ipflags);
2125         num_rebalance_candidates = 0;
2126         for (i=0; i<numnodes; i++) {
2127                 if (rebalance_candidates[i]) {
2128                         num_rebalance_candidates++;
2129                 }
2130         }
2131         if (num_rebalance_candidates == 0) {
2132                 goto finished;
2133         }
2134
2135         /* Now, try to make sure the ip adresses are evenly distributed
2136            across the nodes.
2137         */
2138         lcp2_failback(ctdb, ipflags, all_ips,
2139                       lcp2_imbalances, rebalance_candidates);
2140
2141 finished:
2142         talloc_free(tmp_ctx);
2143 }
2144
2145 static bool all_nodes_are_disabled(struct ctdb_node_map *nodemap)
2146 {
2147         int i;
2148
2149         for (i=0;i<nodemap->num;i++) {
2150                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
2151                         /* Found one completely healthy node */
2152                         return false;
2153                 }
2154         }
2155
2156         return true;
2157 }
2158
2159 /* The calculation part of the IP allocation algorithm. */
2160 static void ctdb_takeover_run_core(struct ctdb_context *ctdb,
2161                                    struct ctdb_ipflags *ipflags,
2162                                    struct ctdb_public_ip_list **all_ips_p,
2163                                    uint32_t *force_rebalance_nodes)
2164 {
2165         /* since nodes only know about those public addresses that
2166            can be served by that particular node, no single node has
2167            a full list of all public addresses that exist in the cluster.
2168            Walk over all node structures and create a merged list of
2169            all public addresses that exist in the cluster.
2170
2171            keep the tree of ips around as ctdb->ip_tree
2172         */
2173         *all_ips_p = create_merged_ip_list(ctdb);
2174
2175         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2176                 ip_alloc_lcp2(ctdb, ipflags, *all_ips_p, force_rebalance_nodes);
2177         } else if (1 == ctdb->tunable.deterministic_public_ips) {
2178                 ip_alloc_deterministic_ips(ctdb, ipflags, *all_ips_p);
2179         } else {
2180                 ip_alloc_nondeterministic_ips(ctdb, ipflags, *all_ips_p);
2181         }
2182
2183         /* at this point ->pnn is the node which will own each IP
2184            or -1 if there is no node that can cover this ip
2185         */
2186
2187         return;
2188 }
2189
2190 struct get_tunable_callback_data {
2191         const char *tunable;
2192         uint32_t *out;
2193         bool fatal;
2194 };
2195
2196 static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
2197                                  int32_t res, TDB_DATA outdata,
2198                                  void *callback)
2199 {
2200         struct get_tunable_callback_data *cd =
2201                 (struct get_tunable_callback_data *)callback;
2202         int size;
2203
2204         if (res != 0) {
2205                 /* Already handled in fail callback */
2206                 return;
2207         }
2208
2209         if (outdata.dsize != sizeof(uint32_t)) {
2210                 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading \"%s\" tunable from node %d. Expected %d bytes but received %d bytes\n",
2211                                  cd->tunable, pnn, (int)sizeof(uint32_t),
2212                                  (int)outdata.dsize));
2213                 cd->fatal = true;
2214                 return;
2215         }
2216
2217         size = talloc_array_length(cd->out);
2218         if (pnn >= size) {
2219                 DEBUG(DEBUG_ERR,("Got %s reply from node %d but nodemap only has %d entries\n",
2220                                  cd->tunable, pnn, size));
2221                 return;
2222         }
2223
2224                 
2225         cd->out[pnn] = *(uint32_t *)outdata.dptr;
2226 }
2227
2228 static void get_tunable_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2229                                        int32_t res, TDB_DATA outdata,
2230                                        void *callback)
2231 {
2232         struct get_tunable_callback_data *cd =
2233                 (struct get_tunable_callback_data *)callback;
2234
2235         switch (res) {
2236         case -ETIME:
2237                 DEBUG(DEBUG_ERR,
2238                       ("Timed out getting tunable \"%s\" from node %d\n",
2239                        cd->tunable, pnn));
2240                 cd->fatal = true;
2241                 break;
2242         case -EINVAL:
2243         case -1:
2244                 DEBUG(DEBUG_WARNING,
2245                       ("Tunable \"%s\" not implemented on node %d\n",
2246                        cd->tunable, pnn));
2247                 break;
2248         default:
2249                 DEBUG(DEBUG_ERR,
2250                       ("Unexpected error getting tunable \"%s\" from node %d\n",
2251                        cd->tunable, pnn));
2252                 cd->fatal = true;
2253         }
2254 }
2255
2256 static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
2257                                         TALLOC_CTX *tmp_ctx,
2258                                         struct ctdb_node_map *nodemap,
2259                                         const char *tunable,
2260                                         uint32_t default_value)
2261 {
2262         TDB_DATA data;
2263         struct ctdb_control_get_tunable *t;
2264         uint32_t *nodes;
2265         uint32_t *tvals;
2266         struct get_tunable_callback_data callback_data;
2267         int i;
2268
2269         tvals = talloc_array(tmp_ctx, uint32_t, nodemap->num);
2270         CTDB_NO_MEMORY_NULL(ctdb, tvals);
2271         for (i=0; i<nodemap->num; i++) {
2272                 tvals[i] = default_value;
2273         }
2274                 
2275         callback_data.out = tvals;
2276         callback_data.tunable = tunable;
2277         callback_data.fatal = false;
2278
2279         data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(tunable) + 1;
2280         data.dptr  = talloc_size(tmp_ctx, data.dsize);
2281         t = (struct ctdb_control_get_tunable *)data.dptr;
2282         t->length = strlen(tunable)+1;
2283         memcpy(t->name, tunable, t->length);
2284         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2285         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
2286                                       nodes, 0, TAKEOVER_TIMEOUT(),
2287                                       false, data,
2288                                       get_tunable_callback,
2289                                       get_tunable_fail_callback,
2290                                       &callback_data) != 0) {
2291                 if (callback_data.fatal) {
2292                         talloc_free(tvals);
2293                         tvals = NULL;
2294                 }
2295         }
2296         talloc_free(nodes);
2297         talloc_free(data.dptr);
2298
2299         return tvals;
2300 }
2301
2302 struct get_runstate_callback_data {
2303         enum ctdb_runstate *out;
2304         bool fatal;
2305 };
2306
2307 static void get_runstate_callback(struct ctdb_context *ctdb, uint32_t pnn,
2308                                   int32_t res, TDB_DATA outdata,
2309                                   void *callback_data)
2310 {
2311         struct get_runstate_callback_data *cd =
2312                 (struct get_runstate_callback_data *)callback_data;
2313         int size;
2314
2315         if (res != 0) {
2316                 /* Already handled in fail callback */
2317                 return;
2318         }
2319
2320         if (outdata.dsize != sizeof(uint32_t)) {
2321                 DEBUG(DEBUG_ERR,("Wrong size of returned data when getting runstate from node %d. Expected %d bytes but received %d bytes\n",
2322                                  pnn, (int)sizeof(uint32_t),
2323                                  (int)outdata.dsize));
2324                 cd->fatal = true;
2325                 return;
2326         }
2327
2328         size = talloc_array_length(cd->out);
2329         if (pnn >= size) {
2330                 DEBUG(DEBUG_ERR,("Got reply from node %d but nodemap only has %d entries\n",
2331                                  pnn, size));
2332                 return;
2333         }
2334
2335         cd->out[pnn] = (enum ctdb_runstate)*(uint32_t *)outdata.dptr;
2336 }
2337
2338 static void get_runstate_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2339                                        int32_t res, TDB_DATA outdata,
2340                                        void *callback)
2341 {
2342         struct get_runstate_callback_data *cd =
2343                 (struct get_runstate_callback_data *)callback;
2344
2345         switch (res) {
2346         case -ETIME:
2347                 DEBUG(DEBUG_ERR,
2348                       ("Timed out getting runstate from node %d\n", pnn));
2349                 cd->fatal = true;
2350                 break;
2351         default:
2352                 DEBUG(DEBUG_WARNING,
2353                       ("Error getting runstate from node %d - assuming runstates not supported\n",
2354                        pnn));
2355         }
2356 }
2357
2358 static enum ctdb_runstate * get_runstate_from_nodes(struct ctdb_context *ctdb,
2359                                                     TALLOC_CTX *tmp_ctx,
2360                                                     struct ctdb_node_map *nodemap,
2361                                                     enum ctdb_runstate default_value)
2362 {
2363         uint32_t *nodes;
2364         enum ctdb_runstate *rs;
2365         struct get_runstate_callback_data callback_data;
2366         int i;
2367
2368         rs = talloc_array(tmp_ctx, enum ctdb_runstate, nodemap->num);
2369         CTDB_NO_MEMORY_NULL(ctdb, rs);
2370         for (i=0; i<nodemap->num; i++) {
2371                 rs[i] = default_value;
2372         }
2373
2374         callback_data.out = rs;
2375         callback_data.fatal = false;
2376
2377         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2378         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_RUNSTATE,
2379                                       nodes, 0, TAKEOVER_TIMEOUT(),
2380                                       true, tdb_null,
2381                                       get_runstate_callback,
2382                                       get_runstate_fail_callback,
2383                                       &callback_data) != 0) {
2384                 if (callback_data.fatal) {
2385                         free(rs);
2386                         rs = NULL;
2387                 }
2388         }
2389         talloc_free(nodes);
2390
2391         return rs;
2392 }
2393
2394 /* Set internal flags for IP allocation:
2395  *   Clear ip flags
2396  *   Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
2397  *   Set NOIPHOST ip flag for each INACTIVE node
2398  *   if all nodes are disabled:
2399  *     Set NOIPHOST ip flags from per-node NoIPHostOnAllDisabled tunable
2400  *   else
2401  *     Set NOIPHOST ip flags for disabled nodes
2402  */
2403 static struct ctdb_ipflags *
2404 set_ipflags_internal(struct ctdb_context *ctdb,
2405                      TALLOC_CTX *tmp_ctx,
2406                      struct ctdb_node_map *nodemap,
2407                      uint32_t *tval_noiptakeover,
2408                      uint32_t *tval_noiphostonalldisabled,
2409                      enum ctdb_runstate *runstate)
2410 {
2411         int i;
2412         struct ctdb_ipflags *ipflags;
2413
2414         /* Clear IP flags - implicit due to talloc_zero */
2415         ipflags = talloc_zero_array(tmp_ctx, struct ctdb_ipflags, nodemap->num);
2416         CTDB_NO_MEMORY_NULL(ctdb, ipflags);
2417
2418         for (i=0;i<nodemap->num;i++) {
2419                 /* Can not take IPs on node with NoIPTakeover set */
2420                 if (tval_noiptakeover[i] != 0) {
2421                         ipflags[i].noiptakeover = true;
2422                 }
2423
2424                 /* Can not host IPs on node not in RUNNING state */
2425                 if (runstate[i] != CTDB_RUNSTATE_RUNNING) {
2426                         ipflags[i].noiphost = true;
2427                         continue;
2428                 }
2429                 /* Can not host IPs on INACTIVE node */
2430                 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2431                         ipflags[i].noiphost = true;
2432                 }
2433                 /* Remember the runstate */
2434                 ipflags[i].runstate = runstate[i];
2435         }
2436
2437         if (all_nodes_are_disabled(nodemap)) {
2438                 /* If all nodes are disabled, can not host IPs on node
2439                  * with NoIPHostOnAllDisabled set
2440                  */
2441                 for (i=0;i<nodemap->num;i++) {
2442                         if (tval_noiphostonalldisabled[i] != 0) {
2443                                 ipflags[i].noiphost = true;
2444                         }
2445                 }
2446         } else {
2447                 /* If some nodes are not disabled, then can not host
2448                  * IPs on DISABLED node
2449                  */
2450                 for (i=0;i<nodemap->num;i++) {
2451                         if (nodemap->nodes[i].flags & NODE_FLAGS_DISABLED) {
2452                                 ipflags[i].noiphost = true;
2453                         }
2454                 }
2455         }
2456
2457         return ipflags;
2458 }
2459
2460 static struct ctdb_ipflags *set_ipflags(struct ctdb_context *ctdb,
2461                                         TALLOC_CTX *tmp_ctx,
2462                                         struct ctdb_node_map *nodemap)
2463 {
2464         uint32_t *tval_noiptakeover;
2465         uint32_t *tval_noiphostonalldisabled;
2466         struct ctdb_ipflags *ipflags;
2467         enum ctdb_runstate *runstate;
2468
2469
2470         tval_noiptakeover = get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2471                                                    "NoIPTakeover", 0);
2472         if (tval_noiptakeover == NULL) {
2473                 return NULL;
2474         }
2475
2476         tval_noiphostonalldisabled =
2477                 get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2478                                        "NoIPHostOnAllDisabled", 0);
2479         if (tval_noiphostonalldisabled == NULL) {
2480                 /* Caller frees tmp_ctx */
2481                 return NULL;
2482         }
2483
2484         /* Any nodes where CTDB_CONTROL_GET_RUNSTATE is not supported
2485          * will default to CTDB_RUNSTATE_RUNNING.  This ensures
2486          * reasonable behaviour on a mixed cluster during upgrade.
2487          */
2488         runstate = get_runstate_from_nodes(ctdb, tmp_ctx, nodemap,
2489                                            CTDB_RUNSTATE_RUNNING);
2490         if (runstate == NULL) {
2491                 /* Caller frees tmp_ctx */
2492                 return NULL;
2493         }
2494
2495         ipflags = set_ipflags_internal(ctdb, tmp_ctx, nodemap,
2496                                        tval_noiptakeover,
2497                                        tval_noiphostonalldisabled,
2498                                        runstate);
2499
2500         talloc_free(tval_noiptakeover);
2501         talloc_free(tval_noiphostonalldisabled);
2502         talloc_free(runstate);
2503
2504         return ipflags;
2505 }
2506
2507 struct iprealloc_callback_data {
2508         bool *retry_nodes;
2509         int retry_count;
2510         client_async_callback fail_callback;
2511         void *fail_callback_data;
2512         struct ctdb_node_map *nodemap;
2513 };
2514
2515 static void iprealloc_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2516                                         int32_t res, TDB_DATA outdata,
2517                                         void *callback)
2518 {
2519         int numnodes;
2520         struct iprealloc_callback_data *cd =
2521                 (struct iprealloc_callback_data *)callback;
2522
2523         numnodes = talloc_array_length(cd->retry_nodes);
2524         if (pnn > numnodes) {
2525                 DEBUG(DEBUG_ERR,
2526                       ("ipreallocated failure from node %d, "
2527                        "but only %d nodes in nodemap\n",
2528                        pnn, numnodes));
2529                 return;
2530         }
2531
2532         /* Can't run the "ipreallocated" event on a INACTIVE node */
2533         if (cd->nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE) {
2534                 DEBUG(DEBUG_WARNING,
2535                       ("ipreallocated failed on inactive node %d, ignoring\n",
2536                        pnn));
2537                 return;
2538         }
2539
2540         switch (res) {
2541         case -ETIME:
2542                 /* If the control timed out then that's a real error,
2543                  * so call the real fail callback
2544                  */
2545                 if (cd->fail_callback) {
2546                         cd->fail_callback(ctdb, pnn, res, outdata,
2547                                           cd->fail_callback_data);
2548                 } else {
2549                         DEBUG(DEBUG_WARNING,
2550                               ("iprealloc timed out but no callback registered\n"));
2551                 }
2552                 break;
2553         default:
2554                 /* If not a timeout then either the ipreallocated
2555                  * eventscript (or some setup) failed.  This might
2556                  * have failed because the IPREALLOCATED control isn't
2557                  * implemented - right now there is no way of knowing
2558                  * because the error codes are all folded down to -1.
2559                  * Consider retrying using EVENTSCRIPT control...
2560                  */
2561                 DEBUG(DEBUG_WARNING,
2562                       ("ipreallocated failure from node %d, flagging retry\n",
2563                        pnn));
2564                 cd->retry_nodes[pnn] = true;
2565                 cd->retry_count++;
2566         }
2567 }
2568
2569 struct takeover_callback_data {
2570         bool *node_failed;
2571         client_async_callback fail_callback;
2572         void *fail_callback_data;
2573         struct ctdb_node_map *nodemap;
2574 };
2575
2576 static void takeover_run_fail_callback(struct ctdb_context *ctdb,
2577                                        uint32_t node_pnn, int32_t res,
2578                                        TDB_DATA outdata, void *callback_data)
2579 {
2580         struct takeover_callback_data *cd =
2581                 talloc_get_type_abort(callback_data,
2582                                       struct takeover_callback_data);
2583         int i;
2584
2585         for (i = 0; i < cd->nodemap->num; i++) {
2586                 if (node_pnn == cd->nodemap->nodes[i].pnn) {
2587                         break;
2588                 }
2589         }
2590
2591         if (i == cd->nodemap->num) {
2592                 DEBUG(DEBUG_ERR, (__location__ " invalid PNN %u\n", node_pnn));
2593                 return;
2594         }
2595
2596         if (!cd->node_failed[i]) {
2597                 cd->node_failed[i] = true;
2598                 cd->fail_callback(ctdb, node_pnn, res, outdata,
2599                                   cd->fail_callback_data);
2600         }
2601 }
2602
2603 /*
2604   make any IP alias changes for public addresses that are necessary 
2605  */
2606 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
2607                       uint32_t *force_rebalance_nodes,
2608                       client_async_callback fail_callback, void *callback_data)
2609 {
2610         int i, j, ret;
2611         struct ctdb_public_ip ip;
2612         struct ctdb_public_ipv4 ipv4;
2613         uint32_t *nodes;
2614         struct ctdb_public_ip_list *all_ips, *tmp_ip;
2615         TDB_DATA data;
2616         struct timeval timeout;
2617         struct client_async_data *async_data;
2618         struct ctdb_client_control_state *state;
2619         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2620         struct ctdb_ipflags *ipflags;
2621         struct takeover_callback_data *takeover_data;
2622         struct iprealloc_callback_data iprealloc_data;
2623         bool *retry_data;
2624         bool can_host_ips;
2625
2626         /*
2627          * ip failover is completely disabled, just send out the 
2628          * ipreallocated event.
2629          */
2630         if (ctdb->tunable.disable_ip_failover != 0) {
2631                 goto ipreallocated;
2632         }
2633
2634         ipflags = set_ipflags(ctdb, tmp_ctx, nodemap);
2635         if (ipflags == NULL) {
2636                 DEBUG(DEBUG_ERR,("Failed to set IP flags - aborting takeover run\n"));
2637                 talloc_free(tmp_ctx);
2638                 return -1;
2639         }
2640
2641         /* Short-circuit IP allocation if no nodes are in the RUNNING
2642          * runstate yet, since no nodes will be able to host IPs */
2643         can_host_ips = false;
2644         for (i=0; i<nodemap->num; i++) {
2645                 if (ipflags[i].runstate == CTDB_RUNSTATE_RUNNING) {
2646                         can_host_ips = true;
2647                 }
2648         }
2649         if (!can_host_ips) {
2650                 DEBUG(DEBUG_WARNING,("No nodes available to host public IPs yet\n"));
2651                 return 0;
2652         }
2653
2654         /* Do the IP reassignment calculations */
2655         ctdb_takeover_run_core(ctdb, ipflags, &all_ips, force_rebalance_nodes);
2656
2657         /* Now tell all nodes to release any public IPs should not
2658          * host.  This will be a NOOP on nodes that don't currently
2659          * hold the given IP.
2660          */
2661         takeover_data = talloc_zero(tmp_ctx, struct takeover_callback_data);
2662         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data);
2663
2664         takeover_data->node_failed = talloc_zero_array(tmp_ctx,
2665                                                        bool, nodemap->num);
2666         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data->node_failed);
2667         takeover_data->fail_callback = fail_callback;
2668         takeover_data->fail_callback_data = callback_data;
2669         takeover_data->nodemap = nodemap;
2670
2671         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2672         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2673
2674         async_data->fail_callback = takeover_run_fail_callback;
2675         async_data->callback_data = takeover_data;
2676
2677         ZERO_STRUCT(ip); /* Avoid valgrind warnings for union */
2678
2679         /* Send a RELEASE_IP to all nodes that should not be hosting
2680          * each IP.  For each IP, all but one of these will be
2681          * redundant.  However, the redundant ones are used to tell
2682          * nodes which node should be hosting the IP so that commands
2683          * like "ctdb ip" can display a particular nodes idea of who
2684          * is hosting what. */
2685         for (i=0;i<nodemap->num;i++) {
2686                 /* don't talk to unconnected nodes, but do talk to banned nodes */
2687                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2688                         continue;
2689                 }
2690
2691                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2692                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2693                                 /* This node should be serving this
2694                                    vnn so dont tell it to release the ip
2695                                 */
2696                                 continue;
2697                         }
2698                         if (tmp_ip->addr.sa.sa_family == AF_INET) {
2699                                 ipv4.pnn = tmp_ip->pnn;
2700                                 ipv4.sin = tmp_ip->addr.ip;
2701
2702                                 timeout = TAKEOVER_TIMEOUT();
2703                                 data.dsize = sizeof(ipv4);
2704                                 data.dptr  = (uint8_t *)&ipv4;
2705                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2706                                                 0, CTDB_CONTROL_RELEASE_IPv4, 0,
2707                                                 data, async_data,
2708                                                 &timeout, NULL);
2709                         } else {
2710                                 ip.pnn  = tmp_ip->pnn;
2711                                 ip.addr = tmp_ip->addr;
2712
2713                                 timeout = TAKEOVER_TIMEOUT();
2714                                 data.dsize = sizeof(ip);
2715                                 data.dptr  = (uint8_t *)&ip;
2716                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2717                                                 0, CTDB_CONTROL_RELEASE_IP, 0,
2718                                                 data, async_data,
2719                                                 &timeout, NULL);
2720                         }
2721
2722                         if (state == NULL) {
2723                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2724                                 talloc_free(tmp_ctx);
2725                                 return -1;
2726                         }
2727                 
2728                         ctdb_client_async_add(async_data, state);
2729                 }
2730         }
2731         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2732                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2733                 talloc_free(tmp_ctx);
2734                 return -1;
2735         }
2736         talloc_free(async_data);
2737
2738
2739         /* For each IP, send a TAKOVER_IP to the node that should be
2740          * hosting it.  Many of these will often be redundant (since
2741          * the allocation won't have changed) but they can be useful
2742          * to recover from inconsistencies. */
2743         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2744         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2745
2746         async_data->fail_callback = fail_callback;
2747         async_data->callback_data = callback_data;
2748
2749         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2750                 if (tmp_ip->pnn == -1) {
2751                         /* this IP won't be taken over */
2752                         continue;
2753                 }
2754
2755                 if (tmp_ip->addr.sa.sa_family == AF_INET) {
2756                         ipv4.pnn = tmp_ip->pnn;
2757                         ipv4.sin = tmp_ip->addr.ip;
2758
2759                         timeout = TAKEOVER_TIMEOUT();
2760                         data.dsize = sizeof(ipv4);
2761                         data.dptr  = (uint8_t *)&ipv4;
2762                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
2763                                         0, CTDB_CONTROL_TAKEOVER_IPv4, 0,
2764                                         data, async_data,
2765                                         &timeout, NULL);
2766                 } else {
2767                         ip.pnn  = tmp_ip->pnn;
2768                         ip.addr = tmp_ip->addr;
2769
2770                         timeout = TAKEOVER_TIMEOUT();
2771                         data.dsize = sizeof(ip);
2772                         data.dptr  = (uint8_t *)&ip;
2773                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
2774                                         0, CTDB_CONTROL_TAKEOVER_IP, 0,
2775                                         data, async_data,
2776                                         &timeout, NULL);
2777                 }
2778                 if (state == NULL) {
2779                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2780                         talloc_free(tmp_ctx);
2781                         return -1;
2782                 }
2783                 
2784                 ctdb_client_async_add(async_data, state);
2785         }
2786         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2787                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2788                 talloc_free(tmp_ctx);
2789                 return -1;
2790         }
2791
2792 ipreallocated:
2793         /*
2794          * Tell all nodes to run eventscripts to process the
2795          * "ipreallocated" event.  This can do a lot of things,
2796          * including restarting services to reconfigure them if public
2797          * IPs have moved.  Once upon a time this event only used to
2798          * update natgw.
2799          */
2800         retry_data = talloc_zero_array(tmp_ctx, bool, nodemap->num);
2801         CTDB_NO_MEMORY_FATAL(ctdb, retry_data);
2802         iprealloc_data.retry_nodes = retry_data;
2803         iprealloc_data.retry_count = 0;
2804         iprealloc_data.fail_callback = fail_callback;
2805         iprealloc_data.fail_callback_data = callback_data;
2806         iprealloc_data.nodemap = nodemap;
2807
2808         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2809         ret = ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
2810                                         nodes, 0, TAKEOVER_TIMEOUT(),
2811                                         false, tdb_null,
2812                                         NULL, iprealloc_fail_callback,
2813                                         &iprealloc_data);
2814         if (ret != 0) {
2815                 /* If the control failed then we should retry to any
2816                  * nodes flagged by iprealloc_fail_callback using the
2817                  * EVENTSCRIPT control.  This is a best-effort at
2818                  * backward compatiblity when running a mixed cluster
2819                  * where some nodes have not yet been upgraded to
2820                  * support the IPREALLOCATED control.
2821                  */
2822                 DEBUG(DEBUG_WARNING,
2823                       ("Retry ipreallocated to some nodes using eventscript control\n"));
2824
2825                 nodes = talloc_array(tmp_ctx, uint32_t,
2826                                      iprealloc_data.retry_count);
2827                 CTDB_NO_MEMORY_FATAL(ctdb, nodes);
2828
2829                 j = 0;
2830                 for (i=0; i<nodemap->num; i++) {
2831                         if (iprealloc_data.retry_nodes[i]) {
2832                                 nodes[j] = i;
2833                                 j++;
2834                         }
2835                 }
2836
2837                 data.dptr  = discard_const("ipreallocated");
2838                 data.dsize = strlen((char *)data.dptr) + 1; 
2839                 ret = ctdb_client_async_control(ctdb,
2840                                                 CTDB_CONTROL_RUN_EVENTSCRIPTS,
2841                                                 nodes, 0, TAKEOVER_TIMEOUT(),
2842                                                 false, data,
2843                                                 NULL, fail_callback,
2844                                                 callback_data);
2845                 if (ret != 0) {
2846                         DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
2847                 }
2848         }
2849
2850         talloc_free(tmp_ctx);
2851         return ret;
2852 }
2853
2854
2855 /*
2856   destroy a ctdb_client_ip structure
2857  */
2858 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2859 {
2860         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2861                 ctdb_addr_to_str(&ip->addr),
2862                 ntohs(ip->addr.ip.sin_port),
2863                 ip->client_id));
2864
2865         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2866         return 0;
2867 }
2868
2869 /*
2870   called by a client to inform us of a TCP connection that it is managing
2871   that should tickled with an ACK when IP takeover is done
2872  */
2873 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2874                                 TDB_DATA indata)
2875 {
2876         struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
2877         struct ctdb_control_tcp_addr *tcp_sock = NULL;
2878         struct ctdb_tcp_list *tcp;
2879         struct ctdb_tcp_connection t;
2880         int ret;
2881         TDB_DATA data;
2882         struct ctdb_client_ip *ip;
2883         struct ctdb_vnn *vnn;
2884         ctdb_sock_addr addr;
2885
2886         /* If we don't have public IPs, tickles are useless */
2887         if (ctdb->vnn == NULL) {
2888                 return 0;
2889         }
2890
2891         tcp_sock = (struct ctdb_control_tcp_addr *)indata.dptr;
2892
2893         addr = tcp_sock->src;
2894         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
2895         addr = tcp_sock->dest;
2896         ctdb_canonicalize_ip(&addr, &tcp_sock->dest);
2897
2898         ZERO_STRUCT(addr);
2899         memcpy(&addr, &tcp_sock->dest, sizeof(addr));
2900         vnn = find_public_ip_vnn(ctdb, &addr);
2901         if (vnn == NULL) {
2902                 switch (addr.sa.sa_family) {
2903                 case AF_INET:
2904                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2905                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
2906                                         ctdb_addr_to_str(&addr)));
2907                         }
2908                         break;
2909                 case AF_INET6:
2910                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
2911                                 ctdb_addr_to_str(&addr)));
2912                         break;
2913                 default:
2914                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2915                 }
2916
2917                 return 0;
2918         }
2919
2920         if (vnn->pnn != ctdb->pnn) {
2921                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2922                         ctdb_addr_to_str(&addr),
2923                         client_id, client->pid));
2924                 /* failing this call will tell smbd to die */
2925                 return -1;
2926         }
2927
2928         ip = talloc(client, struct ctdb_client_ip);
2929         CTDB_NO_MEMORY(ctdb, ip);
2930
2931         ip->ctdb      = ctdb;
2932         ip->addr      = addr;
2933         ip->client_id = client_id;
2934         talloc_set_destructor(ip, ctdb_client_ip_destructor);
2935         DLIST_ADD(ctdb->client_ip_list, ip);
2936
2937         tcp = talloc(client, struct ctdb_tcp_list);
2938         CTDB_NO_MEMORY(ctdb, tcp);
2939
2940         tcp->connection.src_addr = tcp_sock->src;
2941         tcp->connection.dst_addr = tcp_sock->dest;
2942
2943         DLIST_ADD(client->tcp_list, tcp);
2944
2945         t.src_addr = tcp_sock->src;
2946         t.dst_addr = tcp_sock->dest;
2947
2948         data.dptr = (uint8_t *)&t;
2949         data.dsize = sizeof(t);
2950
2951         switch (addr.sa.sa_family) {
2952         case AF_INET:
2953                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2954                         (unsigned)ntohs(tcp_sock->dest.ip.sin_port), 
2955                         ctdb_addr_to_str(&tcp_sock->src),
2956                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2957                 break;
2958         case AF_INET6:
2959                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2960                         (unsigned)ntohs(tcp_sock->dest.ip6.sin6_port), 
2961                         ctdb_addr_to_str(&tcp_sock->src),
2962                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2963                 break;
2964         default:
2965                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2966         }
2967
2968
2969         /* tell all nodes about this tcp connection */
2970         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
2971                                        CTDB_CONTROL_TCP_ADD,
2972                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2973         if (ret != 0) {
2974                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
2975                 return -1;
2976         }
2977
2978         return 0;
2979 }
2980
2981 /*
2982   find a tcp address on a list
2983  */
2984 static struct ctdb_tcp_connection *ctdb_tcp_find(struct ctdb_tcp_array *array, 
2985                                            struct ctdb_tcp_connection *tcp)
2986 {
2987         int i;
2988
2989         if (array == NULL) {
2990                 return NULL;
2991         }
2992
2993         for (i=0;i<array->num;i++) {
2994                 if (ctdb_same_sockaddr(&array->connections[i].src_addr, &tcp->src_addr) &&
2995                     ctdb_same_sockaddr(&array->connections[i].dst_addr, &tcp->dst_addr)) {
2996                         return &array->connections[i];
2997                 }
2998         }
2999         return NULL;
3000 }
3001
3002
3003
3004 /*
3005   called by a daemon to inform us of a TCP connection that one of its
3006   clients managing that should tickled with an ACK when IP takeover is
3007   done
3008  */
3009 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
3010 {
3011         struct ctdb_tcp_connection *p = (struct ctdb_tcp_connection *)indata.dptr;
3012         struct ctdb_tcp_array *tcparray;
3013         struct ctdb_tcp_connection tcp;
3014         struct ctdb_vnn *vnn;
3015
3016         /* If we don't have public IPs, tickles are useless */
3017         if (ctdb->vnn == NULL) {
3018                 return 0;
3019         }
3020
3021         vnn = find_public_ip_vnn(ctdb, &p->dst_addr);
3022         if (vnn == NULL) {
3023                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
3024                         ctdb_addr_to_str(&p->dst_addr)));
3025
3026                 return -1;
3027         }
3028
3029
3030         tcparray = vnn->tcp_array;
3031
3032         /* If this is the first tickle */
3033         if (tcparray == NULL) {
3034                 tcparray = talloc(vnn, struct ctdb_tcp_array);
3035                 CTDB_NO_MEMORY(ctdb, tcparray);
3036                 vnn->tcp_array = tcparray;
3037
3038                 tcparray->num = 0;
3039                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
3040                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
3041
3042                 tcparray->connections[tcparray->num].src_addr = p->src_addr;
3043                 tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
3044                 tcparray->num++;
3045
3046                 if (tcp_update_needed) {
3047                         vnn->tcp_update_needed = true;
3048                 }
3049                 return 0;
3050         }
3051
3052
3053         /* Do we already have this tickle ?*/
3054         tcp.src_addr = p->src_addr;
3055         tcp.dst_addr = p->dst_addr;
3056         if (ctdb_tcp_find(tcparray, &tcp) != NULL) {
3057                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
3058                         ctdb_addr_to_str(&tcp.dst_addr),
3059                         ntohs(tcp.dst_addr.ip.sin_port),
3060                         vnn->pnn));
3061                 return 0;
3062         }
3063
3064         /* A new tickle, we must add it to the array */
3065         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
3066                                         struct ctdb_tcp_connection,
3067                                         tcparray->num+1);
3068         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3069
3070         tcparray->connections[tcparray->num].src_addr = p->src_addr;
3071         tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
3072         tcparray->num++;
3073
3074         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
3075                 ctdb_addr_to_str(&tcp.dst_addr),
3076                 ntohs(tcp.dst_addr.ip.sin_port),
3077                 vnn->pnn));
3078
3079         if (tcp_update_needed) {
3080                 vnn->tcp_update_needed = true;
3081         }
3082
3083         return 0;
3084 }
3085
3086
3087 /*
3088   called by a daemon to inform us of a TCP connection that one of its
3089   clients managing that should tickled with an ACK when IP takeover is
3090   done
3091  */
3092 static void ctdb_remove_tcp_connection(struct ctdb_context *ctdb, struct ctdb_tcp_connection *conn)
3093 {
3094         struct ctdb_tcp_connection *tcpp;
3095         struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst_addr);
3096
3097         if (vnn == NULL) {
3098                 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
3099                         ctdb_addr_to_str(&conn->dst_addr)));
3100                 return;
3101         }
3102
3103         /* if the array is empty we cant remove it
3104            and we dont need to do anything
3105          */
3106         if (vnn->tcp_array == NULL) {
3107                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
3108                         ctdb_addr_to_str(&conn->dst_addr),
3109                         ntohs(conn->dst_addr.ip.sin_port)));
3110                 return;
3111         }
3112
3113
3114         /* See if we know this connection
3115            if we dont know this connection  then we dont need to do anything
3116          */
3117         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
3118         if (tcpp == NULL) {
3119                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
3120                         ctdb_addr_to_str(&conn->dst_addr),
3121                         ntohs(conn->dst_addr.ip.sin_port)));
3122                 return;
3123         }
3124
3125
3126         /* We need to remove this entry from the array.
3127            Instead of allocating a new array and copying data to it
3128            we cheat and just copy the last entry in the existing array
3129            to the entry that is to be removed and just shring the 
3130            ->num field
3131          */
3132         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
3133         vnn->tcp_array->num--;
3134
3135         /* If we deleted the last entry we also need to remove the entire array
3136          */
3137         if (vnn->tcp_array->num == 0) {
3138                 talloc_free(vnn->tcp_array);
3139                 vnn->tcp_array = NULL;
3140         }               
3141
3142         vnn->tcp_update_needed = true;
3143
3144         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
3145                 ctdb_addr_to_str(&conn->src_addr),
3146                 ntohs(conn->src_addr.ip.sin_port)));
3147 }
3148
3149
3150 /*
3151   called by a daemon to inform us of a TCP connection that one of its
3152   clients used are no longer needed in the tickle database
3153  */
3154 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
3155 {
3156         struct ctdb_tcp_connection *conn = (struct ctdb_tcp_connection *)indata.dptr;
3157
3158         /* If we don't have public IPs, tickles are useless */
3159         if (ctdb->vnn == NULL) {
3160                 return 0;
3161         }
3162
3163         ctdb_remove_tcp_connection(ctdb, conn);
3164
3165         return 0;
3166 }
3167
3168
3169 /*
3170   Called when another daemon starts - causes all tickles for all
3171   public addresses we are serving to be sent to the new node on the
3172   next check.  This actually causes the next scheduled call to
3173   tdb_update_tcp_tickles() to update all nodes.  This is simple and
3174   doesn't require careful error handling.
3175  */
3176 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t pnn)
3177 {
3178         struct ctdb_vnn *vnn;
3179
3180         DEBUG(DEBUG_INFO, ("Received startup control from node %lu\n",
3181                            (unsigned long) pnn));
3182
3183         for (vnn = ctdb->vnn; vnn != NULL; vnn = vnn->next) {
3184                 vnn->tcp_update_needed = true;
3185         }
3186
3187         return 0;
3188 }
3189
3190
3191 /*
3192   called when a client structure goes away - hook to remove
3193   elements from the tcp_list in all daemons
3194  */
3195 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
3196 {
3197         while (client->tcp_list) {
3198                 struct ctdb_tcp_list *tcp = client->tcp_list;
3199                 DLIST_REMOVE(client->tcp_list, tcp);
3200                 ctdb_remove_tcp_connection(client->ctdb, &tcp->connection);
3201         }
3202 }
3203
3204
3205 /*
3206   release all IPs on shutdown
3207  */
3208 void ctdb_release_all_ips(struct ctdb_context *ctdb)
3209 {
3210         struct ctdb_vnn *vnn;
3211         int count = 0;
3212
3213         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3214                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
3215                         ctdb_vnn_unassign_iface(ctdb, vnn);
3216                         continue;
3217                 }
3218                 if (!vnn->iface) {
3219                         continue;
3220                 }
3221
3222                 DEBUG(DEBUG_INFO,("Release of IP %s/%u on interface %s node:-1\n",
3223                                     ctdb_addr_to_str(&vnn->public_address),
3224                                     vnn->public_netmask_bits,
3225                                     ctdb_vnn_iface_string(vnn)));
3226
3227                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
3228                                   ctdb_vnn_iface_string(vnn),
3229                                   ctdb_addr_to_str(&vnn->public_address),
3230                                   vnn->public_netmask_bits);
3231                 release_kill_clients(ctdb, &vnn->public_address);
3232                 ctdb_vnn_unassign_iface(ctdb, vnn);
3233                 count++;
3234         }
3235
3236         DEBUG(DEBUG_NOTICE,(__location__ " Released %d public IPs\n", count));
3237 }
3238
3239
3240 /*
3241   get list of public IPs
3242  */
3243 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
3244                                     struct ctdb_req_control *c, TDB_DATA *outdata)
3245 {
3246         int i, num, len;
3247         struct ctdb_all_public_ips *ips;
3248         struct ctdb_vnn *vnn;
3249         bool only_available = false;
3250
3251         if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
3252                 only_available = true;
3253         }
3254
3255         /* count how many public ip structures we have */
3256         num = 0;
3257         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3258                 num++;
3259         }
3260
3261         len = offsetof(struct ctdb_all_public_ips, ips) + 
3262                 num*sizeof(struct ctdb_public_ip);
3263         ips = talloc_zero_size(outdata, len);
3264         CTDB_NO_MEMORY(ctdb, ips);
3265
3266         i = 0;
3267         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3268                 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
3269                         continue;
3270                 }
3271                 ips->ips[i].pnn  = vnn->pnn;
3272                 ips->ips[i].addr = vnn->public_address;
3273                 i++;
3274         }
3275         ips->num = i;
3276         len = offsetof(struct ctdb_all_public_ips, ips) +
3277                 i*sizeof(struct ctdb_public_ip);
3278
3279         outdata->dsize = len;
3280         outdata->dptr  = (uint8_t *)ips;
3281
3282         return 0;
3283 }
3284
3285
3286 /*
3287   get list of public IPs, old ipv4 style.  only returns ipv4 addresses
3288  */
3289 int32_t ctdb_control_get_public_ipsv4(struct ctdb_context *ctdb, 
3290                                     struct ctdb_req_control *c, TDB_DATA *outdata)
3291 {
3292         int i, num, len;
3293         struct ctdb_all_public_ipsv4 *ips;
3294         struct ctdb_vnn *vnn;
3295
3296         /* count how many public ip structures we have */
3297         num = 0;
3298         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3299                 if (vnn->public_address.sa.sa_family != AF_INET) {
3300                         continue;
3301                 }
3302                 num++;
3303         }
3304
3305         len = offsetof(struct ctdb_all_public_ipsv4, ips) + 
3306                 num*sizeof(struct ctdb_public_ipv4);
3307         ips = talloc_zero_size(outdata, len);
3308         CTDB_NO_MEMORY(ctdb, ips);
3309
3310         outdata->dsize = len;
3311         outdata->dptr  = (uint8_t *)ips;
3312
3313         ips->num = num;
3314         i = 0;
3315         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3316                 if (vnn->public_address.sa.sa_family != AF_INET) {
3317                         continue;
3318                 }
3319                 ips->ips[i].pnn = vnn->pnn;
3320                 ips->ips[i].sin = vnn->public_address.ip;
3321                 i++;
3322         }
3323
3324         return 0;
3325 }
3326
3327 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
3328                                         struct ctdb_req_control *c,
3329                                         TDB_DATA indata,
3330                                         TDB_DATA *outdata)
3331 {
3332         int i, num, len;
3333         ctdb_sock_addr *addr;
3334         struct ctdb_control_public_ip_info *info;
3335         struct ctdb_vnn *vnn;
3336
3337         addr = (ctdb_sock_addr *)indata.dptr;
3338
3339         vnn = find_public_ip_vnn(ctdb, addr);
3340         if (vnn == NULL) {
3341                 /* if it is not a public ip   it could be our 'single ip' */
3342                 if (ctdb->single_ip_vnn) {
3343                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
3344                                 vnn = ctdb->single_ip_vnn;
3345                         }
3346                 }
3347         }
3348         if (vnn == NULL) {
3349                 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
3350                                  "'%s'not a public address\n",
3351                                  ctdb_addr_to_str(addr)));
3352                 return -1;
3353         }
3354
3355         /* count how many public ip structures we have */
3356         num = 0;
3357         for (;vnn->ifaces[num];) {
3358                 num++;
3359         }
3360
3361         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3362                 num*sizeof(struct ctdb_control_iface_info);
3363         info = talloc_zero_size(outdata, len);
3364         CTDB_NO_MEMORY(ctdb, info);
3365
3366         info->ip.addr = vnn->public_address;
3367         info->ip.pnn = vnn->pnn;
3368         info->active_idx = 0xFFFFFFFF;
3369
3370         for (i=0; vnn->ifaces[i]; i++) {
3371                 struct ctdb_iface *cur;
3372
3373                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
3374                 if (cur == NULL) {
3375                         DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
3376                                            vnn->ifaces[i]));
3377                         return -1;
3378                 }
3379                 if (vnn->iface == cur) {
3380                         info->active_idx = i;
3381                 }
3382                 strncpy(info->ifaces[i].name, cur->name, sizeof(info->ifaces[i].name)-1);
3383                 info->ifaces[i].link_state = cur->link_up;
3384                 info->ifaces[i].references = cur->references;
3385         }
3386         info->num = i;
3387         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3388                 i*sizeof(struct ctdb_control_iface_info);
3389
3390         outdata->dsize = len;
3391         outdata->dptr  = (uint8_t *)info;
3392
3393         return 0;
3394 }
3395
3396 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
3397                                 struct ctdb_req_control *c,
3398                                 TDB_DATA *outdata)
3399 {
3400         int i, num, len;
3401         struct ctdb_control_get_ifaces *ifaces;
3402         struct ctdb_iface *cur;
3403
3404         /* count how many public ip structures we have */
3405         num = 0;
3406         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3407                 num++;
3408         }
3409
3410         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3411                 num*sizeof(struct ctdb_control_iface_info);
3412         ifaces = talloc_zero_size(outdata, len);
3413         CTDB_NO_MEMORY(ctdb, ifaces);
3414
3415         i = 0;
3416         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3417                 strcpy(ifaces->ifaces[i].name, cur->name);
3418                 ifaces->ifaces[i].link_state = cur->link_up;
3419                 ifaces->ifaces[i].references = cur->references;
3420                 i++;
3421         }
3422         ifaces->num = i;
3423         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3424                 i*sizeof(struct ctdb_control_iface_info);
3425
3426         outdata->dsize = len;
3427         outdata->dptr  = (uint8_t *)ifaces;
3428
3429         return 0;
3430 }
3431
3432 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
3433                                     struct ctdb_req_control *c,
3434                                     TDB_DATA indata)
3435 {
3436         struct ctdb_control_iface_info *info;
3437         struct ctdb_iface *iface;
3438         bool link_up = false;
3439
3440         info = (struct ctdb_control_iface_info *)indata.dptr;
3441
3442         if (info->name[CTDB_IFACE_SIZE] != '\0') {
3443                 int len = strnlen(info->name, CTDB_IFACE_SIZE);
3444                 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
3445                                   len, len, info->name));
3446                 return -1;
3447         }
3448
3449         switch (info->link_state) {
3450         case 0:
3451                 link_up = false;
3452                 break;
3453         case 1:
3454                 link_up = true;
3455                 break;
3456         default:
3457                 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
3458                                   (unsigned int)info->link_state));
3459                 return -1;
3460         }
3461
3462         if (info->references != 0) {
3463                 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
3464                                   (unsigned int)info->references));
3465                 return -1;
3466         }
3467
3468         iface = ctdb_find_iface(ctdb, info->name);
3469         if (iface == NULL) {
3470                 return -1;
3471         }
3472
3473         if (link_up == iface->link_up) {
3474                 return 0;
3475         }
3476
3477         DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
3478               ("iface[%s] has changed it's link status %s => %s\n",
3479                iface->name,
3480                iface->link_up?"up":"down",
3481                link_up?"up":"down"));
3482
3483         iface->link_up = link_up;
3484         return 0;
3485 }
3486
3487
3488 /* 
3489    structure containing the listening socket and the list of tcp connections
3490    that the ctdb daemon is to kill
3491 */
3492 struct ctdb_kill_tcp {
3493         struct ctdb_vnn *vnn;
3494         struct ctdb_context *ctdb;
3495         int capture_fd;
3496         struct fd_event *fde;
3497         trbt_tree_t *connections;
3498         void *private_data;
3499 };
3500
3501 /*
3502   a tcp connection that is to be killed
3503  */
3504 struct ctdb_killtcp_con {
3505         ctdb_sock_addr src_addr;
3506         ctdb_sock_addr dst_addr;
3507         int count;
3508         struct ctdb_kill_tcp *killtcp;
3509 };
3510
3511 /* this function is used to create a key to represent this socketpair
3512    in the killtcp tree.
3513    this key is used to insert and lookup matching socketpairs that are
3514    to be tickled and RST
3515 */
3516 #define KILLTCP_KEYLEN  10
3517 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
3518 {
3519         static uint32_t key[KILLTCP_KEYLEN];
3520
3521         bzero(key, sizeof(key));
3522
3523         if (src->sa.sa_family != dst->sa.sa_family) {
3524                 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
3525                 return key;
3526         }
3527         
3528         switch (src->sa.sa_family) {
3529         case AF_INET:
3530                 key[0]  = dst->ip.sin_addr.s_addr;
3531                 key[1]  = src->ip.sin_addr.s_addr;
3532                 key[2]  = dst->ip.sin_port;
3533                 key[3]  = src->ip.sin_port;
3534                 break;
3535         case AF_INET6: {
3536                 uint32_t *dst6_addr32 =
3537                         (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
3538                 uint32_t *src6_addr32 =
3539                         (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
3540                 key[0]  = dst6_addr32[3];
3541                 key[1]  = src6_addr32[3];
3542                 key[2]  = dst6_addr32[2];
3543                 key[3]  = src6_addr32[2];
3544                 key[4]  = dst6_addr32[1];
3545                 key[5]  = src6_addr32[1];
3546                 key[6]  = dst6_addr32[0];
3547                 key[7]  = src6_addr32[0];
3548                 key[8]  = dst->ip6.sin6_port;
3549                 key[9]  = src->ip6.sin6_port;
3550                 break;
3551         }
3552         default:
3553                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
3554                 return key;
3555         }
3556
3557         return key;
3558 }
3559
3560 /*
3561   called when we get a read event on the raw socket
3562  */
3563 static void capture_tcp_handler(struct event_context *ev, struct fd_event *fde, 
3564                                 uint16_t flags, void *private_data)
3565 {
3566         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3567         struct ctdb_killtcp_con *con;
3568         ctdb_sock_addr src, dst;
3569         uint32_t ack_seq, seq;
3570
3571         if (!(flags & EVENT_FD_READ)) {
3572                 return;
3573         }
3574
3575         if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
3576                                 killtcp->private_data,
3577                                 &src, &dst,
3578                                 &ack_seq, &seq) != 0) {
3579                 /* probably a non-tcp ACK packet */
3580                 return;
3581         }
3582
3583         /* check if we have this guy in our list of connections
3584            to kill
3585         */
3586         con = trbt_lookuparray32(killtcp->connections, 
3587                         KILLTCP_KEYLEN, killtcp_key(&src, &dst));
3588         if (con == NULL) {
3589                 /* no this was some other packet we can just ignore */
3590                 return;
3591         }
3592
3593         /* This one has been tickled !
3594            now reset him and remove him from the list.
3595          */
3596         DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
3597                 ntohs(con->dst_addr.ip.sin_port),
3598                 ctdb_addr_to_str(&con->src_addr),
3599                 ntohs(con->src_addr.ip.sin_port)));
3600
3601         ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
3602         talloc_free(con);
3603 }
3604
3605
3606 /* when traversing the list of all tcp connections to send tickle acks to
3607    (so that we can capture the ack coming back and kill the connection
3608     by a RST)
3609    this callback is called for each connection we are currently trying to kill
3610 */
3611 static int tickle_connection_traverse(void *param, void *data)
3612 {
3613         struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
3614
3615         /* have tried too many times, just give up */
3616         if (con->count >= 5) {
3617                 /* can't delete in traverse: reparent to delete_cons */
3618                 talloc_steal(param, con);
3619                 return 0;
3620         }
3621
3622         /* othervise, try tickling it again */
3623         con->count++;
3624         ctdb_sys_send_tcp(
3625                 (ctdb_sock_addr *)&con->dst_addr,
3626                 (ctdb_sock_addr *)&con->src_addr,
3627                 0, 0, 0);
3628         return 0;
3629 }
3630
3631
3632 /* 
3633    called every second until all sentenced connections have been reset
3634  */
3635 static void ctdb_tickle_sentenced_connections(struct event_context *ev, struct timed_event *te, 
3636                                               struct timeval t, void *private_data)
3637 {
3638         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3639         void *delete_cons = talloc_new(NULL);
3640
3641         /* loop over all connections sending tickle ACKs */
3642         trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
3643
3644         /* now we've finished traverse, it's safe to do deletion. */
3645         talloc_free(delete_cons);
3646
3647         /* If there are no more connections to kill we can remove the
3648            entire killtcp structure
3649          */
3650         if ( (killtcp->connections == NULL) || 
3651              (killtcp->connections->root == NULL) ) {
3652                 talloc_free(killtcp);
3653                 return;
3654         }
3655
3656         /* try tickling them again in a seconds time
3657          */
3658         event_add_timed(killtcp->ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
3659                         ctdb_tickle_sentenced_connections, killtcp);
3660 }
3661
3662 /*
3663   destroy the killtcp structure
3664  */
3665 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
3666 {
3667         struct ctdb_vnn *tmpvnn;
3668
3669         /* verify that this vnn is still active */
3670         for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
3671                 if (tmpvnn == killtcp->vnn) {
3672                         break;
3673                 }
3674         }
3675
3676         if (tmpvnn == NULL) {
3677                 return 0;
3678         }
3679
3680         if (killtcp->vnn->killtcp != killtcp) {
3681                 return 0;
3682         }
3683
3684         killtcp->vnn->killtcp = NULL;
3685
3686         return 0;
3687 }
3688
3689
3690 /* nothing fancy here, just unconditionally replace any existing
3691    connection structure with the new one.
3692
3693    dont even free the old one if it did exist, that one is talloc_stolen
3694    by the same node in the tree anyway and will be deleted when the new data 
3695    is deleted
3696 */
3697 static void *add_killtcp_callback(void *parm, void *data)
3698 {
3699         return parm;
3700 }
3701
3702 /*
3703   add a tcp socket to the list of connections we want to RST
3704  */
3705 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb, 
3706                                        ctdb_sock_addr *s,
3707                                        ctdb_sock_addr *d)
3708 {
3709         ctdb_sock_addr src, dst;
3710         struct ctdb_kill_tcp *killtcp;
3711         struct ctdb_killtcp_con *con;
3712         struct ctdb_vnn *vnn;
3713
3714         ctdb_canonicalize_ip(s, &src);
3715         ctdb_canonicalize_ip(d, &dst);
3716
3717         vnn = find_public_ip_vnn(ctdb, &dst);
3718         if (vnn == NULL) {
3719                 vnn = find_public_ip_vnn(ctdb, &src);
3720         }
3721         if (vnn == NULL) {
3722                 /* if it is not a public ip   it could be our 'single ip' */
3723                 if (ctdb->single_ip_vnn) {
3724                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
3725                                 vnn = ctdb->single_ip_vnn;
3726                         }
3727                 }
3728         }
3729         if (vnn == NULL) {
3730                 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n")); 
3731                 return -1;
3732         }
3733
3734         killtcp = vnn->killtcp;
3735         
3736         /* If this is the first connection to kill we must allocate
3737            a new structure
3738          */
3739         if (killtcp == NULL) {
3740                 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
3741                 CTDB_NO_MEMORY(ctdb, killtcp);
3742
3743                 killtcp->vnn         = vnn;
3744                 killtcp->ctdb        = ctdb;
3745                 killtcp->capture_fd  = -1;
3746                 killtcp->connections = trbt_create(killtcp, 0);
3747
3748                 vnn->killtcp         = killtcp;
3749                 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
3750         }
3751
3752
3753
3754         /* create a structure that describes this connection we want to
3755            RST and store it in killtcp->connections
3756         */
3757         con = talloc(killtcp, struct ctdb_killtcp_con);
3758         CTDB_NO_MEMORY(ctdb, con);
3759         con->src_addr = src;
3760         con->dst_addr = dst;
3761         con->count    = 0;
3762         con->killtcp  = killtcp;
3763
3764
3765         trbt_insertarray32_callback(killtcp->connections,
3766                         KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
3767                         add_killtcp_callback, con);
3768
3769         /* 
3770            If we dont have a socket to listen on yet we must create it
3771          */
3772         if (killtcp->capture_fd == -1) {
3773                 const char *iface = ctdb_vnn_iface_string(vnn);
3774                 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
3775                 if (killtcp->capture_fd == -1) {
3776                         DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
3777                                           "socket on iface '%s' for killtcp (%s)\n",
3778                                           iface, strerror(errno)));
3779                         goto failed;
3780                 }
3781         }
3782
3783
3784         if (killtcp->fde == NULL) {
3785                 killtcp->fde = event_add_fd(ctdb->ev, killtcp, killtcp->capture_fd, 
3786                                             EVENT_FD_READ,
3787                                             capture_tcp_handler, killtcp);
3788                 tevent_fd_set_auto_close(killtcp->fde);
3789
3790                 /* We also need to set up some events to tickle all these connections
3791                    until they are all reset
3792                 */
3793                 event_add_timed(ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
3794                                 ctdb_tickle_sentenced_connections, killtcp);
3795         }
3796
3797         /* tickle him once now */
3798         ctdb_sys_send_tcp(
3799                 &con->dst_addr,
3800                 &con->src_addr,
3801                 0, 0, 0);
3802
3803         return 0;
3804
3805 failed:
3806         talloc_free(vnn->killtcp);
3807         vnn->killtcp = NULL;
3808         return -1;
3809 }
3810
3811 /*
3812   kill a TCP connection.
3813  */
3814 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
3815 {
3816         struct ctdb_control_killtcp *killtcp = (struct ctdb_control_killtcp *)indata.dptr;
3817
3818         return ctdb_killtcp_add_connection(ctdb, &killtcp->src_addr, &killtcp->dst_addr);
3819 }
3820
3821 /*
3822   called by a daemon to inform us of the entire list of TCP tickles for
3823   a particular public address.
3824   this control should only be sent by the node that is currently serving
3825   that public address.
3826  */
3827 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
3828 {
3829         struct ctdb_control_tcp_tickle_list *list = (struct ctdb_control_tcp_tickle_list *)indata.dptr;
3830         struct ctdb_tcp_array *tcparray;
3831         struct ctdb_vnn *vnn;
3832
3833         /* We must at least have tickles.num or else we cant verify the size
3834            of the received data blob
3835          */
3836         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3837                                         tickles.connections)) {
3838                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list. Not enough data for the tickle.num field\n"));
3839                 return -1;
3840         }
3841
3842         /* verify that the size of data matches what we expect */
3843         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3844                                 tickles.connections)
3845                          + sizeof(struct ctdb_tcp_connection)
3846                                  * list->tickles.num) {
3847                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list\n"));
3848                 return -1;
3849         }
3850
3851         DEBUG(DEBUG_INFO, ("Received tickle update for public address %s\n",
3852                            ctdb_addr_to_str(&list->addr)));
3853
3854         vnn = find_public_ip_vnn(ctdb, &list->addr);
3855         if (vnn == NULL) {
3856                 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n",
3857                         ctdb_addr_to_str(&list->addr)));
3858
3859                 return 1;
3860         }
3861
3862         /* remove any old ticklelist we might have */
3863         talloc_free(vnn->tcp_array);
3864         vnn->tcp_array = NULL;
3865
3866         tcparray = talloc(vnn, struct ctdb_tcp_array);
3867         CTDB_NO_MEMORY(ctdb, tcparray);
3868
3869         tcparray->num = list->tickles.num;
3870
3871         tcparray->connections = talloc_array(tcparray, struct ctdb_tcp_connection, tcparray->num);
3872         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3873
3874         memcpy(tcparray->connections, &list->tickles.connections[0],
3875                sizeof(struct ctdb_tcp_connection)*tcparray->num);
3876
3877         /* We now have a new fresh tickle list array for this vnn */
3878         vnn->tcp_array = tcparray;
3879
3880         return 0;
3881 }
3882
3883 /*
3884   called to return the full list of tickles for the puclic address associated 
3885   with the provided vnn
3886  */
3887 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3888 {
3889         ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3890         struct ctdb_control_tcp_tickle_list *list;
3891         struct ctdb_tcp_array *tcparray;
3892         int num;
3893         struct ctdb_vnn *vnn;
3894
3895         vnn = find_public_ip_vnn(ctdb, addr);
3896         if (vnn == NULL) {
3897                 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n", 
3898                         ctdb_addr_to_str(addr)));
3899
3900                 return 1;
3901         }
3902
3903         tcparray = vnn->tcp_array;
3904         if (tcparray) {
3905                 num = tcparray->num;
3906         } else {
3907                 num = 0;
3908         }
3909
3910         outdata->dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3911                                 tickles.connections)
3912                         + sizeof(struct ctdb_tcp_connection) * num;
3913
3914         outdata->dptr  = talloc_size(outdata, outdata->dsize);
3915         CTDB_NO_MEMORY(ctdb, outdata->dptr);
3916         list = (struct ctdb_control_tcp_tickle_list *)outdata->dptr;
3917
3918         list->addr = *addr;
3919         list->tickles.num = num;
3920         if (num) {
3921                 memcpy(&list->tickles.connections[0], tcparray->connections, 
3922                         sizeof(struct ctdb_tcp_connection) * num);
3923         }
3924
3925         return 0;
3926 }
3927
3928
3929 /*
3930   set the list of all tcp tickles for a public address
3931  */
3932 static int ctdb_send_set_tcp_tickles_for_ip(struct ctdb_context *ctdb,
3933                                             ctdb_sock_addr *addr,
3934                                             struct ctdb_tcp_array *tcparray)
3935 {
3936         int ret, num;
3937         TDB_DATA data;
3938         struct ctdb_control_tcp_tickle_list *list;
3939
3940         if (tcparray) {
3941                 num = tcparray->num;
3942         } else {
3943                 num = 0;
3944         }
3945
3946         data.dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3947                                 tickles.connections) +
3948                         sizeof(struct ctdb_tcp_connection) * num;
3949         data.dptr = talloc_size(ctdb, data.dsize);
3950         CTDB_NO_MEMORY(ctdb, data.dptr);
3951
3952         list = (struct ctdb_control_tcp_tickle_list *)data.dptr;
3953         list->addr = *addr;
3954         list->tickles.num = num;
3955         if (tcparray) {
3956                 memcpy(&list->tickles.connections[0], tcparray->connections, sizeof(struct ctdb_tcp_connection) * num);
3957         }
3958
3959         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_ALL, 0,
3960                                        CTDB_CONTROL_SET_TCP_TICKLE_LIST,
3961                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3962         if (ret != 0) {
3963                 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
3964                 return -1;
3965         }
3966
3967         talloc_free(data.dptr);
3968
3969         return ret;
3970 }
3971
3972
3973 /*
3974   perform tickle updates if required
3975  */
3976 static void ctdb_update_tcp_tickles(struct event_context *ev, 
3977                                 struct timed_event *te, 
3978                                 struct timeval t, void *private_data)
3979 {
3980         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3981         int ret;
3982         struct ctdb_vnn *vnn;
3983
3984         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3985                 /* we only send out updates for public addresses that 
3986                    we have taken over
3987                  */
3988                 if (ctdb->pnn != vnn->pnn) {
3989                         continue;
3990                 }
3991                 /* We only send out the updates if we need to */
3992                 if (!vnn->tcp_update_needed) {
3993                         continue;
3994                 }
3995                 ret = ctdb_send_set_tcp_tickles_for_ip(ctdb,
3996                                                        &vnn->public_address,
3997                                                        vnn->tcp_array);
3998                 if (ret != 0) {
3999                         DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
4000                                 ctdb_addr_to_str(&vnn->public_address)));
4001                 } else {
4002                         DEBUG(DEBUG_INFO,
4003                               ("Sent tickle update for public address %s\n",
4004                                ctdb_addr_to_str(&vnn->public_address)));
4005                         vnn->tcp_update_needed = false;
4006                 }
4007         }
4008
4009         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
4010                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
4011                              ctdb_update_tcp_tickles, ctdb);
4012 }               
4013         
4014
4015 /*
4016   start periodic update of tcp tickles
4017  */
4018 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
4019 {
4020         ctdb->tickle_update_context = talloc_new(ctdb);
4021
4022         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
4023                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
4024                              ctdb_update_tcp_tickles, ctdb);
4025 }
4026
4027
4028
4029
4030 struct control_gratious_arp {
4031         struct ctdb_context *ctdb;
4032         ctdb_sock_addr addr;
4033         const char *iface;
4034         int count;
4035 };
4036
4037 /*
4038   send a control_gratuitous arp
4039  */
4040 static void send_gratious_arp(struct event_context *ev, struct timed_event *te, 
4041                                   struct timeval t, void *private_data)
4042 {
4043         int ret;
4044         struct control_gratious_arp *arp = talloc_get_type(private_data, 
4045                                                         struct control_gratious_arp);
4046
4047         ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
4048         if (ret != 0) {
4049                 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
4050                                  arp->iface, strerror(errno)));
4051         }
4052
4053
4054         arp->count++;
4055         if (arp->count == CTDB_ARP_REPEAT) {
4056                 talloc_free(arp);
4057                 return;
4058         }
4059
4060         event_add_timed(arp->ctdb->ev, arp, 
4061                         timeval_current_ofs(CTDB_ARP_INTERVAL, 0), 
4062                         send_gratious_arp, arp);
4063 }
4064
4065
4066 /*
4067   send a gratious arp 
4068  */
4069 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
4070 {
4071         struct ctdb_control_gratious_arp *gratious_arp = (struct ctdb_control_gratious_arp *)indata.dptr;
4072         struct control_gratious_arp *arp;
4073
4074         /* verify the size of indata */
4075         if (indata.dsize < offsetof(struct ctdb_control_gratious_arp, iface)) {
4076                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
4077                                  (unsigned)indata.dsize, 
4078                                  (unsigned)offsetof(struct ctdb_control_gratious_arp, iface)));
4079                 return -1;
4080         }
4081         if (indata.dsize != 
4082                 ( offsetof(struct ctdb_control_gratious_arp, iface)
4083                 + gratious_arp->len ) ){
4084
4085                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4086                         "but should be %u bytes\n", 
4087                          (unsigned)indata.dsize, 
4088                          (unsigned)(offsetof(struct ctdb_control_gratious_arp, iface)+gratious_arp->len)));
4089                 return -1;
4090         }
4091
4092
4093         arp = talloc(ctdb, struct control_gratious_arp);
4094         CTDB_NO_MEMORY(ctdb, arp);
4095
4096         arp->ctdb  = ctdb;
4097         arp->addr   = gratious_arp->addr;
4098         arp->iface = talloc_strdup(arp, gratious_arp->iface);
4099         CTDB_NO_MEMORY(ctdb, arp->iface);
4100         arp->count = 0;
4101         
4102         event_add_timed(arp->ctdb->ev, arp, 
4103                         timeval_zero(), send_gratious_arp, arp);
4104
4105         return 0;
4106 }
4107
4108 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
4109 {
4110         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
4111         int ret;
4112
4113         /* verify the size of indata */
4114         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
4115                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
4116                 return -1;
4117         }
4118         if (indata.dsize != 
4119                 ( offsetof(struct ctdb_control_ip_iface, iface)
4120                 + pub->len ) ){
4121
4122                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4123                         "but should be %u bytes\n", 
4124                          (unsigned)indata.dsize, 
4125                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
4126                 return -1;
4127         }
4128
4129         DEBUG(DEBUG_NOTICE,("Add IP %s\n", ctdb_addr_to_str(&pub->addr)));
4130
4131         ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
4132
4133         if (ret != 0) {
4134                 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
4135                 return -1;
4136         }
4137
4138         return 0;
4139 }
4140
4141 struct delete_ip_callback_state {
4142         struct ctdb_req_control *c;
4143 };
4144
4145 /*
4146   called when releaseip event finishes for del_public_address
4147  */
4148 static void delete_ip_callback(struct ctdb_context *ctdb,
4149                                int32_t status, TDB_DATA data,
4150                                const char *errormsg,
4151                                void *private_data)
4152 {
4153         struct delete_ip_callback_state *state =
4154                 talloc_get_type(private_data, struct delete_ip_callback_state);
4155
4156         /* If release failed then fail. */
4157         ctdb_request_control_reply(ctdb, state->c, NULL, status, errormsg);
4158         talloc_free(private_data);
4159 }
4160
4161 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb,
4162                                         struct ctdb_req_control *c,
4163                                         TDB_DATA indata, bool *async_reply)
4164 {
4165         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
4166         struct ctdb_vnn *vnn;
4167
4168         /* verify the size of indata */
4169         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
4170                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
4171                 return -1;
4172         }
4173         if (indata.dsize != 
4174                 ( offsetof(struct ctdb_control_ip_iface, iface)
4175                 + pub->len ) ){
4176
4177                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4178                         "but should be %u bytes\n", 
4179                          (unsigned)indata.dsize, 
4180                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
4181                 return -1;
4182         }
4183
4184         DEBUG(DEBUG_NOTICE,("Delete IP %s\n", ctdb_addr_to_str(&pub->addr)));
4185
4186         /* walk over all public addresses until we find a match */
4187         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
4188                 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
4189                         if (vnn->pnn == ctdb->pnn) {
4190                                 struct delete_ip_callback_state *state;
4191                                 struct ctdb_public_ip *ip;
4192                                 TDB_DATA data;
4193                                 int ret;
4194
4195                                 vnn->delete_pending = true;
4196
4197                                 state = talloc(ctdb,
4198                                                struct delete_ip_callback_state);
4199                                 CTDB_NO_MEMORY(ctdb, state);
4200                                 state->c = c;
4201
4202                                 ip = talloc(state, struct ctdb_public_ip);
4203                                 if (ip == NULL) {
4204                                         DEBUG(DEBUG_ERR,
4205                                               (__location__ " Out of memory\n"));
4206                                         talloc_free(state);
4207                                         return -1;
4208                                 }
4209                                 ip->pnn = -1;
4210                                 ip->addr = pub->addr;
4211
4212                                 data.dsize = sizeof(struct ctdb_public_ip);
4213                                 data.dptr = (unsigned char *)ip;
4214
4215                                 ret = ctdb_daemon_send_control(ctdb,
4216                                                                ctdb_get_pnn(ctdb),
4217                                                                0,
4218                                                                CTDB_CONTROL_RELEASE_IP,
4219                                                                0, 0,
4220                                                                data,
4221                                                                delete_ip_callback,
4222                                                                state);
4223                                 if (ret == -1) {
4224                                         DEBUG(DEBUG_ERR,
4225                                               (__location__ "Unable to send "
4226                                                "CTDB_CONTROL_RELEASE_IP\n"));
4227                                         talloc_free(state);
4228                                         return -1;
4229                                 }
4230
4231                                 state->c = talloc_steal(state, c);
4232                                 *async_reply = true;
4233                         } else {
4234                                 /* This IP is not hosted on the
4235                                  * current node so just delete it
4236                                  * now. */
4237                                 do_delete_ip(ctdb, vnn);
4238                         }
4239
4240                         return 0;
4241                 }
4242         }
4243
4244         DEBUG(DEBUG_ERR,("Delete IP of unknown public IP address %s\n",
4245                          ctdb_addr_to_str(&pub->addr)));
4246         return -1;
4247 }
4248
4249
4250 struct ipreallocated_callback_state {
4251         struct ctdb_req_control *c;
4252 };
4253
4254 static void ctdb_ipreallocated_callback(struct ctdb_context *ctdb,
4255                                         int status, void *p)
4256 {
4257         struct ipreallocated_callback_state *state =
4258                 talloc_get_type(p, struct ipreallocated_callback_state);
4259
4260         if (status != 0) {
4261                 DEBUG(DEBUG_ERR,
4262                       (" \"ipreallocated\" event script failed (status %d)\n",
4263                        status));
4264                 if (status == -ETIME) {
4265                         ctdb_ban_self(ctdb);
4266                 }
4267         }
4268
4269         ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
4270         talloc_free(state);
4271 }
4272
4273 /* A control to run the ipreallocated event */
4274 int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
4275                                    struct ctdb_req_control *c,
4276                                    bool *async_reply)
4277 {
4278         int ret;
4279         struct ipreallocated_callback_state *state;
4280
4281         state = talloc(ctdb, struct ipreallocated_callback_state);
4282         CTDB_NO_MEMORY(ctdb, state);
4283
4284         DEBUG(DEBUG_INFO,(__location__ " Running \"ipreallocated\" event\n"));
4285
4286         ret = ctdb_event_script_callback(ctdb, state,
4287                                          ctdb_ipreallocated_callback, state,
4288                                          CTDB_EVENT_IPREALLOCATED,
4289                                          "%s", "");
4290
4291         if (ret != 0) {
4292                 DEBUG(DEBUG_ERR,("Failed to run \"ipreallocated\" event \n"));
4293                 talloc_free(state);
4294                 return -1;
4295         }
4296
4297         /* tell the control that we will be reply asynchronously */
4298         state->c    = talloc_steal(state, c);
4299         *async_reply = true;
4300
4301         return 0;
4302 }
4303
4304
4305 /* This function is called from the recovery daemon to verify that a remote
4306    node has the expected ip allocation.
4307    This is verified against ctdb->ip_tree
4308 */
4309 int verify_remote_ip_allocation(struct ctdb_context *ctdb,
4310                                 struct ctdb_all_public_ips *ips,
4311                                 uint32_t pnn)
4312 {
4313         struct ctdb_public_ip_list *tmp_ip; 
4314         int i;
4315
4316         if (ctdb->ip_tree == NULL) {
4317                 /* dont know the expected allocation yet, assume remote node
4318                    is correct. */
4319                 return 0;
4320         }
4321
4322         if (ips == NULL) {
4323                 return 0;
4324         }
4325
4326         for (i=0; i<ips->num; i++) {
4327                 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
4328                 if (tmp_ip == NULL) {
4329                         DEBUG(DEBUG_ERR,("Node %u has new or unknown public IP %s\n", pnn, ctdb_addr_to_str(&ips->ips[i].addr)));
4330                         return -1;
4331                 }
4332
4333                 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
4334                         continue;
4335                 }
4336
4337                 if (tmp_ip->pnn != ips->ips[i].pnn) {
4338                         DEBUG(DEBUG_ERR,
4339                               ("Inconsistent IP allocation - node %u thinks %s is held by node %u while it is assigned to node %u\n",
4340                                pnn,
4341                                ctdb_addr_to_str(&ips->ips[i].addr),
4342                                ips->ips[i].pnn, tmp_ip->pnn));
4343                         return -1;
4344                 }
4345         }
4346
4347         return 0;
4348 }
4349
4350 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
4351 {
4352         struct ctdb_public_ip_list *tmp_ip; 
4353
4354         if (ctdb->ip_tree == NULL) {
4355                 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
4356                 return -1;
4357         }
4358
4359         tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
4360         if (tmp_ip == NULL) {
4361                 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
4362                 return -1;
4363         }
4364
4365         DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
4366         tmp_ip->pnn = ip->pnn;
4367
4368         return 0;
4369 }
4370
4371
4372 struct ctdb_reloadips_handle {
4373         struct ctdb_context *ctdb;
4374         struct ctdb_req_control *c;
4375         int status;
4376         int fd[2];
4377         pid_t child;
4378         struct fd_event *fde;
4379 };
4380
4381 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
4382 {
4383         if (h == h->ctdb->reload_ips) {
4384                 h->ctdb->reload_ips = NULL;
4385         }
4386         if (h->c != NULL) {
4387                 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
4388                 h->c = NULL;
4389         }
4390         ctdb_kill(h->ctdb, h->child, SIGKILL);
4391         return 0;
4392 }
4393
4394 static void ctdb_reloadips_timeout_event(struct event_context *ev,
4395                                 struct timed_event *te,
4396                                 struct timeval t, void *private_data)
4397 {
4398         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4399
4400         talloc_free(h);
4401 }       
4402
4403 static void ctdb_reloadips_child_handler(struct event_context *ev, struct fd_event *fde, 
4404                              uint16_t flags, void *private_data)
4405 {
4406         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4407
4408         char res;
4409         int ret;
4410
4411         ret = sys_read(h->fd[0], &res, 1);
4412         if (ret < 1 || res != 0) {
4413                 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
4414                 res = 1;
4415         }
4416         h->status = res;
4417
4418         talloc_free(h);
4419 }
4420
4421 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
4422 {
4423         TALLOC_CTX *mem_ctx = talloc_new(NULL);
4424         struct ctdb_all_public_ips *ips;
4425         struct ctdb_vnn *vnn;
4426         struct client_async_data *async_data;
4427         struct timeval timeout;
4428         TDB_DATA data;
4429         struct ctdb_client_control_state *state;
4430         bool first_add;
4431         int i, ret;
4432
4433         CTDB_NO_MEMORY(ctdb, mem_ctx);
4434
4435         /* Read IPs from local node */
4436         ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(),
4437                                        CTDB_CURRENT_NODE, mem_ctx, &ips);
4438         if (ret != 0) {
4439                 DEBUG(DEBUG_ERR,
4440                       ("Unable to fetch public IPs from local node\n"));
4441                 talloc_free(mem_ctx);
4442                 return -1;
4443         }
4444
4445         /* Read IPs file - this is safe since this is a child process */
4446         ctdb->vnn = NULL;
4447         if (ctdb_set_public_addresses(ctdb, false) != 0) {
4448                 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
4449                 talloc_free(mem_ctx);
4450                 return -1;
4451         }
4452
4453         async_data = talloc_zero(mem_ctx, struct client_async_data);
4454         CTDB_NO_MEMORY(ctdb, async_data);
4455
4456         /* Compare IPs between node and file for IPs to be deleted */
4457         for (i = 0; i < ips->num; i++) {
4458                 /* */
4459                 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4460                         if (ctdb_same_ip(&vnn->public_address,
4461                                          &ips->ips[i].addr)) {
4462                                 /* IP is still in file */
4463                                 break;
4464                         }
4465                 }
4466
4467                 if (vnn == NULL) {
4468                         /* Delete IP ips->ips[i] */
4469                         struct ctdb_control_ip_iface *pub;
4470
4471                         DEBUG(DEBUG_NOTICE,
4472                               ("IP %s no longer configured, deleting it\n",
4473                                ctdb_addr_to_str(&ips->ips[i].addr)));
4474
4475                         pub = talloc_zero(mem_ctx,
4476                                           struct ctdb_control_ip_iface);
4477                         CTDB_NO_MEMORY(ctdb, pub);
4478
4479                         pub->addr  = ips->ips[i].addr;
4480                         pub->mask  = 0;
4481                         pub->len   = 0;
4482
4483                         timeout = TAKEOVER_TIMEOUT();
4484
4485                         data.dsize = offsetof(struct ctdb_control_ip_iface,
4486                                               iface) + pub->len;
4487                         data.dptr = (uint8_t *)pub;
4488
4489                         state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4490                                                   CTDB_CONTROL_DEL_PUBLIC_IP,
4491                                                   0, data, async_data,
4492                                                   &timeout, NULL);
4493                         if (state == NULL) {
4494                                 DEBUG(DEBUG_ERR,
4495                                       (__location__
4496                                        " failed sending CTDB_CONTROL_DEL_PUBLIC_IP\n"));
4497                                 goto failed;
4498                         }
4499
4500                         ctdb_client_async_add(async_data, state);
4501                 }
4502         }
4503
4504         /* Compare IPs between node and file for IPs to be added */
4505         first_add = true;
4506         for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4507                 for (i = 0; i < ips->num; i++) {
4508                         if (ctdb_same_ip(&vnn->public_address,
4509                                          &ips->ips[i].addr)) {
4510                                 /* IP already on node */
4511                                 break;
4512                         }
4513                 }
4514                 if (i == ips->num) {
4515                         /* Add IP ips->ips[i] */
4516                         struct ctdb_control_ip_iface *pub;
4517                         const char *ifaces = NULL;
4518                         uint32_t len;
4519                         int iface = 0;
4520
4521                         DEBUG(DEBUG_NOTICE,
4522                               ("New IP %s configured, adding it\n",
4523                                ctdb_addr_to_str(&vnn->public_address)));
4524                         if (first_add) {
4525                                 uint32_t pnn = ctdb_get_pnn(ctdb);
4526
4527                                 data.dsize = sizeof(pnn);
4528                                 data.dptr  = (uint8_t *)&pnn;
4529
4530                                 ret = ctdb_client_send_message(
4531                                         ctdb,
4532                                         CTDB_BROADCAST_CONNECTED,
4533                                         CTDB_SRVID_REBALANCE_NODE,
4534                                         data);
4535                                 if (ret != 0) {
4536                                         DEBUG(DEBUG_WARNING,
4537                                               ("Failed to send message to force node reallocation - IPs may be unbalanced\n"));
4538                                 }
4539
4540                                 first_add = false;
4541                         }
4542
4543                         ifaces = vnn->ifaces[0];
4544                         iface = 1;
4545                         while (vnn->ifaces[iface] != NULL) {
4546                                 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces,
4547                                                          vnn->ifaces[iface]);
4548                                 iface++;
4549                         }
4550
4551                         len   = strlen(ifaces) + 1;
4552                         pub = talloc_zero_size(mem_ctx,
4553                                                offsetof(struct ctdb_control_ip_iface, iface) + len);
4554                         CTDB_NO_MEMORY(ctdb, pub);
4555
4556                         pub->addr  = vnn->public_address;
4557                         pub->mask  = vnn->public_netmask_bits;
4558                         pub->len   = len;
4559                         memcpy(&pub->iface[0], ifaces, pub->len);
4560
4561                         timeout = TAKEOVER_TIMEOUT();
4562
4563                         data.dsize = offsetof(struct ctdb_control_ip_iface,
4564                                               iface) + pub->len;
4565                         data.dptr = (uint8_t *)pub;
4566
4567                         state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4568                                                   CTDB_CONTROL_ADD_PUBLIC_IP,
4569                                                   0, data, async_data,
4570                                                   &timeout, NULL);
4571                         if (state == NULL) {
4572                                 DEBUG(DEBUG_ERR,
4573                                       (__location__
4574                                        " failed sending CTDB_CONTROL_ADD_PUBLIC_IP\n"));
4575                                 goto failed;
4576                         }
4577
4578                         ctdb_client_async_add(async_data, state);
4579                 }
4580         }
4581
4582         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
4583                 DEBUG(DEBUG_ERR,(__location__ " Add/delete IPs failed\n"));
4584                 goto failed;
4585         }
4586
4587         talloc_free(mem_ctx);
4588         return 0;
4589
4590 failed:
4591         talloc_free(mem_ctx);
4592         return -1;
4593 }
4594
4595 /* This control is sent to force the node to re-read the public addresses file
4596    and drop any addresses we should nnot longer host, and add new addresses
4597    that we are now able to host
4598 */
4599 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control *c, bool *async_reply)
4600 {
4601         struct ctdb_reloadips_handle *h;
4602         pid_t parent = getpid();
4603
4604         if (ctdb->reload_ips != NULL) {
4605                 talloc_free(ctdb->reload_ips);
4606                 ctdb->reload_ips = NULL;
4607         }
4608
4609         h = talloc(ctdb, struct ctdb_reloadips_handle);
4610         CTDB_NO_MEMORY(ctdb, h);
4611         h->ctdb     = ctdb;
4612         h->c        = NULL;
4613         h->status   = -1;
4614         
4615         if (pipe(h->fd) == -1) {
4616                 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
4617                 talloc_free(h);
4618                 return -1;
4619         }
4620
4621         h->child = ctdb_fork(ctdb);
4622         if (h->child == (pid_t)-1) {
4623                 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
4624                 close(h->fd[0]);
4625                 close(h->fd[1]);
4626                 talloc_free(h);
4627                 return -1;
4628         }
4629
4630         /* child process */
4631         if (h->child == 0) {
4632                 signed char res = 0;
4633
4634                 close(h->fd[0]);
4635                 debug_extra = talloc_asprintf(NULL, "reloadips:");
4636
4637                 ctdb_set_process_name("ctdb_reloadips");
4638                 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
4639                         DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
4640                         res = -1;
4641                 } else {
4642                         res = ctdb_reloadips_child(ctdb);
4643                         if (res != 0) {
4644                                 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
4645                         }
4646                 }
4647
4648                 sys_write(h->fd[1], &res, 1);
4649                 /* make sure we die when our parent dies */
4650                 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
4651                         sleep(5);
4652                 }
4653                 _exit(0);
4654         }
4655
4656         h->c             = talloc_steal(h, c);
4657
4658         close(h->fd[1]);
4659         set_close_on_exec(h->fd[0]);
4660
4661         talloc_set_destructor(h, ctdb_reloadips_destructor);
4662
4663
4664         h->fde = event_add_fd(ctdb->ev, h, h->fd[0],
4665                         EVENT_FD_READ, ctdb_reloadips_child_handler,
4666                         (void *)h);
4667         tevent_fd_set_auto_close(h->fde);
4668
4669         event_add_timed(ctdb->ev, h,
4670                         timeval_current_ofs(120, 0),
4671                         ctdb_reloadips_timeout_event, h);
4672
4673         /* we reply later */
4674         *async_reply = true;
4675         return 0;
4676 }