ctdb-ipalloc: Split IP allocation into its own build subsystem
[obnox/samba/samba-obnox.git] / ctdb / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6    Copyright (C) Martin Schwenke  2011
7
8    This program is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3 of the License, or
11    (at your option) any later version.
12    
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17    
18    You should have received a copy of the GNU General Public License
19    along with this program; if not, see <http://www.gnu.org/licenses/>.
20 */
21 #include "replace.h"
22 #include "system/network.h"
23 #include "system/filesys.h"
24 #include "system/time.h"
25 #include "system/wait.h"
26
27 #include <talloc.h>
28 #include <tevent.h>
29
30 #include "lib/util/dlinklist.h"
31 #include "lib/util/debug.h"
32 #include "lib/util/samba_util.h"
33 #include "lib/util/util_process.h"
34
35 #include "ctdb_private.h"
36 #include "ctdb_client.h"
37
38 #include "common/rb_tree.h"
39 #include "common/reqid.h"
40 #include "common/system.h"
41 #include "common/common.h"
42 #include "common/logging.h"
43
44 #include "server/ipalloc.h"
45
46 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
47
48 #define CTDB_ARP_INTERVAL 1
49 #define CTDB_ARP_REPEAT   3
50
51 struct ctdb_interface {
52         struct ctdb_interface *prev, *next;
53         const char *name;
54         bool link_up;
55         uint32_t references;
56 };
57
58 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
59 {
60         if (vnn->iface) {
61                 return vnn->iface->name;
62         }
63
64         return "__none__";
65 }
66
67 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
68 {
69         struct ctdb_interface *i;
70
71         /* Verify that we don't have an entry for this ip yet */
72         for (i=ctdb->ifaces;i;i=i->next) {
73                 if (strcmp(i->name, iface) == 0) {
74                         return 0;
75                 }
76         }
77
78         /* create a new structure for this interface */
79         i = talloc_zero(ctdb, struct ctdb_interface);
80         CTDB_NO_MEMORY_FATAL(ctdb, i);
81         i->name = talloc_strdup(i, iface);
82         CTDB_NO_MEMORY(ctdb, i->name);
83
84         i->link_up = true;
85
86         DLIST_ADD(ctdb->ifaces, i);
87
88         return 0;
89 }
90
91 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
92                                         const char *name)
93 {
94         int n;
95
96         for (n = 0; vnn->ifaces[n] != NULL; n++) {
97                 if (strcmp(name, vnn->ifaces[n]) == 0) {
98                         return true;
99                 }
100         }
101
102         return false;
103 }
104
105 /* If any interfaces now have no possible IPs then delete them.  This
106  * implementation is naive (i.e. simple) rather than clever
107  * (i.e. complex).  Given that this is run on delip and that operation
108  * is rare, this doesn't need to be efficient - it needs to be
109  * foolproof.  One alternative is reference counting, where the logic
110  * is distributed and can, therefore, be broken in multiple places.
111  * Another alternative is to build a red-black tree of interfaces that
112  * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
113  * once) and then walking ctdb->ifaces once and deleting those not in
114  * the tree.  Let's go to one of those if the naive implementation
115  * causes problems...  :-)
116  */
117 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
118                                         struct ctdb_vnn *vnn)
119 {
120         struct ctdb_interface *i, *next;
121
122         /* For each interface, check if there's an IP using it. */
123         for (i = ctdb->ifaces; i != NULL; i = next) {
124                 struct ctdb_vnn *tv;
125                 bool found;
126                 next = i->next;
127
128                 /* Only consider interfaces named in the given VNN. */
129                 if (!vnn_has_interface_with_name(vnn, i->name)) {
130                         continue;
131                 }
132
133                 /* Is the "single IP" on this interface? */
134                 if ((ctdb->single_ip_vnn != NULL) &&
135                     (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
136                     (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
137                         /* Found, next interface please... */
138                         continue;
139                 }
140                 /* Search for a vnn with this interface. */
141                 found = false;
142                 for (tv=ctdb->vnn; tv; tv=tv->next) {
143                         if (vnn_has_interface_with_name(tv, i->name)) {
144                                 found = true;
145                                 break;
146                         }
147                 }
148
149                 if (!found) {
150                         /* None of the VNNs are using this interface. */
151                         DLIST_REMOVE(ctdb->ifaces, i);
152                         talloc_free(i);
153                 }
154         }
155 }
156
157
158 static struct ctdb_interface *ctdb_find_iface(struct ctdb_context *ctdb,
159                                               const char *iface)
160 {
161         struct ctdb_interface *i;
162
163         for (i=ctdb->ifaces;i;i=i->next) {
164                 if (strcmp(i->name, iface) == 0) {
165                         return i;
166                 }
167         }
168
169         return NULL;
170 }
171
172 static struct ctdb_interface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
173                                                   struct ctdb_vnn *vnn)
174 {
175         int i;
176         struct ctdb_interface *cur = NULL;
177         struct ctdb_interface *best = NULL;
178
179         for (i=0; vnn->ifaces[i]; i++) {
180
181                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
182                 if (cur == NULL) {
183                         continue;
184                 }
185
186                 if (!cur->link_up) {
187                         continue;
188                 }
189
190                 if (best == NULL) {
191                         best = cur;
192                         continue;
193                 }
194
195                 if (cur->references < best->references) {
196                         best = cur;
197                         continue;
198                 }
199         }
200
201         return best;
202 }
203
204 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
205                                      struct ctdb_vnn *vnn)
206 {
207         struct ctdb_interface *best = NULL;
208
209         if (vnn->iface) {
210                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
211                                    "still assigned to iface '%s'\n",
212                                    ctdb_addr_to_str(&vnn->public_address),
213                                    ctdb_vnn_iface_string(vnn)));
214                 return 0;
215         }
216
217         best = ctdb_vnn_best_iface(ctdb, vnn);
218         if (best == NULL) {
219                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
220                                   "cannot assign to iface any iface\n",
221                                   ctdb_addr_to_str(&vnn->public_address)));
222                 return -1;
223         }
224
225         vnn->iface = best;
226         best->references++;
227         vnn->pnn = ctdb->pnn;
228
229         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
230                            "now assigned to iface '%s' refs[%d]\n",
231                            ctdb_addr_to_str(&vnn->public_address),
232                            ctdb_vnn_iface_string(vnn),
233                            best->references));
234         return 0;
235 }
236
237 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
238                                     struct ctdb_vnn *vnn)
239 {
240         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
241                            "now unassigned (old iface '%s' refs[%d])\n",
242                            ctdb_addr_to_str(&vnn->public_address),
243                            ctdb_vnn_iface_string(vnn),
244                            vnn->iface?vnn->iface->references:0));
245         if (vnn->iface) {
246                 vnn->iface->references--;
247         }
248         vnn->iface = NULL;
249         if (vnn->pnn == ctdb->pnn) {
250                 vnn->pnn = -1;
251         }
252 }
253
254 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
255                                struct ctdb_vnn *vnn)
256 {
257         int i;
258
259         /* Nodes that are not RUNNING can not host IPs */
260         if (ctdb->runstate != CTDB_RUNSTATE_RUNNING) {
261                 return false;
262         }
263
264         if (vnn->delete_pending) {
265                 return false;
266         }
267
268         if (vnn->iface && vnn->iface->link_up) {
269                 return true;
270         }
271
272         for (i=0; vnn->ifaces[i]; i++) {
273                 struct ctdb_interface *cur;
274
275                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
276                 if (cur == NULL) {
277                         continue;
278                 }
279
280                 if (cur->link_up) {
281                         return true;
282                 }
283         }
284
285         return false;
286 }
287
288 struct ctdb_takeover_arp {
289         struct ctdb_context *ctdb;
290         uint32_t count;
291         ctdb_sock_addr addr;
292         struct ctdb_tcp_array *tcparray;
293         struct ctdb_vnn *vnn;
294 };
295
296
297 /*
298   lists of tcp endpoints
299  */
300 struct ctdb_tcp_list {
301         struct ctdb_tcp_list *prev, *next;
302         struct ctdb_connection connection;
303 };
304
305 /*
306   list of clients to kill on IP release
307  */
308 struct ctdb_client_ip {
309         struct ctdb_client_ip *prev, *next;
310         struct ctdb_context *ctdb;
311         ctdb_sock_addr addr;
312         uint32_t client_id;
313 };
314
315
316 /*
317   send a gratuitous arp
318  */
319 static void ctdb_control_send_arp(struct tevent_context *ev,
320                                   struct tevent_timer *te,
321                                   struct timeval t, void *private_data)
322 {
323         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
324                                                         struct ctdb_takeover_arp);
325         int i, ret;
326         struct ctdb_tcp_array *tcparray;
327         const char *iface = ctdb_vnn_iface_string(arp->vnn);
328
329         ret = ctdb_sys_send_arp(&arp->addr, iface);
330         if (ret != 0) {
331                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
332                                   iface, strerror(errno)));
333         }
334
335         tcparray = arp->tcparray;
336         if (tcparray) {
337                 for (i=0;i<tcparray->num;i++) {
338                         struct ctdb_connection *tcon;
339
340                         tcon = &tcparray->connections[i];
341                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
342                                 (unsigned)ntohs(tcon->dst.ip.sin_port),
343                                 ctdb_addr_to_str(&tcon->src),
344                                 (unsigned)ntohs(tcon->src.ip.sin_port)));
345                         ret = ctdb_sys_send_tcp(
346                                 &tcon->src,
347                                 &tcon->dst,
348                                 0, 0, 0);
349                         if (ret != 0) {
350                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
351                                         ctdb_addr_to_str(&tcon->src)));
352                         }
353                 }
354         }
355
356         arp->count++;
357
358         if (arp->count == CTDB_ARP_REPEAT) {
359                 talloc_free(arp);
360                 return;
361         }
362
363         tevent_add_timer(arp->ctdb->ev, arp->vnn->takeover_ctx,
364                          timeval_current_ofs(CTDB_ARP_INTERVAL, 100000),
365                          ctdb_control_send_arp, arp);
366 }
367
368 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
369                                        struct ctdb_vnn *vnn)
370 {
371         struct ctdb_takeover_arp *arp;
372         struct ctdb_tcp_array *tcparray;
373
374         if (!vnn->takeover_ctx) {
375                 vnn->takeover_ctx = talloc_new(vnn);
376                 if (!vnn->takeover_ctx) {
377                         return -1;
378                 }
379         }
380
381         arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
382         if (!arp) {
383                 return -1;
384         }
385
386         arp->ctdb = ctdb;
387         arp->addr = vnn->public_address;
388         arp->vnn  = vnn;
389
390         tcparray = vnn->tcp_array;
391         if (tcparray) {
392                 /* add all of the known tcp connections for this IP to the
393                    list of tcp connections to send tickle acks for */
394                 arp->tcparray = talloc_steal(arp, tcparray);
395
396                 vnn->tcp_array = NULL;
397                 vnn->tcp_update_needed = true;
398         }
399
400         tevent_add_timer(arp->ctdb->ev, vnn->takeover_ctx,
401                          timeval_zero(), ctdb_control_send_arp, arp);
402
403         return 0;
404 }
405
406 struct takeover_callback_state {
407         struct ctdb_req_control_old *c;
408         ctdb_sock_addr *addr;
409         struct ctdb_vnn *vnn;
410 };
411
412 struct ctdb_do_takeip_state {
413         struct ctdb_req_control_old *c;
414         struct ctdb_vnn *vnn;
415 };
416
417 /*
418   called when takeip event finishes
419  */
420 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
421                                     void *private_data)
422 {
423         struct ctdb_do_takeip_state *state =
424                 talloc_get_type(private_data, struct ctdb_do_takeip_state);
425         int32_t ret;
426         TDB_DATA data;
427
428         if (status != 0) {
429                 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
430         
431                 if (status == -ETIME) {
432                         ctdb_ban_self(ctdb);
433                 }
434                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
435                                  ctdb_addr_to_str(&state->vnn->public_address),
436                                  ctdb_vnn_iface_string(state->vnn)));
437                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
438
439                 node->flags |= NODE_FLAGS_UNHEALTHY;
440                 talloc_free(state);
441                 return;
442         }
443
444         if (ctdb->do_checkpublicip) {
445
446         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
447         if (ret != 0) {
448                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
449                 talloc_free(state);
450                 return;
451         }
452
453         }
454
455         data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
456         data.dsize = strlen((char *)data.dptr) + 1;
457         DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
458
459         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
460
461
462         /* the control succeeded */
463         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
464         talloc_free(state);
465         return;
466 }
467
468 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
469 {
470         state->vnn->update_in_flight = false;
471         return 0;
472 }
473
474 /*
475   take over an ip address
476  */
477 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
478                               struct ctdb_req_control_old *c,
479                               struct ctdb_vnn *vnn)
480 {
481         int ret;
482         struct ctdb_do_takeip_state *state;
483
484         if (vnn->update_in_flight) {
485                 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
486                                     "update for this IP already in flight\n",
487                                     ctdb_addr_to_str(&vnn->public_address),
488                                     vnn->public_netmask_bits));
489                 return -1;
490         }
491
492         ret = ctdb_vnn_assign_iface(ctdb, vnn);
493         if (ret != 0) {
494                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
495                                  "assign a usable interface\n",
496                                  ctdb_addr_to_str(&vnn->public_address),
497                                  vnn->public_netmask_bits));
498                 return -1;
499         }
500
501         state = talloc(vnn, struct ctdb_do_takeip_state);
502         CTDB_NO_MEMORY(ctdb, state);
503
504         state->c = talloc_steal(ctdb, c);
505         state->vnn   = vnn;
506
507         vnn->update_in_flight = true;
508         talloc_set_destructor(state, ctdb_takeip_destructor);
509
510         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
511                             ctdb_addr_to_str(&vnn->public_address),
512                             vnn->public_netmask_bits,
513                             ctdb_vnn_iface_string(vnn)));
514
515         ret = ctdb_event_script_callback(ctdb,
516                                          state,
517                                          ctdb_do_takeip_callback,
518                                          state,
519                                          CTDB_EVENT_TAKE_IP,
520                                          "%s %s %u",
521                                          ctdb_vnn_iface_string(vnn),
522                                          ctdb_addr_to_str(&vnn->public_address),
523                                          vnn->public_netmask_bits);
524
525         if (ret != 0) {
526                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
527                         ctdb_addr_to_str(&vnn->public_address),
528                         ctdb_vnn_iface_string(vnn)));
529                 talloc_free(state);
530                 return -1;
531         }
532
533         return 0;
534 }
535
536 struct ctdb_do_updateip_state {
537         struct ctdb_req_control_old *c;
538         struct ctdb_interface *old;
539         struct ctdb_vnn *vnn;
540 };
541
542 /*
543   called when updateip event finishes
544  */
545 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
546                                       void *private_data)
547 {
548         struct ctdb_do_updateip_state *state =
549                 talloc_get_type(private_data, struct ctdb_do_updateip_state);
550         int32_t ret;
551
552         if (status != 0) {
553                 if (status == -ETIME) {
554                         ctdb_ban_self(ctdb);
555                 }
556                 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
557                         ctdb_addr_to_str(&state->vnn->public_address),
558                         state->old->name,
559                         ctdb_vnn_iface_string(state->vnn)));
560
561                 /*
562                  * All we can do is reset the old interface
563                  * and let the next run fix it
564                  */
565                 ctdb_vnn_unassign_iface(ctdb, state->vnn);
566                 state->vnn->iface = state->old;
567                 state->vnn->iface->references++;
568
569                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
570                 talloc_free(state);
571                 return;
572         }
573
574         if (ctdb->do_checkpublicip) {
575
576         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
577         if (ret != 0) {
578                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
579                 talloc_free(state);
580                 return;
581         }
582
583         }
584
585         /* the control succeeded */
586         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
587         talloc_free(state);
588         return;
589 }
590
591 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
592 {
593         state->vnn->update_in_flight = false;
594         return 0;
595 }
596
597 /*
598   update (move) an ip address
599  */
600 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
601                                 struct ctdb_req_control_old *c,
602                                 struct ctdb_vnn *vnn)
603 {
604         int ret;
605         struct ctdb_do_updateip_state *state;
606         struct ctdb_interface *old = vnn->iface;
607         const char *new_name;
608
609         if (vnn->update_in_flight) {
610                 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
611                                     "update for this IP already in flight\n",
612                                     ctdb_addr_to_str(&vnn->public_address),
613                                     vnn->public_netmask_bits));
614                 return -1;
615         }
616
617         ctdb_vnn_unassign_iface(ctdb, vnn);
618         ret = ctdb_vnn_assign_iface(ctdb, vnn);
619         if (ret != 0) {
620                 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
621                                  "assin a usable interface (old iface '%s')\n",
622                                  ctdb_addr_to_str(&vnn->public_address),
623                                  vnn->public_netmask_bits,
624                                  old->name));
625                 return -1;
626         }
627
628         new_name = ctdb_vnn_iface_string(vnn);
629         if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
630                 /* A benign update from one interface onto itself.
631                  * no need to run the eventscripts in this case, just return
632                  * success.
633                  */
634                 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
635                 return 0;
636         }
637
638         state = talloc(vnn, struct ctdb_do_updateip_state);
639         CTDB_NO_MEMORY(ctdb, state);
640
641         state->c = talloc_steal(ctdb, c);
642         state->old = old;
643         state->vnn = vnn;
644
645         vnn->update_in_flight = true;
646         talloc_set_destructor(state, ctdb_updateip_destructor);
647
648         DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
649                             "interface %s to %s\n",
650                             ctdb_addr_to_str(&vnn->public_address),
651                             vnn->public_netmask_bits,
652                             old->name,
653                             new_name));
654
655         ret = ctdb_event_script_callback(ctdb,
656                                          state,
657                                          ctdb_do_updateip_callback,
658                                          state,
659                                          CTDB_EVENT_UPDATE_IP,
660                                          "%s %s %s %u",
661                                          state->old->name,
662                                          new_name,
663                                          ctdb_addr_to_str(&vnn->public_address),
664                                          vnn->public_netmask_bits);
665         if (ret != 0) {
666                 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
667                                  ctdb_addr_to_str(&vnn->public_address),
668                                  old->name, new_name));
669                 talloc_free(state);
670                 return -1;
671         }
672
673         return 0;
674 }
675
676 /*
677   Find the vnn of the node that has a public ip address
678   returns -1 if the address is not known as a public address
679  */
680 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
681 {
682         struct ctdb_vnn *vnn;
683
684         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
685                 if (ctdb_same_ip(&vnn->public_address, addr)) {
686                         return vnn;
687                 }
688         }
689
690         return NULL;
691 }
692
693 /*
694   take over an ip address
695  */
696 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
697                                  struct ctdb_req_control_old *c,
698                                  TDB_DATA indata,
699                                  bool *async_reply)
700 {
701         int ret;
702         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
703         struct ctdb_vnn *vnn;
704         bool have_ip = false;
705         bool do_updateip = false;
706         bool do_takeip = false;
707         struct ctdb_interface *best_iface = NULL;
708
709         if (pip->pnn != ctdb->pnn) {
710                 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
711                                  "with pnn %d, but we're node %d\n",
712                                  ctdb_addr_to_str(&pip->addr),
713                                  pip->pnn, ctdb->pnn));
714                 return -1;
715         }
716
717         /* update out vnn list */
718         vnn = find_public_ip_vnn(ctdb, &pip->addr);
719         if (vnn == NULL) {
720                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
721                         ctdb_addr_to_str(&pip->addr)));
722                 return 0;
723         }
724
725         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
726                 have_ip = ctdb_sys_have_ip(&pip->addr);
727         }
728         best_iface = ctdb_vnn_best_iface(ctdb, vnn);
729         if (best_iface == NULL) {
730                 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
731                                  "a usable interface (old %s, have_ip %d)\n",
732                                  ctdb_addr_to_str(&vnn->public_address),
733                                  vnn->public_netmask_bits,
734                                  ctdb_vnn_iface_string(vnn),
735                                  have_ip));
736                 return -1;
737         }
738
739         if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
740                 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
741                 have_ip = false;
742         }
743
744
745         if (vnn->iface == NULL && have_ip) {
746                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
747                                   "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
748                                  ctdb_addr_to_str(&vnn->public_address)));
749                 return 0;
750         }
751
752         if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
753                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
754                                   "and we have it on iface[%s], but it was assigned to node %d"
755                                   "and we are node %d, banning ourself\n",
756                                  ctdb_addr_to_str(&vnn->public_address),
757                                  ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
758                 ctdb_ban_self(ctdb);
759                 return -1;
760         }
761
762         if (vnn->pnn == -1 && have_ip) {
763                 vnn->pnn = ctdb->pnn;
764                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
765                                   "and we already have it on iface[%s], update local daemon\n",
766                                  ctdb_addr_to_str(&vnn->public_address),
767                                   ctdb_vnn_iface_string(vnn)));
768                 return 0;
769         }
770
771         if (vnn->iface) {
772                 if (vnn->iface != best_iface) {
773                         if (!vnn->iface->link_up) {
774                                 do_updateip = true;
775                         } else if (vnn->iface->references > (best_iface->references + 1)) {
776                                 /* only move when the rebalance gains something */
777                                         do_updateip = true;
778                         }
779                 }
780         }
781
782         if (!have_ip) {
783                 if (do_updateip) {
784                         ctdb_vnn_unassign_iface(ctdb, vnn);
785                         do_updateip = false;
786                 }
787                 do_takeip = true;
788         }
789
790         if (do_takeip) {
791                 ret = ctdb_do_takeip(ctdb, c, vnn);
792                 if (ret != 0) {
793                         return -1;
794                 }
795         } else if (do_updateip) {
796                 ret = ctdb_do_updateip(ctdb, c, vnn);
797                 if (ret != 0) {
798                         return -1;
799                 }
800         } else {
801                 /*
802                  * The interface is up and the kernel known the ip
803                  * => do nothing
804                  */
805                 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
806                         ctdb_addr_to_str(&pip->addr),
807                         vnn->public_netmask_bits,
808                         ctdb_vnn_iface_string(vnn)));
809                 return 0;
810         }
811
812         /* tell ctdb_control.c that we will be replying asynchronously */
813         *async_reply = true;
814
815         return 0;
816 }
817
818 /*
819   kill any clients that are registered with a IP that is being released
820  */
821 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
822 {
823         struct ctdb_client_ip *ip;
824
825         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
826                 ctdb_addr_to_str(addr)));
827
828         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
829                 ctdb_sock_addr tmp_addr;
830
831                 tmp_addr = ip->addr;
832                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
833                         ip->client_id,
834                         ctdb_addr_to_str(&ip->addr)));
835
836                 if (ctdb_same_ip(&tmp_addr, addr)) {
837                         struct ctdb_client *client = reqid_find(ctdb->idr,
838                                                                 ip->client_id,
839                                                                 struct ctdb_client);
840                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
841                                 ip->client_id,
842                                 ctdb_addr_to_str(&ip->addr),
843                                 client->pid));
844
845                         if (client->pid != 0) {
846                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
847                                         (unsigned)client->pid,
848                                         ctdb_addr_to_str(addr),
849                                         ip->client_id));
850                                 kill(client->pid, SIGKILL);
851                         }
852                 }
853         }
854 }
855
856 static void do_delete_ip(struct ctdb_context *ctdb, struct ctdb_vnn *vnn)
857 {
858         DLIST_REMOVE(ctdb->vnn, vnn);
859         ctdb_vnn_unassign_iface(ctdb, vnn);
860         ctdb_remove_orphaned_ifaces(ctdb, vnn);
861         talloc_free(vnn);
862 }
863
864 /*
865   called when releaseip event finishes
866  */
867 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
868                                 void *private_data)
869 {
870         struct takeover_callback_state *state = 
871                 talloc_get_type(private_data, struct takeover_callback_state);
872         TDB_DATA data;
873
874         if (status == -ETIME) {
875                 ctdb_ban_self(ctdb);
876         }
877
878         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
879                 if  (ctdb_sys_have_ip(state->addr)) {
880                         DEBUG(DEBUG_ERR,
881                               ("IP %s still hosted during release IP callback, failing\n",
882                                ctdb_addr_to_str(state->addr)));
883                         ctdb_request_control_reply(ctdb, state->c,
884                                                    NULL, -1, NULL);
885                         talloc_free(state);
886                         return;
887                 }
888         }
889
890         /* send a message to all clients of this node telling them
891            that the cluster has been reconfigured and they should
892            release any sockets on this IP */
893         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
894         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
895         data.dsize = strlen((char *)data.dptr)+1;
896
897         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
898
899         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
900
901         /* kill clients that have registered with this IP */
902         release_kill_clients(ctdb, state->addr);
903
904         ctdb_vnn_unassign_iface(ctdb, state->vnn);
905
906         /* Process the IP if it has been marked for deletion */
907         if (state->vnn->delete_pending) {
908                 do_delete_ip(ctdb, state->vnn);
909                 state->vnn = NULL;
910         }
911
912         /* the control succeeded */
913         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
914         talloc_free(state);
915 }
916
917 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
918 {
919         if (state->vnn != NULL) {
920                 state->vnn->update_in_flight = false;
921         }
922         return 0;
923 }
924
925 /*
926   release an ip address
927  */
928 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
929                                 struct ctdb_req_control_old *c,
930                                 TDB_DATA indata, 
931                                 bool *async_reply)
932 {
933         int ret;
934         struct takeover_callback_state *state;
935         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
936         struct ctdb_vnn *vnn;
937         char *iface;
938
939         /* update our vnn list */
940         vnn = find_public_ip_vnn(ctdb, &pip->addr);
941         if (vnn == NULL) {
942                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
943                         ctdb_addr_to_str(&pip->addr)));
944                 return 0;
945         }
946         vnn->pnn = pip->pnn;
947
948         /* stop any previous arps */
949         talloc_free(vnn->takeover_ctx);
950         vnn->takeover_ctx = NULL;
951
952         /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
953          * lazy multicast to drop an IP from any node that isn't the
954          * intended new node.  The following causes makes ctdbd ignore
955          * a release for any address it doesn't host.
956          */
957         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
958                 if (!ctdb_sys_have_ip(&pip->addr)) {
959                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
960                                 ctdb_addr_to_str(&pip->addr),
961                                 vnn->public_netmask_bits,
962                                 ctdb_vnn_iface_string(vnn)));
963                         ctdb_vnn_unassign_iface(ctdb, vnn);
964                         return 0;
965                 }
966         } else {
967                 if (vnn->iface == NULL) {
968                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
969                                            ctdb_addr_to_str(&pip->addr),
970                                            vnn->public_netmask_bits));
971                         return 0;
972                 }
973         }
974
975         /* There is a potential race between take_ip and us because we
976          * update the VNN via a callback that run when the
977          * eventscripts have been run.  Avoid the race by allowing one
978          * update to be in flight at a time.
979          */
980         if (vnn->update_in_flight) {
981                 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
982                                     "update for this IP already in flight\n",
983                                     ctdb_addr_to_str(&vnn->public_address),
984                                     vnn->public_netmask_bits));
985                 return -1;
986         }
987
988         iface = strdup(ctdb_vnn_iface_string(vnn));
989
990         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
991                 ctdb_addr_to_str(&pip->addr),
992                 vnn->public_netmask_bits,
993                 iface,
994                 pip->pnn));
995
996         state = talloc(ctdb, struct takeover_callback_state);
997         if (state == NULL) {
998                 ctdb_set_error(ctdb, "Out of memory at %s:%d",
999                                __FILE__, __LINE__);
1000                 free(iface);
1001                 return -1;
1002         }
1003
1004         state->c = talloc_steal(state, c);
1005         state->addr = talloc(state, ctdb_sock_addr);       
1006         if (state->addr == NULL) {
1007                 ctdb_set_error(ctdb, "Out of memory at %s:%d",
1008                                __FILE__, __LINE__);
1009                 free(iface);
1010                 talloc_free(state);
1011                 return -1;
1012         }
1013         *state->addr = pip->addr;
1014         state->vnn   = vnn;
1015
1016         vnn->update_in_flight = true;
1017         talloc_set_destructor(state, ctdb_releaseip_destructor);
1018
1019         ret = ctdb_event_script_callback(ctdb, 
1020                                          state, release_ip_callback, state,
1021                                          CTDB_EVENT_RELEASE_IP,
1022                                          "%s %s %u",
1023                                          iface,
1024                                          ctdb_addr_to_str(&pip->addr),
1025                                          vnn->public_netmask_bits);
1026         free(iface);
1027         if (ret != 0) {
1028                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1029                         ctdb_addr_to_str(&pip->addr),
1030                         ctdb_vnn_iface_string(vnn)));
1031                 talloc_free(state);
1032                 return -1;
1033         }
1034
1035         /* tell the control that we will be reply asynchronously */
1036         *async_reply = true;
1037         return 0;
1038 }
1039
1040 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1041                                    ctdb_sock_addr *addr,
1042                                    unsigned mask, const char *ifaces,
1043                                    bool check_address)
1044 {
1045         struct ctdb_vnn      *vnn;
1046         uint32_t num = 0;
1047         char *tmp;
1048         const char *iface;
1049         int i;
1050         int ret;
1051
1052         tmp = strdup(ifaces);
1053         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1054                 if (!ctdb_sys_check_iface_exists(iface)) {
1055                         DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1056                         free(tmp);
1057                         return -1;
1058                 }
1059         }
1060         free(tmp);
1061
1062         /* Verify that we don't have an entry for this ip yet */
1063         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1064                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1065                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
1066                                 ctdb_addr_to_str(addr)));
1067                         return -1;
1068                 }               
1069         }
1070
1071         /* create a new vnn structure for this ip address */
1072         vnn = talloc_zero(ctdb, struct ctdb_vnn);
1073         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1074         vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1075         tmp = talloc_strdup(vnn, ifaces);
1076         CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1077         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1078                 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1079                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1080                 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1081                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1082                 num++;
1083         }
1084         talloc_free(tmp);
1085         vnn->ifaces[num] = NULL;
1086         vnn->public_address      = *addr;
1087         vnn->public_netmask_bits = mask;
1088         vnn->pnn                 = -1;
1089         if (check_address) {
1090                 if (ctdb_sys_have_ip(addr)) {
1091                         DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1092                         vnn->pnn = ctdb->pnn;
1093                 }
1094         }
1095
1096         for (i=0; vnn->ifaces[i]; i++) {
1097                 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1098                 if (ret != 0) {
1099                         DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1100                                            "for public_address[%s]\n",
1101                                            vnn->ifaces[i], ctdb_addr_to_str(addr)));
1102                         talloc_free(vnn);
1103                         return -1;
1104                 }
1105         }
1106
1107         DLIST_ADD(ctdb->vnn, vnn);
1108
1109         return 0;
1110 }
1111
1112 /*
1113   setup the public address lists from a file
1114 */
1115 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1116 {
1117         char **lines;
1118         int nlines;
1119         int i;
1120
1121         lines = file_lines_load(ctdb->public_addresses_file, &nlines, 0, ctdb);
1122         if (lines == NULL) {
1123                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1124                 return -1;
1125         }
1126         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1127                 nlines--;
1128         }
1129
1130         for (i=0;i<nlines;i++) {
1131                 unsigned mask;
1132                 ctdb_sock_addr addr;
1133                 const char *addrstr;
1134                 const char *ifaces;
1135                 char *tok, *line;
1136
1137                 line = lines[i];
1138                 while ((*line == ' ') || (*line == '\t')) {
1139                         line++;
1140                 }
1141                 if (*line == '#') {
1142                         continue;
1143                 }
1144                 if (strcmp(line, "") == 0) {
1145                         continue;
1146                 }
1147                 tok = strtok(line, " \t");
1148                 addrstr = tok;
1149                 tok = strtok(NULL, " \t");
1150                 if (tok == NULL) {
1151                         if (NULL == ctdb->default_public_interface) {
1152                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1153                                          i+1));
1154                                 talloc_free(lines);
1155                                 return -1;
1156                         }
1157                         ifaces = ctdb->default_public_interface;
1158                 } else {
1159                         ifaces = tok;
1160                 }
1161
1162                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1163                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1164                         talloc_free(lines);
1165                         return -1;
1166                 }
1167                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1168                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1169                         talloc_free(lines);
1170                         return -1;
1171                 }
1172         }
1173
1174
1175         talloc_free(lines);
1176         return 0;
1177 }
1178
1179 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1180                               const char *iface,
1181                               const char *ip)
1182 {
1183         struct ctdb_vnn *svnn;
1184         struct ctdb_interface *cur = NULL;
1185         bool ok;
1186         int ret;
1187
1188         svnn = talloc_zero(ctdb, struct ctdb_vnn);
1189         CTDB_NO_MEMORY(ctdb, svnn);
1190
1191         svnn->ifaces = talloc_array(svnn, const char *, 2);
1192         CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1193         svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1194         CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1195         svnn->ifaces[1] = NULL;
1196
1197         ok = parse_ip(ip, iface, 0, &svnn->public_address);
1198         if (!ok) {
1199                 talloc_free(svnn);
1200                 return -1;
1201         }
1202
1203         ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1204         if (ret != 0) {
1205                 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1206                                    "for single_ip[%s]\n",
1207                                    svnn->ifaces[0],
1208                                    ctdb_addr_to_str(&svnn->public_address)));
1209                 talloc_free(svnn);
1210                 return -1;
1211         }
1212
1213         /* assume the single public ip interface is initially "good" */
1214         cur = ctdb_find_iface(ctdb, iface);
1215         if (cur == NULL) {
1216                 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1217                 return -1;
1218         }
1219         cur->link_up = true;
1220
1221         ret = ctdb_vnn_assign_iface(ctdb, svnn);
1222         if (ret != 0) {
1223                 talloc_free(svnn);
1224                 return -1;
1225         }
1226
1227         ctdb->single_ip_vnn = svnn;
1228         return 0;
1229 }
1230
1231 static void *add_ip_callback(void *parm, void *data)
1232 {
1233         struct public_ip_list *this_ip = parm;
1234         struct public_ip_list *prev_ip = data;
1235
1236         if (prev_ip == NULL) {
1237                 return parm;
1238         }
1239         if (this_ip->pnn == -1) {
1240                 this_ip->pnn = prev_ip->pnn;
1241         }
1242
1243         return parm;
1244 }
1245
1246 static int getips_count_callback(void *param, void *data)
1247 {
1248         struct public_ip_list **ip_list = (struct public_ip_list **)param;
1249         struct public_ip_list *new_ip = (struct public_ip_list *)data;
1250
1251         new_ip->next = *ip_list;
1252         *ip_list     = new_ip;
1253         return 0;
1254 }
1255
1256 static int verify_remote_ip_allocation(struct ctdb_context *ctdb,
1257                                        struct ctdb_public_ip_list_old *ips,
1258                                        uint32_t pnn);
1259
1260 static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
1261                                          struct ipalloc_state *ipalloc_state,
1262                                          struct ctdb_node_map_old *nodemap)
1263 {
1264         int j;
1265         int ret;
1266
1267         if (ipalloc_state->num != nodemap->num) {
1268                 DEBUG(DEBUG_ERR,
1269                       (__location__
1270                        " ipalloc_state->num (%d) != nodemap->num (%d) invalid param\n",
1271                        ipalloc_state->num, nodemap->num));
1272                 return -1;
1273         }
1274
1275         for (j=0; j<nodemap->num; j++) {
1276                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1277                         continue;
1278                 }
1279
1280                 /* Retrieve the list of known public IPs from the node */
1281                 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1282                                         TAKEOVER_TIMEOUT(),
1283                                         j,
1284                                         ipalloc_state->known_public_ips,
1285                                         0,
1286                                         &ipalloc_state->known_public_ips[j]);
1287                 if (ret != 0) {
1288                         DEBUG(DEBUG_ERR,
1289                               ("Failed to read known public IPs from node: %u\n",
1290                                j));
1291                         return -1;
1292                 }
1293
1294                 if (ctdb->do_checkpublicip) {
1295                         verify_remote_ip_allocation(ctdb,
1296                                                     ipalloc_state->known_public_ips[j],
1297                                                     j);
1298                 }
1299
1300                 /* Retrieve the list of available public IPs from the node */
1301                 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1302                                         TAKEOVER_TIMEOUT(),
1303                                         j,
1304                                         ipalloc_state->available_public_ips,
1305                                         CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE,
1306                                         &ipalloc_state->available_public_ips[j]);
1307                 if (ret != 0) {
1308                         DEBUG(DEBUG_ERR,
1309                               ("Failed to read available public IPs from node: %u\n",
1310                                j));
1311                         return -1;
1312                 }
1313         }
1314
1315         return 0;
1316 }
1317
1318 static struct public_ip_list *
1319 create_merged_ip_list(struct ctdb_context *ctdb, struct ipalloc_state *ipalloc_state)
1320 {
1321         int i, j;
1322         struct public_ip_list *ip_list;
1323         struct ctdb_public_ip_list_old *public_ips;
1324
1325         TALLOC_FREE(ctdb->ip_tree);
1326         ctdb->ip_tree = trbt_create(ctdb, 0);
1327
1328         for (i=0; i < ctdb->num_nodes; i++) {
1329                 public_ips = ipalloc_state->known_public_ips[i];
1330
1331                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1332                         continue;
1333                 }
1334
1335                 /* there were no public ips for this node */
1336                 if (public_ips == NULL) {
1337                         continue;
1338                 }
1339
1340                 for (j=0; j < public_ips->num; j++) {
1341                         struct public_ip_list *tmp_ip;
1342
1343                         tmp_ip = talloc_zero(ctdb->ip_tree, struct public_ip_list);
1344                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1345                         /* Do not use information about IP addresses hosted
1346                          * on other nodes, it may not be accurate */
1347                         if (public_ips->ips[j].pnn == ctdb->nodes[i]->pnn) {
1348                                 tmp_ip->pnn = public_ips->ips[j].pnn;
1349                         } else {
1350                                 tmp_ip->pnn = -1;
1351                         }
1352                         tmp_ip->addr = public_ips->ips[j].addr;
1353                         tmp_ip->next = NULL;
1354
1355                         trbt_insertarray32_callback(ctdb->ip_tree,
1356                                 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1357                                 add_ip_callback,
1358                                 tmp_ip);
1359                 }
1360         }
1361
1362         ip_list = NULL;
1363         trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1364
1365         return ip_list;
1366 }
1367
1368 static bool all_nodes_are_disabled(struct ctdb_node_map_old *nodemap)
1369 {
1370         int i;
1371
1372         for (i=0;i<nodemap->num;i++) {
1373                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
1374                         /* Found one completely healthy node */
1375                         return false;
1376                 }
1377         }
1378
1379         return true;
1380 }
1381
1382 struct get_tunable_callback_data {
1383         const char *tunable;
1384         uint32_t *out;
1385         bool fatal;
1386 };
1387
1388 static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
1389                                  int32_t res, TDB_DATA outdata,
1390                                  void *callback)
1391 {
1392         struct get_tunable_callback_data *cd =
1393                 (struct get_tunable_callback_data *)callback;
1394         int size;
1395
1396         if (res != 0) {
1397                 /* Already handled in fail callback */
1398                 return;
1399         }
1400
1401         if (outdata.dsize != sizeof(uint32_t)) {
1402                 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading \"%s\" tunable from node %d. Expected %d bytes but received %d bytes\n",
1403                                  cd->tunable, pnn, (int)sizeof(uint32_t),
1404                                  (int)outdata.dsize));
1405                 cd->fatal = true;
1406                 return;
1407         }
1408
1409         size = talloc_array_length(cd->out);
1410         if (pnn >= size) {
1411                 DEBUG(DEBUG_ERR,("Got %s reply from node %d but nodemap only has %d entries\n",
1412                                  cd->tunable, pnn, size));
1413                 return;
1414         }
1415
1416                 
1417         cd->out[pnn] = *(uint32_t *)outdata.dptr;
1418 }
1419
1420 static void get_tunable_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
1421                                        int32_t res, TDB_DATA outdata,
1422                                        void *callback)
1423 {
1424         struct get_tunable_callback_data *cd =
1425                 (struct get_tunable_callback_data *)callback;
1426
1427         switch (res) {
1428         case -ETIME:
1429                 DEBUG(DEBUG_ERR,
1430                       ("Timed out getting tunable \"%s\" from node %d\n",
1431                        cd->tunable, pnn));
1432                 cd->fatal = true;
1433                 break;
1434         case -EINVAL:
1435         case -1:
1436                 DEBUG(DEBUG_WARNING,
1437                       ("Tunable \"%s\" not implemented on node %d\n",
1438                        cd->tunable, pnn));
1439                 break;
1440         default:
1441                 DEBUG(DEBUG_ERR,
1442                       ("Unexpected error getting tunable \"%s\" from node %d\n",
1443                        cd->tunable, pnn));
1444                 cd->fatal = true;
1445         }
1446 }
1447
1448 static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
1449                                         TALLOC_CTX *tmp_ctx,
1450                                         struct ctdb_node_map_old *nodemap,
1451                                         const char *tunable,
1452                                         uint32_t default_value)
1453 {
1454         TDB_DATA data;
1455         struct ctdb_control_get_tunable *t;
1456         uint32_t *nodes;
1457         uint32_t *tvals;
1458         struct get_tunable_callback_data callback_data;
1459         int i;
1460
1461         tvals = talloc_array(tmp_ctx, uint32_t, nodemap->num);
1462         CTDB_NO_MEMORY_NULL(ctdb, tvals);
1463         for (i=0; i<nodemap->num; i++) {
1464                 tvals[i] = default_value;
1465         }
1466                 
1467         callback_data.out = tvals;
1468         callback_data.tunable = tunable;
1469         callback_data.fatal = false;
1470
1471         data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(tunable) + 1;
1472         data.dptr  = talloc_size(tmp_ctx, data.dsize);
1473         t = (struct ctdb_control_get_tunable *)data.dptr;
1474         t->length = strlen(tunable)+1;
1475         memcpy(t->name, tunable, t->length);
1476         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
1477         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
1478                                       nodes, 0, TAKEOVER_TIMEOUT(),
1479                                       false, data,
1480                                       get_tunable_callback,
1481                                       get_tunable_fail_callback,
1482                                       &callback_data) != 0) {
1483                 if (callback_data.fatal) {
1484                         talloc_free(tvals);
1485                         tvals = NULL;
1486                 }
1487         }
1488         talloc_free(nodes);
1489         talloc_free(data.dptr);
1490
1491         return tvals;
1492 }
1493
1494 /* Set internal flags for IP allocation:
1495  *   Clear ip flags
1496  *   Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
1497  *   Set NOIPHOST ip flag for each INACTIVE node
1498  *   if all nodes are disabled:
1499  *     Set NOIPHOST ip flags from per-node NoIPHostOnAllDisabled tunable
1500  *   else
1501  *     Set NOIPHOST ip flags for disabled nodes
1502  */
1503 static void set_ipflags_internal(struct ipalloc_state *ipalloc_state,
1504                                  struct ctdb_node_map_old *nodemap,
1505                                  uint32_t *tval_noiptakeover,
1506                                  uint32_t *tval_noiphostonalldisabled)
1507 {
1508         int i;
1509
1510         for (i=0;i<nodemap->num;i++) {
1511                 /* Can not take IPs on node with NoIPTakeover set */
1512                 if (tval_noiptakeover[i] != 0) {
1513                         ipalloc_state->noiptakeover[i] = true;
1514                 }
1515
1516                 /* Can not host IPs on INACTIVE node */
1517                 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1518                         ipalloc_state->noiphost[i] = true;
1519                 }
1520         }
1521
1522         if (all_nodes_are_disabled(nodemap)) {
1523                 /* If all nodes are disabled, can not host IPs on node
1524                  * with NoIPHostOnAllDisabled set
1525                  */
1526                 for (i=0;i<nodemap->num;i++) {
1527                         if (tval_noiphostonalldisabled[i] != 0) {
1528                                 ipalloc_state->noiphost[i] = true;
1529                         }
1530                 }
1531         } else {
1532                 /* If some nodes are not disabled, then can not host
1533                  * IPs on DISABLED node
1534                  */
1535                 for (i=0;i<nodemap->num;i++) {
1536                         if (nodemap->nodes[i].flags & NODE_FLAGS_DISABLED) {
1537                                 ipalloc_state->noiphost[i] = true;
1538                         }
1539                 }
1540         }
1541 }
1542
1543 static bool set_ipflags(struct ctdb_context *ctdb,
1544                         struct ipalloc_state *ipalloc_state,
1545                         struct ctdb_node_map_old *nodemap)
1546 {
1547         uint32_t *tval_noiptakeover;
1548         uint32_t *tval_noiphostonalldisabled;
1549
1550         tval_noiptakeover = get_tunable_from_nodes(ctdb, ipalloc_state, nodemap,
1551                                                    "NoIPTakeover", 0);
1552         if (tval_noiptakeover == NULL) {
1553                 return false;
1554         }
1555
1556         tval_noiphostonalldisabled =
1557                 get_tunable_from_nodes(ctdb, ipalloc_state, nodemap,
1558                                        "NoIPHostOnAllDisabled", 0);
1559         if (tval_noiphostonalldisabled == NULL) {
1560                 /* Caller frees tmp_ctx */
1561                 return false;
1562         }
1563
1564         set_ipflags_internal(ipalloc_state, nodemap,
1565                              tval_noiptakeover,
1566                              tval_noiphostonalldisabled);
1567
1568         talloc_free(tval_noiptakeover);
1569         talloc_free(tval_noiphostonalldisabled);
1570
1571         return true;
1572 }
1573
1574 static struct ipalloc_state * ipalloc_state_init(struct ctdb_context *ctdb,
1575                                                  TALLOC_CTX *mem_ctx)
1576 {
1577         struct ipalloc_state *ipalloc_state =
1578                 talloc_zero(mem_ctx, struct ipalloc_state);
1579         if (ipalloc_state == NULL) {
1580                 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
1581                 return NULL;
1582         }
1583
1584         ipalloc_state->num = ctdb->num_nodes;
1585         ipalloc_state->known_public_ips =
1586                 talloc_zero_array(ipalloc_state,
1587                                   struct ctdb_public_ip_list_old *,
1588                                   ipalloc_state->num);
1589         if (ipalloc_state->known_public_ips == NULL) {
1590                 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
1591                 talloc_free(ipalloc_state);
1592                 return NULL;
1593         }
1594         ipalloc_state->available_public_ips =
1595                 talloc_zero_array(ipalloc_state,
1596                                   struct ctdb_public_ip_list_old *,
1597                                   ipalloc_state->num);
1598         if (ipalloc_state->available_public_ips == NULL) {
1599                 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
1600                 talloc_free(ipalloc_state);
1601                 return NULL;
1602         }
1603         ipalloc_state->noiptakeover =
1604                 talloc_zero_array(ipalloc_state,
1605                                   bool,
1606                                   ipalloc_state->num);
1607         if (ipalloc_state->noiptakeover == NULL) {
1608                 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
1609                 talloc_free(ipalloc_state);
1610                 return NULL;
1611         }
1612         ipalloc_state->noiphost =
1613                 talloc_zero_array(ipalloc_state,
1614                                   bool,
1615                                   ipalloc_state->num);
1616         if (ipalloc_state->noiphost == NULL) {
1617                 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
1618                 talloc_free(ipalloc_state);
1619                 return NULL;
1620         }
1621
1622         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
1623                 ipalloc_state->algorithm = IPALLOC_LCP2;
1624         } else if (1 == ctdb->tunable.deterministic_public_ips) {
1625                 ipalloc_state->algorithm = IPALLOC_DETERMINISTIC;
1626         } else {
1627                 ipalloc_state->algorithm = IPALLOC_NONDETERMINISTIC;
1628         }
1629
1630         ipalloc_state->no_ip_failback = ctdb->tunable.no_ip_failback;
1631
1632         return ipalloc_state;
1633 }
1634
1635 struct iprealloc_callback_data {
1636         bool *retry_nodes;
1637         int retry_count;
1638         client_async_callback fail_callback;
1639         void *fail_callback_data;
1640         struct ctdb_node_map_old *nodemap;
1641 };
1642
1643 static void iprealloc_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
1644                                         int32_t res, TDB_DATA outdata,
1645                                         void *callback)
1646 {
1647         int numnodes;
1648         struct iprealloc_callback_data *cd =
1649                 (struct iprealloc_callback_data *)callback;
1650
1651         numnodes = talloc_array_length(cd->retry_nodes);
1652         if (pnn > numnodes) {
1653                 DEBUG(DEBUG_ERR,
1654                       ("ipreallocated failure from node %d, "
1655                        "but only %d nodes in nodemap\n",
1656                        pnn, numnodes));
1657                 return;
1658         }
1659
1660         /* Can't run the "ipreallocated" event on a INACTIVE node */
1661         if (cd->nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE) {
1662                 DEBUG(DEBUG_WARNING,
1663                       ("ipreallocated failed on inactive node %d, ignoring\n",
1664                        pnn));
1665                 return;
1666         }
1667
1668         switch (res) {
1669         case -ETIME:
1670                 /* If the control timed out then that's a real error,
1671                  * so call the real fail callback
1672                  */
1673                 if (cd->fail_callback) {
1674                         cd->fail_callback(ctdb, pnn, res, outdata,
1675                                           cd->fail_callback_data);
1676                 } else {
1677                         DEBUG(DEBUG_WARNING,
1678                               ("iprealloc timed out but no callback registered\n"));
1679                 }
1680                 break;
1681         default:
1682                 /* If not a timeout then either the ipreallocated
1683                  * eventscript (or some setup) failed.  This might
1684                  * have failed because the IPREALLOCATED control isn't
1685                  * implemented - right now there is no way of knowing
1686                  * because the error codes are all folded down to -1.
1687                  * Consider retrying using EVENTSCRIPT control...
1688                  */
1689                 DEBUG(DEBUG_WARNING,
1690                       ("ipreallocated failure from node %d, flagging retry\n",
1691                        pnn));
1692                 cd->retry_nodes[pnn] = true;
1693                 cd->retry_count++;
1694         }
1695 }
1696
1697 struct takeover_callback_data {
1698         bool *node_failed;
1699         client_async_callback fail_callback;
1700         void *fail_callback_data;
1701         struct ctdb_node_map_old *nodemap;
1702 };
1703
1704 static void takeover_run_fail_callback(struct ctdb_context *ctdb,
1705                                        uint32_t node_pnn, int32_t res,
1706                                        TDB_DATA outdata, void *callback_data)
1707 {
1708         struct takeover_callback_data *cd =
1709                 talloc_get_type_abort(callback_data,
1710                                       struct takeover_callback_data);
1711         int i;
1712
1713         for (i = 0; i < cd->nodemap->num; i++) {
1714                 if (node_pnn == cd->nodemap->nodes[i].pnn) {
1715                         break;
1716                 }
1717         }
1718
1719         if (i == cd->nodemap->num) {
1720                 DEBUG(DEBUG_ERR, (__location__ " invalid PNN %u\n", node_pnn));
1721                 return;
1722         }
1723
1724         if (!cd->node_failed[i]) {
1725                 cd->node_failed[i] = true;
1726                 cd->fail_callback(ctdb, node_pnn, res, outdata,
1727                                   cd->fail_callback_data);
1728         }
1729 }
1730
1731 /*
1732   make any IP alias changes for public addresses that are necessary 
1733  */
1734 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
1735                       uint32_t *force_rebalance_nodes,
1736                       client_async_callback fail_callback, void *callback_data)
1737 {
1738         int i, j, ret;
1739         struct ctdb_public_ip ip;
1740         uint32_t *nodes;
1741         struct public_ip_list *all_ips, *tmp_ip;
1742         TDB_DATA data;
1743         struct timeval timeout;
1744         struct client_async_data *async_data;
1745         struct ctdb_client_control_state *state;
1746         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1747         struct ipalloc_state *ipalloc_state;
1748         struct takeover_callback_data *takeover_data;
1749         struct iprealloc_callback_data iprealloc_data;
1750         bool *retry_data;
1751         bool can_host_ips;
1752
1753         /*
1754          * ip failover is completely disabled, just send out the 
1755          * ipreallocated event.
1756          */
1757         if (ctdb->tunable.disable_ip_failover != 0) {
1758                 goto ipreallocated;
1759         }
1760
1761         ipalloc_state = ipalloc_state_init(ctdb, tmp_ctx);
1762         if (ipalloc_state == NULL) {
1763                 talloc_free(tmp_ctx);
1764                 return -1;
1765         }
1766
1767         if (!set_ipflags(ctdb, ipalloc_state, nodemap)) {
1768                 DEBUG(DEBUG_ERR,("Failed to set IP flags - aborting takeover run\n"));
1769                 talloc_free(tmp_ctx);
1770                 return -1;
1771         }
1772
1773         /* Fetch known/available public IPs from each active node */
1774         ret = ctdb_reload_remote_public_ips(ctdb, ipalloc_state, nodemap);
1775         if (ret != 0) {
1776                 talloc_free(tmp_ctx);
1777                 return -1;
1778         }
1779
1780         /* Short-circuit IP allocation if no node has available IPs */
1781         can_host_ips = false;
1782         for (i=0; i < ipalloc_state->num; i++) {
1783                 if (ipalloc_state->available_public_ips[i] != NULL) {
1784                         can_host_ips = true;
1785                 }
1786         }
1787         if (!can_host_ips) {
1788                 DEBUG(DEBUG_WARNING,("No nodes available to host public IPs yet\n"));
1789                 return 0;
1790         }
1791
1792         /* since nodes only know about those public addresses that
1793            can be served by that particular node, no single node has
1794            a full list of all public addresses that exist in the cluster.
1795            Walk over all node structures and create a merged list of
1796            all public addresses that exist in the cluster.
1797
1798            keep the tree of ips around as ctdb->ip_tree
1799         */
1800         all_ips = create_merged_ip_list(ctdb, ipalloc_state);
1801         ipalloc_state->all_ips = all_ips;
1802
1803         ipalloc_state->force_rebalance_nodes = force_rebalance_nodes;
1804
1805         /* Do the IP reassignment calculations */
1806         ipalloc(ipalloc_state);
1807
1808         /* Now tell all nodes to release any public IPs should not
1809          * host.  This will be a NOOP on nodes that don't currently
1810          * hold the given IP.
1811          */
1812         takeover_data = talloc_zero(tmp_ctx, struct takeover_callback_data);
1813         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data);
1814
1815         takeover_data->node_failed = talloc_zero_array(tmp_ctx,
1816                                                        bool, nodemap->num);
1817         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data->node_failed);
1818         takeover_data->fail_callback = fail_callback;
1819         takeover_data->fail_callback_data = callback_data;
1820         takeover_data->nodemap = nodemap;
1821
1822         async_data = talloc_zero(tmp_ctx, struct client_async_data);
1823         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
1824
1825         async_data->fail_callback = takeover_run_fail_callback;
1826         async_data->callback_data = takeover_data;
1827
1828         ZERO_STRUCT(ip); /* Avoid valgrind warnings for union */
1829
1830         /* Send a RELEASE_IP to all nodes that should not be hosting
1831          * each IP.  For each IP, all but one of these will be
1832          * redundant.  However, the redundant ones are used to tell
1833          * nodes which node should be hosting the IP so that commands
1834          * like "ctdb ip" can display a particular nodes idea of who
1835          * is hosting what. */
1836         for (i=0;i<nodemap->num;i++) {
1837                 /* don't talk to unconnected nodes, but do talk to banned nodes */
1838                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1839                         continue;
1840                 }
1841
1842                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1843                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
1844                                 /* This node should be serving this
1845                                    vnn so don't tell it to release the ip
1846                                 */
1847                                 continue;
1848                         }
1849                         ip.pnn  = tmp_ip->pnn;
1850                         ip.addr = tmp_ip->addr;
1851
1852                         timeout = TAKEOVER_TIMEOUT();
1853                         data.dsize = sizeof(ip);
1854                         data.dptr  = (uint8_t *)&ip;
1855                         state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
1856                                                   0, CTDB_CONTROL_RELEASE_IP, 0,
1857                                                   data, async_data,
1858                                                   &timeout, NULL);
1859                         if (state == NULL) {
1860                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
1861                                 talloc_free(tmp_ctx);
1862                                 return -1;
1863                         }
1864
1865                         ctdb_client_async_add(async_data, state);
1866                 }
1867         }
1868         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
1869                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
1870                 talloc_free(tmp_ctx);
1871                 return -1;
1872         }
1873         talloc_free(async_data);
1874
1875
1876         /* For each IP, send a TAKOVER_IP to the node that should be
1877          * hosting it.  Many of these will often be redundant (since
1878          * the allocation won't have changed) but they can be useful
1879          * to recover from inconsistencies. */
1880         async_data = talloc_zero(tmp_ctx, struct client_async_data);
1881         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
1882
1883         async_data->fail_callback = fail_callback;
1884         async_data->callback_data = callback_data;
1885
1886         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1887                 if (tmp_ip->pnn == -1) {
1888                         /* this IP won't be taken over */
1889                         continue;
1890                 }
1891
1892                 ip.pnn  = tmp_ip->pnn;
1893                 ip.addr = tmp_ip->addr;
1894
1895                 timeout = TAKEOVER_TIMEOUT();
1896                 data.dsize = sizeof(ip);
1897                 data.dptr  = (uint8_t *)&ip;
1898                 state = ctdb_control_send(ctdb, tmp_ip->pnn,
1899                                           0, CTDB_CONTROL_TAKEOVER_IP, 0,
1900                                           data, async_data, &timeout, NULL);
1901                 if (state == NULL) {
1902                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
1903                         talloc_free(tmp_ctx);
1904                         return -1;
1905                 }
1906
1907                 ctdb_client_async_add(async_data, state);
1908         }
1909         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
1910                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
1911                 talloc_free(tmp_ctx);
1912                 return -1;
1913         }
1914
1915 ipreallocated:
1916         /*
1917          * Tell all nodes to run eventscripts to process the
1918          * "ipreallocated" event.  This can do a lot of things,
1919          * including restarting services to reconfigure them if public
1920          * IPs have moved.  Once upon a time this event only used to
1921          * update natgw.
1922          */
1923         retry_data = talloc_zero_array(tmp_ctx, bool, nodemap->num);
1924         CTDB_NO_MEMORY_FATAL(ctdb, retry_data);
1925         iprealloc_data.retry_nodes = retry_data;
1926         iprealloc_data.retry_count = 0;
1927         iprealloc_data.fail_callback = fail_callback;
1928         iprealloc_data.fail_callback_data = callback_data;
1929         iprealloc_data.nodemap = nodemap;
1930
1931         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
1932         ret = ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
1933                                         nodes, 0, TAKEOVER_TIMEOUT(),
1934                                         false, tdb_null,
1935                                         NULL, iprealloc_fail_callback,
1936                                         &iprealloc_data);
1937         if (ret != 0) {
1938                 /* If the control failed then we should retry to any
1939                  * nodes flagged by iprealloc_fail_callback using the
1940                  * EVENTSCRIPT control.  This is a best-effort at
1941                  * backward compatiblity when running a mixed cluster
1942                  * where some nodes have not yet been upgraded to
1943                  * support the IPREALLOCATED control.
1944                  */
1945                 DEBUG(DEBUG_WARNING,
1946                       ("Retry ipreallocated to some nodes using eventscript control\n"));
1947
1948                 nodes = talloc_array(tmp_ctx, uint32_t,
1949                                      iprealloc_data.retry_count);
1950                 CTDB_NO_MEMORY_FATAL(ctdb, nodes);
1951
1952                 j = 0;
1953                 for (i=0; i<nodemap->num; i++) {
1954                         if (iprealloc_data.retry_nodes[i]) {
1955                                 nodes[j] = i;
1956                                 j++;
1957                         }
1958                 }
1959
1960                 data.dptr  = discard_const("ipreallocated");
1961                 data.dsize = strlen((char *)data.dptr) + 1; 
1962                 ret = ctdb_client_async_control(ctdb,
1963                                                 CTDB_CONTROL_RUN_EVENTSCRIPTS,
1964                                                 nodes, 0, TAKEOVER_TIMEOUT(),
1965                                                 false, data,
1966                                                 NULL, fail_callback,
1967                                                 callback_data);
1968                 if (ret != 0) {
1969                         DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
1970                 }
1971         }
1972
1973         talloc_free(tmp_ctx);
1974         return ret;
1975 }
1976
1977
1978 /*
1979   destroy a ctdb_client_ip structure
1980  */
1981 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
1982 {
1983         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
1984                 ctdb_addr_to_str(&ip->addr),
1985                 ntohs(ip->addr.ip.sin_port),
1986                 ip->client_id));
1987
1988         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
1989         return 0;
1990 }
1991
1992 /*
1993   called by a client to inform us of a TCP connection that it is managing
1994   that should tickled with an ACK when IP takeover is done
1995  */
1996 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
1997                                 TDB_DATA indata)
1998 {
1999         struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
2000         struct ctdb_connection *tcp_sock = NULL;
2001         struct ctdb_tcp_list *tcp;
2002         struct ctdb_connection t;
2003         int ret;
2004         TDB_DATA data;
2005         struct ctdb_client_ip *ip;
2006         struct ctdb_vnn *vnn;
2007         ctdb_sock_addr addr;
2008
2009         /* If we don't have public IPs, tickles are useless */
2010         if (ctdb->vnn == NULL) {
2011                 return 0;
2012         }
2013
2014         tcp_sock = (struct ctdb_connection *)indata.dptr;
2015
2016         addr = tcp_sock->src;
2017         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
2018         addr = tcp_sock->dst;
2019         ctdb_canonicalize_ip(&addr, &tcp_sock->dst);
2020
2021         ZERO_STRUCT(addr);
2022         memcpy(&addr, &tcp_sock->dst, sizeof(addr));
2023         vnn = find_public_ip_vnn(ctdb, &addr);
2024         if (vnn == NULL) {
2025                 switch (addr.sa.sa_family) {
2026                 case AF_INET:
2027                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2028                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
2029                                         ctdb_addr_to_str(&addr)));
2030                         }
2031                         break;
2032                 case AF_INET6:
2033                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
2034                                 ctdb_addr_to_str(&addr)));
2035                         break;
2036                 default:
2037                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2038                 }
2039
2040                 return 0;
2041         }
2042
2043         if (vnn->pnn != ctdb->pnn) {
2044                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2045                         ctdb_addr_to_str(&addr),
2046                         client_id, client->pid));
2047                 /* failing this call will tell smbd to die */
2048                 return -1;
2049         }
2050
2051         ip = talloc(client, struct ctdb_client_ip);
2052         CTDB_NO_MEMORY(ctdb, ip);
2053
2054         ip->ctdb      = ctdb;
2055         ip->addr      = addr;
2056         ip->client_id = client_id;
2057         talloc_set_destructor(ip, ctdb_client_ip_destructor);
2058         DLIST_ADD(ctdb->client_ip_list, ip);
2059
2060         tcp = talloc(client, struct ctdb_tcp_list);
2061         CTDB_NO_MEMORY(ctdb, tcp);
2062
2063         tcp->connection.src = tcp_sock->src;
2064         tcp->connection.dst = tcp_sock->dst;
2065
2066         DLIST_ADD(client->tcp_list, tcp);
2067
2068         t.src = tcp_sock->src;
2069         t.dst = tcp_sock->dst;
2070
2071         data.dptr = (uint8_t *)&t;
2072         data.dsize = sizeof(t);
2073
2074         switch (addr.sa.sa_family) {
2075         case AF_INET:
2076                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2077                         (unsigned)ntohs(tcp_sock->dst.ip.sin_port),
2078                         ctdb_addr_to_str(&tcp_sock->src),
2079                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2080                 break;
2081         case AF_INET6:
2082                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2083                         (unsigned)ntohs(tcp_sock->dst.ip6.sin6_port),
2084                         ctdb_addr_to_str(&tcp_sock->src),
2085                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2086                 break;
2087         default:
2088                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2089         }
2090
2091
2092         /* tell all nodes about this tcp connection */
2093         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
2094                                        CTDB_CONTROL_TCP_ADD,
2095                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2096         if (ret != 0) {
2097                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
2098                 return -1;
2099         }
2100
2101         return 0;
2102 }
2103
2104 /*
2105   find a tcp address on a list
2106  */
2107 static struct ctdb_connection *ctdb_tcp_find(struct ctdb_tcp_array *array,
2108                                            struct ctdb_connection *tcp)
2109 {
2110         int i;
2111
2112         if (array == NULL) {
2113                 return NULL;
2114         }
2115
2116         for (i=0;i<array->num;i++) {
2117                 if (ctdb_same_sockaddr(&array->connections[i].src, &tcp->src) &&
2118                     ctdb_same_sockaddr(&array->connections[i].dst, &tcp->dst)) {
2119                         return &array->connections[i];
2120                 }
2121         }
2122         return NULL;
2123 }
2124
2125
2126
2127 /*
2128   called by a daemon to inform us of a TCP connection that one of its
2129   clients managing that should tickled with an ACK when IP takeover is
2130   done
2131  */
2132 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
2133 {
2134         struct ctdb_connection *p = (struct ctdb_connection *)indata.dptr;
2135         struct ctdb_tcp_array *tcparray;
2136         struct ctdb_connection tcp;
2137         struct ctdb_vnn *vnn;
2138
2139         /* If we don't have public IPs, tickles are useless */
2140         if (ctdb->vnn == NULL) {
2141                 return 0;
2142         }
2143
2144         vnn = find_public_ip_vnn(ctdb, &p->dst);
2145         if (vnn == NULL) {
2146                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
2147                         ctdb_addr_to_str(&p->dst)));
2148
2149                 return -1;
2150         }
2151
2152
2153         tcparray = vnn->tcp_array;
2154
2155         /* If this is the first tickle */
2156         if (tcparray == NULL) {
2157                 tcparray = talloc(vnn, struct ctdb_tcp_array);
2158                 CTDB_NO_MEMORY(ctdb, tcparray);
2159                 vnn->tcp_array = tcparray;
2160
2161                 tcparray->num = 0;
2162                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_connection));
2163                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
2164
2165                 tcparray->connections[tcparray->num].src = p->src;
2166                 tcparray->connections[tcparray->num].dst = p->dst;
2167                 tcparray->num++;
2168
2169                 if (tcp_update_needed) {
2170                         vnn->tcp_update_needed = true;
2171                 }
2172                 return 0;
2173         }
2174
2175
2176         /* Do we already have this tickle ?*/
2177         tcp.src = p->src;
2178         tcp.dst = p->dst;
2179         if (ctdb_tcp_find(tcparray, &tcp) != NULL) {
2180                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
2181                         ctdb_addr_to_str(&tcp.dst),
2182                         ntohs(tcp.dst.ip.sin_port),
2183                         vnn->pnn));
2184                 return 0;
2185         }
2186
2187         /* A new tickle, we must add it to the array */
2188         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
2189                                         struct ctdb_connection,
2190                                         tcparray->num+1);
2191         CTDB_NO_MEMORY(ctdb, tcparray->connections);
2192
2193         tcparray->connections[tcparray->num].src = p->src;
2194         tcparray->connections[tcparray->num].dst = p->dst;
2195         tcparray->num++;
2196
2197         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
2198                 ctdb_addr_to_str(&tcp.dst),
2199                 ntohs(tcp.dst.ip.sin_port),
2200                 vnn->pnn));
2201
2202         if (tcp_update_needed) {
2203                 vnn->tcp_update_needed = true;
2204         }
2205
2206         return 0;
2207 }
2208
2209
2210 static void ctdb_remove_connection(struct ctdb_vnn *vnn, struct ctdb_connection *conn)
2211 {
2212         struct ctdb_connection *tcpp;
2213
2214         if (vnn == NULL) {
2215                 return;
2216         }
2217
2218         /* if the array is empty we cant remove it
2219            and we don't need to do anything
2220          */
2221         if (vnn->tcp_array == NULL) {
2222                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
2223                         ctdb_addr_to_str(&conn->dst),
2224                         ntohs(conn->dst.ip.sin_port)));
2225                 return;
2226         }
2227
2228
2229         /* See if we know this connection
2230            if we don't know this connection  then we dont need to do anything
2231          */
2232         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
2233         if (tcpp == NULL) {
2234                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
2235                         ctdb_addr_to_str(&conn->dst),
2236                         ntohs(conn->dst.ip.sin_port)));
2237                 return;
2238         }
2239
2240
2241         /* We need to remove this entry from the array.
2242            Instead of allocating a new array and copying data to it
2243            we cheat and just copy the last entry in the existing array
2244            to the entry that is to be removed and just shring the 
2245            ->num field
2246          */
2247         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
2248         vnn->tcp_array->num--;
2249
2250         /* If we deleted the last entry we also need to remove the entire array
2251          */
2252         if (vnn->tcp_array->num == 0) {
2253                 talloc_free(vnn->tcp_array);
2254                 vnn->tcp_array = NULL;
2255         }               
2256
2257         vnn->tcp_update_needed = true;
2258
2259         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
2260                 ctdb_addr_to_str(&conn->src),
2261                 ntohs(conn->src.ip.sin_port)));
2262 }
2263
2264
2265 /*
2266   called by a daemon to inform us of a TCP connection that one of its
2267   clients used are no longer needed in the tickle database
2268  */
2269 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
2270 {
2271         struct ctdb_vnn *vnn;
2272         struct ctdb_connection *conn = (struct ctdb_connection *)indata.dptr;
2273
2274         /* If we don't have public IPs, tickles are useless */
2275         if (ctdb->vnn == NULL) {
2276                 return 0;
2277         }
2278
2279         vnn = find_public_ip_vnn(ctdb, &conn->dst);
2280         if (vnn == NULL) {
2281                 DEBUG(DEBUG_ERR,
2282                       (__location__ " unable to find public address %s\n",
2283                        ctdb_addr_to_str(&conn->dst)));
2284                 return 0;
2285         }
2286
2287         ctdb_remove_connection(vnn, conn);
2288
2289         return 0;
2290 }
2291
2292
2293 /*
2294   Called when another daemon starts - causes all tickles for all
2295   public addresses we are serving to be sent to the new node on the
2296   next check.  This actually causes the next scheduled call to
2297   tdb_update_tcp_tickles() to update all nodes.  This is simple and
2298   doesn't require careful error handling.
2299  */
2300 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t pnn)
2301 {
2302         struct ctdb_vnn *vnn;
2303
2304         DEBUG(DEBUG_INFO, ("Received startup control from node %lu\n",
2305                            (unsigned long) pnn));
2306
2307         for (vnn = ctdb->vnn; vnn != NULL; vnn = vnn->next) {
2308                 vnn->tcp_update_needed = true;
2309         }
2310
2311         return 0;
2312 }
2313
2314
2315 /*
2316   called when a client structure goes away - hook to remove
2317   elements from the tcp_list in all daemons
2318  */
2319 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
2320 {
2321         while (client->tcp_list) {
2322                 struct ctdb_vnn *vnn;
2323                 struct ctdb_tcp_list *tcp = client->tcp_list;
2324                 struct ctdb_connection *conn = &tcp->connection;
2325
2326                 DLIST_REMOVE(client->tcp_list, tcp);
2327
2328                 vnn = find_public_ip_vnn(client->ctdb,
2329                                          &conn->dst);
2330                 if (vnn == NULL) {
2331                         DEBUG(DEBUG_ERR,
2332                               (__location__ " unable to find public address %s\n",
2333                                ctdb_addr_to_str(&conn->dst)));
2334                         continue;
2335                 }
2336
2337                 /* If the IP address is hosted on this node then
2338                  * remove the connection. */
2339                 if (vnn->pnn == client->ctdb->pnn) {
2340                         ctdb_remove_connection(vnn, conn);
2341                 }
2342
2343                 /* Otherwise this function has been called because the
2344                  * server IP address has been released to another node
2345                  * and the client has exited.  This means that we
2346                  * should not delete the connection information.  The
2347                  * takeover node processes connections too. */
2348         }
2349 }
2350
2351
2352 void ctdb_release_all_ips(struct ctdb_context *ctdb)
2353 {
2354         struct ctdb_vnn *vnn;
2355         int count = 0;
2356
2357         if (ctdb->tunable.disable_ip_failover == 1) {
2358                 return;
2359         }
2360
2361         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2362                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
2363                         ctdb_vnn_unassign_iface(ctdb, vnn);
2364                         continue;
2365                 }
2366                 if (!vnn->iface) {
2367                         continue;
2368                 }
2369
2370                 /* Don't allow multiple releases at once.  Some code,
2371                  * particularly ctdb_tickle_sentenced_connections() is
2372                  * not re-entrant */
2373                 if (vnn->update_in_flight) {
2374                         DEBUG(DEBUG_WARNING,
2375                               (__location__
2376                                " Not releasing IP %s/%u on interface %s, an update is already in progess\n",
2377                                     ctdb_addr_to_str(&vnn->public_address),
2378                                     vnn->public_netmask_bits,
2379                                     ctdb_vnn_iface_string(vnn)));
2380                         continue;
2381                 }
2382                 vnn->update_in_flight = true;
2383
2384                 DEBUG(DEBUG_INFO,("Release of IP %s/%u on interface %s node:-1\n",
2385                                     ctdb_addr_to_str(&vnn->public_address),
2386                                     vnn->public_netmask_bits,
2387                                     ctdb_vnn_iface_string(vnn)));
2388
2389                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
2390                                   ctdb_vnn_iface_string(vnn),
2391                                   ctdb_addr_to_str(&vnn->public_address),
2392                                   vnn->public_netmask_bits);
2393                 release_kill_clients(ctdb, &vnn->public_address);
2394                 ctdb_vnn_unassign_iface(ctdb, vnn);
2395                 vnn->update_in_flight = false;
2396                 count++;
2397         }
2398
2399         DEBUG(DEBUG_NOTICE,(__location__ " Released %d public IPs\n", count));
2400 }
2401
2402
2403 /*
2404   get list of public IPs
2405  */
2406 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
2407                                     struct ctdb_req_control_old *c, TDB_DATA *outdata)
2408 {
2409         int i, num, len;
2410         struct ctdb_public_ip_list_old *ips;
2411         struct ctdb_vnn *vnn;
2412         bool only_available = false;
2413
2414         if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
2415                 only_available = true;
2416         }
2417
2418         /* count how many public ip structures we have */
2419         num = 0;
2420         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2421                 num++;
2422         }
2423
2424         len = offsetof(struct ctdb_public_ip_list_old, ips) +
2425                 num*sizeof(struct ctdb_public_ip);
2426         ips = talloc_zero_size(outdata, len);
2427         CTDB_NO_MEMORY(ctdb, ips);
2428
2429         i = 0;
2430         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2431                 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
2432                         continue;
2433                 }
2434                 ips->ips[i].pnn  = vnn->pnn;
2435                 ips->ips[i].addr = vnn->public_address;
2436                 i++;
2437         }
2438         ips->num = i;
2439         len = offsetof(struct ctdb_public_ip_list_old, ips) +
2440                 i*sizeof(struct ctdb_public_ip);
2441
2442         outdata->dsize = len;
2443         outdata->dptr  = (uint8_t *)ips;
2444
2445         return 0;
2446 }
2447
2448
2449 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
2450                                         struct ctdb_req_control_old *c,
2451                                         TDB_DATA indata,
2452                                         TDB_DATA *outdata)
2453 {
2454         int i, num, len;
2455         ctdb_sock_addr *addr;
2456         struct ctdb_public_ip_info_old *info;
2457         struct ctdb_vnn *vnn;
2458
2459         addr = (ctdb_sock_addr *)indata.dptr;
2460
2461         vnn = find_public_ip_vnn(ctdb, addr);
2462         if (vnn == NULL) {
2463                 /* if it is not a public ip   it could be our 'single ip' */
2464                 if (ctdb->single_ip_vnn) {
2465                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
2466                                 vnn = ctdb->single_ip_vnn;
2467                         }
2468                 }
2469         }
2470         if (vnn == NULL) {
2471                 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
2472                                  "'%s'not a public address\n",
2473                                  ctdb_addr_to_str(addr)));
2474                 return -1;
2475         }
2476
2477         /* count how many public ip structures we have */
2478         num = 0;
2479         for (;vnn->ifaces[num];) {
2480                 num++;
2481         }
2482
2483         len = offsetof(struct ctdb_public_ip_info_old, ifaces) +
2484                 num*sizeof(struct ctdb_iface);
2485         info = talloc_zero_size(outdata, len);
2486         CTDB_NO_MEMORY(ctdb, info);
2487
2488         info->ip.addr = vnn->public_address;
2489         info->ip.pnn = vnn->pnn;
2490         info->active_idx = 0xFFFFFFFF;
2491
2492         for (i=0; vnn->ifaces[i]; i++) {
2493                 struct ctdb_interface *cur;
2494
2495                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
2496                 if (cur == NULL) {
2497                         DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
2498                                            vnn->ifaces[i]));
2499                         return -1;
2500                 }
2501                 if (vnn->iface == cur) {
2502                         info->active_idx = i;
2503                 }
2504                 strncpy(info->ifaces[i].name, cur->name, sizeof(info->ifaces[i].name)-1);
2505                 info->ifaces[i].link_state = cur->link_up;
2506                 info->ifaces[i].references = cur->references;
2507         }
2508         info->num = i;
2509         len = offsetof(struct ctdb_public_ip_info_old, ifaces) +
2510                 i*sizeof(struct ctdb_iface);
2511
2512         outdata->dsize = len;
2513         outdata->dptr  = (uint8_t *)info;
2514
2515         return 0;
2516 }
2517
2518 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
2519                                 struct ctdb_req_control_old *c,
2520                                 TDB_DATA *outdata)
2521 {
2522         int i, num, len;
2523         struct ctdb_iface_list_old *ifaces;
2524         struct ctdb_interface *cur;
2525
2526         /* count how many public ip structures we have */
2527         num = 0;
2528         for (cur=ctdb->ifaces;cur;cur=cur->next) {
2529                 num++;
2530         }
2531
2532         len = offsetof(struct ctdb_iface_list_old, ifaces) +
2533                 num*sizeof(struct ctdb_iface);
2534         ifaces = talloc_zero_size(outdata, len);
2535         CTDB_NO_MEMORY(ctdb, ifaces);
2536
2537         i = 0;
2538         for (cur=ctdb->ifaces;cur;cur=cur->next) {
2539                 strcpy(ifaces->ifaces[i].name, cur->name);
2540                 ifaces->ifaces[i].link_state = cur->link_up;
2541                 ifaces->ifaces[i].references = cur->references;
2542                 i++;
2543         }
2544         ifaces->num = i;
2545         len = offsetof(struct ctdb_iface_list_old, ifaces) +
2546                 i*sizeof(struct ctdb_iface);
2547
2548         outdata->dsize = len;
2549         outdata->dptr  = (uint8_t *)ifaces;
2550
2551         return 0;
2552 }
2553
2554 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
2555                                     struct ctdb_req_control_old *c,
2556                                     TDB_DATA indata)
2557 {
2558         struct ctdb_iface *info;
2559         struct ctdb_interface *iface;
2560         bool link_up = false;
2561
2562         info = (struct ctdb_iface *)indata.dptr;
2563
2564         if (info->name[CTDB_IFACE_SIZE] != '\0') {
2565                 int len = strnlen(info->name, CTDB_IFACE_SIZE);
2566                 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
2567                                   len, len, info->name));
2568                 return -1;
2569         }
2570
2571         switch (info->link_state) {
2572         case 0:
2573                 link_up = false;
2574                 break;
2575         case 1:
2576                 link_up = true;
2577                 break;
2578         default:
2579                 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
2580                                   (unsigned int)info->link_state));
2581                 return -1;
2582         }
2583
2584         if (info->references != 0) {
2585                 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
2586                                   (unsigned int)info->references));
2587                 return -1;
2588         }
2589
2590         iface = ctdb_find_iface(ctdb, info->name);
2591         if (iface == NULL) {
2592                 return -1;
2593         }
2594
2595         if (link_up == iface->link_up) {
2596                 return 0;
2597         }
2598
2599         DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
2600               ("iface[%s] has changed it's link status %s => %s\n",
2601                iface->name,
2602                iface->link_up?"up":"down",
2603                link_up?"up":"down"));
2604
2605         iface->link_up = link_up;
2606         return 0;
2607 }
2608
2609
2610 /* 
2611    structure containing the listening socket and the list of tcp connections
2612    that the ctdb daemon is to kill
2613 */
2614 struct ctdb_kill_tcp {
2615         struct ctdb_vnn *vnn;
2616         struct ctdb_context *ctdb;
2617         int capture_fd;
2618         struct tevent_fd *fde;
2619         trbt_tree_t *connections;
2620         void *private_data;
2621 };
2622
2623 /*
2624   a tcp connection that is to be killed
2625  */
2626 struct ctdb_killtcp_con {
2627         ctdb_sock_addr src_addr;
2628         ctdb_sock_addr dst_addr;
2629         int count;
2630         struct ctdb_kill_tcp *killtcp;
2631 };
2632
2633 /* this function is used to create a key to represent this socketpair
2634    in the killtcp tree.
2635    this key is used to insert and lookup matching socketpairs that are
2636    to be tickled and RST
2637 */
2638 #define KILLTCP_KEYLEN  10
2639 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
2640 {
2641         static uint32_t key[KILLTCP_KEYLEN];
2642
2643         bzero(key, sizeof(key));
2644
2645         if (src->sa.sa_family != dst->sa.sa_family) {
2646                 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
2647                 return key;
2648         }
2649         
2650         switch (src->sa.sa_family) {
2651         case AF_INET:
2652                 key[0]  = dst->ip.sin_addr.s_addr;
2653                 key[1]  = src->ip.sin_addr.s_addr;
2654                 key[2]  = dst->ip.sin_port;
2655                 key[3]  = src->ip.sin_port;
2656                 break;
2657         case AF_INET6: {
2658                 uint32_t *dst6_addr32 =
2659                         (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
2660                 uint32_t *src6_addr32 =
2661                         (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
2662                 key[0]  = dst6_addr32[3];
2663                 key[1]  = src6_addr32[3];
2664                 key[2]  = dst6_addr32[2];
2665                 key[3]  = src6_addr32[2];
2666                 key[4]  = dst6_addr32[1];
2667                 key[5]  = src6_addr32[1];
2668                 key[6]  = dst6_addr32[0];
2669                 key[7]  = src6_addr32[0];
2670                 key[8]  = dst->ip6.sin6_port;
2671                 key[9]  = src->ip6.sin6_port;
2672                 break;
2673         }
2674         default:
2675                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
2676                 return key;
2677         }
2678
2679         return key;
2680 }
2681
2682 /*
2683   called when we get a read event on the raw socket
2684  */
2685 static void capture_tcp_handler(struct tevent_context *ev,
2686                                 struct tevent_fd *fde,
2687                                 uint16_t flags, void *private_data)
2688 {
2689         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
2690         struct ctdb_killtcp_con *con;
2691         ctdb_sock_addr src, dst;
2692         uint32_t ack_seq, seq;
2693
2694         if (!(flags & TEVENT_FD_READ)) {
2695                 return;
2696         }
2697
2698         if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
2699                                 killtcp->private_data,
2700                                 &src, &dst,
2701                                 &ack_seq, &seq) != 0) {
2702                 /* probably a non-tcp ACK packet */
2703                 return;
2704         }
2705
2706         /* check if we have this guy in our list of connections
2707            to kill
2708         */
2709         con = trbt_lookuparray32(killtcp->connections, 
2710                         KILLTCP_KEYLEN, killtcp_key(&src, &dst));
2711         if (con == NULL) {
2712                 /* no this was some other packet we can just ignore */
2713                 return;
2714         }
2715
2716         /* This one has been tickled !
2717            now reset him and remove him from the list.
2718          */
2719         DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
2720                 ntohs(con->dst_addr.ip.sin_port),
2721                 ctdb_addr_to_str(&con->src_addr),
2722                 ntohs(con->src_addr.ip.sin_port)));
2723
2724         ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
2725         talloc_free(con);
2726 }
2727
2728
2729 /* when traversing the list of all tcp connections to send tickle acks to
2730    (so that we can capture the ack coming back and kill the connection
2731     by a RST)
2732    this callback is called for each connection we are currently trying to kill
2733 */
2734 static int tickle_connection_traverse(void *param, void *data)
2735 {
2736         struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
2737
2738         /* have tried too many times, just give up */
2739         if (con->count >= 5) {
2740                 /* can't delete in traverse: reparent to delete_cons */
2741                 talloc_steal(param, con);
2742                 return 0;
2743         }
2744
2745         /* othervise, try tickling it again */
2746         con->count++;
2747         ctdb_sys_send_tcp(
2748                 (ctdb_sock_addr *)&con->dst_addr,
2749                 (ctdb_sock_addr *)&con->src_addr,
2750                 0, 0, 0);
2751         return 0;
2752 }
2753
2754
2755 /* 
2756    called every second until all sentenced connections have been reset
2757  */
2758 static void ctdb_tickle_sentenced_connections(struct tevent_context *ev,
2759                                               struct tevent_timer *te,
2760                                               struct timeval t, void *private_data)
2761 {
2762         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
2763         void *delete_cons = talloc_new(NULL);
2764
2765         /* loop over all connections sending tickle ACKs */
2766         trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
2767
2768         /* now we've finished traverse, it's safe to do deletion. */
2769         talloc_free(delete_cons);
2770
2771         /* If there are no more connections to kill we can remove the
2772            entire killtcp structure
2773          */
2774         if ( (killtcp->connections == NULL) || 
2775              (killtcp->connections->root == NULL) ) {
2776                 talloc_free(killtcp);
2777                 return;
2778         }
2779
2780         /* try tickling them again in a seconds time
2781          */
2782         tevent_add_timer(killtcp->ctdb->ev, killtcp,
2783                          timeval_current_ofs(1, 0),
2784                          ctdb_tickle_sentenced_connections, killtcp);
2785 }
2786
2787 /*
2788   destroy the killtcp structure
2789  */
2790 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
2791 {
2792         struct ctdb_vnn *tmpvnn;
2793
2794         /* verify that this vnn is still active */
2795         for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
2796                 if (tmpvnn == killtcp->vnn) {
2797                         break;
2798                 }
2799         }
2800
2801         if (tmpvnn == NULL) {
2802                 return 0;
2803         }
2804
2805         if (killtcp->vnn->killtcp != killtcp) {
2806                 return 0;
2807         }
2808
2809         killtcp->vnn->killtcp = NULL;
2810
2811         return 0;
2812 }
2813
2814
2815 /* nothing fancy here, just unconditionally replace any existing
2816    connection structure with the new one.
2817
2818    don't even free the old one if it did exist, that one is talloc_stolen
2819    by the same node in the tree anyway and will be deleted when the new data 
2820    is deleted
2821 */
2822 static void *add_killtcp_callback(void *parm, void *data)
2823 {
2824         return parm;
2825 }
2826
2827 /*
2828   add a tcp socket to the list of connections we want to RST
2829  */
2830 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb, 
2831                                        ctdb_sock_addr *s,
2832                                        ctdb_sock_addr *d)
2833 {
2834         ctdb_sock_addr src, dst;
2835         struct ctdb_kill_tcp *killtcp;
2836         struct ctdb_killtcp_con *con;
2837         struct ctdb_vnn *vnn;
2838
2839         ctdb_canonicalize_ip(s, &src);
2840         ctdb_canonicalize_ip(d, &dst);
2841
2842         vnn = find_public_ip_vnn(ctdb, &dst);
2843         if (vnn == NULL) {
2844                 vnn = find_public_ip_vnn(ctdb, &src);
2845         }
2846         if (vnn == NULL) {
2847                 /* if it is not a public ip   it could be our 'single ip' */
2848                 if (ctdb->single_ip_vnn) {
2849                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
2850                                 vnn = ctdb->single_ip_vnn;
2851                         }
2852                 }
2853         }
2854         if (vnn == NULL) {
2855                 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n")); 
2856                 return -1;
2857         }
2858
2859         killtcp = vnn->killtcp;
2860         
2861         /* If this is the first connection to kill we must allocate
2862            a new structure
2863          */
2864         if (killtcp == NULL) {
2865                 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
2866                 CTDB_NO_MEMORY(ctdb, killtcp);
2867
2868                 killtcp->vnn         = vnn;
2869                 killtcp->ctdb        = ctdb;
2870                 killtcp->capture_fd  = -1;
2871                 killtcp->connections = trbt_create(killtcp, 0);
2872
2873                 vnn->killtcp         = killtcp;
2874                 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
2875         }
2876
2877
2878
2879         /* create a structure that describes this connection we want to
2880            RST and store it in killtcp->connections
2881         */
2882         con = talloc(killtcp, struct ctdb_killtcp_con);
2883         CTDB_NO_MEMORY(ctdb, con);
2884         con->src_addr = src;
2885         con->dst_addr = dst;
2886         con->count    = 0;
2887         con->killtcp  = killtcp;
2888
2889
2890         trbt_insertarray32_callback(killtcp->connections,
2891                         KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
2892                         add_killtcp_callback, con);
2893
2894         /* 
2895            If we don't have a socket to listen on yet we must create it
2896          */
2897         if (killtcp->capture_fd == -1) {
2898                 const char *iface = ctdb_vnn_iface_string(vnn);
2899                 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
2900                 if (killtcp->capture_fd == -1) {
2901                         DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
2902                                           "socket on iface '%s' for killtcp (%s)\n",
2903                                           iface, strerror(errno)));
2904                         goto failed;
2905                 }
2906         }
2907
2908
2909         if (killtcp->fde == NULL) {
2910                 killtcp->fde = tevent_add_fd(ctdb->ev, killtcp,
2911                                              killtcp->capture_fd,
2912                                              TEVENT_FD_READ,
2913                                              capture_tcp_handler, killtcp);
2914                 tevent_fd_set_auto_close(killtcp->fde);
2915
2916                 /* We also need to set up some events to tickle all these connections
2917                    until they are all reset
2918                 */
2919                 tevent_add_timer(ctdb->ev, killtcp, timeval_current_ofs(1, 0),
2920                                  ctdb_tickle_sentenced_connections, killtcp);
2921         }
2922
2923         /* tickle him once now */
2924         ctdb_sys_send_tcp(
2925                 &con->dst_addr,
2926                 &con->src_addr,
2927                 0, 0, 0);
2928
2929         return 0;
2930
2931 failed:
2932         talloc_free(vnn->killtcp);
2933         vnn->killtcp = NULL;
2934         return -1;
2935 }
2936
2937 /*
2938   kill a TCP connection.
2939  */
2940 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
2941 {
2942         struct ctdb_connection *killtcp = (struct ctdb_connection *)indata.dptr;
2943
2944         return ctdb_killtcp_add_connection(ctdb, &killtcp->src, &killtcp->dst);
2945 }
2946
2947 /*
2948   called by a daemon to inform us of the entire list of TCP tickles for
2949   a particular public address.
2950   this control should only be sent by the node that is currently serving
2951   that public address.
2952  */
2953 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
2954 {
2955         struct ctdb_tickle_list_old *list = (struct ctdb_tickle_list_old *)indata.dptr;
2956         struct ctdb_tcp_array *tcparray;
2957         struct ctdb_vnn *vnn;
2958
2959         /* We must at least have tickles.num or else we cant verify the size
2960            of the received data blob
2961          */
2962         if (indata.dsize < offsetof(struct ctdb_tickle_list_old, connections)) {
2963                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_tickle_list. Not enough data for the tickle.num field\n"));
2964                 return -1;
2965         }
2966
2967         /* verify that the size of data matches what we expect */
2968         if (indata.dsize < offsetof(struct ctdb_tickle_list_old, connections)
2969                          + sizeof(struct ctdb_connection) * list->num) {
2970                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_tickle_list\n"));
2971                 return -1;
2972         }
2973
2974         DEBUG(DEBUG_INFO, ("Received tickle update for public address %s\n",
2975                            ctdb_addr_to_str(&list->addr)));
2976
2977         vnn = find_public_ip_vnn(ctdb, &list->addr);
2978         if (vnn == NULL) {
2979                 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n",
2980                         ctdb_addr_to_str(&list->addr)));
2981
2982                 return 1;
2983         }
2984
2985         if (vnn->pnn == ctdb->pnn) {
2986                 DEBUG(DEBUG_INFO,
2987                       ("Ignoring redundant set tcp tickle list, this node hosts '%s'\n",
2988                        ctdb_addr_to_str(&list->addr)));
2989                 return 0;
2990         }
2991
2992         /* remove any old ticklelist we might have */
2993         talloc_free(vnn->tcp_array);
2994         vnn->tcp_array = NULL;
2995
2996         tcparray = talloc(vnn, struct ctdb_tcp_array);
2997         CTDB_NO_MEMORY(ctdb, tcparray);
2998
2999         tcparray->num = list->num;
3000
3001         tcparray->connections = talloc_array(tcparray, struct ctdb_connection, tcparray->num);
3002         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3003
3004         memcpy(tcparray->connections, &list->connections[0],
3005                sizeof(struct ctdb_connection)*tcparray->num);
3006
3007         /* We now have a new fresh tickle list array for this vnn */
3008         vnn->tcp_array = tcparray;
3009
3010         return 0;
3011 }
3012
3013 /*
3014   called to return the full list of tickles for the puclic address associated 
3015   with the provided vnn
3016  */
3017 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3018 {
3019         ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3020         struct ctdb_tickle_list_old *list;
3021         struct ctdb_tcp_array *tcparray;
3022         int num;
3023         struct ctdb_vnn *vnn;
3024
3025         vnn = find_public_ip_vnn(ctdb, addr);
3026         if (vnn == NULL) {
3027                 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n", 
3028                         ctdb_addr_to_str(addr)));
3029
3030                 return 1;
3031         }
3032
3033         tcparray = vnn->tcp_array;
3034         if (tcparray) {
3035                 num = tcparray->num;
3036         } else {
3037                 num = 0;
3038         }
3039
3040         outdata->dsize = offsetof(struct ctdb_tickle_list_old, connections)
3041                         + sizeof(struct ctdb_connection) * num;
3042
3043         outdata->dptr  = talloc_size(outdata, outdata->dsize);
3044         CTDB_NO_MEMORY(ctdb, outdata->dptr);
3045         list = (struct ctdb_tickle_list_old *)outdata->dptr;
3046
3047         list->addr = *addr;
3048         list->num = num;
3049         if (num) {
3050                 memcpy(&list->connections[0], tcparray->connections,
3051                         sizeof(struct ctdb_connection) * num);
3052         }
3053
3054         return 0;
3055 }
3056
3057
3058 /*
3059   set the list of all tcp tickles for a public address
3060  */
3061 static int ctdb_send_set_tcp_tickles_for_ip(struct ctdb_context *ctdb,
3062                                             ctdb_sock_addr *addr,
3063                                             struct ctdb_tcp_array *tcparray)
3064 {
3065         int ret, num;
3066         TDB_DATA data;
3067         struct ctdb_tickle_list_old *list;
3068
3069         if (tcparray) {
3070                 num = tcparray->num;
3071         } else {
3072                 num = 0;
3073         }
3074
3075         data.dsize = offsetof(struct ctdb_tickle_list_old, connections) +
3076                         sizeof(struct ctdb_connection) * num;
3077         data.dptr = talloc_size(ctdb, data.dsize);
3078         CTDB_NO_MEMORY(ctdb, data.dptr);
3079
3080         list = (struct ctdb_tickle_list_old *)data.dptr;
3081         list->addr = *addr;
3082         list->num = num;
3083         if (tcparray) {
3084                 memcpy(&list->connections[0], tcparray->connections, sizeof(struct ctdb_connection) * num);
3085         }
3086
3087         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_ALL, 0,
3088                                        CTDB_CONTROL_SET_TCP_TICKLE_LIST,
3089                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3090         if (ret != 0) {
3091                 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
3092                 return -1;
3093         }
3094
3095         talloc_free(data.dptr);
3096
3097         return ret;
3098 }
3099
3100
3101 /*
3102   perform tickle updates if required
3103  */
3104 static void ctdb_update_tcp_tickles(struct tevent_context *ev,
3105                                     struct tevent_timer *te,
3106                                     struct timeval t, void *private_data)
3107 {
3108         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3109         int ret;
3110         struct ctdb_vnn *vnn;
3111
3112         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3113                 /* we only send out updates for public addresses that 
3114                    we have taken over
3115                  */
3116                 if (ctdb->pnn != vnn->pnn) {
3117                         continue;
3118                 }
3119                 /* We only send out the updates if we need to */
3120                 if (!vnn->tcp_update_needed) {
3121                         continue;
3122                 }
3123                 ret = ctdb_send_set_tcp_tickles_for_ip(ctdb,
3124                                                        &vnn->public_address,
3125                                                        vnn->tcp_array);
3126                 if (ret != 0) {
3127                         DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
3128                                 ctdb_addr_to_str(&vnn->public_address)));
3129                 } else {
3130                         DEBUG(DEBUG_INFO,
3131                               ("Sent tickle update for public address %s\n",
3132                                ctdb_addr_to_str(&vnn->public_address)));
3133                         vnn->tcp_update_needed = false;
3134                 }
3135         }
3136
3137         tevent_add_timer(ctdb->ev, ctdb->tickle_update_context,
3138                          timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
3139                          ctdb_update_tcp_tickles, ctdb);
3140 }
3141
3142 /*
3143   start periodic update of tcp tickles
3144  */
3145 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
3146 {
3147         ctdb->tickle_update_context = talloc_new(ctdb);
3148
3149         tevent_add_timer(ctdb->ev, ctdb->tickle_update_context,
3150                          timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
3151                          ctdb_update_tcp_tickles, ctdb);
3152 }
3153
3154
3155
3156
3157 struct control_gratious_arp {
3158         struct ctdb_context *ctdb;
3159         ctdb_sock_addr addr;
3160         const char *iface;
3161         int count;
3162 };
3163
3164 /*
3165   send a control_gratuitous arp
3166  */
3167 static void send_gratious_arp(struct tevent_context *ev,
3168                               struct tevent_timer *te,
3169                               struct timeval t, void *private_data)
3170 {
3171         int ret;
3172         struct control_gratious_arp *arp = talloc_get_type(private_data, 
3173                                                         struct control_gratious_arp);
3174
3175         ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
3176         if (ret != 0) {
3177                 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
3178                                  arp->iface, strerror(errno)));
3179         }
3180
3181
3182         arp->count++;
3183         if (arp->count == CTDB_ARP_REPEAT) {
3184                 talloc_free(arp);
3185                 return;
3186         }
3187
3188         tevent_add_timer(arp->ctdb->ev, arp,
3189                          timeval_current_ofs(CTDB_ARP_INTERVAL, 0),
3190                          send_gratious_arp, arp);
3191 }
3192
3193
3194 /*
3195   send a gratious arp 
3196  */
3197 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
3198 {
3199         struct ctdb_addr_info_old *gratious_arp = (struct ctdb_addr_info_old *)indata.dptr;
3200         struct control_gratious_arp *arp;
3201
3202         /* verify the size of indata */
3203         if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
3204                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
3205                                  (unsigned)indata.dsize, 
3206                                  (unsigned)offsetof(struct ctdb_addr_info_old, iface)));
3207                 return -1;
3208         }
3209         if (indata.dsize != 
3210                 ( offsetof(struct ctdb_addr_info_old, iface)
3211                 + gratious_arp->len ) ){
3212
3213                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3214                         "but should be %u bytes\n", 
3215                          (unsigned)indata.dsize, 
3216                          (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+gratious_arp->len)));
3217                 return -1;
3218         }
3219
3220
3221         arp = talloc(ctdb, struct control_gratious_arp);
3222         CTDB_NO_MEMORY(ctdb, arp);
3223
3224         arp->ctdb  = ctdb;
3225         arp->addr   = gratious_arp->addr;
3226         arp->iface = talloc_strdup(arp, gratious_arp->iface);
3227         CTDB_NO_MEMORY(ctdb, arp->iface);
3228         arp->count = 0;
3229
3230         tevent_add_timer(arp->ctdb->ev, arp,
3231                          timeval_zero(), send_gratious_arp, arp);
3232
3233         return 0;
3234 }
3235
3236 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
3237 {
3238         struct ctdb_addr_info_old *pub = (struct ctdb_addr_info_old *)indata.dptr;
3239         int ret;
3240
3241         /* verify the size of indata */
3242         if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
3243                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_addr_info structure\n"));
3244                 return -1;
3245         }
3246         if (indata.dsize != 
3247                 ( offsetof(struct ctdb_addr_info_old, iface)
3248                 + pub->len ) ){
3249
3250                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3251                         "but should be %u bytes\n", 
3252                          (unsigned)indata.dsize, 
3253                          (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+pub->len)));
3254                 return -1;
3255         }
3256
3257         DEBUG(DEBUG_NOTICE,("Add IP %s\n", ctdb_addr_to_str(&pub->addr)));
3258
3259         ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
3260
3261         if (ret != 0) {
3262                 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
3263                 return -1;
3264         }
3265
3266         return 0;
3267 }
3268
3269 struct delete_ip_callback_state {
3270         struct ctdb_req_control_old *c;
3271 };
3272
3273 /*
3274   called when releaseip event finishes for del_public_address
3275  */
3276 static void delete_ip_callback(struct ctdb_context *ctdb,
3277                                int32_t status, TDB_DATA data,
3278                                const char *errormsg,
3279                                void *private_data)
3280 {
3281         struct delete_ip_callback_state *state =
3282                 talloc_get_type(private_data, struct delete_ip_callback_state);
3283
3284         /* If release failed then fail. */
3285         ctdb_request_control_reply(ctdb, state->c, NULL, status, errormsg);
3286         talloc_free(private_data);
3287 }
3288
3289 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb,
3290                                         struct ctdb_req_control_old *c,
3291                                         TDB_DATA indata, bool *async_reply)
3292 {
3293         struct ctdb_addr_info_old *pub = (struct ctdb_addr_info_old *)indata.dptr;
3294         struct ctdb_vnn *vnn;
3295
3296         /* verify the size of indata */
3297         if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
3298                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_addr_info structure\n"));
3299                 return -1;
3300         }
3301         if (indata.dsize != 
3302                 ( offsetof(struct ctdb_addr_info_old, iface)
3303                 + pub->len ) ){
3304
3305                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3306                         "but should be %u bytes\n", 
3307                          (unsigned)indata.dsize, 
3308                          (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+pub->len)));
3309                 return -1;
3310         }
3311
3312         DEBUG(DEBUG_NOTICE,("Delete IP %s\n", ctdb_addr_to_str(&pub->addr)));
3313
3314         /* walk over all public addresses until we find a match */
3315         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3316                 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
3317                         if (vnn->pnn == ctdb->pnn) {
3318                                 struct delete_ip_callback_state *state;
3319                                 struct ctdb_public_ip *ip;
3320                                 TDB_DATA data;
3321                                 int ret;
3322
3323                                 vnn->delete_pending = true;
3324
3325                                 state = talloc(ctdb,
3326                                                struct delete_ip_callback_state);
3327                                 CTDB_NO_MEMORY(ctdb, state);
3328                                 state->c = c;
3329
3330                                 ip = talloc(state, struct ctdb_public_ip);
3331                                 if (ip == NULL) {
3332                                         DEBUG(DEBUG_ERR,
3333                                               (__location__ " Out of memory\n"));
3334                                         talloc_free(state);
3335                                         return -1;
3336                                 }
3337                                 ip->pnn = -1;
3338                                 ip->addr = pub->addr;
3339
3340                                 data.dsize = sizeof(struct ctdb_public_ip);
3341                                 data.dptr = (unsigned char *)ip;
3342
3343                                 ret = ctdb_daemon_send_control(ctdb,
3344                                                                ctdb_get_pnn(ctdb),
3345                                                                0,
3346                                                                CTDB_CONTROL_RELEASE_IP,
3347                                                                0, 0,
3348                                                                data,
3349                                                                delete_ip_callback,
3350                                                                state);
3351                                 if (ret == -1) {
3352                                         DEBUG(DEBUG_ERR,
3353                                               (__location__ "Unable to send "
3354                                                "CTDB_CONTROL_RELEASE_IP\n"));
3355                                         talloc_free(state);
3356                                         return -1;
3357                                 }
3358
3359                                 state->c = talloc_steal(state, c);
3360                                 *async_reply = true;
3361                         } else {
3362                                 /* This IP is not hosted on the
3363                                  * current node so just delete it
3364                                  * now. */
3365                                 do_delete_ip(ctdb, vnn);
3366                         }
3367
3368                         return 0;
3369                 }
3370         }
3371
3372         DEBUG(DEBUG_ERR,("Delete IP of unknown public IP address %s\n",
3373                          ctdb_addr_to_str(&pub->addr)));
3374         return -1;
3375 }
3376
3377
3378 struct ipreallocated_callback_state {
3379         struct ctdb_req_control_old *c;
3380 };
3381
3382 static void ctdb_ipreallocated_callback(struct ctdb_context *ctdb,
3383                                         int status, void *p)
3384 {
3385         struct ipreallocated_callback_state *state =
3386                 talloc_get_type(p, struct ipreallocated_callback_state);
3387
3388         if (status != 0) {
3389                 DEBUG(DEBUG_ERR,
3390                       (" \"ipreallocated\" event script failed (status %d)\n",
3391                        status));
3392                 if (status == -ETIME) {
3393                         ctdb_ban_self(ctdb);
3394                 }
3395         }
3396
3397         ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
3398         talloc_free(state);
3399 }
3400
3401 /* A control to run the ipreallocated event */
3402 int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
3403                                    struct ctdb_req_control_old *c,
3404                                    bool *async_reply)
3405 {
3406         int ret;
3407         struct ipreallocated_callback_state *state;
3408
3409         state = talloc(ctdb, struct ipreallocated_callback_state);
3410         CTDB_NO_MEMORY(ctdb, state);
3411
3412         DEBUG(DEBUG_INFO,(__location__ " Running \"ipreallocated\" event\n"));
3413
3414         ret = ctdb_event_script_callback(ctdb, state,
3415                                          ctdb_ipreallocated_callback, state,
3416                                          CTDB_EVENT_IPREALLOCATED,
3417                                          "%s", "");
3418
3419         if (ret != 0) {
3420                 DEBUG(DEBUG_ERR,("Failed to run \"ipreallocated\" event \n"));
3421                 talloc_free(state);
3422                 return -1;
3423         }
3424
3425         /* tell the control that we will be reply asynchronously */
3426         state->c    = talloc_steal(state, c);
3427         *async_reply = true;
3428
3429         return 0;
3430 }
3431
3432
3433 /* This function is called from the recovery daemon to verify that a remote
3434    node has the expected ip allocation.
3435    This is verified against ctdb->ip_tree
3436 */
3437 static int verify_remote_ip_allocation(struct ctdb_context *ctdb,
3438                                        struct ctdb_public_ip_list_old *ips,
3439                                        uint32_t pnn)
3440 {
3441         struct public_ip_list *tmp_ip;
3442         int i;
3443
3444         if (ctdb->ip_tree == NULL) {
3445                 /* don't know the expected allocation yet, assume remote node
3446                    is correct. */
3447                 return 0;
3448         }
3449
3450         if (ips == NULL) {
3451                 return 0;
3452         }
3453
3454         for (i=0; i<ips->num; i++) {
3455                 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
3456                 if (tmp_ip == NULL) {
3457                         DEBUG(DEBUG_ERR,("Node %u has new or unknown public IP %s\n", pnn, ctdb_addr_to_str(&ips->ips[i].addr)));
3458                         return -1;
3459                 }
3460
3461                 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
3462                         continue;
3463                 }
3464
3465                 if (tmp_ip->pnn != ips->ips[i].pnn) {
3466                         DEBUG(DEBUG_ERR,
3467                               ("Inconsistent IP allocation - node %u thinks %s is held by node %u while it is assigned to node %u\n",
3468                                pnn,
3469                                ctdb_addr_to_str(&ips->ips[i].addr),
3470                                ips->ips[i].pnn, tmp_ip->pnn));
3471                         return -1;
3472                 }
3473         }
3474
3475         return 0;
3476 }
3477
3478 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
3479 {
3480         struct public_ip_list *tmp_ip;
3481
3482         /* IP tree is never built if DisableIPFailover is set */
3483         if (ctdb->tunable.disable_ip_failover != 0) {
3484                 return 0;
3485         }
3486
3487         if (ctdb->ip_tree == NULL) {
3488                 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
3489                 return -1;
3490         }
3491
3492         tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
3493         if (tmp_ip == NULL) {
3494                 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
3495                 return -1;
3496         }
3497
3498         DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
3499         tmp_ip->pnn = ip->pnn;
3500
3501         return 0;
3502 }
3503
3504 void clear_ip_assignment_tree(struct ctdb_context *ctdb)
3505 {
3506         TALLOC_FREE(ctdb->ip_tree);
3507 }
3508
3509 struct ctdb_reloadips_handle {
3510         struct ctdb_context *ctdb;
3511         struct ctdb_req_control_old *c;
3512         int status;
3513         int fd[2];
3514         pid_t child;
3515         struct tevent_fd *fde;
3516 };
3517
3518 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
3519 {
3520         if (h == h->ctdb->reload_ips) {
3521                 h->ctdb->reload_ips = NULL;
3522         }
3523         if (h->c != NULL) {
3524                 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
3525                 h->c = NULL;
3526         }
3527         ctdb_kill(h->ctdb, h->child, SIGKILL);
3528         return 0;
3529 }
3530
3531 static void ctdb_reloadips_timeout_event(struct tevent_context *ev,
3532                                          struct tevent_timer *te,
3533                                          struct timeval t, void *private_data)
3534 {
3535         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
3536
3537         talloc_free(h);
3538 }
3539
3540 static void ctdb_reloadips_child_handler(struct tevent_context *ev,
3541                                          struct tevent_fd *fde,
3542                                          uint16_t flags, void *private_data)
3543 {
3544         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
3545
3546         char res;
3547         int ret;
3548
3549         ret = sys_read(h->fd[0], &res, 1);
3550         if (ret < 1 || res != 0) {
3551                 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
3552                 res = 1;
3553         }
3554         h->status = res;
3555
3556         talloc_free(h);
3557 }
3558
3559 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
3560 {
3561         TALLOC_CTX *mem_ctx = talloc_new(NULL);
3562         struct ctdb_public_ip_list_old *ips;
3563         struct ctdb_vnn *vnn;
3564         struct client_async_data *async_data;
3565         struct timeval timeout;
3566         TDB_DATA data;
3567         struct ctdb_client_control_state *state;
3568         bool first_add;
3569         int i, ret;
3570
3571         CTDB_NO_MEMORY(ctdb, mem_ctx);
3572
3573         /* Read IPs from local node */
3574         ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(),
3575                                        CTDB_CURRENT_NODE, mem_ctx, &ips);
3576         if (ret != 0) {
3577                 DEBUG(DEBUG_ERR,
3578                       ("Unable to fetch public IPs from local node\n"));
3579                 talloc_free(mem_ctx);
3580                 return -1;
3581         }
3582
3583         /* Read IPs file - this is safe since this is a child process */
3584         ctdb->vnn = NULL;
3585         if (ctdb_set_public_addresses(ctdb, false) != 0) {
3586                 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
3587                 talloc_free(mem_ctx);
3588                 return -1;
3589         }
3590
3591         async_data = talloc_zero(mem_ctx, struct client_async_data);
3592         CTDB_NO_MEMORY(ctdb, async_data);
3593
3594         /* Compare IPs between node and file for IPs to be deleted */
3595         for (i = 0; i < ips->num; i++) {
3596                 /* */
3597                 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
3598                         if (ctdb_same_ip(&vnn->public_address,
3599                                          &ips->ips[i].addr)) {
3600                                 /* IP is still in file */
3601                                 break;
3602                         }
3603                 }
3604
3605                 if (vnn == NULL) {
3606                         /* Delete IP ips->ips[i] */
3607                         struct ctdb_addr_info_old *pub;
3608
3609                         DEBUG(DEBUG_NOTICE,
3610                               ("IP %s no longer configured, deleting it\n",
3611                                ctdb_addr_to_str(&ips->ips[i].addr)));
3612
3613                         pub = talloc_zero(mem_ctx, struct ctdb_addr_info_old);
3614                         CTDB_NO_MEMORY(ctdb, pub);
3615
3616                         pub->addr  = ips->ips[i].addr;
3617                         pub->mask  = 0;
3618                         pub->len   = 0;
3619
3620                         timeout = TAKEOVER_TIMEOUT();
3621
3622                         data.dsize = offsetof(struct ctdb_addr_info_old,
3623                                               iface) + pub->len;
3624                         data.dptr = (uint8_t *)pub;
3625
3626                         state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
3627                                                   CTDB_CONTROL_DEL_PUBLIC_IP,
3628                                                   0, data, async_data,
3629                                                   &timeout, NULL);
3630                         if (state == NULL) {
3631                                 DEBUG(DEBUG_ERR,
3632                                       (__location__
3633                                        " failed sending CTDB_CONTROL_DEL_PUBLIC_IP\n"));
3634                                 goto failed;
3635                         }
3636
3637                         ctdb_client_async_add(async_data, state);
3638                 }
3639         }
3640
3641         /* Compare IPs between node and file for IPs to be added */
3642         first_add = true;
3643         for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
3644                 for (i = 0; i < ips->num; i++) {
3645                         if (ctdb_same_ip(&vnn->public_address,
3646                                          &ips->ips[i].addr)) {
3647                                 /* IP already on node */
3648                                 break;
3649                         }
3650                 }
3651                 if (i == ips->num) {
3652                         /* Add IP ips->ips[i] */
3653                         struct ctdb_addr_info_old *pub;
3654                         const char *ifaces = NULL;
3655                         uint32_t len;
3656                         int iface = 0;
3657
3658                         DEBUG(DEBUG_NOTICE,
3659                               ("New IP %s configured, adding it\n",
3660                                ctdb_addr_to_str(&vnn->public_address)));
3661                         if (first_add) {
3662                                 uint32_t pnn = ctdb_get_pnn(ctdb);
3663
3664                                 data.dsize = sizeof(pnn);
3665                                 data.dptr  = (uint8_t *)&pnn;
3666
3667                                 ret = ctdb_client_send_message(
3668                                         ctdb,
3669                                         CTDB_BROADCAST_CONNECTED,
3670                                         CTDB_SRVID_REBALANCE_NODE,
3671                                         data);
3672                                 if (ret != 0) {
3673                                         DEBUG(DEBUG_WARNING,
3674                                               ("Failed to send message to force node reallocation - IPs may be unbalanced\n"));
3675                                 }
3676
3677                                 first_add = false;
3678                         }
3679
3680                         ifaces = vnn->ifaces[0];
3681                         iface = 1;
3682                         while (vnn->ifaces[iface] != NULL) {
3683                                 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces,
3684                                                          vnn->ifaces[iface]);
3685                                 iface++;
3686                         }
3687
3688                         len   = strlen(ifaces) + 1;
3689                         pub = talloc_zero_size(mem_ctx,
3690                                                offsetof(struct ctdb_addr_info_old, iface) + len);
3691                         CTDB_NO_MEMORY(ctdb, pub);
3692
3693                         pub->addr  = vnn->public_address;
3694                         pub->mask  = vnn->public_netmask_bits;
3695                         pub->len   = len;
3696                         memcpy(&pub->iface[0], ifaces, pub->len);
3697
3698                         timeout = TAKEOVER_TIMEOUT();
3699
3700                         data.dsize = offsetof(struct ctdb_addr_info_old,
3701                                               iface) + pub->len;
3702                         data.dptr = (uint8_t *)pub;
3703
3704                         state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
3705                                                   CTDB_CONTROL_ADD_PUBLIC_IP,
3706                                                   0, data, async_data,
3707                                                   &timeout, NULL);
3708                         if (state == NULL) {
3709                                 DEBUG(DEBUG_ERR,
3710                                       (__location__
3711                                        " failed sending CTDB_CONTROL_ADD_PUBLIC_IP\n"));
3712                                 goto failed;
3713                         }
3714
3715                         ctdb_client_async_add(async_data, state);
3716                 }
3717         }
3718
3719         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
3720                 DEBUG(DEBUG_ERR,(__location__ " Add/delete IPs failed\n"));
3721                 goto failed;
3722         }
3723
3724         talloc_free(mem_ctx);
3725         return 0;
3726
3727 failed:
3728         talloc_free(mem_ctx);
3729         return -1;
3730 }
3731
3732 /* This control is sent to force the node to re-read the public addresses file
3733    and drop any addresses we should nnot longer host, and add new addresses
3734    that we are now able to host
3735 */
3736 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control_old *c, bool *async_reply)
3737 {
3738         struct ctdb_reloadips_handle *h;
3739         pid_t parent = getpid();
3740
3741         if (ctdb->reload_ips != NULL) {
3742                 talloc_free(ctdb->reload_ips);
3743                 ctdb->reload_ips = NULL;
3744         }
3745
3746         h = talloc(ctdb, struct ctdb_reloadips_handle);
3747         CTDB_NO_MEMORY(ctdb, h);
3748         h->ctdb     = ctdb;
3749         h->c        = NULL;
3750         h->status   = -1;
3751         
3752         if (pipe(h->fd) == -1) {
3753                 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
3754                 talloc_free(h);
3755                 return -1;
3756         }
3757
3758         h->child = ctdb_fork(ctdb);
3759         if (h->child == (pid_t)-1) {
3760                 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
3761                 close(h->fd[0]);
3762                 close(h->fd[1]);
3763                 talloc_free(h);
3764                 return -1;
3765         }
3766
3767         /* child process */
3768         if (h->child == 0) {
3769                 signed char res = 0;
3770
3771                 close(h->fd[0]);
3772                 debug_extra = talloc_asprintf(NULL, "reloadips:");
3773
3774                 prctl_set_comment("ctdb_reloadips");
3775                 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
3776                         DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
3777                         res = -1;
3778                 } else {
3779                         res = ctdb_reloadips_child(ctdb);
3780                         if (res != 0) {
3781                                 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
3782                         }
3783                 }
3784
3785                 sys_write(h->fd[1], &res, 1);
3786                 /* make sure we die when our parent dies */
3787                 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
3788                         sleep(5);
3789                 }
3790                 _exit(0);
3791         }
3792
3793         h->c             = talloc_steal(h, c);
3794
3795         close(h->fd[1]);
3796         set_close_on_exec(h->fd[0]);
3797
3798         talloc_set_destructor(h, ctdb_reloadips_destructor);
3799
3800
3801         h->fde = tevent_add_fd(ctdb->ev, h, h->fd[0], TEVENT_FD_READ,
3802                                ctdb_reloadips_child_handler, (void *)h);
3803         tevent_fd_set_auto_close(h->fde);
3804
3805         tevent_add_timer(ctdb->ev, h, timeval_current_ofs(120, 0),
3806                          ctdb_reloadips_timeout_event, h);
3807
3808         /* we reply later */
3809         *async_reply = true;
3810         return 0;
3811 }