ctdb: Use ctdb_wait_for_process_to_exit()
[samba.git] / ctdb / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6    Copyright (C) Martin Schwenke  2011
7
8    This program is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3 of the License, or
11    (at your option) any later version.
12    
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17    
18    You should have received a copy of the GNU General Public License
19    along with this program; if not, see <http://www.gnu.org/licenses/>.
20 */
21 #include "replace.h"
22 #include "system/network.h"
23 #include "system/filesys.h"
24 #include "system/time.h"
25 #include "system/wait.h"
26
27 #include <talloc.h>
28 #include <tevent.h>
29
30 #include "lib/util/dlinklist.h"
31 #include "lib/util/debug.h"
32 #include "lib/util/samba_util.h"
33 #include "lib/util/util_process.h"
34
35 #include "ctdb_private.h"
36 #include "ctdb_client.h"
37
38 #include "common/rb_tree.h"
39 #include "common/reqid.h"
40 #include "common/system.h"
41 #include "common/common.h"
42 #include "common/logging.h"
43
44 #include "server/ipalloc.h"
45
46 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
47
48 #define CTDB_ARP_INTERVAL 1
49 #define CTDB_ARP_REPEAT   3
50
51 struct ctdb_interface {
52         struct ctdb_interface *prev, *next;
53         const char *name;
54         bool link_up;
55         uint32_t references;
56 };
57
58 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
59 {
60         if (vnn->iface) {
61                 return vnn->iface->name;
62         }
63
64         return "__none__";
65 }
66
67 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
68 {
69         struct ctdb_interface *i;
70
71         /* Verify that we don't have an entry for this ip yet */
72         for (i=ctdb->ifaces;i;i=i->next) {
73                 if (strcmp(i->name, iface) == 0) {
74                         return 0;
75                 }
76         }
77
78         /* create a new structure for this interface */
79         i = talloc_zero(ctdb, struct ctdb_interface);
80         CTDB_NO_MEMORY_FATAL(ctdb, i);
81         i->name = talloc_strdup(i, iface);
82         CTDB_NO_MEMORY(ctdb, i->name);
83
84         i->link_up = true;
85
86         DLIST_ADD(ctdb->ifaces, i);
87
88         return 0;
89 }
90
91 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
92                                         const char *name)
93 {
94         int n;
95
96         for (n = 0; vnn->ifaces[n] != NULL; n++) {
97                 if (strcmp(name, vnn->ifaces[n]) == 0) {
98                         return true;
99                 }
100         }
101
102         return false;
103 }
104
105 /* If any interfaces now have no possible IPs then delete them.  This
106  * implementation is naive (i.e. simple) rather than clever
107  * (i.e. complex).  Given that this is run on delip and that operation
108  * is rare, this doesn't need to be efficient - it needs to be
109  * foolproof.  One alternative is reference counting, where the logic
110  * is distributed and can, therefore, be broken in multiple places.
111  * Another alternative is to build a red-black tree of interfaces that
112  * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
113  * once) and then walking ctdb->ifaces once and deleting those not in
114  * the tree.  Let's go to one of those if the naive implementation
115  * causes problems...  :-)
116  */
117 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
118                                         struct ctdb_vnn *vnn)
119 {
120         struct ctdb_interface *i, *next;
121
122         /* For each interface, check if there's an IP using it. */
123         for (i = ctdb->ifaces; i != NULL; i = next) {
124                 struct ctdb_vnn *tv;
125                 bool found;
126                 next = i->next;
127
128                 /* Only consider interfaces named in the given VNN. */
129                 if (!vnn_has_interface_with_name(vnn, i->name)) {
130                         continue;
131                 }
132
133                 /* Is the "single IP" on this interface? */
134                 if ((ctdb->single_ip_vnn != NULL) &&
135                     (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
136                     (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
137                         /* Found, next interface please... */
138                         continue;
139                 }
140                 /* Search for a vnn with this interface. */
141                 found = false;
142                 for (tv=ctdb->vnn; tv; tv=tv->next) {
143                         if (vnn_has_interface_with_name(tv, i->name)) {
144                                 found = true;
145                                 break;
146                         }
147                 }
148
149                 if (!found) {
150                         /* None of the VNNs are using this interface. */
151                         DLIST_REMOVE(ctdb->ifaces, i);
152                         talloc_free(i);
153                 }
154         }
155 }
156
157
158 static struct ctdb_interface *ctdb_find_iface(struct ctdb_context *ctdb,
159                                               const char *iface)
160 {
161         struct ctdb_interface *i;
162
163         for (i=ctdb->ifaces;i;i=i->next) {
164                 if (strcmp(i->name, iface) == 0) {
165                         return i;
166                 }
167         }
168
169         return NULL;
170 }
171
172 static struct ctdb_interface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
173                                                   struct ctdb_vnn *vnn)
174 {
175         int i;
176         struct ctdb_interface *cur = NULL;
177         struct ctdb_interface *best = NULL;
178
179         for (i=0; vnn->ifaces[i]; i++) {
180
181                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
182                 if (cur == NULL) {
183                         continue;
184                 }
185
186                 if (!cur->link_up) {
187                         continue;
188                 }
189
190                 if (best == NULL) {
191                         best = cur;
192                         continue;
193                 }
194
195                 if (cur->references < best->references) {
196                         best = cur;
197                         continue;
198                 }
199         }
200
201         return best;
202 }
203
204 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
205                                      struct ctdb_vnn *vnn)
206 {
207         struct ctdb_interface *best = NULL;
208
209         if (vnn->iface) {
210                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
211                                    "still assigned to iface '%s'\n",
212                                    ctdb_addr_to_str(&vnn->public_address),
213                                    ctdb_vnn_iface_string(vnn)));
214                 return 0;
215         }
216
217         best = ctdb_vnn_best_iface(ctdb, vnn);
218         if (best == NULL) {
219                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
220                                   "cannot assign to iface any iface\n",
221                                   ctdb_addr_to_str(&vnn->public_address)));
222                 return -1;
223         }
224
225         vnn->iface = best;
226         best->references++;
227         vnn->pnn = ctdb->pnn;
228
229         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
230                            "now assigned to iface '%s' refs[%d]\n",
231                            ctdb_addr_to_str(&vnn->public_address),
232                            ctdb_vnn_iface_string(vnn),
233                            best->references));
234         return 0;
235 }
236
237 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
238                                     struct ctdb_vnn *vnn)
239 {
240         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
241                            "now unassigned (old iface '%s' refs[%d])\n",
242                            ctdb_addr_to_str(&vnn->public_address),
243                            ctdb_vnn_iface_string(vnn),
244                            vnn->iface?vnn->iface->references:0));
245         if (vnn->iface) {
246                 vnn->iface->references--;
247         }
248         vnn->iface = NULL;
249         if (vnn->pnn == ctdb->pnn) {
250                 vnn->pnn = -1;
251         }
252 }
253
254 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
255                                struct ctdb_vnn *vnn)
256 {
257         int i;
258
259         /* Nodes that are not RUNNING can not host IPs */
260         if (ctdb->runstate != CTDB_RUNSTATE_RUNNING) {
261                 return false;
262         }
263
264         if (vnn->delete_pending) {
265                 return false;
266         }
267
268         if (vnn->iface && vnn->iface->link_up) {
269                 return true;
270         }
271
272         for (i=0; vnn->ifaces[i]; i++) {
273                 struct ctdb_interface *cur;
274
275                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
276                 if (cur == NULL) {
277                         continue;
278                 }
279
280                 if (cur->link_up) {
281                         return true;
282                 }
283         }
284
285         return false;
286 }
287
288 struct ctdb_takeover_arp {
289         struct ctdb_context *ctdb;
290         uint32_t count;
291         ctdb_sock_addr addr;
292         struct ctdb_tcp_array *tcparray;
293         struct ctdb_vnn *vnn;
294 };
295
296
297 /*
298   lists of tcp endpoints
299  */
300 struct ctdb_tcp_list {
301         struct ctdb_tcp_list *prev, *next;
302         struct ctdb_connection connection;
303 };
304
305 /*
306   list of clients to kill on IP release
307  */
308 struct ctdb_client_ip {
309         struct ctdb_client_ip *prev, *next;
310         struct ctdb_context *ctdb;
311         ctdb_sock_addr addr;
312         uint32_t client_id;
313 };
314
315
316 /*
317   send a gratuitous arp
318  */
319 static void ctdb_control_send_arp(struct tevent_context *ev,
320                                   struct tevent_timer *te,
321                                   struct timeval t, void *private_data)
322 {
323         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
324                                                         struct ctdb_takeover_arp);
325         int i, ret;
326         struct ctdb_tcp_array *tcparray;
327         const char *iface = ctdb_vnn_iface_string(arp->vnn);
328
329         ret = ctdb_sys_send_arp(&arp->addr, iface);
330         if (ret != 0) {
331                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
332                                   iface, strerror(errno)));
333         }
334
335         tcparray = arp->tcparray;
336         if (tcparray) {
337                 for (i=0;i<tcparray->num;i++) {
338                         struct ctdb_connection *tcon;
339
340                         tcon = &tcparray->connections[i];
341                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
342                                 (unsigned)ntohs(tcon->dst.ip.sin_port),
343                                 ctdb_addr_to_str(&tcon->src),
344                                 (unsigned)ntohs(tcon->src.ip.sin_port)));
345                         ret = ctdb_sys_send_tcp(
346                                 &tcon->src,
347                                 &tcon->dst,
348                                 0, 0, 0);
349                         if (ret != 0) {
350                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
351                                         ctdb_addr_to_str(&tcon->src)));
352                         }
353                 }
354         }
355
356         arp->count++;
357
358         if (arp->count == CTDB_ARP_REPEAT) {
359                 talloc_free(arp);
360                 return;
361         }
362
363         tevent_add_timer(arp->ctdb->ev, arp->vnn->takeover_ctx,
364                          timeval_current_ofs(CTDB_ARP_INTERVAL, 100000),
365                          ctdb_control_send_arp, arp);
366 }
367
368 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
369                                        struct ctdb_vnn *vnn)
370 {
371         struct ctdb_takeover_arp *arp;
372         struct ctdb_tcp_array *tcparray;
373
374         if (!vnn->takeover_ctx) {
375                 vnn->takeover_ctx = talloc_new(vnn);
376                 if (!vnn->takeover_ctx) {
377                         return -1;
378                 }
379         }
380
381         arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
382         if (!arp) {
383                 return -1;
384         }
385
386         arp->ctdb = ctdb;
387         arp->addr = vnn->public_address;
388         arp->vnn  = vnn;
389
390         tcparray = vnn->tcp_array;
391         if (tcparray) {
392                 /* add all of the known tcp connections for this IP to the
393                    list of tcp connections to send tickle acks for */
394                 arp->tcparray = talloc_steal(arp, tcparray);
395
396                 vnn->tcp_array = NULL;
397                 vnn->tcp_update_needed = true;
398         }
399
400         tevent_add_timer(arp->ctdb->ev, vnn->takeover_ctx,
401                          timeval_zero(), ctdb_control_send_arp, arp);
402
403         return 0;
404 }
405
406 struct takeover_callback_state {
407         struct ctdb_req_control_old *c;
408         ctdb_sock_addr *addr;
409         struct ctdb_vnn *vnn;
410 };
411
412 struct ctdb_do_takeip_state {
413         struct ctdb_req_control_old *c;
414         struct ctdb_vnn *vnn;
415 };
416
417 /*
418   called when takeip event finishes
419  */
420 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
421                                     void *private_data)
422 {
423         struct ctdb_do_takeip_state *state =
424                 talloc_get_type(private_data, struct ctdb_do_takeip_state);
425         int32_t ret;
426         TDB_DATA data;
427
428         if (status != 0) {
429                 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
430         
431                 if (status == -ETIME) {
432                         ctdb_ban_self(ctdb);
433                 }
434                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
435                                  ctdb_addr_to_str(&state->vnn->public_address),
436                                  ctdb_vnn_iface_string(state->vnn)));
437                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
438
439                 node->flags |= NODE_FLAGS_UNHEALTHY;
440                 talloc_free(state);
441                 return;
442         }
443
444         if (ctdb->do_checkpublicip) {
445
446         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
447         if (ret != 0) {
448                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
449                 talloc_free(state);
450                 return;
451         }
452
453         }
454
455         data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
456         data.dsize = strlen((char *)data.dptr) + 1;
457         DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
458
459         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
460
461
462         /* the control succeeded */
463         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
464         talloc_free(state);
465         return;
466 }
467
468 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
469 {
470         state->vnn->update_in_flight = false;
471         return 0;
472 }
473
474 /*
475   take over an ip address
476  */
477 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
478                               struct ctdb_req_control_old *c,
479                               struct ctdb_vnn *vnn)
480 {
481         int ret;
482         struct ctdb_do_takeip_state *state;
483
484         if (vnn->update_in_flight) {
485                 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
486                                     "update for this IP already in flight\n",
487                                     ctdb_addr_to_str(&vnn->public_address),
488                                     vnn->public_netmask_bits));
489                 return -1;
490         }
491
492         ret = ctdb_vnn_assign_iface(ctdb, vnn);
493         if (ret != 0) {
494                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
495                                  "assign a usable interface\n",
496                                  ctdb_addr_to_str(&vnn->public_address),
497                                  vnn->public_netmask_bits));
498                 return -1;
499         }
500
501         state = talloc(vnn, struct ctdb_do_takeip_state);
502         CTDB_NO_MEMORY(ctdb, state);
503
504         state->c = talloc_steal(ctdb, c);
505         state->vnn   = vnn;
506
507         vnn->update_in_flight = true;
508         talloc_set_destructor(state, ctdb_takeip_destructor);
509
510         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
511                             ctdb_addr_to_str(&vnn->public_address),
512                             vnn->public_netmask_bits,
513                             ctdb_vnn_iface_string(vnn)));
514
515         ret = ctdb_event_script_callback(ctdb,
516                                          state,
517                                          ctdb_do_takeip_callback,
518                                          state,
519                                          CTDB_EVENT_TAKE_IP,
520                                          "%s %s %u",
521                                          ctdb_vnn_iface_string(vnn),
522                                          ctdb_addr_to_str(&vnn->public_address),
523                                          vnn->public_netmask_bits);
524
525         if (ret != 0) {
526                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
527                         ctdb_addr_to_str(&vnn->public_address),
528                         ctdb_vnn_iface_string(vnn)));
529                 talloc_free(state);
530                 return -1;
531         }
532
533         return 0;
534 }
535
536 struct ctdb_do_updateip_state {
537         struct ctdb_req_control_old *c;
538         struct ctdb_interface *old;
539         struct ctdb_vnn *vnn;
540 };
541
542 /*
543   called when updateip event finishes
544  */
545 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
546                                       void *private_data)
547 {
548         struct ctdb_do_updateip_state *state =
549                 talloc_get_type(private_data, struct ctdb_do_updateip_state);
550         int32_t ret;
551
552         if (status != 0) {
553                 if (status == -ETIME) {
554                         ctdb_ban_self(ctdb);
555                 }
556                 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
557                         ctdb_addr_to_str(&state->vnn->public_address),
558                         state->old->name,
559                         ctdb_vnn_iface_string(state->vnn)));
560
561                 /*
562                  * All we can do is reset the old interface
563                  * and let the next run fix it
564                  */
565                 ctdb_vnn_unassign_iface(ctdb, state->vnn);
566                 state->vnn->iface = state->old;
567                 state->vnn->iface->references++;
568
569                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
570                 talloc_free(state);
571                 return;
572         }
573
574         if (ctdb->do_checkpublicip) {
575
576         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
577         if (ret != 0) {
578                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
579                 talloc_free(state);
580                 return;
581         }
582
583         }
584
585         /* the control succeeded */
586         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
587         talloc_free(state);
588         return;
589 }
590
591 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
592 {
593         state->vnn->update_in_flight = false;
594         return 0;
595 }
596
597 /*
598   update (move) an ip address
599  */
600 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
601                                 struct ctdb_req_control_old *c,
602                                 struct ctdb_vnn *vnn)
603 {
604         int ret;
605         struct ctdb_do_updateip_state *state;
606         struct ctdb_interface *old = vnn->iface;
607         const char *new_name;
608
609         if (vnn->update_in_flight) {
610                 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
611                                     "update for this IP already in flight\n",
612                                     ctdb_addr_to_str(&vnn->public_address),
613                                     vnn->public_netmask_bits));
614                 return -1;
615         }
616
617         ctdb_vnn_unassign_iface(ctdb, vnn);
618         ret = ctdb_vnn_assign_iface(ctdb, vnn);
619         if (ret != 0) {
620                 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
621                                  "assin a usable interface (old iface '%s')\n",
622                                  ctdb_addr_to_str(&vnn->public_address),
623                                  vnn->public_netmask_bits,
624                                  old->name));
625                 return -1;
626         }
627
628         new_name = ctdb_vnn_iface_string(vnn);
629         if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
630                 /* A benign update from one interface onto itself.
631                  * no need to run the eventscripts in this case, just return
632                  * success.
633                  */
634                 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
635                 return 0;
636         }
637
638         state = talloc(vnn, struct ctdb_do_updateip_state);
639         CTDB_NO_MEMORY(ctdb, state);
640
641         state->c = talloc_steal(ctdb, c);
642         state->old = old;
643         state->vnn = vnn;
644
645         vnn->update_in_flight = true;
646         talloc_set_destructor(state, ctdb_updateip_destructor);
647
648         DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
649                             "interface %s to %s\n",
650                             ctdb_addr_to_str(&vnn->public_address),
651                             vnn->public_netmask_bits,
652                             old->name,
653                             new_name));
654
655         ret = ctdb_event_script_callback(ctdb,
656                                          state,
657                                          ctdb_do_updateip_callback,
658                                          state,
659                                          CTDB_EVENT_UPDATE_IP,
660                                          "%s %s %s %u",
661                                          state->old->name,
662                                          new_name,
663                                          ctdb_addr_to_str(&vnn->public_address),
664                                          vnn->public_netmask_bits);
665         if (ret != 0) {
666                 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
667                                  ctdb_addr_to_str(&vnn->public_address),
668                                  old->name, new_name));
669                 talloc_free(state);
670                 return -1;
671         }
672
673         return 0;
674 }
675
676 /*
677   Find the vnn of the node that has a public ip address
678   returns -1 if the address is not known as a public address
679  */
680 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
681 {
682         struct ctdb_vnn *vnn;
683
684         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
685                 if (ctdb_same_ip(&vnn->public_address, addr)) {
686                         return vnn;
687                 }
688         }
689
690         return NULL;
691 }
692
693 /*
694   take over an ip address
695  */
696 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
697                                  struct ctdb_req_control_old *c,
698                                  TDB_DATA indata,
699                                  bool *async_reply)
700 {
701         int ret;
702         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
703         struct ctdb_vnn *vnn;
704         bool have_ip = false;
705         bool do_updateip = false;
706         bool do_takeip = false;
707         struct ctdb_interface *best_iface = NULL;
708
709         if (pip->pnn != ctdb->pnn) {
710                 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
711                                  "with pnn %d, but we're node %d\n",
712                                  ctdb_addr_to_str(&pip->addr),
713                                  pip->pnn, ctdb->pnn));
714                 return -1;
715         }
716
717         /* update out vnn list */
718         vnn = find_public_ip_vnn(ctdb, &pip->addr);
719         if (vnn == NULL) {
720                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
721                         ctdb_addr_to_str(&pip->addr)));
722                 return 0;
723         }
724
725         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
726                 have_ip = ctdb_sys_have_ip(&pip->addr);
727         }
728         best_iface = ctdb_vnn_best_iface(ctdb, vnn);
729         if (best_iface == NULL) {
730                 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
731                                  "a usable interface (old %s, have_ip %d)\n",
732                                  ctdb_addr_to_str(&vnn->public_address),
733                                  vnn->public_netmask_bits,
734                                  ctdb_vnn_iface_string(vnn),
735                                  have_ip));
736                 return -1;
737         }
738
739         if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
740                 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
741                 have_ip = false;
742         }
743
744
745         if (vnn->iface == NULL && have_ip) {
746                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
747                                   "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
748                                  ctdb_addr_to_str(&vnn->public_address)));
749                 return 0;
750         }
751
752         if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
753                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
754                                   "and we have it on iface[%s], but it was assigned to node %d"
755                                   "and we are node %d, banning ourself\n",
756                                  ctdb_addr_to_str(&vnn->public_address),
757                                  ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
758                 ctdb_ban_self(ctdb);
759                 return -1;
760         }
761
762         if (vnn->pnn == -1 && have_ip) {
763                 vnn->pnn = ctdb->pnn;
764                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
765                                   "and we already have it on iface[%s], update local daemon\n",
766                                  ctdb_addr_to_str(&vnn->public_address),
767                                   ctdb_vnn_iface_string(vnn)));
768                 return 0;
769         }
770
771         if (vnn->iface) {
772                 if (vnn->iface != best_iface) {
773                         if (!vnn->iface->link_up) {
774                                 do_updateip = true;
775                         } else if (vnn->iface->references > (best_iface->references + 1)) {
776                                 /* only move when the rebalance gains something */
777                                         do_updateip = true;
778                         }
779                 }
780         }
781
782         if (!have_ip) {
783                 if (do_updateip) {
784                         ctdb_vnn_unassign_iface(ctdb, vnn);
785                         do_updateip = false;
786                 }
787                 do_takeip = true;
788         }
789
790         if (do_takeip) {
791                 ret = ctdb_do_takeip(ctdb, c, vnn);
792                 if (ret != 0) {
793                         return -1;
794                 }
795         } else if (do_updateip) {
796                 ret = ctdb_do_updateip(ctdb, c, vnn);
797                 if (ret != 0) {
798                         return -1;
799                 }
800         } else {
801                 /*
802                  * The interface is up and the kernel known the ip
803                  * => do nothing
804                  */
805                 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
806                         ctdb_addr_to_str(&pip->addr),
807                         vnn->public_netmask_bits,
808                         ctdb_vnn_iface_string(vnn)));
809                 return 0;
810         }
811
812         /* tell ctdb_control.c that we will be replying asynchronously */
813         *async_reply = true;
814
815         return 0;
816 }
817
818 /*
819   kill any clients that are registered with a IP that is being released
820  */
821 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
822 {
823         struct ctdb_client_ip *ip;
824
825         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
826                 ctdb_addr_to_str(addr)));
827
828         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
829                 ctdb_sock_addr tmp_addr;
830
831                 tmp_addr = ip->addr;
832                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
833                         ip->client_id,
834                         ctdb_addr_to_str(&ip->addr)));
835
836                 if (ctdb_same_ip(&tmp_addr, addr)) {
837                         struct ctdb_client *client = reqid_find(ctdb->idr,
838                                                                 ip->client_id,
839                                                                 struct ctdb_client);
840                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
841                                 ip->client_id,
842                                 ctdb_addr_to_str(&ip->addr),
843                                 client->pid));
844
845                         if (client->pid != 0) {
846                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
847                                         (unsigned)client->pid,
848                                         ctdb_addr_to_str(addr),
849                                         ip->client_id));
850                                 kill(client->pid, SIGKILL);
851                         }
852                 }
853         }
854 }
855
856 static void do_delete_ip(struct ctdb_context *ctdb, struct ctdb_vnn *vnn)
857 {
858         DLIST_REMOVE(ctdb->vnn, vnn);
859         ctdb_vnn_unassign_iface(ctdb, vnn);
860         ctdb_remove_orphaned_ifaces(ctdb, vnn);
861         talloc_free(vnn);
862 }
863
864 /*
865   called when releaseip event finishes
866  */
867 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
868                                 void *private_data)
869 {
870         struct takeover_callback_state *state = 
871                 talloc_get_type(private_data, struct takeover_callback_state);
872         TDB_DATA data;
873
874         if (status == -ETIME) {
875                 ctdb_ban_self(ctdb);
876         }
877
878         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
879                 if  (ctdb_sys_have_ip(state->addr)) {
880                         DEBUG(DEBUG_ERR,
881                               ("IP %s still hosted during release IP callback, failing\n",
882                                ctdb_addr_to_str(state->addr)));
883                         ctdb_request_control_reply(ctdb, state->c,
884                                                    NULL, -1, NULL);
885                         talloc_free(state);
886                         return;
887                 }
888         }
889
890         /* send a message to all clients of this node telling them
891            that the cluster has been reconfigured and they should
892            release any sockets on this IP */
893         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
894         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
895         data.dsize = strlen((char *)data.dptr)+1;
896
897         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
898
899         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
900
901         /* kill clients that have registered with this IP */
902         release_kill_clients(ctdb, state->addr);
903
904         ctdb_vnn_unassign_iface(ctdb, state->vnn);
905
906         /* Process the IP if it has been marked for deletion */
907         if (state->vnn->delete_pending) {
908                 do_delete_ip(ctdb, state->vnn);
909                 state->vnn = NULL;
910         }
911
912         /* the control succeeded */
913         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
914         talloc_free(state);
915 }
916
917 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
918 {
919         if (state->vnn != NULL) {
920                 state->vnn->update_in_flight = false;
921         }
922         return 0;
923 }
924
925 /*
926   release an ip address
927  */
928 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
929                                 struct ctdb_req_control_old *c,
930                                 TDB_DATA indata, 
931                                 bool *async_reply)
932 {
933         int ret;
934         struct takeover_callback_state *state;
935         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
936         struct ctdb_vnn *vnn;
937         char *iface;
938
939         /* update our vnn list */
940         vnn = find_public_ip_vnn(ctdb, &pip->addr);
941         if (vnn == NULL) {
942                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
943                         ctdb_addr_to_str(&pip->addr)));
944                 return 0;
945         }
946         vnn->pnn = pip->pnn;
947
948         /* stop any previous arps */
949         talloc_free(vnn->takeover_ctx);
950         vnn->takeover_ctx = NULL;
951
952         /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
953          * lazy multicast to drop an IP from any node that isn't the
954          * intended new node.  The following causes makes ctdbd ignore
955          * a release for any address it doesn't host.
956          */
957         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
958                 if (!ctdb_sys_have_ip(&pip->addr)) {
959                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
960                                 ctdb_addr_to_str(&pip->addr),
961                                 vnn->public_netmask_bits,
962                                 ctdb_vnn_iface_string(vnn)));
963                         ctdb_vnn_unassign_iface(ctdb, vnn);
964                         return 0;
965                 }
966         } else {
967                 if (vnn->iface == NULL) {
968                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
969                                            ctdb_addr_to_str(&pip->addr),
970                                            vnn->public_netmask_bits));
971                         return 0;
972                 }
973         }
974
975         /* There is a potential race between take_ip and us because we
976          * update the VNN via a callback that run when the
977          * eventscripts have been run.  Avoid the race by allowing one
978          * update to be in flight at a time.
979          */
980         if (vnn->update_in_flight) {
981                 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
982                                     "update for this IP already in flight\n",
983                                     ctdb_addr_to_str(&vnn->public_address),
984                                     vnn->public_netmask_bits));
985                 return -1;
986         }
987
988         iface = strdup(ctdb_vnn_iface_string(vnn));
989
990         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
991                 ctdb_addr_to_str(&pip->addr),
992                 vnn->public_netmask_bits,
993                 iface,
994                 pip->pnn));
995
996         state = talloc(ctdb, struct takeover_callback_state);
997         if (state == NULL) {
998                 ctdb_set_error(ctdb, "Out of memory at %s:%d",
999                                __FILE__, __LINE__);
1000                 free(iface);
1001                 return -1;
1002         }
1003
1004         state->c = talloc_steal(state, c);
1005         state->addr = talloc(state, ctdb_sock_addr);       
1006         if (state->addr == NULL) {
1007                 ctdb_set_error(ctdb, "Out of memory at %s:%d",
1008                                __FILE__, __LINE__);
1009                 free(iface);
1010                 talloc_free(state);
1011                 return -1;
1012         }
1013         *state->addr = pip->addr;
1014         state->vnn   = vnn;
1015
1016         vnn->update_in_flight = true;
1017         talloc_set_destructor(state, ctdb_releaseip_destructor);
1018
1019         ret = ctdb_event_script_callback(ctdb, 
1020                                          state, release_ip_callback, state,
1021                                          CTDB_EVENT_RELEASE_IP,
1022                                          "%s %s %u",
1023                                          iface,
1024                                          ctdb_addr_to_str(&pip->addr),
1025                                          vnn->public_netmask_bits);
1026         free(iface);
1027         if (ret != 0) {
1028                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1029                         ctdb_addr_to_str(&pip->addr),
1030                         ctdb_vnn_iface_string(vnn)));
1031                 talloc_free(state);
1032                 return -1;
1033         }
1034
1035         /* tell the control that we will be reply asynchronously */
1036         *async_reply = true;
1037         return 0;
1038 }
1039
1040 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1041                                    ctdb_sock_addr *addr,
1042                                    unsigned mask, const char *ifaces,
1043                                    bool check_address)
1044 {
1045         struct ctdb_vnn      *vnn;
1046         uint32_t num = 0;
1047         char *tmp;
1048         const char *iface;
1049         int i;
1050         int ret;
1051
1052         tmp = strdup(ifaces);
1053         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1054                 if (!ctdb_sys_check_iface_exists(iface)) {
1055                         DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1056                         free(tmp);
1057                         return -1;
1058                 }
1059         }
1060         free(tmp);
1061
1062         /* Verify that we don't have an entry for this ip yet */
1063         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1064                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1065                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
1066                                 ctdb_addr_to_str(addr)));
1067                         return -1;
1068                 }               
1069         }
1070
1071         /* create a new vnn structure for this ip address */
1072         vnn = talloc_zero(ctdb, struct ctdb_vnn);
1073         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1074         vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1075         tmp = talloc_strdup(vnn, ifaces);
1076         CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1077         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1078                 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1079                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1080                 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1081                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1082                 num++;
1083         }
1084         talloc_free(tmp);
1085         vnn->ifaces[num] = NULL;
1086         vnn->public_address      = *addr;
1087         vnn->public_netmask_bits = mask;
1088         vnn->pnn                 = -1;
1089         if (check_address) {
1090                 if (ctdb_sys_have_ip(addr)) {
1091                         DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1092                         vnn->pnn = ctdb->pnn;
1093                 }
1094         }
1095
1096         for (i=0; vnn->ifaces[i]; i++) {
1097                 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1098                 if (ret != 0) {
1099                         DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1100                                            "for public_address[%s]\n",
1101                                            vnn->ifaces[i], ctdb_addr_to_str(addr)));
1102                         talloc_free(vnn);
1103                         return -1;
1104                 }
1105         }
1106
1107         DLIST_ADD(ctdb->vnn, vnn);
1108
1109         return 0;
1110 }
1111
1112 /*
1113   setup the public address lists from a file
1114 */
1115 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1116 {
1117         char **lines;
1118         int nlines;
1119         int i;
1120
1121         lines = file_lines_load(ctdb->public_addresses_file, &nlines, 0, ctdb);
1122         if (lines == NULL) {
1123                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1124                 return -1;
1125         }
1126         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1127                 nlines--;
1128         }
1129
1130         for (i=0;i<nlines;i++) {
1131                 unsigned mask;
1132                 ctdb_sock_addr addr;
1133                 const char *addrstr;
1134                 const char *ifaces;
1135                 char *tok, *line;
1136
1137                 line = lines[i];
1138                 while ((*line == ' ') || (*line == '\t')) {
1139                         line++;
1140                 }
1141                 if (*line == '#') {
1142                         continue;
1143                 }
1144                 if (strcmp(line, "") == 0) {
1145                         continue;
1146                 }
1147                 tok = strtok(line, " \t");
1148                 addrstr = tok;
1149                 tok = strtok(NULL, " \t");
1150                 if (tok == NULL) {
1151                         if (NULL == ctdb->default_public_interface) {
1152                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1153                                          i+1));
1154                                 talloc_free(lines);
1155                                 return -1;
1156                         }
1157                         ifaces = ctdb->default_public_interface;
1158                 } else {
1159                         ifaces = tok;
1160                 }
1161
1162                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1163                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1164                         talloc_free(lines);
1165                         return -1;
1166                 }
1167                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1168                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1169                         talloc_free(lines);
1170                         return -1;
1171                 }
1172         }
1173
1174
1175         talloc_free(lines);
1176         return 0;
1177 }
1178
1179 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1180                               const char *iface,
1181                               const char *ip)
1182 {
1183         struct ctdb_vnn *svnn;
1184         struct ctdb_interface *cur = NULL;
1185         bool ok;
1186         int ret;
1187
1188         svnn = talloc_zero(ctdb, struct ctdb_vnn);
1189         CTDB_NO_MEMORY(ctdb, svnn);
1190
1191         svnn->ifaces = talloc_array(svnn, const char *, 2);
1192         CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1193         svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1194         CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1195         svnn->ifaces[1] = NULL;
1196
1197         ok = parse_ip(ip, iface, 0, &svnn->public_address);
1198         if (!ok) {
1199                 talloc_free(svnn);
1200                 return -1;
1201         }
1202
1203         ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1204         if (ret != 0) {
1205                 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1206                                    "for single_ip[%s]\n",
1207                                    svnn->ifaces[0],
1208                                    ctdb_addr_to_str(&svnn->public_address)));
1209                 talloc_free(svnn);
1210                 return -1;
1211         }
1212
1213         /* assume the single public ip interface is initially "good" */
1214         cur = ctdb_find_iface(ctdb, iface);
1215         if (cur == NULL) {
1216                 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1217                 return -1;
1218         }
1219         cur->link_up = true;
1220
1221         ret = ctdb_vnn_assign_iface(ctdb, svnn);
1222         if (ret != 0) {
1223                 talloc_free(svnn);
1224                 return -1;
1225         }
1226
1227         ctdb->single_ip_vnn = svnn;
1228         return 0;
1229 }
1230
1231 static void *add_ip_callback(void *parm, void *data)
1232 {
1233         struct public_ip_list *this_ip = parm;
1234         struct public_ip_list *prev_ip = data;
1235
1236         if (prev_ip == NULL) {
1237                 return parm;
1238         }
1239         if (this_ip->pnn == -1) {
1240                 this_ip->pnn = prev_ip->pnn;
1241         }
1242
1243         return parm;
1244 }
1245
1246 static int getips_count_callback(void *param, void *data)
1247 {
1248         struct public_ip_list **ip_list = (struct public_ip_list **)param;
1249         struct public_ip_list *new_ip = (struct public_ip_list *)data;
1250
1251         new_ip->next = *ip_list;
1252         *ip_list     = new_ip;
1253         return 0;
1254 }
1255
1256 static int verify_remote_ip_allocation(struct ctdb_context *ctdb,
1257                                        struct ctdb_public_ip_list *ips,
1258                                        uint32_t pnn);
1259
1260 static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
1261                                          struct ipalloc_state *ipalloc_state,
1262                                          struct ctdb_node_map_old *nodemap)
1263 {
1264         int j;
1265         int ret;
1266         struct ctdb_public_ip_list_old *ip_list;
1267
1268         if (ipalloc_state->num != nodemap->num) {
1269                 DEBUG(DEBUG_ERR,
1270                       (__location__
1271                        " ipalloc_state->num (%d) != nodemap->num (%d) invalid param\n",
1272                        ipalloc_state->num, nodemap->num));
1273                 return -1;
1274         }
1275
1276         for (j=0; j<nodemap->num; j++) {
1277                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1278                         continue;
1279                 }
1280
1281                 /* Retrieve the list of known public IPs from the node */
1282                 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1283                                         TAKEOVER_TIMEOUT(),
1284                                         j,
1285                                         ipalloc_state->known_public_ips,
1286                                         0,
1287                                         &ip_list);
1288                 if (ret != 0) {
1289                         DEBUG(DEBUG_ERR,
1290                               ("Failed to read known public IPs from node: %u\n",
1291                                j));
1292                         return -1;
1293                 }
1294                 ipalloc_state->known_public_ips[j].num = ip_list->num;
1295                 /* This could be copied and freed.  However, ip_list
1296                  * is allocated off ipalloc_state->known_public_ips,
1297                  * so this is a safe hack.  This will go away in a
1298                  * while anyway... */
1299                 ipalloc_state->known_public_ips[j].ip = &ip_list->ips[0];
1300
1301                 if (ctdb->do_checkpublicip) {
1302                         verify_remote_ip_allocation(
1303                                 ctdb,
1304                                 &ipalloc_state->known_public_ips[j],
1305                                 j);
1306                 }
1307
1308                 /* Retrieve the list of available public IPs from the node */
1309                 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1310                                         TAKEOVER_TIMEOUT(),
1311                                         j,
1312                                         ipalloc_state->available_public_ips,
1313                                         CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE,
1314                                         &ip_list);
1315                 if (ret != 0) {
1316                         DEBUG(DEBUG_ERR,
1317                               ("Failed to read available public IPs from node: %u\n",
1318                                j));
1319                         return -1;
1320                 }
1321                 ipalloc_state->available_public_ips[j].num = ip_list->num;
1322                 /* This could be copied and freed.  However, ip_list
1323                  * is allocated off ipalloc_state->available_public_ips,
1324                  * so this is a safe hack.  This will go away in a
1325                  * while anyway... */
1326                 ipalloc_state->available_public_ips[j].ip = &ip_list->ips[0];
1327         }
1328
1329         return 0;
1330 }
1331
1332 static struct public_ip_list *
1333 create_merged_ip_list(struct ctdb_context *ctdb, struct ipalloc_state *ipalloc_state)
1334 {
1335         int i, j;
1336         struct public_ip_list *ip_list;
1337         struct ctdb_public_ip_list *public_ips;
1338
1339         TALLOC_FREE(ctdb->ip_tree);
1340         ctdb->ip_tree = trbt_create(ctdb, 0);
1341
1342         for (i=0; i < ctdb->num_nodes; i++) {
1343
1344                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1345                         continue;
1346                 }
1347
1348                 /* there were no public ips for this node */
1349                 if (ipalloc_state->known_public_ips == NULL) {
1350                         continue;
1351                 }
1352
1353                 public_ips = &ipalloc_state->known_public_ips[i];
1354
1355                 for (j=0; j < public_ips->num; j++) {
1356                         struct public_ip_list *tmp_ip;
1357
1358                         tmp_ip = talloc_zero(ctdb->ip_tree, struct public_ip_list);
1359                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1360                         /* Do not use information about IP addresses hosted
1361                          * on other nodes, it may not be accurate */
1362                         if (public_ips->ip[j].pnn == ctdb->nodes[i]->pnn) {
1363                                 tmp_ip->pnn = public_ips->ip[j].pnn;
1364                         } else {
1365                                 tmp_ip->pnn = -1;
1366                         }
1367                         tmp_ip->addr = public_ips->ip[j].addr;
1368                         tmp_ip->next = NULL;
1369
1370                         trbt_insertarray32_callback(ctdb->ip_tree,
1371                                 IP_KEYLEN, ip_key(&public_ips->ip[j].addr),
1372                                 add_ip_callback,
1373                                 tmp_ip);
1374                 }
1375         }
1376
1377         ip_list = NULL;
1378         trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1379
1380         return ip_list;
1381 }
1382
1383 static bool all_nodes_are_disabled(struct ctdb_node_map_old *nodemap)
1384 {
1385         int i;
1386
1387         for (i=0;i<nodemap->num;i++) {
1388                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
1389                         /* Found one completely healthy node */
1390                         return false;
1391                 }
1392         }
1393
1394         return true;
1395 }
1396
1397 struct get_tunable_callback_data {
1398         const char *tunable;
1399         uint32_t *out;
1400         bool fatal;
1401 };
1402
1403 static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
1404                                  int32_t res, TDB_DATA outdata,
1405                                  void *callback)
1406 {
1407         struct get_tunable_callback_data *cd =
1408                 (struct get_tunable_callback_data *)callback;
1409         int size;
1410
1411         if (res != 0) {
1412                 /* Already handled in fail callback */
1413                 return;
1414         }
1415
1416         if (outdata.dsize != sizeof(uint32_t)) {
1417                 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading \"%s\" tunable from node %d. Expected %d bytes but received %d bytes\n",
1418                                  cd->tunable, pnn, (int)sizeof(uint32_t),
1419                                  (int)outdata.dsize));
1420                 cd->fatal = true;
1421                 return;
1422         }
1423
1424         size = talloc_array_length(cd->out);
1425         if (pnn >= size) {
1426                 DEBUG(DEBUG_ERR,("Got %s reply from node %d but nodemap only has %d entries\n",
1427                                  cd->tunable, pnn, size));
1428                 return;
1429         }
1430
1431                 
1432         cd->out[pnn] = *(uint32_t *)outdata.dptr;
1433 }
1434
1435 static void get_tunable_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
1436                                        int32_t res, TDB_DATA outdata,
1437                                        void *callback)
1438 {
1439         struct get_tunable_callback_data *cd =
1440                 (struct get_tunable_callback_data *)callback;
1441
1442         switch (res) {
1443         case -ETIME:
1444                 DEBUG(DEBUG_ERR,
1445                       ("Timed out getting tunable \"%s\" from node %d\n",
1446                        cd->tunable, pnn));
1447                 cd->fatal = true;
1448                 break;
1449         case -EINVAL:
1450         case -1:
1451                 DEBUG(DEBUG_WARNING,
1452                       ("Tunable \"%s\" not implemented on node %d\n",
1453                        cd->tunable, pnn));
1454                 break;
1455         default:
1456                 DEBUG(DEBUG_ERR,
1457                       ("Unexpected error getting tunable \"%s\" from node %d\n",
1458                        cd->tunable, pnn));
1459                 cd->fatal = true;
1460         }
1461 }
1462
1463 static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
1464                                         TALLOC_CTX *tmp_ctx,
1465                                         struct ctdb_node_map_old *nodemap,
1466                                         const char *tunable,
1467                                         uint32_t default_value)
1468 {
1469         TDB_DATA data;
1470         struct ctdb_control_get_tunable *t;
1471         uint32_t *nodes;
1472         uint32_t *tvals;
1473         struct get_tunable_callback_data callback_data;
1474         int i;
1475
1476         tvals = talloc_array(tmp_ctx, uint32_t, nodemap->num);
1477         CTDB_NO_MEMORY_NULL(ctdb, tvals);
1478         for (i=0; i<nodemap->num; i++) {
1479                 tvals[i] = default_value;
1480         }
1481                 
1482         callback_data.out = tvals;
1483         callback_data.tunable = tunable;
1484         callback_data.fatal = false;
1485
1486         data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(tunable) + 1;
1487         data.dptr  = talloc_size(tmp_ctx, data.dsize);
1488         t = (struct ctdb_control_get_tunable *)data.dptr;
1489         t->length = strlen(tunable)+1;
1490         memcpy(t->name, tunable, t->length);
1491         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
1492         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
1493                                       nodes, 0, TAKEOVER_TIMEOUT(),
1494                                       false, data,
1495                                       get_tunable_callback,
1496                                       get_tunable_fail_callback,
1497                                       &callback_data) != 0) {
1498                 if (callback_data.fatal) {
1499                         talloc_free(tvals);
1500                         tvals = NULL;
1501                 }
1502         }
1503         talloc_free(nodes);
1504         talloc_free(data.dptr);
1505
1506         return tvals;
1507 }
1508
1509 /* Set internal flags for IP allocation:
1510  *   Clear ip flags
1511  *   Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
1512  *   Set NOIPHOST ip flag for each INACTIVE node
1513  *   if all nodes are disabled:
1514  *     Set NOIPHOST ip flags from per-node NoIPHostOnAllDisabled tunable
1515  *   else
1516  *     Set NOIPHOST ip flags for disabled nodes
1517  */
1518 static void set_ipflags_internal(struct ipalloc_state *ipalloc_state,
1519                                  struct ctdb_node_map_old *nodemap,
1520                                  uint32_t *tval_noiptakeover,
1521                                  uint32_t *tval_noiphostonalldisabled)
1522 {
1523         int i;
1524
1525         for (i=0;i<nodemap->num;i++) {
1526                 /* Can not take IPs on node with NoIPTakeover set */
1527                 if (tval_noiptakeover[i] != 0) {
1528                         ipalloc_state->noiptakeover[i] = true;
1529                 }
1530
1531                 /* Can not host IPs on INACTIVE node */
1532                 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1533                         ipalloc_state->noiphost[i] = true;
1534                 }
1535         }
1536
1537         if (all_nodes_are_disabled(nodemap)) {
1538                 /* If all nodes are disabled, can not host IPs on node
1539                  * with NoIPHostOnAllDisabled set
1540                  */
1541                 for (i=0;i<nodemap->num;i++) {
1542                         if (tval_noiphostonalldisabled[i] != 0) {
1543                                 ipalloc_state->noiphost[i] = true;
1544                         }
1545                 }
1546         } else {
1547                 /* If some nodes are not disabled, then can not host
1548                  * IPs on DISABLED node
1549                  */
1550                 for (i=0;i<nodemap->num;i++) {
1551                         if (nodemap->nodes[i].flags & NODE_FLAGS_DISABLED) {
1552                                 ipalloc_state->noiphost[i] = true;
1553                         }
1554                 }
1555         }
1556 }
1557
1558 static bool set_ipflags(struct ctdb_context *ctdb,
1559                         struct ipalloc_state *ipalloc_state,
1560                         struct ctdb_node_map_old *nodemap)
1561 {
1562         uint32_t *tval_noiptakeover;
1563         uint32_t *tval_noiphostonalldisabled;
1564
1565         tval_noiptakeover = get_tunable_from_nodes(ctdb, ipalloc_state, nodemap,
1566                                                    "NoIPTakeover", 0);
1567         if (tval_noiptakeover == NULL) {
1568                 return false;
1569         }
1570
1571         tval_noiphostonalldisabled =
1572                 get_tunable_from_nodes(ctdb, ipalloc_state, nodemap,
1573                                        "NoIPHostOnAllDisabled", 0);
1574         if (tval_noiphostonalldisabled == NULL) {
1575                 /* Caller frees tmp_ctx */
1576                 return false;
1577         }
1578
1579         set_ipflags_internal(ipalloc_state, nodemap,
1580                              tval_noiptakeover,
1581                              tval_noiphostonalldisabled);
1582
1583         talloc_free(tval_noiptakeover);
1584         talloc_free(tval_noiphostonalldisabled);
1585
1586         return true;
1587 }
1588
1589 static struct ipalloc_state * ipalloc_state_init(struct ctdb_context *ctdb,
1590                                                  TALLOC_CTX *mem_ctx)
1591 {
1592         struct ipalloc_state *ipalloc_state =
1593                 talloc_zero(mem_ctx, struct ipalloc_state);
1594         if (ipalloc_state == NULL) {
1595                 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
1596                 return NULL;
1597         }
1598
1599         ipalloc_state->num = ctdb->num_nodes;
1600
1601         ipalloc_state->known_public_ips =
1602                 talloc_zero_array(ipalloc_state,
1603                                   struct ctdb_public_ip_list,
1604                                   ipalloc_state->num);
1605         if (ipalloc_state->known_public_ips == NULL) {
1606                 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
1607                 goto fail;
1608         }
1609
1610         ipalloc_state->available_public_ips =
1611                 talloc_zero_array(ipalloc_state,
1612                                   struct ctdb_public_ip_list,
1613                                   ipalloc_state->num);
1614         if (ipalloc_state->available_public_ips == NULL) {
1615                 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
1616                 goto fail;
1617         }
1618         ipalloc_state->noiptakeover =
1619                 talloc_zero_array(ipalloc_state,
1620                                   bool,
1621                                   ipalloc_state->num);
1622         if (ipalloc_state->noiptakeover == NULL) {
1623                 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
1624                 goto fail;
1625         }
1626         ipalloc_state->noiphost =
1627                 talloc_zero_array(ipalloc_state,
1628                                   bool,
1629                                   ipalloc_state->num);
1630         if (ipalloc_state->noiphost == NULL) {
1631                 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
1632                 goto fail;
1633         }
1634
1635         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
1636                 ipalloc_state->algorithm = IPALLOC_LCP2;
1637         } else if (1 == ctdb->tunable.deterministic_public_ips) {
1638                 ipalloc_state->algorithm = IPALLOC_DETERMINISTIC;
1639         } else {
1640                 ipalloc_state->algorithm = IPALLOC_NONDETERMINISTIC;
1641         }
1642
1643         ipalloc_state->no_ip_failback = ctdb->tunable.no_ip_failback;
1644
1645         return ipalloc_state;
1646 fail:
1647         talloc_free(ipalloc_state);
1648         return NULL;
1649 }
1650
1651 struct iprealloc_callback_data {
1652         bool *retry_nodes;
1653         int retry_count;
1654         client_async_callback fail_callback;
1655         void *fail_callback_data;
1656         struct ctdb_node_map_old *nodemap;
1657 };
1658
1659 static void iprealloc_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
1660                                         int32_t res, TDB_DATA outdata,
1661                                         void *callback)
1662 {
1663         int numnodes;
1664         struct iprealloc_callback_data *cd =
1665                 (struct iprealloc_callback_data *)callback;
1666
1667         numnodes = talloc_array_length(cd->retry_nodes);
1668         if (pnn > numnodes) {
1669                 DEBUG(DEBUG_ERR,
1670                       ("ipreallocated failure from node %d, "
1671                        "but only %d nodes in nodemap\n",
1672                        pnn, numnodes));
1673                 return;
1674         }
1675
1676         /* Can't run the "ipreallocated" event on a INACTIVE node */
1677         if (cd->nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE) {
1678                 DEBUG(DEBUG_WARNING,
1679                       ("ipreallocated failed on inactive node %d, ignoring\n",
1680                        pnn));
1681                 return;
1682         }
1683
1684         switch (res) {
1685         case -ETIME:
1686                 /* If the control timed out then that's a real error,
1687                  * so call the real fail callback
1688                  */
1689                 if (cd->fail_callback) {
1690                         cd->fail_callback(ctdb, pnn, res, outdata,
1691                                           cd->fail_callback_data);
1692                 } else {
1693                         DEBUG(DEBUG_WARNING,
1694                               ("iprealloc timed out but no callback registered\n"));
1695                 }
1696                 break;
1697         default:
1698                 /* If not a timeout then either the ipreallocated
1699                  * eventscript (or some setup) failed.  This might
1700                  * have failed because the IPREALLOCATED control isn't
1701                  * implemented - right now there is no way of knowing
1702                  * because the error codes are all folded down to -1.
1703                  * Consider retrying using EVENTSCRIPT control...
1704                  */
1705                 DEBUG(DEBUG_WARNING,
1706                       ("ipreallocated failure from node %d, flagging retry\n",
1707                        pnn));
1708                 cd->retry_nodes[pnn] = true;
1709                 cd->retry_count++;
1710         }
1711 }
1712
1713 struct takeover_callback_data {
1714         bool *node_failed;
1715         client_async_callback fail_callback;
1716         void *fail_callback_data;
1717         struct ctdb_node_map_old *nodemap;
1718 };
1719
1720 static void takeover_run_fail_callback(struct ctdb_context *ctdb,
1721                                        uint32_t node_pnn, int32_t res,
1722                                        TDB_DATA outdata, void *callback_data)
1723 {
1724         struct takeover_callback_data *cd =
1725                 talloc_get_type_abort(callback_data,
1726                                       struct takeover_callback_data);
1727         int i;
1728
1729         for (i = 0; i < cd->nodemap->num; i++) {
1730                 if (node_pnn == cd->nodemap->nodes[i].pnn) {
1731                         break;
1732                 }
1733         }
1734
1735         if (i == cd->nodemap->num) {
1736                 DEBUG(DEBUG_ERR, (__location__ " invalid PNN %u\n", node_pnn));
1737                 return;
1738         }
1739
1740         if (!cd->node_failed[i]) {
1741                 cd->node_failed[i] = true;
1742                 cd->fail_callback(ctdb, node_pnn, res, outdata,
1743                                   cd->fail_callback_data);
1744         }
1745 }
1746
1747 /*
1748  * Recalculate the allocation of public IPs to nodes and have the
1749  * nodes host their allocated addresses.
1750  *
1751  * - Allocate memory for IP allocation state, including per node
1752  *   arrays
1753  * - Populate IP allocation algorithm in IP allocation state
1754  * - Populate local value of tunable NoIPFailback in IP allocation
1755      state - this is really a cluster-wide configuration variable and
1756      only the value form the master node is used
1757  * - Retrieve tunables NoIPTakeover and NoIPHostOnAllDisabled from all
1758  *   connected nodes - this is done separately so tunable values can
1759  *   be faked in unit testing
1760  * - Populate NoIPTakover tunable in IP allocation state
1761  * - Populate NoIPHost in IP allocation state, derived from node flags
1762  *   and NoIPHostOnAllDisabled tunable
1763  * - Retrieve and populate known and available IP lists in IP
1764  *   allocation state
1765  * - If no available IP addresses then early exit
1766  * - Build list of (known IPs, currently assigned node)
1767  * - Populate list of nodes to force rebalance - internal structure,
1768  *   currently no way to fetch, only used by LCP2 for nodes that have
1769  *   had new IP addresses added
1770  * - Run IP allocation algorithm
1771  * - Send RELEASE_IP to all nodes for IPs they should not host
1772  * - Send TAKE_IP to all nodes for IPs they should host
1773  * - Send IPREALLOCATED to all nodes (with backward compatibility hack)
1774  */
1775 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
1776                       uint32_t *force_rebalance_nodes,
1777                       client_async_callback fail_callback, void *callback_data)
1778 {
1779         int i, j, ret;
1780         struct ctdb_public_ip ip;
1781         uint32_t *nodes;
1782         struct public_ip_list *all_ips, *tmp_ip;
1783         TDB_DATA data;
1784         struct timeval timeout;
1785         struct client_async_data *async_data;
1786         struct ctdb_client_control_state *state;
1787         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1788         struct ipalloc_state *ipalloc_state;
1789         struct takeover_callback_data *takeover_data;
1790         struct iprealloc_callback_data iprealloc_data;
1791         bool *retry_data;
1792         bool can_host_ips;
1793
1794         /*
1795          * ip failover is completely disabled, just send out the 
1796          * ipreallocated event.
1797          */
1798         if (ctdb->tunable.disable_ip_failover != 0) {
1799                 goto ipreallocated;
1800         }
1801
1802         ipalloc_state = ipalloc_state_init(ctdb, tmp_ctx);
1803         if (ipalloc_state == NULL) {
1804                 talloc_free(tmp_ctx);
1805                 return -1;
1806         }
1807
1808         if (!set_ipflags(ctdb, ipalloc_state, nodemap)) {
1809                 DEBUG(DEBUG_ERR,("Failed to set IP flags - aborting takeover run\n"));
1810                 talloc_free(tmp_ctx);
1811                 return -1;
1812         }
1813
1814         /* Fetch known/available public IPs from each active node */
1815         ret = ctdb_reload_remote_public_ips(ctdb, ipalloc_state, nodemap);
1816         if (ret != 0) {
1817                 talloc_free(tmp_ctx);
1818                 return -1;
1819         }
1820
1821         /* Short-circuit IP allocation if no node has available IPs */
1822         can_host_ips = false;
1823         for (i=0; i < ipalloc_state->num; i++) {
1824                 if (ipalloc_state->available_public_ips[i].num != 0) {
1825                         can_host_ips = true;
1826                 }
1827         }
1828         if (!can_host_ips) {
1829                 DEBUG(DEBUG_WARNING,("No nodes available to host public IPs yet\n"));
1830                 return 0;
1831         }
1832
1833         /* since nodes only know about those public addresses that
1834            can be served by that particular node, no single node has
1835            a full list of all public addresses that exist in the cluster.
1836            Walk over all node structures and create a merged list of
1837            all public addresses that exist in the cluster.
1838
1839            keep the tree of ips around as ctdb->ip_tree
1840         */
1841         all_ips = create_merged_ip_list(ctdb, ipalloc_state);
1842         ipalloc_state->all_ips = all_ips;
1843
1844         ipalloc_state->force_rebalance_nodes = force_rebalance_nodes;
1845
1846         /* Do the IP reassignment calculations */
1847         ipalloc(ipalloc_state);
1848
1849         /* Now tell all nodes to release any public IPs should not
1850          * host.  This will be a NOOP on nodes that don't currently
1851          * hold the given IP.
1852          */
1853         takeover_data = talloc_zero(tmp_ctx, struct takeover_callback_data);
1854         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data);
1855
1856         takeover_data->node_failed = talloc_zero_array(tmp_ctx,
1857                                                        bool, nodemap->num);
1858         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data->node_failed);
1859         takeover_data->fail_callback = fail_callback;
1860         takeover_data->fail_callback_data = callback_data;
1861         takeover_data->nodemap = nodemap;
1862
1863         async_data = talloc_zero(tmp_ctx, struct client_async_data);
1864         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
1865
1866         async_data->fail_callback = takeover_run_fail_callback;
1867         async_data->callback_data = takeover_data;
1868
1869         ZERO_STRUCT(ip); /* Avoid valgrind warnings for union */
1870
1871         /* Send a RELEASE_IP to all nodes that should not be hosting
1872          * each IP.  For each IP, all but one of these will be
1873          * redundant.  However, the redundant ones are used to tell
1874          * nodes which node should be hosting the IP so that commands
1875          * like "ctdb ip" can display a particular nodes idea of who
1876          * is hosting what. */
1877         for (i=0;i<nodemap->num;i++) {
1878                 /* don't talk to unconnected nodes, but do talk to banned nodes */
1879                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1880                         continue;
1881                 }
1882
1883                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1884                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
1885                                 /* This node should be serving this
1886                                    vnn so don't tell it to release the ip
1887                                 */
1888                                 continue;
1889                         }
1890                         ip.pnn  = tmp_ip->pnn;
1891                         ip.addr = tmp_ip->addr;
1892
1893                         timeout = TAKEOVER_TIMEOUT();
1894                         data.dsize = sizeof(ip);
1895                         data.dptr  = (uint8_t *)&ip;
1896                         state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
1897                                                   0, CTDB_CONTROL_RELEASE_IP, 0,
1898                                                   data, async_data,
1899                                                   &timeout, NULL);
1900                         if (state == NULL) {
1901                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
1902                                 talloc_free(tmp_ctx);
1903                                 return -1;
1904                         }
1905
1906                         ctdb_client_async_add(async_data, state);
1907                 }
1908         }
1909         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
1910                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
1911                 talloc_free(tmp_ctx);
1912                 return -1;
1913         }
1914         talloc_free(async_data);
1915
1916
1917         /* For each IP, send a TAKOVER_IP to the node that should be
1918          * hosting it.  Many of these will often be redundant (since
1919          * the allocation won't have changed) but they can be useful
1920          * to recover from inconsistencies. */
1921         async_data = talloc_zero(tmp_ctx, struct client_async_data);
1922         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
1923
1924         async_data->fail_callback = fail_callback;
1925         async_data->callback_data = callback_data;
1926
1927         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1928                 if (tmp_ip->pnn == -1) {
1929                         /* this IP won't be taken over */
1930                         continue;
1931                 }
1932
1933                 ip.pnn  = tmp_ip->pnn;
1934                 ip.addr = tmp_ip->addr;
1935
1936                 timeout = TAKEOVER_TIMEOUT();
1937                 data.dsize = sizeof(ip);
1938                 data.dptr  = (uint8_t *)&ip;
1939                 state = ctdb_control_send(ctdb, tmp_ip->pnn,
1940                                           0, CTDB_CONTROL_TAKEOVER_IP, 0,
1941                                           data, async_data, &timeout, NULL);
1942                 if (state == NULL) {
1943                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
1944                         talloc_free(tmp_ctx);
1945                         return -1;
1946                 }
1947
1948                 ctdb_client_async_add(async_data, state);
1949         }
1950         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
1951                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
1952                 talloc_free(tmp_ctx);
1953                 return -1;
1954         }
1955
1956 ipreallocated:
1957         /*
1958          * Tell all nodes to run eventscripts to process the
1959          * "ipreallocated" event.  This can do a lot of things,
1960          * including restarting services to reconfigure them if public
1961          * IPs have moved.  Once upon a time this event only used to
1962          * update natgw.
1963          */
1964         retry_data = talloc_zero_array(tmp_ctx, bool, nodemap->num);
1965         CTDB_NO_MEMORY_FATAL(ctdb, retry_data);
1966         iprealloc_data.retry_nodes = retry_data;
1967         iprealloc_data.retry_count = 0;
1968         iprealloc_data.fail_callback = fail_callback;
1969         iprealloc_data.fail_callback_data = callback_data;
1970         iprealloc_data.nodemap = nodemap;
1971
1972         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
1973         ret = ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
1974                                         nodes, 0, TAKEOVER_TIMEOUT(),
1975                                         false, tdb_null,
1976                                         NULL, iprealloc_fail_callback,
1977                                         &iprealloc_data);
1978         if (ret != 0) {
1979                 /* If the control failed then we should retry to any
1980                  * nodes flagged by iprealloc_fail_callback using the
1981                  * EVENTSCRIPT control.  This is a best-effort at
1982                  * backward compatiblity when running a mixed cluster
1983                  * where some nodes have not yet been upgraded to
1984                  * support the IPREALLOCATED control.
1985                  */
1986                 DEBUG(DEBUG_WARNING,
1987                       ("Retry ipreallocated to some nodes using eventscript control\n"));
1988
1989                 nodes = talloc_array(tmp_ctx, uint32_t,
1990                                      iprealloc_data.retry_count);
1991                 CTDB_NO_MEMORY_FATAL(ctdb, nodes);
1992
1993                 j = 0;
1994                 for (i=0; i<nodemap->num; i++) {
1995                         if (iprealloc_data.retry_nodes[i]) {
1996                                 nodes[j] = i;
1997                                 j++;
1998                         }
1999                 }
2000
2001                 data.dptr  = discard_const("ipreallocated");
2002                 data.dsize = strlen((char *)data.dptr) + 1; 
2003                 ret = ctdb_client_async_control(ctdb,
2004                                                 CTDB_CONTROL_RUN_EVENTSCRIPTS,
2005                                                 nodes, 0, TAKEOVER_TIMEOUT(),
2006                                                 false, data,
2007                                                 NULL, fail_callback,
2008                                                 callback_data);
2009                 if (ret != 0) {
2010                         DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
2011                 }
2012         }
2013
2014         talloc_free(tmp_ctx);
2015         return ret;
2016 }
2017
2018
2019 /*
2020   destroy a ctdb_client_ip structure
2021  */
2022 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2023 {
2024         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2025                 ctdb_addr_to_str(&ip->addr),
2026                 ntohs(ip->addr.ip.sin_port),
2027                 ip->client_id));
2028
2029         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2030         return 0;
2031 }
2032
2033 /*
2034   called by a client to inform us of a TCP connection that it is managing
2035   that should tickled with an ACK when IP takeover is done
2036  */
2037 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2038                                 TDB_DATA indata)
2039 {
2040         struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
2041         struct ctdb_connection *tcp_sock = NULL;
2042         struct ctdb_tcp_list *tcp;
2043         struct ctdb_connection t;
2044         int ret;
2045         TDB_DATA data;
2046         struct ctdb_client_ip *ip;
2047         struct ctdb_vnn *vnn;
2048         ctdb_sock_addr addr;
2049
2050         /* If we don't have public IPs, tickles are useless */
2051         if (ctdb->vnn == NULL) {
2052                 return 0;
2053         }
2054
2055         tcp_sock = (struct ctdb_connection *)indata.dptr;
2056
2057         addr = tcp_sock->src;
2058         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
2059         addr = tcp_sock->dst;
2060         ctdb_canonicalize_ip(&addr, &tcp_sock->dst);
2061
2062         ZERO_STRUCT(addr);
2063         memcpy(&addr, &tcp_sock->dst, sizeof(addr));
2064         vnn = find_public_ip_vnn(ctdb, &addr);
2065         if (vnn == NULL) {
2066                 switch (addr.sa.sa_family) {
2067                 case AF_INET:
2068                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2069                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
2070                                         ctdb_addr_to_str(&addr)));
2071                         }
2072                         break;
2073                 case AF_INET6:
2074                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
2075                                 ctdb_addr_to_str(&addr)));
2076                         break;
2077                 default:
2078                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2079                 }
2080
2081                 return 0;
2082         }
2083
2084         if (vnn->pnn != ctdb->pnn) {
2085                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2086                         ctdb_addr_to_str(&addr),
2087                         client_id, client->pid));
2088                 /* failing this call will tell smbd to die */
2089                 return -1;
2090         }
2091
2092         ip = talloc(client, struct ctdb_client_ip);
2093         CTDB_NO_MEMORY(ctdb, ip);
2094
2095         ip->ctdb      = ctdb;
2096         ip->addr      = addr;
2097         ip->client_id = client_id;
2098         talloc_set_destructor(ip, ctdb_client_ip_destructor);
2099         DLIST_ADD(ctdb->client_ip_list, ip);
2100
2101         tcp = talloc(client, struct ctdb_tcp_list);
2102         CTDB_NO_MEMORY(ctdb, tcp);
2103
2104         tcp->connection.src = tcp_sock->src;
2105         tcp->connection.dst = tcp_sock->dst;
2106
2107         DLIST_ADD(client->tcp_list, tcp);
2108
2109         t.src = tcp_sock->src;
2110         t.dst = tcp_sock->dst;
2111
2112         data.dptr = (uint8_t *)&t;
2113         data.dsize = sizeof(t);
2114
2115         switch (addr.sa.sa_family) {
2116         case AF_INET:
2117                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2118                         (unsigned)ntohs(tcp_sock->dst.ip.sin_port),
2119                         ctdb_addr_to_str(&tcp_sock->src),
2120                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2121                 break;
2122         case AF_INET6:
2123                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2124                         (unsigned)ntohs(tcp_sock->dst.ip6.sin6_port),
2125                         ctdb_addr_to_str(&tcp_sock->src),
2126                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2127                 break;
2128         default:
2129                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2130         }
2131
2132
2133         /* tell all nodes about this tcp connection */
2134         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
2135                                        CTDB_CONTROL_TCP_ADD,
2136                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2137         if (ret != 0) {
2138                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
2139                 return -1;
2140         }
2141
2142         return 0;
2143 }
2144
2145 /*
2146   find a tcp address on a list
2147  */
2148 static struct ctdb_connection *ctdb_tcp_find(struct ctdb_tcp_array *array,
2149                                            struct ctdb_connection *tcp)
2150 {
2151         int i;
2152
2153         if (array == NULL) {
2154                 return NULL;
2155         }
2156
2157         for (i=0;i<array->num;i++) {
2158                 if (ctdb_same_sockaddr(&array->connections[i].src, &tcp->src) &&
2159                     ctdb_same_sockaddr(&array->connections[i].dst, &tcp->dst)) {
2160                         return &array->connections[i];
2161                 }
2162         }
2163         return NULL;
2164 }
2165
2166
2167
2168 /*
2169   called by a daemon to inform us of a TCP connection that one of its
2170   clients managing that should tickled with an ACK when IP takeover is
2171   done
2172  */
2173 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
2174 {
2175         struct ctdb_connection *p = (struct ctdb_connection *)indata.dptr;
2176         struct ctdb_tcp_array *tcparray;
2177         struct ctdb_connection tcp;
2178         struct ctdb_vnn *vnn;
2179
2180         /* If we don't have public IPs, tickles are useless */
2181         if (ctdb->vnn == NULL) {
2182                 return 0;
2183         }
2184
2185         vnn = find_public_ip_vnn(ctdb, &p->dst);
2186         if (vnn == NULL) {
2187                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
2188                         ctdb_addr_to_str(&p->dst)));
2189
2190                 return -1;
2191         }
2192
2193
2194         tcparray = vnn->tcp_array;
2195
2196         /* If this is the first tickle */
2197         if (tcparray == NULL) {
2198                 tcparray = talloc(vnn, struct ctdb_tcp_array);
2199                 CTDB_NO_MEMORY(ctdb, tcparray);
2200                 vnn->tcp_array = tcparray;
2201
2202                 tcparray->num = 0;
2203                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_connection));
2204                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
2205
2206                 tcparray->connections[tcparray->num].src = p->src;
2207                 tcparray->connections[tcparray->num].dst = p->dst;
2208                 tcparray->num++;
2209
2210                 if (tcp_update_needed) {
2211                         vnn->tcp_update_needed = true;
2212                 }
2213                 return 0;
2214         }
2215
2216
2217         /* Do we already have this tickle ?*/
2218         tcp.src = p->src;
2219         tcp.dst = p->dst;
2220         if (ctdb_tcp_find(tcparray, &tcp) != NULL) {
2221                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
2222                         ctdb_addr_to_str(&tcp.dst),
2223                         ntohs(tcp.dst.ip.sin_port),
2224                         vnn->pnn));
2225                 return 0;
2226         }
2227
2228         /* A new tickle, we must add it to the array */
2229         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
2230                                         struct ctdb_connection,
2231                                         tcparray->num+1);
2232         CTDB_NO_MEMORY(ctdb, tcparray->connections);
2233
2234         tcparray->connections[tcparray->num].src = p->src;
2235         tcparray->connections[tcparray->num].dst = p->dst;
2236         tcparray->num++;
2237
2238         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
2239                 ctdb_addr_to_str(&tcp.dst),
2240                 ntohs(tcp.dst.ip.sin_port),
2241                 vnn->pnn));
2242
2243         if (tcp_update_needed) {
2244                 vnn->tcp_update_needed = true;
2245         }
2246
2247         return 0;
2248 }
2249
2250
2251 static void ctdb_remove_connection(struct ctdb_vnn *vnn, struct ctdb_connection *conn)
2252 {
2253         struct ctdb_connection *tcpp;
2254
2255         if (vnn == NULL) {
2256                 return;
2257         }
2258
2259         /* if the array is empty we cant remove it
2260            and we don't need to do anything
2261          */
2262         if (vnn->tcp_array == NULL) {
2263                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
2264                         ctdb_addr_to_str(&conn->dst),
2265                         ntohs(conn->dst.ip.sin_port)));
2266                 return;
2267         }
2268
2269
2270         /* See if we know this connection
2271            if we don't know this connection  then we dont need to do anything
2272          */
2273         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
2274         if (tcpp == NULL) {
2275                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
2276                         ctdb_addr_to_str(&conn->dst),
2277                         ntohs(conn->dst.ip.sin_port)));
2278                 return;
2279         }
2280
2281
2282         /* We need to remove this entry from the array.
2283            Instead of allocating a new array and copying data to it
2284            we cheat and just copy the last entry in the existing array
2285            to the entry that is to be removed and just shring the 
2286            ->num field
2287          */
2288         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
2289         vnn->tcp_array->num--;
2290
2291         /* If we deleted the last entry we also need to remove the entire array
2292          */
2293         if (vnn->tcp_array->num == 0) {
2294                 talloc_free(vnn->tcp_array);
2295                 vnn->tcp_array = NULL;
2296         }               
2297
2298         vnn->tcp_update_needed = true;
2299
2300         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
2301                 ctdb_addr_to_str(&conn->src),
2302                 ntohs(conn->src.ip.sin_port)));
2303 }
2304
2305
2306 /*
2307   called by a daemon to inform us of a TCP connection that one of its
2308   clients used are no longer needed in the tickle database
2309  */
2310 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
2311 {
2312         struct ctdb_vnn *vnn;
2313         struct ctdb_connection *conn = (struct ctdb_connection *)indata.dptr;
2314
2315         /* If we don't have public IPs, tickles are useless */
2316         if (ctdb->vnn == NULL) {
2317                 return 0;
2318         }
2319
2320         vnn = find_public_ip_vnn(ctdb, &conn->dst);
2321         if (vnn == NULL) {
2322                 DEBUG(DEBUG_ERR,
2323                       (__location__ " unable to find public address %s\n",
2324                        ctdb_addr_to_str(&conn->dst)));
2325                 return 0;
2326         }
2327
2328         ctdb_remove_connection(vnn, conn);
2329
2330         return 0;
2331 }
2332
2333
2334 /*
2335   Called when another daemon starts - causes all tickles for all
2336   public addresses we are serving to be sent to the new node on the
2337   next check.  This actually causes the next scheduled call to
2338   tdb_update_tcp_tickles() to update all nodes.  This is simple and
2339   doesn't require careful error handling.
2340  */
2341 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t pnn)
2342 {
2343         struct ctdb_vnn *vnn;
2344
2345         DEBUG(DEBUG_INFO, ("Received startup control from node %lu\n",
2346                            (unsigned long) pnn));
2347
2348         for (vnn = ctdb->vnn; vnn != NULL; vnn = vnn->next) {
2349                 vnn->tcp_update_needed = true;
2350         }
2351
2352         return 0;
2353 }
2354
2355
2356 /*
2357   called when a client structure goes away - hook to remove
2358   elements from the tcp_list in all daemons
2359  */
2360 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
2361 {
2362         while (client->tcp_list) {
2363                 struct ctdb_vnn *vnn;
2364                 struct ctdb_tcp_list *tcp = client->tcp_list;
2365                 struct ctdb_connection *conn = &tcp->connection;
2366
2367                 DLIST_REMOVE(client->tcp_list, tcp);
2368
2369                 vnn = find_public_ip_vnn(client->ctdb,
2370                                          &conn->dst);
2371                 if (vnn == NULL) {
2372                         DEBUG(DEBUG_ERR,
2373                               (__location__ " unable to find public address %s\n",
2374                                ctdb_addr_to_str(&conn->dst)));
2375                         continue;
2376                 }
2377
2378                 /* If the IP address is hosted on this node then
2379                  * remove the connection. */
2380                 if (vnn->pnn == client->ctdb->pnn) {
2381                         ctdb_remove_connection(vnn, conn);
2382                 }
2383
2384                 /* Otherwise this function has been called because the
2385                  * server IP address has been released to another node
2386                  * and the client has exited.  This means that we
2387                  * should not delete the connection information.  The
2388                  * takeover node processes connections too. */
2389         }
2390 }
2391
2392
2393 void ctdb_release_all_ips(struct ctdb_context *ctdb)
2394 {
2395         struct ctdb_vnn *vnn;
2396         int count = 0;
2397
2398         if (ctdb->tunable.disable_ip_failover == 1) {
2399                 return;
2400         }
2401
2402         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2403                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
2404                         ctdb_vnn_unassign_iface(ctdb, vnn);
2405                         continue;
2406                 }
2407                 if (!vnn->iface) {
2408                         continue;
2409                 }
2410
2411                 /* Don't allow multiple releases at once.  Some code,
2412                  * particularly ctdb_tickle_sentenced_connections() is
2413                  * not re-entrant */
2414                 if (vnn->update_in_flight) {
2415                         DEBUG(DEBUG_WARNING,
2416                               (__location__
2417                                " Not releasing IP %s/%u on interface %s, an update is already in progess\n",
2418                                     ctdb_addr_to_str(&vnn->public_address),
2419                                     vnn->public_netmask_bits,
2420                                     ctdb_vnn_iface_string(vnn)));
2421                         continue;
2422                 }
2423                 vnn->update_in_flight = true;
2424
2425                 DEBUG(DEBUG_INFO,("Release of IP %s/%u on interface %s node:-1\n",
2426                                     ctdb_addr_to_str(&vnn->public_address),
2427                                     vnn->public_netmask_bits,
2428                                     ctdb_vnn_iface_string(vnn)));
2429
2430                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
2431                                   ctdb_vnn_iface_string(vnn),
2432                                   ctdb_addr_to_str(&vnn->public_address),
2433                                   vnn->public_netmask_bits);
2434                 release_kill_clients(ctdb, &vnn->public_address);
2435                 ctdb_vnn_unassign_iface(ctdb, vnn);
2436                 vnn->update_in_flight = false;
2437                 count++;
2438         }
2439
2440         DEBUG(DEBUG_NOTICE,(__location__ " Released %d public IPs\n", count));
2441 }
2442
2443
2444 /*
2445   get list of public IPs
2446  */
2447 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
2448                                     struct ctdb_req_control_old *c, TDB_DATA *outdata)
2449 {
2450         int i, num, len;
2451         struct ctdb_public_ip_list_old *ips;
2452         struct ctdb_vnn *vnn;
2453         bool only_available = false;
2454
2455         if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
2456                 only_available = true;
2457         }
2458
2459         /* count how many public ip structures we have */
2460         num = 0;
2461         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2462                 num++;
2463         }
2464
2465         len = offsetof(struct ctdb_public_ip_list_old, ips) +
2466                 num*sizeof(struct ctdb_public_ip);
2467         ips = talloc_zero_size(outdata, len);
2468         CTDB_NO_MEMORY(ctdb, ips);
2469
2470         i = 0;
2471         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2472                 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
2473                         continue;
2474                 }
2475                 ips->ips[i].pnn  = vnn->pnn;
2476                 ips->ips[i].addr = vnn->public_address;
2477                 i++;
2478         }
2479         ips->num = i;
2480         len = offsetof(struct ctdb_public_ip_list_old, ips) +
2481                 i*sizeof(struct ctdb_public_ip);
2482
2483         outdata->dsize = len;
2484         outdata->dptr  = (uint8_t *)ips;
2485
2486         return 0;
2487 }
2488
2489
2490 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
2491                                         struct ctdb_req_control_old *c,
2492                                         TDB_DATA indata,
2493                                         TDB_DATA *outdata)
2494 {
2495         int i, num, len;
2496         ctdb_sock_addr *addr;
2497         struct ctdb_public_ip_info_old *info;
2498         struct ctdb_vnn *vnn;
2499
2500         addr = (ctdb_sock_addr *)indata.dptr;
2501
2502         vnn = find_public_ip_vnn(ctdb, addr);
2503         if (vnn == NULL) {
2504                 /* if it is not a public ip   it could be our 'single ip' */
2505                 if (ctdb->single_ip_vnn) {
2506                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
2507                                 vnn = ctdb->single_ip_vnn;
2508                         }
2509                 }
2510         }
2511         if (vnn == NULL) {
2512                 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
2513                                  "'%s'not a public address\n",
2514                                  ctdb_addr_to_str(addr)));
2515                 return -1;
2516         }
2517
2518         /* count how many public ip structures we have */
2519         num = 0;
2520         for (;vnn->ifaces[num];) {
2521                 num++;
2522         }
2523
2524         len = offsetof(struct ctdb_public_ip_info_old, ifaces) +
2525                 num*sizeof(struct ctdb_iface);
2526         info = talloc_zero_size(outdata, len);
2527         CTDB_NO_MEMORY(ctdb, info);
2528
2529         info->ip.addr = vnn->public_address;
2530         info->ip.pnn = vnn->pnn;
2531         info->active_idx = 0xFFFFFFFF;
2532
2533         for (i=0; vnn->ifaces[i]; i++) {
2534                 struct ctdb_interface *cur;
2535
2536                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
2537                 if (cur == NULL) {
2538                         DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
2539                                            vnn->ifaces[i]));
2540                         return -1;
2541                 }
2542                 if (vnn->iface == cur) {
2543                         info->active_idx = i;
2544                 }
2545                 strncpy(info->ifaces[i].name, cur->name, sizeof(info->ifaces[i].name)-1);
2546                 info->ifaces[i].link_state = cur->link_up;
2547                 info->ifaces[i].references = cur->references;
2548         }
2549         info->num = i;
2550         len = offsetof(struct ctdb_public_ip_info_old, ifaces) +
2551                 i*sizeof(struct ctdb_iface);
2552
2553         outdata->dsize = len;
2554         outdata->dptr  = (uint8_t *)info;
2555
2556         return 0;
2557 }
2558
2559 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
2560                                 struct ctdb_req_control_old *c,
2561                                 TDB_DATA *outdata)
2562 {
2563         int i, num, len;
2564         struct ctdb_iface_list_old *ifaces;
2565         struct ctdb_interface *cur;
2566
2567         /* count how many public ip structures we have */
2568         num = 0;
2569         for (cur=ctdb->ifaces;cur;cur=cur->next) {
2570                 num++;
2571         }
2572
2573         len = offsetof(struct ctdb_iface_list_old, ifaces) +
2574                 num*sizeof(struct ctdb_iface);
2575         ifaces = talloc_zero_size(outdata, len);
2576         CTDB_NO_MEMORY(ctdb, ifaces);
2577
2578         i = 0;
2579         for (cur=ctdb->ifaces;cur;cur=cur->next) {
2580                 strcpy(ifaces->ifaces[i].name, cur->name);
2581                 ifaces->ifaces[i].link_state = cur->link_up;
2582                 ifaces->ifaces[i].references = cur->references;
2583                 i++;
2584         }
2585         ifaces->num = i;
2586         len = offsetof(struct ctdb_iface_list_old, ifaces) +
2587                 i*sizeof(struct ctdb_iface);
2588
2589         outdata->dsize = len;
2590         outdata->dptr  = (uint8_t *)ifaces;
2591
2592         return 0;
2593 }
2594
2595 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
2596                                     struct ctdb_req_control_old *c,
2597                                     TDB_DATA indata)
2598 {
2599         struct ctdb_iface *info;
2600         struct ctdb_interface *iface;
2601         bool link_up = false;
2602
2603         info = (struct ctdb_iface *)indata.dptr;
2604
2605         if (info->name[CTDB_IFACE_SIZE] != '\0') {
2606                 int len = strnlen(info->name, CTDB_IFACE_SIZE);
2607                 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
2608                                   len, len, info->name));
2609                 return -1;
2610         }
2611
2612         switch (info->link_state) {
2613         case 0:
2614                 link_up = false;
2615                 break;
2616         case 1:
2617                 link_up = true;
2618                 break;
2619         default:
2620                 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
2621                                   (unsigned int)info->link_state));
2622                 return -1;
2623         }
2624
2625         if (info->references != 0) {
2626                 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
2627                                   (unsigned int)info->references));
2628                 return -1;
2629         }
2630
2631         iface = ctdb_find_iface(ctdb, info->name);
2632         if (iface == NULL) {
2633                 return -1;
2634         }
2635
2636         if (link_up == iface->link_up) {
2637                 return 0;
2638         }
2639
2640         DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
2641               ("iface[%s] has changed it's link status %s => %s\n",
2642                iface->name,
2643                iface->link_up?"up":"down",
2644                link_up?"up":"down"));
2645
2646         iface->link_up = link_up;
2647         return 0;
2648 }
2649
2650
2651 /* 
2652    structure containing the listening socket and the list of tcp connections
2653    that the ctdb daemon is to kill
2654 */
2655 struct ctdb_kill_tcp {
2656         struct ctdb_vnn *vnn;
2657         struct ctdb_context *ctdb;
2658         int capture_fd;
2659         struct tevent_fd *fde;
2660         trbt_tree_t *connections;
2661         void *private_data;
2662 };
2663
2664 /*
2665   a tcp connection that is to be killed
2666  */
2667 struct ctdb_killtcp_con {
2668         ctdb_sock_addr src_addr;
2669         ctdb_sock_addr dst_addr;
2670         int count;
2671         struct ctdb_kill_tcp *killtcp;
2672 };
2673
2674 /* this function is used to create a key to represent this socketpair
2675    in the killtcp tree.
2676    this key is used to insert and lookup matching socketpairs that are
2677    to be tickled and RST
2678 */
2679 #define KILLTCP_KEYLEN  10
2680 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
2681 {
2682         static uint32_t key[KILLTCP_KEYLEN];
2683
2684         bzero(key, sizeof(key));
2685
2686         if (src->sa.sa_family != dst->sa.sa_family) {
2687                 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
2688                 return key;
2689         }
2690         
2691         switch (src->sa.sa_family) {
2692         case AF_INET:
2693                 key[0]  = dst->ip.sin_addr.s_addr;
2694                 key[1]  = src->ip.sin_addr.s_addr;
2695                 key[2]  = dst->ip.sin_port;
2696                 key[3]  = src->ip.sin_port;
2697                 break;
2698         case AF_INET6: {
2699                 uint32_t *dst6_addr32 =
2700                         (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
2701                 uint32_t *src6_addr32 =
2702                         (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
2703                 key[0]  = dst6_addr32[3];
2704                 key[1]  = src6_addr32[3];
2705                 key[2]  = dst6_addr32[2];
2706                 key[3]  = src6_addr32[2];
2707                 key[4]  = dst6_addr32[1];
2708                 key[5]  = src6_addr32[1];
2709                 key[6]  = dst6_addr32[0];
2710                 key[7]  = src6_addr32[0];
2711                 key[8]  = dst->ip6.sin6_port;
2712                 key[9]  = src->ip6.sin6_port;
2713                 break;
2714         }
2715         default:
2716                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
2717                 return key;
2718         }
2719
2720         return key;
2721 }
2722
2723 /*
2724   called when we get a read event on the raw socket
2725  */
2726 static void capture_tcp_handler(struct tevent_context *ev,
2727                                 struct tevent_fd *fde,
2728                                 uint16_t flags, void *private_data)
2729 {
2730         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
2731         struct ctdb_killtcp_con *con;
2732         ctdb_sock_addr src, dst;
2733         uint32_t ack_seq, seq;
2734
2735         if (!(flags & TEVENT_FD_READ)) {
2736                 return;
2737         }
2738
2739         if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
2740                                 killtcp->private_data,
2741                                 &src, &dst,
2742                                 &ack_seq, &seq) != 0) {
2743                 /* probably a non-tcp ACK packet */
2744                 return;
2745         }
2746
2747         /* check if we have this guy in our list of connections
2748            to kill
2749         */
2750         con = trbt_lookuparray32(killtcp->connections, 
2751                         KILLTCP_KEYLEN, killtcp_key(&src, &dst));
2752         if (con == NULL) {
2753                 /* no this was some other packet we can just ignore */
2754                 return;
2755         }
2756
2757         /* This one has been tickled !
2758            now reset him and remove him from the list.
2759          */
2760         DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
2761                 ntohs(con->dst_addr.ip.sin_port),
2762                 ctdb_addr_to_str(&con->src_addr),
2763                 ntohs(con->src_addr.ip.sin_port)));
2764
2765         ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
2766         talloc_free(con);
2767 }
2768
2769
2770 /* when traversing the list of all tcp connections to send tickle acks to
2771    (so that we can capture the ack coming back and kill the connection
2772     by a RST)
2773    this callback is called for each connection we are currently trying to kill
2774 */
2775 static int tickle_connection_traverse(void *param, void *data)
2776 {
2777         struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
2778
2779         /* have tried too many times, just give up */
2780         if (con->count >= 5) {
2781                 /* can't delete in traverse: reparent to delete_cons */
2782                 talloc_steal(param, con);
2783                 return 0;
2784         }
2785
2786         /* othervise, try tickling it again */
2787         con->count++;
2788         ctdb_sys_send_tcp(
2789                 (ctdb_sock_addr *)&con->dst_addr,
2790                 (ctdb_sock_addr *)&con->src_addr,
2791                 0, 0, 0);
2792         return 0;
2793 }
2794
2795
2796 /* 
2797    called every second until all sentenced connections have been reset
2798  */
2799 static void ctdb_tickle_sentenced_connections(struct tevent_context *ev,
2800                                               struct tevent_timer *te,
2801                                               struct timeval t, void *private_data)
2802 {
2803         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
2804         void *delete_cons = talloc_new(NULL);
2805
2806         /* loop over all connections sending tickle ACKs */
2807         trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
2808
2809         /* now we've finished traverse, it's safe to do deletion. */
2810         talloc_free(delete_cons);
2811
2812         /* If there are no more connections to kill we can remove the
2813            entire killtcp structure
2814          */
2815         if ( (killtcp->connections == NULL) || 
2816              (killtcp->connections->root == NULL) ) {
2817                 talloc_free(killtcp);
2818                 return;
2819         }
2820
2821         /* try tickling them again in a seconds time
2822          */
2823         tevent_add_timer(killtcp->ctdb->ev, killtcp,
2824                          timeval_current_ofs(1, 0),
2825                          ctdb_tickle_sentenced_connections, killtcp);
2826 }
2827
2828 /*
2829   destroy the killtcp structure
2830  */
2831 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
2832 {
2833         struct ctdb_vnn *tmpvnn;
2834
2835         /* verify that this vnn is still active */
2836         for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
2837                 if (tmpvnn == killtcp->vnn) {
2838                         break;
2839                 }
2840         }
2841
2842         if (tmpvnn == NULL) {
2843                 return 0;
2844         }
2845
2846         if (killtcp->vnn->killtcp != killtcp) {
2847                 return 0;
2848         }
2849
2850         killtcp->vnn->killtcp = NULL;
2851
2852         return 0;
2853 }
2854
2855
2856 /* nothing fancy here, just unconditionally replace any existing
2857    connection structure with the new one.
2858
2859    don't even free the old one if it did exist, that one is talloc_stolen
2860    by the same node in the tree anyway and will be deleted when the new data 
2861    is deleted
2862 */
2863 static void *add_killtcp_callback(void *parm, void *data)
2864 {
2865         return parm;
2866 }
2867
2868 /*
2869   add a tcp socket to the list of connections we want to RST
2870  */
2871 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb, 
2872                                        ctdb_sock_addr *s,
2873                                        ctdb_sock_addr *d)
2874 {
2875         ctdb_sock_addr src, dst;
2876         struct ctdb_kill_tcp *killtcp;
2877         struct ctdb_killtcp_con *con;
2878         struct ctdb_vnn *vnn;
2879
2880         ctdb_canonicalize_ip(s, &src);
2881         ctdb_canonicalize_ip(d, &dst);
2882
2883         vnn = find_public_ip_vnn(ctdb, &dst);
2884         if (vnn == NULL) {
2885                 vnn = find_public_ip_vnn(ctdb, &src);
2886         }
2887         if (vnn == NULL) {
2888                 /* if it is not a public ip   it could be our 'single ip' */
2889                 if (ctdb->single_ip_vnn) {
2890                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
2891                                 vnn = ctdb->single_ip_vnn;
2892                         }
2893                 }
2894         }
2895         if (vnn == NULL) {
2896                 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n")); 
2897                 return -1;
2898         }
2899
2900         killtcp = vnn->killtcp;
2901         
2902         /* If this is the first connection to kill we must allocate
2903            a new structure
2904          */
2905         if (killtcp == NULL) {
2906                 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
2907                 CTDB_NO_MEMORY(ctdb, killtcp);
2908
2909                 killtcp->vnn         = vnn;
2910                 killtcp->ctdb        = ctdb;
2911                 killtcp->capture_fd  = -1;
2912                 killtcp->connections = trbt_create(killtcp, 0);
2913
2914                 vnn->killtcp         = killtcp;
2915                 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
2916         }
2917
2918
2919
2920         /* create a structure that describes this connection we want to
2921            RST and store it in killtcp->connections
2922         */
2923         con = talloc(killtcp, struct ctdb_killtcp_con);
2924         CTDB_NO_MEMORY(ctdb, con);
2925         con->src_addr = src;
2926         con->dst_addr = dst;
2927         con->count    = 0;
2928         con->killtcp  = killtcp;
2929
2930
2931         trbt_insertarray32_callback(killtcp->connections,
2932                         KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
2933                         add_killtcp_callback, con);
2934
2935         /* 
2936            If we don't have a socket to listen on yet we must create it
2937          */
2938         if (killtcp->capture_fd == -1) {
2939                 const char *iface = ctdb_vnn_iface_string(vnn);
2940                 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
2941                 if (killtcp->capture_fd == -1) {
2942                         DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
2943                                           "socket on iface '%s' for killtcp (%s)\n",
2944                                           iface, strerror(errno)));
2945                         goto failed;
2946                 }
2947         }
2948
2949
2950         if (killtcp->fde == NULL) {
2951                 killtcp->fde = tevent_add_fd(ctdb->ev, killtcp,
2952                                              killtcp->capture_fd,
2953                                              TEVENT_FD_READ,
2954                                              capture_tcp_handler, killtcp);
2955                 tevent_fd_set_auto_close(killtcp->fde);
2956
2957                 /* We also need to set up some events to tickle all these connections
2958                    until they are all reset
2959                 */
2960                 tevent_add_timer(ctdb->ev, killtcp, timeval_current_ofs(1, 0),
2961                                  ctdb_tickle_sentenced_connections, killtcp);
2962         }
2963
2964         /* tickle him once now */
2965         ctdb_sys_send_tcp(
2966                 &con->dst_addr,
2967                 &con->src_addr,
2968                 0, 0, 0);
2969
2970         return 0;
2971
2972 failed:
2973         talloc_free(vnn->killtcp);
2974         vnn->killtcp = NULL;
2975         return -1;
2976 }
2977
2978 /*
2979   kill a TCP connection.
2980  */
2981 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
2982 {
2983         struct ctdb_connection *killtcp = (struct ctdb_connection *)indata.dptr;
2984
2985         return ctdb_killtcp_add_connection(ctdb, &killtcp->src, &killtcp->dst);
2986 }
2987
2988 /*
2989   called by a daemon to inform us of the entire list of TCP tickles for
2990   a particular public address.
2991   this control should only be sent by the node that is currently serving
2992   that public address.
2993  */
2994 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
2995 {
2996         struct ctdb_tickle_list_old *list = (struct ctdb_tickle_list_old *)indata.dptr;
2997         struct ctdb_tcp_array *tcparray;
2998         struct ctdb_vnn *vnn;
2999
3000         /* We must at least have tickles.num or else we cant verify the size
3001            of the received data blob
3002          */
3003         if (indata.dsize < offsetof(struct ctdb_tickle_list_old, connections)) {
3004                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_tickle_list. Not enough data for the tickle.num field\n"));
3005                 return -1;
3006         }
3007
3008         /* verify that the size of data matches what we expect */
3009         if (indata.dsize < offsetof(struct ctdb_tickle_list_old, connections)
3010                          + sizeof(struct ctdb_connection) * list->num) {
3011                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_tickle_list\n"));
3012                 return -1;
3013         }
3014
3015         DEBUG(DEBUG_INFO, ("Received tickle update for public address %s\n",
3016                            ctdb_addr_to_str(&list->addr)));
3017
3018         vnn = find_public_ip_vnn(ctdb, &list->addr);
3019         if (vnn == NULL) {
3020                 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n",
3021                         ctdb_addr_to_str(&list->addr)));
3022
3023                 return 1;
3024         }
3025
3026         if (vnn->pnn == ctdb->pnn) {
3027                 DEBUG(DEBUG_INFO,
3028                       ("Ignoring redundant set tcp tickle list, this node hosts '%s'\n",
3029                        ctdb_addr_to_str(&list->addr)));
3030                 return 0;
3031         }
3032
3033         /* remove any old ticklelist we might have */
3034         talloc_free(vnn->tcp_array);
3035         vnn->tcp_array = NULL;
3036
3037         tcparray = talloc(vnn, struct ctdb_tcp_array);
3038         CTDB_NO_MEMORY(ctdb, tcparray);
3039
3040         tcparray->num = list->num;
3041
3042         tcparray->connections = talloc_array(tcparray, struct ctdb_connection, tcparray->num);
3043         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3044
3045         memcpy(tcparray->connections, &list->connections[0],
3046                sizeof(struct ctdb_connection)*tcparray->num);
3047
3048         /* We now have a new fresh tickle list array for this vnn */
3049         vnn->tcp_array = tcparray;
3050
3051         return 0;
3052 }
3053
3054 /*
3055   called to return the full list of tickles for the puclic address associated 
3056   with the provided vnn
3057  */
3058 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3059 {
3060         ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3061         struct ctdb_tickle_list_old *list;
3062         struct ctdb_tcp_array *tcparray;
3063         int num;
3064         struct ctdb_vnn *vnn;
3065
3066         vnn = find_public_ip_vnn(ctdb, addr);
3067         if (vnn == NULL) {
3068                 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n", 
3069                         ctdb_addr_to_str(addr)));
3070
3071                 return 1;
3072         }
3073
3074         tcparray = vnn->tcp_array;
3075         if (tcparray) {
3076                 num = tcparray->num;
3077         } else {
3078                 num = 0;
3079         }
3080
3081         outdata->dsize = offsetof(struct ctdb_tickle_list_old, connections)
3082                         + sizeof(struct ctdb_connection) * num;
3083
3084         outdata->dptr  = talloc_size(outdata, outdata->dsize);
3085         CTDB_NO_MEMORY(ctdb, outdata->dptr);
3086         list = (struct ctdb_tickle_list_old *)outdata->dptr;
3087
3088         list->addr = *addr;
3089         list->num = num;
3090         if (num) {
3091                 memcpy(&list->connections[0], tcparray->connections,
3092                         sizeof(struct ctdb_connection) * num);
3093         }
3094
3095         return 0;
3096 }
3097
3098
3099 /*
3100   set the list of all tcp tickles for a public address
3101  */
3102 static int ctdb_send_set_tcp_tickles_for_ip(struct ctdb_context *ctdb,
3103                                             ctdb_sock_addr *addr,
3104                                             struct ctdb_tcp_array *tcparray)
3105 {
3106         int ret, num;
3107         TDB_DATA data;
3108         struct ctdb_tickle_list_old *list;
3109
3110         if (tcparray) {
3111                 num = tcparray->num;
3112         } else {
3113                 num = 0;
3114         }
3115
3116         data.dsize = offsetof(struct ctdb_tickle_list_old, connections) +
3117                         sizeof(struct ctdb_connection) * num;
3118         data.dptr = talloc_size(ctdb, data.dsize);
3119         CTDB_NO_MEMORY(ctdb, data.dptr);
3120
3121         list = (struct ctdb_tickle_list_old *)data.dptr;
3122         list->addr = *addr;
3123         list->num = num;
3124         if (tcparray) {
3125                 memcpy(&list->connections[0], tcparray->connections, sizeof(struct ctdb_connection) * num);
3126         }
3127
3128         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_ALL, 0,
3129                                        CTDB_CONTROL_SET_TCP_TICKLE_LIST,
3130                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3131         if (ret != 0) {
3132                 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
3133                 return -1;
3134         }
3135
3136         talloc_free(data.dptr);
3137
3138         return ret;
3139 }
3140
3141
3142 /*
3143   perform tickle updates if required
3144  */
3145 static void ctdb_update_tcp_tickles(struct tevent_context *ev,
3146                                     struct tevent_timer *te,
3147                                     struct timeval t, void *private_data)
3148 {
3149         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3150         int ret;
3151         struct ctdb_vnn *vnn;
3152
3153         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3154                 /* we only send out updates for public addresses that 
3155                    we have taken over
3156                  */
3157                 if (ctdb->pnn != vnn->pnn) {
3158                         continue;
3159                 }
3160                 /* We only send out the updates if we need to */
3161                 if (!vnn->tcp_update_needed) {
3162                         continue;
3163                 }
3164                 ret = ctdb_send_set_tcp_tickles_for_ip(ctdb,
3165                                                        &vnn->public_address,
3166                                                        vnn->tcp_array);
3167                 if (ret != 0) {
3168                         DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
3169                                 ctdb_addr_to_str(&vnn->public_address)));
3170                 } else {
3171                         DEBUG(DEBUG_INFO,
3172                               ("Sent tickle update for public address %s\n",
3173                                ctdb_addr_to_str(&vnn->public_address)));
3174                         vnn->tcp_update_needed = false;
3175                 }
3176         }
3177
3178         tevent_add_timer(ctdb->ev, ctdb->tickle_update_context,
3179                          timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
3180                          ctdb_update_tcp_tickles, ctdb);
3181 }
3182
3183 /*
3184   start periodic update of tcp tickles
3185  */
3186 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
3187 {
3188         ctdb->tickle_update_context = talloc_new(ctdb);
3189
3190         tevent_add_timer(ctdb->ev, ctdb->tickle_update_context,
3191                          timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
3192                          ctdb_update_tcp_tickles, ctdb);
3193 }
3194
3195
3196
3197
3198 struct control_gratious_arp {
3199         struct ctdb_context *ctdb;
3200         ctdb_sock_addr addr;
3201         const char *iface;
3202         int count;
3203 };
3204
3205 /*
3206   send a control_gratuitous arp
3207  */
3208 static void send_gratious_arp(struct tevent_context *ev,
3209                               struct tevent_timer *te,
3210                               struct timeval t, void *private_data)
3211 {
3212         int ret;
3213         struct control_gratious_arp *arp = talloc_get_type(private_data, 
3214                                                         struct control_gratious_arp);
3215
3216         ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
3217         if (ret != 0) {
3218                 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
3219                                  arp->iface, strerror(errno)));
3220         }
3221
3222
3223         arp->count++;
3224         if (arp->count == CTDB_ARP_REPEAT) {
3225                 talloc_free(arp);
3226                 return;
3227         }
3228
3229         tevent_add_timer(arp->ctdb->ev, arp,
3230                          timeval_current_ofs(CTDB_ARP_INTERVAL, 0),
3231                          send_gratious_arp, arp);
3232 }
3233
3234
3235 /*
3236   send a gratious arp 
3237  */
3238 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
3239 {
3240         struct ctdb_addr_info_old *gratious_arp = (struct ctdb_addr_info_old *)indata.dptr;
3241         struct control_gratious_arp *arp;
3242
3243         /* verify the size of indata */
3244         if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
3245                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
3246                                  (unsigned)indata.dsize, 
3247                                  (unsigned)offsetof(struct ctdb_addr_info_old, iface)));
3248                 return -1;
3249         }
3250         if (indata.dsize != 
3251                 ( offsetof(struct ctdb_addr_info_old, iface)
3252                 + gratious_arp->len ) ){
3253
3254                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3255                         "but should be %u bytes\n", 
3256                          (unsigned)indata.dsize, 
3257                          (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+gratious_arp->len)));
3258                 return -1;
3259         }
3260
3261
3262         arp = talloc(ctdb, struct control_gratious_arp);
3263         CTDB_NO_MEMORY(ctdb, arp);
3264
3265         arp->ctdb  = ctdb;
3266         arp->addr   = gratious_arp->addr;
3267         arp->iface = talloc_strdup(arp, gratious_arp->iface);
3268         CTDB_NO_MEMORY(ctdb, arp->iface);
3269         arp->count = 0;
3270
3271         tevent_add_timer(arp->ctdb->ev, arp,
3272                          timeval_zero(), send_gratious_arp, arp);
3273
3274         return 0;
3275 }
3276
3277 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
3278 {
3279         struct ctdb_addr_info_old *pub = (struct ctdb_addr_info_old *)indata.dptr;
3280         int ret;
3281
3282         /* verify the size of indata */
3283         if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
3284                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_addr_info structure\n"));
3285                 return -1;
3286         }
3287         if (indata.dsize != 
3288                 ( offsetof(struct ctdb_addr_info_old, iface)
3289                 + pub->len ) ){
3290
3291                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3292                         "but should be %u bytes\n", 
3293                          (unsigned)indata.dsize, 
3294                          (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+pub->len)));
3295                 return -1;
3296         }
3297
3298         DEBUG(DEBUG_NOTICE,("Add IP %s\n", ctdb_addr_to_str(&pub->addr)));
3299
3300         ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
3301
3302         if (ret != 0) {
3303                 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
3304                 return -1;
3305         }
3306
3307         return 0;
3308 }
3309
3310 struct delete_ip_callback_state {
3311         struct ctdb_req_control_old *c;
3312 };
3313
3314 /*
3315   called when releaseip event finishes for del_public_address
3316  */
3317 static void delete_ip_callback(struct ctdb_context *ctdb,
3318                                int32_t status, TDB_DATA data,
3319                                const char *errormsg,
3320                                void *private_data)
3321 {
3322         struct delete_ip_callback_state *state =
3323                 talloc_get_type(private_data, struct delete_ip_callback_state);
3324
3325         /* If release failed then fail. */
3326         ctdb_request_control_reply(ctdb, state->c, NULL, status, errormsg);
3327         talloc_free(private_data);
3328 }
3329
3330 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb,
3331                                         struct ctdb_req_control_old *c,
3332                                         TDB_DATA indata, bool *async_reply)
3333 {
3334         struct ctdb_addr_info_old *pub = (struct ctdb_addr_info_old *)indata.dptr;
3335         struct ctdb_vnn *vnn;
3336
3337         /* verify the size of indata */
3338         if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
3339                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_addr_info structure\n"));
3340                 return -1;
3341         }
3342         if (indata.dsize != 
3343                 ( offsetof(struct ctdb_addr_info_old, iface)
3344                 + pub->len ) ){
3345
3346                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3347                         "but should be %u bytes\n", 
3348                          (unsigned)indata.dsize, 
3349                          (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+pub->len)));
3350                 return -1;
3351         }
3352
3353         DEBUG(DEBUG_NOTICE,("Delete IP %s\n", ctdb_addr_to_str(&pub->addr)));
3354
3355         /* walk over all public addresses until we find a match */
3356         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3357                 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
3358                         if (vnn->pnn == ctdb->pnn) {
3359                                 struct delete_ip_callback_state *state;
3360                                 struct ctdb_public_ip *ip;
3361                                 TDB_DATA data;
3362                                 int ret;
3363
3364                                 vnn->delete_pending = true;
3365
3366                                 state = talloc(ctdb,
3367                                                struct delete_ip_callback_state);
3368                                 CTDB_NO_MEMORY(ctdb, state);
3369                                 state->c = c;
3370
3371                                 ip = talloc(state, struct ctdb_public_ip);
3372                                 if (ip == NULL) {
3373                                         DEBUG(DEBUG_ERR,
3374                                               (__location__ " Out of memory\n"));
3375                                         talloc_free(state);
3376                                         return -1;
3377                                 }
3378                                 ip->pnn = -1;
3379                                 ip->addr = pub->addr;
3380
3381                                 data.dsize = sizeof(struct ctdb_public_ip);
3382                                 data.dptr = (unsigned char *)ip;
3383
3384                                 ret = ctdb_daemon_send_control(ctdb,
3385                                                                ctdb_get_pnn(ctdb),
3386                                                                0,
3387                                                                CTDB_CONTROL_RELEASE_IP,
3388                                                                0, 0,
3389                                                                data,
3390                                                                delete_ip_callback,
3391                                                                state);
3392                                 if (ret == -1) {
3393                                         DEBUG(DEBUG_ERR,
3394                                               (__location__ "Unable to send "
3395                                                "CTDB_CONTROL_RELEASE_IP\n"));
3396                                         talloc_free(state);
3397                                         return -1;
3398                                 }
3399
3400                                 state->c = talloc_steal(state, c);
3401                                 *async_reply = true;
3402                         } else {
3403                                 /* This IP is not hosted on the
3404                                  * current node so just delete it
3405                                  * now. */
3406                                 do_delete_ip(ctdb, vnn);
3407                         }
3408
3409                         return 0;
3410                 }
3411         }
3412
3413         DEBUG(DEBUG_ERR,("Delete IP of unknown public IP address %s\n",
3414                          ctdb_addr_to_str(&pub->addr)));
3415         return -1;
3416 }
3417
3418
3419 struct ipreallocated_callback_state {
3420         struct ctdb_req_control_old *c;
3421 };
3422
3423 static void ctdb_ipreallocated_callback(struct ctdb_context *ctdb,
3424                                         int status, void *p)
3425 {
3426         struct ipreallocated_callback_state *state =
3427                 talloc_get_type(p, struct ipreallocated_callback_state);
3428
3429         if (status != 0) {
3430                 DEBUG(DEBUG_ERR,
3431                       (" \"ipreallocated\" event script failed (status %d)\n",
3432                        status));
3433                 if (status == -ETIME) {
3434                         ctdb_ban_self(ctdb);
3435                 }
3436         }
3437
3438         ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
3439         talloc_free(state);
3440 }
3441
3442 /* A control to run the ipreallocated event */
3443 int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
3444                                    struct ctdb_req_control_old *c,
3445                                    bool *async_reply)
3446 {
3447         int ret;
3448         struct ipreallocated_callback_state *state;
3449
3450         state = talloc(ctdb, struct ipreallocated_callback_state);
3451         CTDB_NO_MEMORY(ctdb, state);
3452
3453         DEBUG(DEBUG_INFO,(__location__ " Running \"ipreallocated\" event\n"));
3454
3455         ret = ctdb_event_script_callback(ctdb, state,
3456                                          ctdb_ipreallocated_callback, state,
3457                                          CTDB_EVENT_IPREALLOCATED,
3458                                          "%s", "");
3459
3460         if (ret != 0) {
3461                 DEBUG(DEBUG_ERR,("Failed to run \"ipreallocated\" event \n"));
3462                 talloc_free(state);
3463                 return -1;
3464         }
3465
3466         /* tell the control that we will be reply asynchronously */
3467         state->c    = talloc_steal(state, c);
3468         *async_reply = true;
3469
3470         return 0;
3471 }
3472
3473
3474 /* This function is called from the recovery daemon to verify that a remote
3475    node has the expected ip allocation.
3476    This is verified against ctdb->ip_tree
3477 */
3478 static int verify_remote_ip_allocation(struct ctdb_context *ctdb,
3479                                        struct ctdb_public_ip_list *ips,
3480                                        uint32_t pnn)
3481 {
3482         struct public_ip_list *tmp_ip;
3483         int i;
3484
3485         if (ctdb->ip_tree == NULL) {
3486                 /* don't know the expected allocation yet, assume remote node
3487                    is correct. */
3488                 return 0;
3489         }
3490
3491         if (ips == NULL) {
3492                 return 0;
3493         }
3494
3495         for (i=0; i<ips->num; i++) {
3496                 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ip[i].addr));
3497                 if (tmp_ip == NULL) {
3498                         DEBUG(DEBUG_ERR,("Node %u has new or unknown public IP %s\n", pnn, ctdb_addr_to_str(&ips->ip[i].addr)));
3499                         return -1;
3500                 }
3501
3502                 if (tmp_ip->pnn == -1 || ips->ip[i].pnn == -1) {
3503                         continue;
3504                 }
3505
3506                 if (tmp_ip->pnn != ips->ip[i].pnn) {
3507                         DEBUG(DEBUG_ERR,
3508                               ("Inconsistent IP allocation - node %u thinks %s is held by node %u while it is assigned to node %u\n",
3509                                pnn,
3510                                ctdb_addr_to_str(&ips->ip[i].addr),
3511                                ips->ip[i].pnn, tmp_ip->pnn));
3512                         return -1;
3513                 }
3514         }
3515
3516         return 0;
3517 }
3518
3519 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
3520 {
3521         struct public_ip_list *tmp_ip;
3522
3523         /* IP tree is never built if DisableIPFailover is set */
3524         if (ctdb->tunable.disable_ip_failover != 0) {
3525                 return 0;
3526         }
3527
3528         if (ctdb->ip_tree == NULL) {
3529                 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
3530                 return -1;
3531         }
3532
3533         tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
3534         if (tmp_ip == NULL) {
3535                 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
3536                 return -1;
3537         }
3538
3539         DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
3540         tmp_ip->pnn = ip->pnn;
3541
3542         return 0;
3543 }
3544
3545 void clear_ip_assignment_tree(struct ctdb_context *ctdb)
3546 {
3547         TALLOC_FREE(ctdb->ip_tree);
3548 }
3549
3550 struct ctdb_reloadips_handle {
3551         struct ctdb_context *ctdb;
3552         struct ctdb_req_control_old *c;
3553         int status;
3554         int fd[2];
3555         pid_t child;
3556         struct tevent_fd *fde;
3557 };
3558
3559 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
3560 {
3561         if (h == h->ctdb->reload_ips) {
3562                 h->ctdb->reload_ips = NULL;
3563         }
3564         if (h->c != NULL) {
3565                 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
3566                 h->c = NULL;
3567         }
3568         ctdb_kill(h->ctdb, h->child, SIGKILL);
3569         return 0;
3570 }
3571
3572 static void ctdb_reloadips_timeout_event(struct tevent_context *ev,
3573                                          struct tevent_timer *te,
3574                                          struct timeval t, void *private_data)
3575 {
3576         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
3577
3578         talloc_free(h);
3579 }
3580
3581 static void ctdb_reloadips_child_handler(struct tevent_context *ev,
3582                                          struct tevent_fd *fde,
3583                                          uint16_t flags, void *private_data)
3584 {
3585         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
3586
3587         char res;
3588         int ret;
3589
3590         ret = sys_read(h->fd[0], &res, 1);
3591         if (ret < 1 || res != 0) {
3592                 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
3593                 res = 1;
3594         }
3595         h->status = res;
3596
3597         talloc_free(h);
3598 }
3599
3600 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
3601 {
3602         TALLOC_CTX *mem_ctx = talloc_new(NULL);
3603         struct ctdb_public_ip_list_old *ips;
3604         struct ctdb_vnn *vnn;
3605         struct client_async_data *async_data;
3606         struct timeval timeout;
3607         TDB_DATA data;
3608         struct ctdb_client_control_state *state;
3609         bool first_add;
3610         int i, ret;
3611
3612         CTDB_NO_MEMORY(ctdb, mem_ctx);
3613
3614         /* Read IPs from local node */
3615         ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(),
3616                                        CTDB_CURRENT_NODE, mem_ctx, &ips);
3617         if (ret != 0) {
3618                 DEBUG(DEBUG_ERR,
3619                       ("Unable to fetch public IPs from local node\n"));
3620                 talloc_free(mem_ctx);
3621                 return -1;
3622         }
3623
3624         /* Read IPs file - this is safe since this is a child process */
3625         ctdb->vnn = NULL;
3626         if (ctdb_set_public_addresses(ctdb, false) != 0) {
3627                 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
3628                 talloc_free(mem_ctx);
3629                 return -1;
3630         }
3631
3632         async_data = talloc_zero(mem_ctx, struct client_async_data);
3633         CTDB_NO_MEMORY(ctdb, async_data);
3634
3635         /* Compare IPs between node and file for IPs to be deleted */
3636         for (i = 0; i < ips->num; i++) {
3637                 /* */
3638                 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
3639                         if (ctdb_same_ip(&vnn->public_address,
3640                                          &ips->ips[i].addr)) {
3641                                 /* IP is still in file */
3642                                 break;
3643                         }
3644                 }
3645
3646                 if (vnn == NULL) {
3647                         /* Delete IP ips->ips[i] */
3648                         struct ctdb_addr_info_old *pub;
3649
3650                         DEBUG(DEBUG_NOTICE,
3651                               ("IP %s no longer configured, deleting it\n",
3652                                ctdb_addr_to_str(&ips->ips[i].addr)));
3653
3654                         pub = talloc_zero(mem_ctx, struct ctdb_addr_info_old);
3655                         CTDB_NO_MEMORY(ctdb, pub);
3656
3657                         pub->addr  = ips->ips[i].addr;
3658                         pub->mask  = 0;
3659                         pub->len   = 0;
3660
3661                         timeout = TAKEOVER_TIMEOUT();
3662
3663                         data.dsize = offsetof(struct ctdb_addr_info_old,
3664                                               iface) + pub->len;
3665                         data.dptr = (uint8_t *)pub;
3666
3667                         state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
3668                                                   CTDB_CONTROL_DEL_PUBLIC_IP,
3669                                                   0, data, async_data,
3670                                                   &timeout, NULL);
3671                         if (state == NULL) {
3672                                 DEBUG(DEBUG_ERR,
3673                                       (__location__
3674                                        " failed sending CTDB_CONTROL_DEL_PUBLIC_IP\n"));
3675                                 goto failed;
3676                         }
3677
3678                         ctdb_client_async_add(async_data, state);
3679                 }
3680         }
3681
3682         /* Compare IPs between node and file for IPs to be added */
3683         first_add = true;
3684         for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
3685                 for (i = 0; i < ips->num; i++) {
3686                         if (ctdb_same_ip(&vnn->public_address,
3687                                          &ips->ips[i].addr)) {
3688                                 /* IP already on node */
3689                                 break;
3690                         }
3691                 }
3692                 if (i == ips->num) {
3693                         /* Add IP ips->ips[i] */
3694                         struct ctdb_addr_info_old *pub;
3695                         const char *ifaces = NULL;
3696                         uint32_t len;
3697                         int iface = 0;
3698
3699                         DEBUG(DEBUG_NOTICE,
3700                               ("New IP %s configured, adding it\n",
3701                                ctdb_addr_to_str(&vnn->public_address)));
3702                         if (first_add) {
3703                                 uint32_t pnn = ctdb_get_pnn(ctdb);
3704
3705                                 data.dsize = sizeof(pnn);
3706                                 data.dptr  = (uint8_t *)&pnn;
3707
3708                                 ret = ctdb_client_send_message(
3709                                         ctdb,
3710                                         CTDB_BROADCAST_CONNECTED,
3711                                         CTDB_SRVID_REBALANCE_NODE,
3712                                         data);
3713                                 if (ret != 0) {
3714                                         DEBUG(DEBUG_WARNING,
3715                                               ("Failed to send message to force node reallocation - IPs may be unbalanced\n"));
3716                                 }
3717
3718                                 first_add = false;
3719                         }
3720
3721                         ifaces = vnn->ifaces[0];
3722                         iface = 1;
3723                         while (vnn->ifaces[iface] != NULL) {
3724                                 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces,
3725                                                          vnn->ifaces[iface]);
3726                                 iface++;
3727                         }
3728
3729                         len   = strlen(ifaces) + 1;
3730                         pub = talloc_zero_size(mem_ctx,
3731                                                offsetof(struct ctdb_addr_info_old, iface) + len);
3732                         CTDB_NO_MEMORY(ctdb, pub);
3733
3734                         pub->addr  = vnn->public_address;
3735                         pub->mask  = vnn->public_netmask_bits;
3736                         pub->len   = len;
3737                         memcpy(&pub->iface[0], ifaces, pub->len);
3738
3739                         timeout = TAKEOVER_TIMEOUT();
3740
3741                         data.dsize = offsetof(struct ctdb_addr_info_old,
3742                                               iface) + pub->len;
3743                         data.dptr = (uint8_t *)pub;
3744
3745                         state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
3746                                                   CTDB_CONTROL_ADD_PUBLIC_IP,
3747                                                   0, data, async_data,
3748                                                   &timeout, NULL);
3749                         if (state == NULL) {
3750                                 DEBUG(DEBUG_ERR,
3751                                       (__location__
3752                                        " failed sending CTDB_CONTROL_ADD_PUBLIC_IP\n"));
3753                                 goto failed;
3754                         }
3755
3756                         ctdb_client_async_add(async_data, state);
3757                 }
3758         }
3759
3760         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
3761                 DEBUG(DEBUG_ERR,(__location__ " Add/delete IPs failed\n"));
3762                 goto failed;
3763         }
3764
3765         talloc_free(mem_ctx);
3766         return 0;
3767
3768 failed:
3769         talloc_free(mem_ctx);
3770         return -1;
3771 }
3772
3773 /* This control is sent to force the node to re-read the public addresses file
3774    and drop any addresses we should nnot longer host, and add new addresses
3775    that we are now able to host
3776 */
3777 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control_old *c, bool *async_reply)
3778 {
3779         struct ctdb_reloadips_handle *h;
3780         pid_t parent = getpid();
3781
3782         if (ctdb->reload_ips != NULL) {
3783                 talloc_free(ctdb->reload_ips);
3784                 ctdb->reload_ips = NULL;
3785         }
3786
3787         h = talloc(ctdb, struct ctdb_reloadips_handle);
3788         CTDB_NO_MEMORY(ctdb, h);
3789         h->ctdb     = ctdb;
3790         h->c        = NULL;
3791         h->status   = -1;
3792         
3793         if (pipe(h->fd) == -1) {
3794                 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
3795                 talloc_free(h);
3796                 return -1;
3797         }
3798
3799         h->child = ctdb_fork(ctdb);
3800         if (h->child == (pid_t)-1) {
3801                 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
3802                 close(h->fd[0]);
3803                 close(h->fd[1]);
3804                 talloc_free(h);
3805                 return -1;
3806         }
3807
3808         /* child process */
3809         if (h->child == 0) {
3810                 signed char res = 0;
3811
3812                 close(h->fd[0]);
3813                 debug_extra = talloc_asprintf(NULL, "reloadips:");
3814
3815                 prctl_set_comment("ctdb_reloadips");
3816                 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
3817                         DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
3818                         res = -1;
3819                 } else {
3820                         res = ctdb_reloadips_child(ctdb);
3821                         if (res != 0) {
3822                                 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
3823                         }
3824                 }
3825
3826                 sys_write(h->fd[1], &res, 1);
3827                 ctdb_wait_for_process_to_exit(parent);
3828                 _exit(0);
3829         }
3830
3831         h->c             = talloc_steal(h, c);
3832
3833         close(h->fd[1]);
3834         set_close_on_exec(h->fd[0]);
3835
3836         talloc_set_destructor(h, ctdb_reloadips_destructor);
3837
3838
3839         h->fde = tevent_add_fd(ctdb->ev, h, h->fd[0], TEVENT_FD_READ,
3840                                ctdb_reloadips_child_handler, (void *)h);
3841         tevent_fd_set_auto_close(h->fde);
3842
3843         tevent_add_timer(ctdb->ev, h, timeval_current_ofs(120, 0),
3844                          ctdb_reloadips_timeout_event, h);
3845
3846         /* we reply later */
3847         *async_reply = true;
3848         return 0;
3849 }