4114b40f83e0ccb76da575a39d3e1a1ed8d70761
[sahlberg/ctdb.git] / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6    Copyright (C) Martin Schwenke  2011
7
8    This program is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3 of the License, or
11    (at your option) any later version.
12    
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17    
18    You should have received a copy of the GNU General Public License
19    along with this program; if not, see <http://www.gnu.org/licenses/>.
20 */
21 #include "includes.h"
22 #include "lib/tevent/tevent.h"
23 #include "lib/tdb/include/tdb.h"
24 #include "lib/util/dlinklist.h"
25 #include "system/network.h"
26 #include "system/filesys.h"
27 #include "system/wait.h"
28 #include "../include/ctdb_private.h"
29 #include "../common/rb_tree.h"
30
31
32 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
33
34 #define CTDB_ARP_INTERVAL 1
35 #define CTDB_ARP_REPEAT   3
36
37 struct ctdb_iface {
38         struct ctdb_iface *prev, *next;
39         const char *name;
40         bool link_up;
41         uint32_t references;
42 };
43
44 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
45 {
46         if (vnn->iface) {
47                 return vnn->iface->name;
48         }
49
50         return "__none__";
51 }
52
53 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
54 {
55         struct ctdb_iface *i;
56
57         /* Verify that we dont have an entry for this ip yet */
58         for (i=ctdb->ifaces;i;i=i->next) {
59                 if (strcmp(i->name, iface) == 0) {
60                         return 0;
61                 }
62         }
63
64         /* create a new structure for this interface */
65         i = talloc_zero(ctdb, struct ctdb_iface);
66         CTDB_NO_MEMORY_FATAL(ctdb, i);
67         i->name = talloc_strdup(i, iface);
68         CTDB_NO_MEMORY(ctdb, i->name);
69         i->link_up = false;
70
71         DLIST_ADD(ctdb->ifaces, i);
72
73         return 0;
74 }
75
76 static struct ctdb_iface *ctdb_find_iface(struct ctdb_context *ctdb,
77                                           const char *iface)
78 {
79         struct ctdb_iface *i;
80
81         /* Verify that we dont have an entry for this ip yet */
82         for (i=ctdb->ifaces;i;i=i->next) {
83                 if (strcmp(i->name, iface) == 0) {
84                         return i;
85                 }
86         }
87
88         return NULL;
89 }
90
91 static struct ctdb_iface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
92                                               struct ctdb_vnn *vnn)
93 {
94         int i;
95         struct ctdb_iface *cur = NULL;
96         struct ctdb_iface *best = NULL;
97
98         for (i=0; vnn->ifaces[i]; i++) {
99
100                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
101                 if (cur == NULL) {
102                         continue;
103                 }
104
105                 if (!cur->link_up) {
106                         continue;
107                 }
108
109                 if (best == NULL) {
110                         best = cur;
111                         continue;
112                 }
113
114                 if (cur->references < best->references) {
115                         best = cur;
116                         continue;
117                 }
118         }
119
120         return best;
121 }
122
123 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
124                                      struct ctdb_vnn *vnn)
125 {
126         struct ctdb_iface *best = NULL;
127
128         if (vnn->iface) {
129                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
130                                    "still assigned to iface '%s'\n",
131                                    ctdb_addr_to_str(&vnn->public_address),
132                                    ctdb_vnn_iface_string(vnn)));
133                 return 0;
134         }
135
136         best = ctdb_vnn_best_iface(ctdb, vnn);
137         if (best == NULL) {
138                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
139                                   "cannot assign to iface any iface\n",
140                                   ctdb_addr_to_str(&vnn->public_address)));
141                 return -1;
142         }
143
144         vnn->iface = best;
145         best->references++;
146         vnn->pnn = ctdb->pnn;
147
148         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
149                            "now assigned to iface '%s' refs[%d]\n",
150                            ctdb_addr_to_str(&vnn->public_address),
151                            ctdb_vnn_iface_string(vnn),
152                            best->references));
153         return 0;
154 }
155
156 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
157                                     struct ctdb_vnn *vnn)
158 {
159         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
160                            "now unassigned (old iface '%s' refs[%d])\n",
161                            ctdb_addr_to_str(&vnn->public_address),
162                            ctdb_vnn_iface_string(vnn),
163                            vnn->iface?vnn->iface->references:0));
164         if (vnn->iface) {
165                 vnn->iface->references--;
166         }
167         vnn->iface = NULL;
168         if (vnn->pnn == ctdb->pnn) {
169                 vnn->pnn = -1;
170         }
171 }
172
173 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
174                                struct ctdb_vnn *vnn)
175 {
176         int i;
177
178         if (vnn->iface && vnn->iface->link_up) {
179                 return true;
180         }
181
182         for (i=0; vnn->ifaces[i]; i++) {
183                 struct ctdb_iface *cur;
184
185                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
186                 if (cur == NULL) {
187                         continue;
188                 }
189
190                 if (cur->link_up) {
191                         return true;
192                 }
193         }
194
195         return false;
196 }
197
198 struct ctdb_takeover_arp {
199         struct ctdb_context *ctdb;
200         uint32_t count;
201         ctdb_sock_addr addr;
202         struct ctdb_tcp_array *tcparray;
203         struct ctdb_vnn *vnn;
204 };
205
206
207 /*
208   lists of tcp endpoints
209  */
210 struct ctdb_tcp_list {
211         struct ctdb_tcp_list *prev, *next;
212         struct ctdb_tcp_connection connection;
213 };
214
215 /*
216   list of clients to kill on IP release
217  */
218 struct ctdb_client_ip {
219         struct ctdb_client_ip *prev, *next;
220         struct ctdb_context *ctdb;
221         ctdb_sock_addr addr;
222         uint32_t client_id;
223 };
224
225
226 /*
227   send a gratuitous arp
228  */
229 static void ctdb_control_send_arp(struct event_context *ev, struct timed_event *te, 
230                                   struct timeval t, void *private_data)
231 {
232         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
233                                                         struct ctdb_takeover_arp);
234         int i, ret;
235         struct ctdb_tcp_array *tcparray;
236         const char *iface = ctdb_vnn_iface_string(arp->vnn);
237
238         ret = ctdb_sys_send_arp(&arp->addr, iface);
239         if (ret != 0) {
240                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
241                                   iface, strerror(errno)));
242         }
243
244         tcparray = arp->tcparray;
245         if (tcparray) {
246                 for (i=0;i<tcparray->num;i++) {
247                         struct ctdb_tcp_connection *tcon;
248
249                         tcon = &tcparray->connections[i];
250                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
251                                 (unsigned)ntohs(tcon->dst_addr.ip.sin_port), 
252                                 ctdb_addr_to_str(&tcon->src_addr),
253                                 (unsigned)ntohs(tcon->src_addr.ip.sin_port)));
254                         ret = ctdb_sys_send_tcp(
255                                 &tcon->src_addr, 
256                                 &tcon->dst_addr,
257                                 0, 0, 0);
258                         if (ret != 0) {
259                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
260                                         ctdb_addr_to_str(&tcon->src_addr)));
261                         }
262                 }
263         }
264
265         arp->count++;
266
267         if (arp->count == CTDB_ARP_REPEAT) {
268                 talloc_free(arp);
269                 return;
270         }
271
272         event_add_timed(arp->ctdb->ev, arp->vnn->takeover_ctx, 
273                         timeval_current_ofs(CTDB_ARP_INTERVAL, 100000), 
274                         ctdb_control_send_arp, arp);
275 }
276
277 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
278                                        struct ctdb_vnn *vnn)
279 {
280         struct ctdb_takeover_arp *arp;
281         struct ctdb_tcp_array *tcparray;
282
283         if (!vnn->takeover_ctx) {
284                 vnn->takeover_ctx = talloc_new(vnn);
285                 if (!vnn->takeover_ctx) {
286                         return -1;
287                 }
288         }
289
290         arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
291         if (!arp) {
292                 return -1;
293         }
294
295         arp->ctdb = ctdb;
296         arp->addr = vnn->public_address;
297         arp->vnn  = vnn;
298
299         tcparray = vnn->tcp_array;
300         if (tcparray) {
301                 /* add all of the known tcp connections for this IP to the
302                    list of tcp connections to send tickle acks for */
303                 arp->tcparray = talloc_steal(arp, tcparray);
304
305                 vnn->tcp_array = NULL;
306                 vnn->tcp_update_needed = true;
307         }
308
309         event_add_timed(arp->ctdb->ev, vnn->takeover_ctx,
310                         timeval_zero(), ctdb_control_send_arp, arp);
311
312         return 0;
313 }
314
315 struct takeover_callback_state {
316         struct ctdb_req_control *c;
317         ctdb_sock_addr *addr;
318         struct ctdb_vnn *vnn;
319 };
320
321 struct ctdb_do_takeip_state {
322         struct ctdb_req_control *c;
323         struct ctdb_vnn *vnn;
324 };
325
326 /*
327   called when takeip event finishes
328  */
329 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
330                                     void *private_data)
331 {
332         struct ctdb_do_takeip_state *state =
333                 talloc_get_type(private_data, struct ctdb_do_takeip_state);
334         int32_t ret;
335         TDB_DATA data;
336
337         if (status != 0) {
338                 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
339         
340                 if (status == -ETIME) {
341                         ctdb_ban_self(ctdb);
342                 }
343                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
344                                  ctdb_addr_to_str(&state->vnn->public_address),
345                                  ctdb_vnn_iface_string(state->vnn)));
346                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
347
348                 node->flags |= NODE_FLAGS_UNHEALTHY;
349                 talloc_free(state);
350                 return;
351         }
352
353         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
354         if (ret != 0) {
355                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
356                 talloc_free(state);
357                 return;
358         }
359
360         data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
361         data.dsize = strlen((char *)data.dptr) + 1;
362         DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
363
364         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
365
366
367         /* the control succeeded */
368         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
369         talloc_free(state);
370         return;
371 }
372
373 /*
374   take over an ip address
375  */
376 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
377                               struct ctdb_req_control *c,
378                               struct ctdb_vnn *vnn)
379 {
380         int ret;
381         struct ctdb_do_takeip_state *state;
382
383         ret = ctdb_vnn_assign_iface(ctdb, vnn);
384         if (ret != 0) {
385                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
386                                  "assin a usable interface\n",
387                                  ctdb_addr_to_str(&vnn->public_address),
388                                  vnn->public_netmask_bits));
389                 return -1;
390         }
391
392         state = talloc(vnn, struct ctdb_do_takeip_state);
393         CTDB_NO_MEMORY(ctdb, state);
394
395         state->c = talloc_steal(ctdb, c);
396         state->vnn   = vnn;
397
398         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
399                             ctdb_addr_to_str(&vnn->public_address),
400                             vnn->public_netmask_bits,
401                             ctdb_vnn_iface_string(vnn)));
402
403         ret = ctdb_event_script_callback(ctdb,
404                                          state,
405                                          ctdb_do_takeip_callback,
406                                          state,
407                                          false,
408                                          CTDB_EVENT_TAKE_IP,
409                                          "%s %s %u",
410                                          ctdb_vnn_iface_string(vnn),
411                                          ctdb_addr_to_str(&vnn->public_address),
412                                          vnn->public_netmask_bits);
413
414         if (ret != 0) {
415                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
416                         ctdb_addr_to_str(&vnn->public_address),
417                         ctdb_vnn_iface_string(vnn)));
418                 talloc_free(state);
419                 return -1;
420         }
421
422         return 0;
423 }
424
425 struct ctdb_do_updateip_state {
426         struct ctdb_req_control *c;
427         struct ctdb_iface *old;
428         struct ctdb_vnn *vnn;
429 };
430
431 /*
432   called when updateip event finishes
433  */
434 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
435                                       void *private_data)
436 {
437         struct ctdb_do_updateip_state *state =
438                 talloc_get_type(private_data, struct ctdb_do_updateip_state);
439         int32_t ret;
440
441         if (status != 0) {
442                 if (status == -ETIME) {
443                         ctdb_ban_self(ctdb);
444                 }
445                 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
446                         ctdb_addr_to_str(&state->vnn->public_address),
447                         state->old->name,
448                         ctdb_vnn_iface_string(state->vnn)));
449
450                 /*
451                  * All we can do is reset the old interface
452                  * and let the next run fix it
453                  */
454                 ctdb_vnn_unassign_iface(ctdb, state->vnn);
455                 state->vnn->iface = state->old;
456                 state->vnn->iface->references++;
457
458                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
459                 talloc_free(state);
460                 return;
461         }
462
463         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
464         if (ret != 0) {
465                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
466                 talloc_free(state);
467                 return;
468         }
469
470         /* the control succeeded */
471         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
472         talloc_free(state);
473         return;
474 }
475
476 /*
477   update (move) an ip address
478  */
479 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
480                                 struct ctdb_req_control *c,
481                                 struct ctdb_vnn *vnn)
482 {
483         int ret;
484         struct ctdb_do_updateip_state *state;
485         struct ctdb_iface *old = vnn->iface;
486         const char *new_name;
487
488         ctdb_vnn_unassign_iface(ctdb, vnn);
489         ret = ctdb_vnn_assign_iface(ctdb, vnn);
490         if (ret != 0) {
491                 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
492                                  "assin a usable interface (old iface '%s')\n",
493                                  ctdb_addr_to_str(&vnn->public_address),
494                                  vnn->public_netmask_bits,
495                                  old->name));
496                 return -1;
497         }
498
499         new_name = ctdb_vnn_iface_string(vnn);
500         if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
501                 /* A benign update from one interface onto itself.
502                  * no need to run the eventscripts in this case, just return
503                  * success.
504                  */
505                 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
506                 return 0;
507         }
508
509         state = talloc(vnn, struct ctdb_do_updateip_state);
510         CTDB_NO_MEMORY(ctdb, state);
511
512         state->c = talloc_steal(ctdb, c);
513         state->old = old;
514         state->vnn = vnn;
515
516         DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
517                             "interface %s to %s\n",
518                             ctdb_addr_to_str(&vnn->public_address),
519                             vnn->public_netmask_bits,
520                             old->name,
521                             new_name));
522
523         ret = ctdb_event_script_callback(ctdb,
524                                          state,
525                                          ctdb_do_updateip_callback,
526                                          state,
527                                          false,
528                                          CTDB_EVENT_UPDATE_IP,
529                                          "%s %s %s %u",
530                                          state->old->name,
531                                          new_name,
532                                          ctdb_addr_to_str(&vnn->public_address),
533                                          vnn->public_netmask_bits);
534         if (ret != 0) {
535                 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
536                                  ctdb_addr_to_str(&vnn->public_address),
537                                  old->name, new_name));
538                 talloc_free(state);
539                 return -1;
540         }
541
542         return 0;
543 }
544
545 /*
546   Find the vnn of the node that has a public ip address
547   returns -1 if the address is not known as a public address
548  */
549 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
550 {
551         struct ctdb_vnn *vnn;
552
553         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
554                 if (ctdb_same_ip(&vnn->public_address, addr)) {
555                         return vnn;
556                 }
557         }
558
559         return NULL;
560 }
561
562 /*
563   take over an ip address
564  */
565 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
566                                  struct ctdb_req_control *c,
567                                  TDB_DATA indata,
568                                  bool *async_reply)
569 {
570         int ret;
571         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
572         struct ctdb_vnn *vnn;
573         bool have_ip = false;
574         bool do_updateip = false;
575         bool do_takeip = false;
576         struct ctdb_iface *best_iface = NULL;
577
578         if (pip->pnn != ctdb->pnn) {
579                 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
580                                  "with pnn %d, but we're node %d\n",
581                                  ctdb_addr_to_str(&pip->addr),
582                                  pip->pnn, ctdb->pnn));
583                 return -1;
584         }
585
586         /* update out vnn list */
587         vnn = find_public_ip_vnn(ctdb, &pip->addr);
588         if (vnn == NULL) {
589                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
590                         ctdb_addr_to_str(&pip->addr)));
591                 return 0;
592         }
593
594         have_ip = ctdb_sys_have_ip(&pip->addr);
595         best_iface = ctdb_vnn_best_iface(ctdb, vnn);
596         if (best_iface == NULL) {
597                 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
598                                  "a usable interface (old %s, have_ip %d)\n",
599                                  ctdb_addr_to_str(&vnn->public_address),
600                                  vnn->public_netmask_bits,
601                                  ctdb_vnn_iface_string(vnn),
602                                  have_ip));
603                 return -1;
604         }
605
606         if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
607                 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
608                 have_ip = false;
609         }
610
611         if (vnn->iface == NULL && have_ip) {
612                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
613                                   "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
614                                  ctdb_addr_to_str(&vnn->public_address)));
615                 return 0;
616         }
617
618         if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
619                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
620                                   "and we have it on iface[%s], but it was assigned to node %d"
621                                   "and we are node %d, banning ourself\n",
622                                  ctdb_addr_to_str(&vnn->public_address),
623                                  ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
624                 ctdb_ban_self(ctdb);
625                 return -1;
626         }
627
628         if (vnn->pnn == -1 && have_ip) {
629                 vnn->pnn = ctdb->pnn;
630                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
631                                   "and we already have it on iface[%s], update local daemon\n",
632                                  ctdb_addr_to_str(&vnn->public_address),
633                                   ctdb_vnn_iface_string(vnn)));
634                 return 0;
635         }
636
637         if (vnn->iface) {
638                 if (vnn->iface->link_up) {
639                         /* only move when the rebalance gains something */
640                         if (vnn->iface->references > (best_iface->references + 1)) {
641                                 do_updateip = true;
642                         }
643                 } else if (vnn->iface != best_iface) {
644                         do_updateip = true;
645                 }
646         }
647
648         if (!have_ip) {
649                 if (do_updateip) {
650                         ctdb_vnn_unassign_iface(ctdb, vnn);
651                         do_updateip = false;
652                 }
653                 do_takeip = true;
654         }
655
656         if (do_takeip) {
657                 ret = ctdb_do_takeip(ctdb, c, vnn);
658                 if (ret != 0) {
659                         return -1;
660                 }
661         } else if (do_updateip) {
662                 ret = ctdb_do_updateip(ctdb, c, vnn);
663                 if (ret != 0) {
664                         return -1;
665                 }
666         } else {
667                 /*
668                  * The interface is up and the kernel known the ip
669                  * => do nothing
670                  */
671                 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
672                         ctdb_addr_to_str(&pip->addr),
673                         vnn->public_netmask_bits,
674                         ctdb_vnn_iface_string(vnn)));
675                 return 0;
676         }
677
678         /* tell ctdb_control.c that we will be replying asynchronously */
679         *async_reply = true;
680
681         return 0;
682 }
683
684 /*
685   takeover an ip address old v4 style
686  */
687 int32_t ctdb_control_takeover_ipv4(struct ctdb_context *ctdb, 
688                                 struct ctdb_req_control *c,
689                                 TDB_DATA indata, 
690                                 bool *async_reply)
691 {
692         TDB_DATA data;
693         
694         data.dsize = sizeof(struct ctdb_public_ip);
695         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
696         CTDB_NO_MEMORY(ctdb, data.dptr);
697         
698         memcpy(data.dptr, indata.dptr, indata.dsize);
699         return ctdb_control_takeover_ip(ctdb, c, data, async_reply);
700 }
701
702 /*
703   kill any clients that are registered with a IP that is being released
704  */
705 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
706 {
707         struct ctdb_client_ip *ip;
708
709         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
710                 ctdb_addr_to_str(addr)));
711
712         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
713                 ctdb_sock_addr tmp_addr;
714
715                 tmp_addr = ip->addr;
716                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
717                         ip->client_id,
718                         ctdb_addr_to_str(&ip->addr)));
719
720                 if (ctdb_same_ip(&tmp_addr, addr)) {
721                         struct ctdb_client *client = ctdb_reqid_find(ctdb, 
722                                                                      ip->client_id, 
723                                                                      struct ctdb_client);
724                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
725                                 ip->client_id,
726                                 ctdb_addr_to_str(&ip->addr),
727                                 client->pid));
728
729                         if (client->pid != 0) {
730                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
731                                         (unsigned)client->pid,
732                                         ctdb_addr_to_str(addr),
733                                         ip->client_id));
734                                 kill(client->pid, SIGKILL);
735                         }
736                 }
737         }
738 }
739
740 /*
741   called when releaseip event finishes
742  */
743 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
744                                 void *private_data)
745 {
746         struct takeover_callback_state *state = 
747                 talloc_get_type(private_data, struct takeover_callback_state);
748         TDB_DATA data;
749
750         if (status == -ETIME) {
751                 ctdb_ban_self(ctdb);
752         }
753
754         /* send a message to all clients of this node telling them
755            that the cluster has been reconfigured and they should
756            release any sockets on this IP */
757         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
758         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
759         data.dsize = strlen((char *)data.dptr)+1;
760
761         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
762
763         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
764
765         /* kill clients that have registered with this IP */
766         release_kill_clients(ctdb, state->addr);
767
768         ctdb_vnn_unassign_iface(ctdb, state->vnn);
769
770         /* the control succeeded */
771         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
772         talloc_free(state);
773 }
774
775 /*
776   release an ip address
777  */
778 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
779                                 struct ctdb_req_control *c,
780                                 TDB_DATA indata, 
781                                 bool *async_reply)
782 {
783         int ret;
784         struct takeover_callback_state *state;
785         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
786         struct ctdb_vnn *vnn;
787
788         /* update our vnn list */
789         vnn = find_public_ip_vnn(ctdb, &pip->addr);
790         if (vnn == NULL) {
791                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
792                         ctdb_addr_to_str(&pip->addr)));
793                 return 0;
794         }
795         vnn->pnn = pip->pnn;
796
797         /* stop any previous arps */
798         talloc_free(vnn->takeover_ctx);
799         vnn->takeover_ctx = NULL;
800
801         if (!ctdb_sys_have_ip(&pip->addr)) {
802                 DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n", 
803                         ctdb_addr_to_str(&pip->addr),
804                         vnn->public_netmask_bits, 
805                         ctdb_vnn_iface_string(vnn)));
806                 ctdb_vnn_unassign_iface(ctdb, vnn);
807                 return 0;
808         }
809
810         if (vnn->iface == NULL) {
811                 DEBUG(DEBUG_ERR,(__location__ " release_ip of IP %s is known to the kernel, "
812                                  "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
813                                  ctdb_addr_to_str(&vnn->public_address)));
814                 return 0;
815         }
816
817         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
818                 ctdb_addr_to_str(&pip->addr),
819                 vnn->public_netmask_bits, 
820                 ctdb_vnn_iface_string(vnn),
821                 pip->pnn));
822
823         state = talloc(ctdb, struct takeover_callback_state);
824         CTDB_NO_MEMORY(ctdb, state);
825
826         state->c = talloc_steal(state, c);
827         state->addr = talloc(state, ctdb_sock_addr);       
828         CTDB_NO_MEMORY(ctdb, state->addr);
829         *state->addr = pip->addr;
830         state->vnn   = vnn;
831
832         ret = ctdb_event_script_callback(ctdb, 
833                                          state, release_ip_callback, state,
834                                          false,
835                                          CTDB_EVENT_RELEASE_IP,
836                                          "%s %s %u",
837                                          ctdb_vnn_iface_string(vnn),
838                                          ctdb_addr_to_str(&pip->addr),
839                                          vnn->public_netmask_bits);
840         if (ret != 0) {
841                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
842                         ctdb_addr_to_str(&pip->addr),
843                         ctdb_vnn_iface_string(vnn)));
844                 talloc_free(state);
845                 return -1;
846         }
847
848         /* tell the control that we will be reply asynchronously */
849         *async_reply = true;
850         return 0;
851 }
852
853 /*
854   release an ip address old v4 style
855  */
856 int32_t ctdb_control_release_ipv4(struct ctdb_context *ctdb, 
857                                 struct ctdb_req_control *c,
858                                 TDB_DATA indata, 
859                                 bool *async_reply)
860 {
861         TDB_DATA data;
862         
863         data.dsize = sizeof(struct ctdb_public_ip);
864         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
865         CTDB_NO_MEMORY(ctdb, data.dptr);
866         
867         memcpy(data.dptr, indata.dptr, indata.dsize);
868         return ctdb_control_release_ip(ctdb, c, data, async_reply);
869 }
870
871
872 static int ctdb_add_public_address(struct ctdb_context *ctdb,
873                                    ctdb_sock_addr *addr,
874                                    unsigned mask, const char *ifaces)
875 {
876         struct ctdb_vnn      *vnn;
877         uint32_t num = 0;
878         char *tmp;
879         const char *iface;
880         int i;
881         int ret;
882
883         tmp = talloc_strdup(vnn, ifaces);
884         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
885                 if (!ctdb_sys_check_iface_exists(iface)) {
886                         DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
887                         talloc_free(tmp);
888                         return -1;
889                 }
890         }
891         talloc_free(tmp);
892
893         /* Verify that we dont have an entry for this ip yet */
894         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
895                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
896                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
897                                 ctdb_addr_to_str(addr)));
898                         return -1;
899                 }               
900         }
901
902         /* create a new vnn structure for this ip address */
903         vnn = talloc_zero(ctdb, struct ctdb_vnn);
904         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
905         vnn->ifaces = talloc_array(vnn, const char *, num + 2);
906         tmp = talloc_strdup(vnn, ifaces);
907         CTDB_NO_MEMORY_FATAL(ctdb, tmp);
908         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
909                 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
910                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
911                 vnn->ifaces[num] = talloc_strdup(vnn, iface);
912                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
913                 num++;
914         }
915         talloc_free(tmp);
916         vnn->ifaces[num] = NULL;
917         vnn->public_address      = *addr;
918         vnn->public_netmask_bits = mask;
919         vnn->pnn                 = -1;
920         if (ctdb_sys_have_ip(addr)) {
921                 DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
922                 vnn->pnn = ctdb->pnn;
923         }
924
925         for (i=0; vnn->ifaces[i]; i++) {
926                 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
927                 if (ret != 0) {
928                         DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
929                                            "for public_address[%s]\n",
930                                            vnn->ifaces[i], ctdb_addr_to_str(addr)));
931                         talloc_free(vnn);
932                         return -1;
933                 }
934                 if (i == 0) {
935                         vnn->iface = ctdb_find_iface(ctdb, vnn->ifaces[i]);
936                 }
937         }
938
939         DLIST_ADD(ctdb->vnn, vnn);
940
941         return 0;
942 }
943
944 /*
945   setup the event script directory
946 */
947 int ctdb_set_event_script_dir(struct ctdb_context *ctdb, const char *script_dir)
948 {
949         ctdb->event_script_dir = talloc_strdup(ctdb, script_dir);
950         CTDB_NO_MEMORY(ctdb, ctdb->event_script_dir);
951         return 0;
952 }
953
954 static void ctdb_check_interfaces_event(struct event_context *ev, struct timed_event *te, 
955                                   struct timeval t, void *private_data)
956 {
957         struct ctdb_context *ctdb = talloc_get_type(private_data, 
958                                                         struct ctdb_context);
959         struct ctdb_vnn *vnn;
960
961         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
962                 int i;
963
964                 for (i=0; vnn->ifaces[i] != NULL; i++) {
965                         if (!ctdb_sys_check_iface_exists(vnn->ifaces[i])) {
966                                 DEBUG(DEBUG_CRIT,("Interface %s does not exist but is used by public ip %s\n",
967                                         vnn->ifaces[i],
968                                         ctdb_addr_to_str(&vnn->public_address)));
969                         }
970                 }
971         }
972
973         event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx, 
974                 timeval_current_ofs(30, 0), 
975                 ctdb_check_interfaces_event, ctdb);
976 }
977
978
979 static int ctdb_start_monitoring_interfaces(struct ctdb_context *ctdb)
980 {
981         if (ctdb->check_public_ifaces_ctx != NULL) {
982                 talloc_free(ctdb->check_public_ifaces_ctx);
983                 ctdb->check_public_ifaces_ctx = NULL;
984         }
985
986         ctdb->check_public_ifaces_ctx = talloc_new(ctdb);
987         if (ctdb->check_public_ifaces_ctx == NULL) {
988                 ctdb_fatal(ctdb, "failed to allocate context for checking interfaces");
989         }
990
991         event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx, 
992                 timeval_current_ofs(30, 0), 
993                 ctdb_check_interfaces_event, ctdb);
994
995         return 0;
996 }
997
998
999 /*
1000   setup the public address lists from a file
1001 */
1002 int ctdb_set_public_addresses(struct ctdb_context *ctdb, const char *alist)
1003 {
1004         char **lines;
1005         int nlines;
1006         int i;
1007
1008         lines = file_lines_load(alist, &nlines, ctdb);
1009         if (lines == NULL) {
1010                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", alist);
1011                 return -1;
1012         }
1013         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1014                 nlines--;
1015         }
1016
1017         for (i=0;i<nlines;i++) {
1018                 unsigned mask;
1019                 ctdb_sock_addr addr;
1020                 const char *addrstr;
1021                 const char *ifaces;
1022                 char *tok, *line;
1023
1024                 line = lines[i];
1025                 while ((*line == ' ') || (*line == '\t')) {
1026                         line++;
1027                 }
1028                 if (*line == '#') {
1029                         continue;
1030                 }
1031                 if (strcmp(line, "") == 0) {
1032                         continue;
1033                 }
1034                 tok = strtok(line, " \t");
1035                 addrstr = tok;
1036                 tok = strtok(NULL, " \t");
1037                 if (tok == NULL) {
1038                         if (NULL == ctdb->default_public_interface) {
1039                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1040                                          i+1));
1041                                 talloc_free(lines);
1042                                 return -1;
1043                         }
1044                         ifaces = ctdb->default_public_interface;
1045                 } else {
1046                         ifaces = tok;
1047                 }
1048
1049                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1050                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1051                         talloc_free(lines);
1052                         return -1;
1053                 }
1054                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces)) {
1055                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1056                         talloc_free(lines);
1057                         return -1;
1058                 }
1059         }
1060
1061
1062         ctdb_start_monitoring_interfaces(ctdb);
1063
1064         talloc_free(lines);
1065         return 0;
1066 }
1067
1068 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1069                               const char *iface,
1070                               const char *ip)
1071 {
1072         struct ctdb_vnn *svnn;
1073         struct ctdb_iface *cur = NULL;
1074         bool ok;
1075         int ret;
1076
1077         svnn = talloc_zero(ctdb, struct ctdb_vnn);
1078         CTDB_NO_MEMORY(ctdb, svnn);
1079
1080         svnn->ifaces = talloc_array(svnn, const char *, 2);
1081         CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1082         svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1083         CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1084         svnn->ifaces[1] = NULL;
1085
1086         ok = parse_ip(ip, iface, 0, &svnn->public_address);
1087         if (!ok) {
1088                 talloc_free(svnn);
1089                 return -1;
1090         }
1091
1092         ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1093         if (ret != 0) {
1094                 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1095                                    "for single_ip[%s]\n",
1096                                    svnn->ifaces[0],
1097                                    ctdb_addr_to_str(&svnn->public_address)));
1098                 talloc_free(svnn);
1099                 return -1;
1100         }
1101
1102         /* assume the single public ip interface is initially "good" */
1103         cur = ctdb_find_iface(ctdb, iface);
1104         if (cur == NULL) {
1105                 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1106                 return -1;
1107         }
1108         cur->link_up = true;
1109
1110         ret = ctdb_vnn_assign_iface(ctdb, svnn);
1111         if (ret != 0) {
1112                 talloc_free(svnn);
1113                 return -1;
1114         }
1115
1116         ctdb->single_ip_vnn = svnn;
1117         return 0;
1118 }
1119
1120 /* Given a physical node, return the number of
1121    public addresses that is currently assigned to this node.
1122 */
1123 static int node_ip_coverage(struct ctdb_context *ctdb, 
1124         int32_t pnn,
1125         struct ctdb_public_ip_list *ips)
1126 {
1127         int num=0;
1128
1129         for (;ips;ips=ips->next) {
1130                 if (ips->pnn == pnn) {
1131                         num++;
1132                 }
1133         }
1134         return num;
1135 }
1136
1137
1138 /* Check if this is a public ip known to the node, i.e. can that
1139    node takeover this ip ?
1140 */
1141 static int can_node_serve_ip(struct ctdb_context *ctdb, int32_t pnn, 
1142                 struct ctdb_public_ip_list *ip)
1143 {
1144         struct ctdb_all_public_ips *public_ips;
1145         int i;
1146
1147         public_ips = ctdb->nodes[pnn]->available_public_ips;
1148
1149         if (public_ips == NULL) {
1150                 return -1;
1151         }
1152
1153         for (i=0;i<public_ips->num;i++) {
1154                 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1155                         /* yes, this node can serve this public ip */
1156                         return 0;
1157                 }
1158         }
1159
1160         return -1;
1161 }
1162
1163
1164 /* search the node lists list for a node to takeover this ip.
1165    pick the node that currently are serving the least number of ips
1166    so that the ips get spread out evenly.
1167 */
1168 static int find_takeover_node(struct ctdb_context *ctdb, 
1169                 struct ctdb_node_map *nodemap, uint32_t mask, 
1170                 struct ctdb_public_ip_list *ip,
1171                 struct ctdb_public_ip_list *all_ips)
1172 {
1173         int pnn, min=0, num;
1174         int i;
1175
1176         pnn    = -1;
1177         for (i=0;i<nodemap->num;i++) {
1178                 if (nodemap->nodes[i].flags & mask) {
1179                         /* This node is not healty and can not be used to serve
1180                            a public address 
1181                         */
1182                         continue;
1183                 }
1184
1185                 /* verify that this node can serve this ip */
1186                 if (can_node_serve_ip(ctdb, i, ip)) {
1187                         /* no it couldnt   so skip to the next node */
1188                         continue;
1189                 }
1190
1191                 num = node_ip_coverage(ctdb, i, all_ips);
1192                 /* was this the first node we checked ? */
1193                 if (pnn == -1) {
1194                         pnn = i;
1195                         min  = num;
1196                 } else {
1197                         if (num < min) {
1198                                 pnn = i;
1199                                 min  = num;
1200                         }
1201                 }
1202         }       
1203         if (pnn == -1) {
1204                 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1205                         ctdb_addr_to_str(&ip->addr)));
1206
1207                 return -1;
1208         }
1209
1210         ip->pnn = pnn;
1211         return 0;
1212 }
1213
1214 #define IP_KEYLEN       4
1215 static uint32_t *ip_key(ctdb_sock_addr *ip)
1216 {
1217         static uint32_t key[IP_KEYLEN];
1218
1219         bzero(key, sizeof(key));
1220
1221         switch (ip->sa.sa_family) {
1222         case AF_INET:
1223                 key[3]  = htonl(ip->ip.sin_addr.s_addr);
1224                 break;
1225         case AF_INET6:
1226                 key[0]  = htonl(ip->ip6.sin6_addr.s6_addr32[0]);
1227                 key[1]  = htonl(ip->ip6.sin6_addr.s6_addr32[1]);
1228                 key[2]  = htonl(ip->ip6.sin6_addr.s6_addr32[2]);
1229                 key[3]  = htonl(ip->ip6.sin6_addr.s6_addr32[3]);
1230                 break;
1231         default:
1232                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1233                 return key;
1234         }
1235
1236         return key;
1237 }
1238
1239 static void *add_ip_callback(void *parm, void *data)
1240 {
1241         struct ctdb_public_ip_list *this_ip = parm; 
1242         struct ctdb_public_ip_list *prev_ip = data; 
1243
1244         if (prev_ip == NULL) {
1245                 return parm;
1246         }
1247         if (this_ip->pnn == -1) {
1248                 this_ip->pnn = prev_ip->pnn;
1249         }
1250
1251         return parm;
1252 }
1253
1254 void getips_count_callback(void *param, void *data)
1255 {
1256         struct ctdb_public_ip_list **ip_list = (struct ctdb_public_ip_list **)param;
1257         struct ctdb_public_ip_list *new_ip = (struct ctdb_public_ip_list *)data;
1258
1259         new_ip->next = *ip_list;
1260         *ip_list     = new_ip;
1261 }
1262
1263 static struct ctdb_public_ip_list *
1264 create_merged_ip_list(struct ctdb_context *ctdb)
1265 {
1266         int i, j;
1267         struct ctdb_public_ip_list *ip_list;
1268         struct ctdb_all_public_ips *public_ips;
1269
1270         if (ctdb->ip_tree != NULL) {
1271                 talloc_free(ctdb->ip_tree);
1272                 ctdb->ip_tree = NULL;
1273         }
1274         ctdb->ip_tree = trbt_create(ctdb, 0);
1275
1276         for (i=0;i<ctdb->num_nodes;i++) {
1277                 public_ips = ctdb->nodes[i]->known_public_ips;
1278
1279                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1280                         continue;
1281                 }
1282
1283                 /* there were no public ips for this node */
1284                 if (public_ips == NULL) {
1285                         continue;
1286                 }               
1287
1288                 for (j=0;j<public_ips->num;j++) {
1289                         struct ctdb_public_ip_list *tmp_ip; 
1290
1291                         tmp_ip = talloc_zero(ctdb->ip_tree, struct ctdb_public_ip_list);
1292                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1293                         tmp_ip->pnn  = public_ips->ips[j].pnn;
1294                         tmp_ip->addr = public_ips->ips[j].addr;
1295                         tmp_ip->next = NULL;
1296
1297                         trbt_insertarray32_callback(ctdb->ip_tree,
1298                                 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1299                                 add_ip_callback,
1300                                 tmp_ip);
1301                 }
1302         }
1303
1304         ip_list = NULL;
1305         trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1306
1307         return ip_list;
1308 }
1309
1310 /* 
1311  * This is the length of the longtest common prefix between the IPs.
1312  * It is calculated by XOR-ing the 2 IPs together and counting the
1313  * number of leading zeroes.  The implementation means that all
1314  * addresses end up being 128 bits long.
1315  * Not static, so we can easily link it into a unit test.
1316  *
1317  * FIXME? Should we consider IPv4 and IPv6 separately given that the
1318  * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1319  * lots of nodes and IP addresses?
1320  */
1321 uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1322 {
1323         uint32_t ip1_k[IP_KEYLEN];
1324         uint32_t *t;
1325         int i;
1326         uint32_t x;
1327
1328         uint32_t distance = 0;
1329
1330         memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1331         t = ip_key(ip2);
1332         for (i=0; i<IP_KEYLEN; i++) {
1333                 x = ip1_k[i] ^ t[i];
1334                 if (x == 0) {
1335                         distance += 32;
1336                 } else {
1337                         /* Count number of leading zeroes. 
1338                          * FIXME? This could be optimised...
1339                          */
1340                         while ((x & (1 << 31)) == 0) {
1341                                 x <<= 1;
1342                                 distance += 1;
1343                         }
1344                 }
1345         }
1346
1347         return distance;
1348 }
1349
1350 /* Calculate the IP distance for the given IP relative to IPs on the
1351    given node.  The ips argument is generally the all_ips variable
1352    used in the main part of the algorithm.
1353  * Not static, so we can easily link it into a unit test.
1354  */
1355 uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1356                            struct ctdb_public_ip_list *ips,
1357                            int pnn)
1358 {
1359         struct ctdb_public_ip_list *t;
1360         uint32_t d;
1361
1362         uint32_t sum = 0;
1363
1364         for (t=ips; t != NULL; t=t->next) {
1365                 if (t->pnn != pnn) {
1366                         continue;
1367                 }
1368
1369                 /* Optimisation: We never calculate the distance
1370                  * between an address and itself.  This allows us to
1371                  * calculate the effect of removing an address from a
1372                  * node by simply calculating the distance between
1373                  * that address and all of the exitsing addresses.
1374                  * Moreover, we assume that we're only ever dealing
1375                  * with addresses from all_ips so we can identify an
1376                  * address via a pointer rather than doing a more
1377                  * expensive address comparison. */
1378                 if (&(t->addr) == ip) {
1379                         continue;
1380                 }
1381
1382                 d = ip_distance(ip, &(t->addr));
1383                 sum += d * d;  /* Cheaper than pulling in math.h :-) */
1384         }
1385
1386         return sum;
1387 }
1388
1389 /* Return the LCP2 imbalance metric for addresses currently assigned
1390    to the given node.
1391  * Not static, so we can easily link it into a unit test.
1392  */
1393 uint32_t lcp2_imbalance(struct ctdb_public_ip_list * all_ips, int pnn)
1394 {
1395         struct ctdb_public_ip_list *t;
1396
1397         uint32_t imbalance = 0;
1398
1399         for (t=all_ips; t!=NULL; t=t->next) {
1400                 if (t->pnn != pnn) {
1401                         continue;
1402                 }
1403                 /* Pass the rest of the IPs rather than the whole
1404                    all_ips input list.
1405                 */
1406                 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1407         }
1408
1409         return imbalance;
1410 }
1411
1412 /* Allocate any unassigned IPs just by looping through the IPs and
1413  * finding the best node for each.
1414  * Not static, so we can easily link it into a unit test.
1415  */
1416 void basic_allocate_unassigned(struct ctdb_context *ctdb,
1417                                struct ctdb_node_map *nodemap,
1418                                uint32_t mask,
1419                                struct ctdb_public_ip_list *all_ips)
1420 {
1421         struct ctdb_public_ip_list *tmp_ip;
1422
1423         /* loop over all ip's and find a physical node to cover for 
1424            each unassigned ip.
1425         */
1426         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1427                 if (tmp_ip->pnn == -1) {
1428                         if (find_takeover_node(ctdb, nodemap, mask, tmp_ip, all_ips)) {
1429                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1430                                         ctdb_addr_to_str(&tmp_ip->addr)));
1431                         }
1432                 }
1433         }
1434 }
1435
1436 /* Basic non-deterministic rebalancing algorithm.
1437  * Not static, so we can easily link it into a unit test.
1438  */
1439 bool basic_failback(struct ctdb_context *ctdb,
1440                     struct ctdb_node_map *nodemap,
1441                     uint32_t mask,
1442                     struct ctdb_public_ip_list *all_ips,
1443                     int num_ips,
1444                     int *retries)
1445 {
1446         int i;
1447         int maxnode, maxnum=0, minnode, minnum=0, num;
1448         struct ctdb_public_ip_list *tmp_ip;
1449
1450         /* for each ip address, loop over all nodes that can serve
1451            this ip and make sure that the difference between the node
1452            serving the most and the node serving the least ip's are
1453            not greater than 1.
1454         */
1455         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1456                 if (tmp_ip->pnn == -1) {
1457                         continue;
1458                 }
1459
1460                 /* Get the highest and lowest number of ips's served by any 
1461                    valid node which can serve this ip.
1462                 */
1463                 maxnode = -1;
1464                 minnode = -1;
1465                 for (i=0;i<nodemap->num;i++) {
1466                         if (nodemap->nodes[i].flags & mask) {
1467                                 continue;
1468                         }
1469
1470                         /* only check nodes that can actually serve this ip */
1471                         if (can_node_serve_ip(ctdb, i, tmp_ip)) {
1472                                 /* no it couldnt   so skip to the next node */
1473                                 continue;
1474                         }
1475
1476                         num = node_ip_coverage(ctdb, i, all_ips);
1477                         if (maxnode == -1) {
1478                                 maxnode = i;
1479                                 maxnum  = num;
1480                         } else {
1481                                 if (num > maxnum) {
1482                                         maxnode = i;
1483                                         maxnum  = num;
1484                                 }
1485                         }
1486                         if (minnode == -1) {
1487                                 minnode = i;
1488                                 minnum  = num;
1489                         } else {
1490                                 if (num < minnum) {
1491                                         minnode = i;
1492                                         minnum  = num;
1493                                 }
1494                         }
1495                 }
1496                 if (maxnode == -1) {
1497                         DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1498                                 ctdb_addr_to_str(&tmp_ip->addr)));
1499
1500                         continue;
1501                 }
1502
1503                 /* If we want deterministic IPs then dont try to reallocate 
1504                    them to spread out the load.
1505                 */
1506                 if (1 == ctdb->tunable.deterministic_public_ips) {
1507                         continue;
1508                 }
1509
1510                 /* if the spread between the smallest and largest coverage by
1511                    a node is >=2 we steal one of the ips from the node with
1512                    most coverage to even things out a bit.
1513                    try to do this a limited number of times since we dont
1514                    want to spend too much time balancing the ip coverage.
1515                 */
1516                 if ( (maxnum > minnum+1)
1517                      && (*retries < (num_ips + 5)) ){
1518                         struct ctdb_public_ip_list *tmp;
1519
1520                         /* mark one of maxnode's vnn's as unassigned and try
1521                            again
1522                         */
1523                         for (tmp=all_ips;tmp;tmp=tmp->next) {
1524                                 if (tmp->pnn == maxnode) {
1525                                         tmp->pnn = -1;
1526                                         (*retries)++;
1527                                         return true;
1528                                 }
1529                         }
1530                 }
1531         }
1532
1533         return false;
1534 }
1535
1536 /* Do necessary LCP2 initialisation.  Bury it in a function here so
1537  * that we can unit test it.
1538  * Not static, so we can easily link it into a unit test.
1539  */
1540 void lcp2_init(struct ctdb_context * tmp_ctx,
1541                struct ctdb_node_map * nodemap,
1542                uint32_t mask,
1543                struct ctdb_public_ip_list *all_ips,
1544                uint32_t **lcp2_imbalances,
1545                bool **newly_healthy)
1546 {
1547         int i;
1548         struct ctdb_public_ip_list *tmp_ip;
1549
1550         *newly_healthy = talloc_array(tmp_ctx, bool, nodemap->num);
1551         CTDB_NO_MEMORY_FATAL(tmp_ctx, *newly_healthy);
1552         *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, nodemap->num);
1553         CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
1554
1555         for (i=0;i<nodemap->num;i++) {
1556                 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1557                 /* First step: is the node "healthy"? */
1558                 (*newly_healthy)[i] = ! (bool)(nodemap->nodes[i].flags & mask);
1559         }
1560
1561         /* 2nd step: if a ndoe has IPs assigned then it must have been
1562          * healthy before, so we remove it from consideration... */
1563         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1564                 if (tmp_ip->pnn != -1) {
1565                         (*newly_healthy)[tmp_ip->pnn] = false;
1566                 }
1567         }
1568 }
1569
1570 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1571  * the IP/node combination that will cost the least.
1572  * Not static, so we can easily link it into a unit test.
1573  */
1574 void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
1575                               struct ctdb_node_map *nodemap,
1576                               uint32_t mask,
1577                               struct ctdb_public_ip_list *all_ips,
1578                               uint32_t *lcp2_imbalances)
1579 {
1580         struct ctdb_public_ip_list *tmp_ip;
1581         int dstnode;
1582
1583         int minnode;
1584         uint32_t mindsum, dstdsum, dstimbl, minimbl;
1585         struct ctdb_public_ip_list *minip;
1586
1587         bool should_loop = true;
1588         bool have_unassigned = true;
1589
1590         while (have_unassigned && should_loop) {
1591                 should_loop = false;
1592
1593                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1594                 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1595
1596                 minnode = -1;
1597                 mindsum = 0;
1598                 minip = NULL;
1599
1600                 /* loop over each unassigned ip. */
1601                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1602                         if (tmp_ip->pnn != -1) {
1603                                 continue;
1604                         }
1605
1606                         for (dstnode=0; dstnode < nodemap->num; dstnode++) {
1607                                 /* only check nodes that can actually serve this ip */
1608                                 if (can_node_serve_ip(ctdb, dstnode, tmp_ip)) {
1609                                         /* no it couldnt   so skip to the next node */
1610                                         continue;
1611                                 }
1612                                 if (nodemap->nodes[dstnode].flags & mask) {
1613                                         continue;
1614                                 }
1615
1616                                 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1617                                 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1618                                 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1619                                                    ctdb_addr_to_str(&(tmp_ip->addr)),
1620                                                    dstnode,
1621                                                    dstimbl - lcp2_imbalances[dstnode]));
1622
1623
1624                                 if ((minnode == -1) || (dstdsum < mindsum)) {
1625                                         minnode = dstnode;
1626                                         minimbl = dstimbl;
1627                                         mindsum = dstdsum;
1628                                         minip = tmp_ip;
1629                                         should_loop = true;
1630                                 }
1631                         }
1632                 }
1633
1634                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1635
1636                 /* If we found one then assign it to the given node. */
1637                 if (minnode != -1) {
1638                         minip->pnn = minnode;
1639                         lcp2_imbalances[minnode] = minimbl;
1640                         DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1641                                           ctdb_addr_to_str(&(minip->addr)),
1642                                           minnode,
1643                                           mindsum));
1644                 }
1645
1646                 /* There might be a better way but at least this is clear. */
1647                 have_unassigned = false;
1648                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1649                         if (tmp_ip->pnn == -1) {
1650                                 have_unassigned = true;
1651                         }
1652                 }
1653         }
1654
1655         /* We know if we have an unassigned addresses so we might as
1656          * well optimise.
1657          */
1658         if (have_unassigned) {
1659                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1660                         if (tmp_ip->pnn == -1) {
1661                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1662                                                      ctdb_addr_to_str(&tmp_ip->addr)));
1663                         }
1664                 }
1665         }
1666 }
1667
1668 /* LCP2 algorithm for rebalancing the cluster.  This finds the source
1669  * node with the highest LCP2 imbalance, and then determines the best
1670  * IP/destination node combination to move from the source node.
1671  *
1672  * Not static, so we can easily link it into a unit test.
1673  */
1674 bool lcp2_failback(struct ctdb_context *ctdb,
1675                    struct ctdb_node_map *nodemap,
1676                    uint32_t mask,
1677                    struct ctdb_public_ip_list *all_ips,
1678                    uint32_t *lcp2_imbalances,
1679                    bool *newly_healthy)
1680 {
1681         int srcnode, dstnode, mindstnode, i, num_newly_healthy;
1682         uint32_t srcimbl, srcdsum, maximbl, dstimbl, dstdsum;
1683         uint32_t minsrcimbl, mindstimbl, b;
1684         struct ctdb_public_ip_list *minip;
1685         struct ctdb_public_ip_list *tmp_ip;
1686
1687         /* It is only worth continuing if we have suitable target
1688          * nodes to transfer IPs to.  This check is much cheaper than
1689          * continuing on...
1690          */
1691         num_newly_healthy = 0;
1692         for (i = 0; i < nodemap->num; i++) {
1693                 if (newly_healthy[i]) {
1694                         num_newly_healthy++;
1695                 }
1696         }
1697         if (num_newly_healthy == 0) {
1698                 return false;
1699         }
1700
1701         /* Get the node with the highest imbalance metric. */
1702         srcnode = -1;
1703         maximbl = 0;
1704         for (i=0; i < nodemap->num; i++) {
1705                 b = lcp2_imbalances[i];
1706                 if ((srcnode == -1) || (b > maximbl)) {
1707                         srcnode = i;
1708                         maximbl = b;
1709                 }
1710         }
1711
1712         /* This means that all nodes had 0 or 1 addresses, so can't be
1713          * imbalanced.
1714          */
1715         if (maximbl == 0) {
1716                 return false;
1717         }
1718
1719         /* Find an IP and destination node that best reduces imbalance. */
1720         minip = NULL;
1721         minsrcimbl = 0;
1722         mindstnode = -1;
1723         mindstimbl = 0;
1724
1725         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1726         DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n", srcnode, maximbl));
1727
1728         for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1729                 /* Only consider addresses on srcnode. */
1730                 if (tmp_ip->pnn != srcnode) {
1731                         continue;
1732                 }
1733
1734                 /* What is this IP address costing the source node? */
1735                 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1736                 srcimbl = maximbl - srcdsum;
1737
1738                 /* Consider this IP address would cost each potential
1739                  * destination node.  Destination nodes are limited to
1740                  * those that are newly healthy, since we don't want
1741                  * to do gratuitous failover of IPs just to make minor
1742                  * balance improvements.
1743                  */
1744                 for (dstnode=0; dstnode < nodemap->num; dstnode++) {
1745                         if (! newly_healthy[dstnode]) {
1746                                 continue;
1747                         }
1748                         /* only check nodes that can actually serve this ip */
1749                         if (can_node_serve_ip(ctdb, dstnode, tmp_ip)) {
1750                                 /* no it couldnt   so skip to the next node */
1751                                 continue;
1752                         }
1753
1754                         dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1755                         dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1756                         DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1757                                            srcnode, srcimbl - lcp2_imbalances[srcnode],
1758                                            ctdb_addr_to_str(&(tmp_ip->addr)),
1759                                            dstnode, dstimbl - lcp2_imbalances[dstnode]));
1760
1761                         if ((dstimbl < maximbl) && (dstdsum < srcdsum) && \
1762                             ((mindstnode == -1) ||                              \
1763                              ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1764
1765                                 minip = tmp_ip;
1766                                 minsrcimbl = srcimbl;
1767                                 mindstnode = dstnode;
1768                                 mindstimbl = dstimbl;
1769                         }
1770                 }
1771         }
1772         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1773
1774         if (mindstnode != -1) {
1775                 /* We found a move that makes things better... */
1776                 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1777                                   srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1778                                   ctdb_addr_to_str(&(minip->addr)),
1779                                   mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1780
1781
1782                 lcp2_imbalances[srcnode] = srcimbl;
1783                 lcp2_imbalances[mindstnode] = mindstimbl;
1784                 minip->pnn = mindstnode;
1785
1786                 return true;
1787         }
1788
1789         return false;
1790         
1791 }
1792
1793 /* The calculation part of the IP allocation algorithm.
1794  * Not static, so we can easily link it into a unit test.
1795  */
1796 void ctdb_takeover_run_core(struct ctdb_context *ctdb,
1797                             struct ctdb_node_map *nodemap,
1798                             struct ctdb_public_ip_list **all_ips_p)
1799 {
1800         int i, num_healthy, retries, num_ips;
1801         uint32_t mask;
1802         struct ctdb_public_ip_list *all_ips, *tmp_ip;
1803         uint32_t *lcp2_imbalances;
1804         bool *newly_healthy;
1805
1806         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1807
1808         /* Count how many completely healthy nodes we have */
1809         num_healthy = 0;
1810         for (i=0;i<nodemap->num;i++) {
1811                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
1812                         num_healthy++;
1813                 }
1814         }
1815
1816         if (num_healthy > 0) {
1817                 /* We have healthy nodes, so only consider them for 
1818                    serving public addresses
1819                 */
1820                 mask = NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED;
1821         } else {
1822                 /* We didnt have any completely healthy nodes so
1823                    use "disabled" nodes as a fallback
1824                 */
1825                 mask = NODE_FLAGS_INACTIVE;
1826         }
1827
1828         /* since nodes only know about those public addresses that
1829            can be served by that particular node, no single node has
1830            a full list of all public addresses that exist in the cluster.
1831            Walk over all node structures and create a merged list of
1832            all public addresses that exist in the cluster.
1833
1834            keep the tree of ips around as ctdb->ip_tree
1835         */
1836         all_ips = create_merged_ip_list(ctdb);
1837         *all_ips_p = all_ips; /* minimal code changes */
1838
1839         /* Count how many ips we have */
1840         num_ips = 0;
1841         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1842                 num_ips++;
1843         }
1844
1845         /* If we want deterministic ip allocations, i.e. that the ip addresses
1846            will always be allocated the same way for a specific set of
1847            available/unavailable nodes.
1848         */
1849         if (1 == ctdb->tunable.deterministic_public_ips) {              
1850                 DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
1851                 for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
1852                         tmp_ip->pnn = i%nodemap->num;
1853                 }
1854         }
1855
1856
1857         /* mark all public addresses with a masked node as being served by
1858            node -1
1859         */
1860         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1861                 if (tmp_ip->pnn == -1) {
1862                         continue;
1863                 }
1864                 if (nodemap->nodes[tmp_ip->pnn].flags & mask) {
1865                         tmp_ip->pnn = -1;
1866                 }
1867         }
1868
1869         /* verify that the assigned nodes can serve that public ip
1870            and set it to -1 if not
1871         */
1872         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1873                 if (tmp_ip->pnn == -1) {
1874                         continue;
1875                 }
1876                 if (can_node_serve_ip(ctdb, tmp_ip->pnn, tmp_ip) != 0) {
1877                         /* this node can not serve this ip. */
1878                         tmp_ip->pnn = -1;
1879                 }
1880         }
1881
1882         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
1883                 lcp2_init(tmp_ctx, nodemap, mask, all_ips, &lcp2_imbalances, &newly_healthy);
1884         }
1885
1886         /* now we must redistribute all public addresses with takeover node
1887            -1 among the nodes available
1888         */
1889         retries = 0;
1890 try_again:
1891         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
1892                 lcp2_allocate_unassigned(ctdb, nodemap, mask, all_ips, lcp2_imbalances);
1893         } else {
1894                 basic_allocate_unassigned(ctdb, nodemap, mask, all_ips);
1895         }
1896
1897         /* If we dont want ips to fail back after a node becomes healthy
1898            again, we wont even try to reallocat the ip addresses so that
1899            they are evenly spread out.
1900            This can NOT be used at the same time as DeterministicIPs !
1901         */
1902         if (1 == ctdb->tunable.no_ip_failback) {
1903                 if (1 == ctdb->tunable.deterministic_public_ips) {
1904                         DEBUG(DEBUG_ERR, ("ERROR: You can not use 'DeterministicIPs' and 'NoIPFailback' at the same time\n"));
1905                 }
1906                 goto finished;
1907         }
1908
1909
1910         /* now, try to make sure the ip adresses are evenly distributed
1911            across the node.
1912         */
1913         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
1914                 if (lcp2_failback(ctdb, nodemap, mask, all_ips, lcp2_imbalances, newly_healthy)) {
1915                         goto try_again;
1916                 }
1917         } else {
1918                 if (basic_failback(ctdb, nodemap, mask, all_ips, num_ips, &retries)) {
1919                         goto try_again;
1920                 }
1921         }
1922
1923         /* finished distributing the public addresses, now just send the 
1924            info out to the nodes
1925         */
1926 finished:
1927
1928         /* at this point ->pnn is the node which will own each IP
1929            or -1 if there is no node that can cover this ip
1930         */
1931
1932         return;
1933 }
1934
1935 /*
1936   make any IP alias changes for public addresses that are necessary 
1937  */
1938 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
1939 {
1940         int i;
1941         struct ctdb_public_ip ip;
1942         struct ctdb_public_ipv4 ipv4;
1943         uint32_t *nodes;
1944         struct ctdb_public_ip_list *all_ips, *tmp_ip;
1945         TDB_DATA data;
1946         struct timeval timeout;
1947         struct client_async_data *async_data;
1948         struct ctdb_client_control_state *state;
1949         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1950
1951         /*
1952          * ip failover is completely disabled, just send out the 
1953          * ipreallocated event.
1954          */
1955         if (ctdb->tunable.disable_ip_failover != 0) {
1956                 goto ipreallocated;
1957         }
1958
1959         ZERO_STRUCT(ip);
1960
1961         /* Do the IP reassignment calculations */
1962         ctdb_takeover_run_core(ctdb, nodemap, &all_ips);
1963
1964         /* now tell all nodes to delete any alias that they should not
1965            have.  This will be a NOOP on nodes that don't currently
1966            hold the given alias */
1967         async_data = talloc_zero(tmp_ctx, struct client_async_data);
1968         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
1969
1970         for (i=0;i<nodemap->num;i++) {
1971                 /* don't talk to unconnected nodes, but do talk to banned nodes */
1972                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1973                         continue;
1974                 }
1975
1976                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1977                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
1978                                 /* This node should be serving this
1979                                    vnn so dont tell it to release the ip
1980                                 */
1981                                 continue;
1982                         }
1983                         if (tmp_ip->addr.sa.sa_family == AF_INET) {
1984                                 ipv4.pnn = tmp_ip->pnn;
1985                                 ipv4.sin = tmp_ip->addr.ip;
1986
1987                                 timeout = TAKEOVER_TIMEOUT();
1988                                 data.dsize = sizeof(ipv4);
1989                                 data.dptr  = (uint8_t *)&ipv4;
1990                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
1991                                                 0, CTDB_CONTROL_RELEASE_IPv4, 0,
1992                                                 data, async_data,
1993                                                 &timeout, NULL);
1994                         } else {
1995                                 ip.pnn  = tmp_ip->pnn;
1996                                 ip.addr = tmp_ip->addr;
1997
1998                                 timeout = TAKEOVER_TIMEOUT();
1999                                 data.dsize = sizeof(ip);
2000                                 data.dptr  = (uint8_t *)&ip;
2001                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2002                                                 0, CTDB_CONTROL_RELEASE_IP, 0,
2003                                                 data, async_data,
2004                                                 &timeout, NULL);
2005                         }
2006
2007                         if (state == NULL) {
2008                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2009                                 talloc_free(tmp_ctx);
2010                                 return -1;
2011                         }
2012                 
2013                         ctdb_client_async_add(async_data, state);
2014                 }
2015         }
2016         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2017                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2018                 talloc_free(tmp_ctx);
2019                 return -1;
2020         }
2021         talloc_free(async_data);
2022
2023
2024         /* tell all nodes to get their own IPs */
2025         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2026         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2027         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2028                 if (tmp_ip->pnn == -1) {
2029                         /* this IP won't be taken over */
2030                         continue;
2031                 }
2032
2033                 if (tmp_ip->addr.sa.sa_family == AF_INET) {
2034                         ipv4.pnn = tmp_ip->pnn;
2035                         ipv4.sin = tmp_ip->addr.ip;
2036
2037                         timeout = TAKEOVER_TIMEOUT();
2038                         data.dsize = sizeof(ipv4);
2039                         data.dptr  = (uint8_t *)&ipv4;
2040                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
2041                                         0, CTDB_CONTROL_TAKEOVER_IPv4, 0,
2042                                         data, async_data,
2043                                         &timeout, NULL);
2044                 } else {
2045                         ip.pnn  = tmp_ip->pnn;
2046                         ip.addr = tmp_ip->addr;
2047
2048                         timeout = TAKEOVER_TIMEOUT();
2049                         data.dsize = sizeof(ip);
2050                         data.dptr  = (uint8_t *)&ip;
2051                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
2052                                         0, CTDB_CONTROL_TAKEOVER_IP, 0,
2053                                         data, async_data,
2054                                         &timeout, NULL);
2055                 }
2056                 if (state == NULL) {
2057                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2058                         talloc_free(tmp_ctx);
2059                         return -1;
2060                 }
2061                 
2062                 ctdb_client_async_add(async_data, state);
2063         }
2064         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2065                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2066                 talloc_free(tmp_ctx);
2067                 return -1;
2068         }
2069
2070 ipreallocated:
2071         /* tell all nodes to update natwg */
2072         /* send the flags update natgw on all connected nodes */
2073         data.dptr  = discard_const("ipreallocated");
2074         data.dsize = strlen((char *)data.dptr) + 1; 
2075         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2076         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_RUN_EVENTSCRIPTS,
2077                                       nodes, 0, TAKEOVER_TIMEOUT(),
2078                                       false, data,
2079                                       NULL, NULL,
2080                                       NULL) != 0) {
2081                 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to updatenatgw failed\n"));
2082         }
2083
2084         talloc_free(tmp_ctx);
2085         return 0;
2086 }
2087
2088
2089 /*
2090   destroy a ctdb_client_ip structure
2091  */
2092 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2093 {
2094         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2095                 ctdb_addr_to_str(&ip->addr),
2096                 ntohs(ip->addr.ip.sin_port),
2097                 ip->client_id));
2098
2099         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2100         return 0;
2101 }
2102
2103 /*
2104   called by a client to inform us of a TCP connection that it is managing
2105   that should tickled with an ACK when IP takeover is done
2106   we handle both the old ipv4 style of packets as well as the new ipv4/6
2107   pdus.
2108  */
2109 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2110                                 TDB_DATA indata)
2111 {
2112         struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
2113         struct ctdb_control_tcp *old_addr = NULL;
2114         struct ctdb_control_tcp_addr new_addr;
2115         struct ctdb_control_tcp_addr *tcp_sock = NULL;
2116         struct ctdb_tcp_list *tcp;
2117         struct ctdb_tcp_connection t;
2118         int ret;
2119         TDB_DATA data;
2120         struct ctdb_client_ip *ip;
2121         struct ctdb_vnn *vnn;
2122         ctdb_sock_addr addr;
2123
2124         switch (indata.dsize) {
2125         case sizeof(struct ctdb_control_tcp):
2126                 old_addr = (struct ctdb_control_tcp *)indata.dptr;
2127                 ZERO_STRUCT(new_addr);
2128                 tcp_sock = &new_addr;
2129                 tcp_sock->src.ip  = old_addr->src;
2130                 tcp_sock->dest.ip = old_addr->dest;
2131                 break;
2132         case sizeof(struct ctdb_control_tcp_addr):
2133                 tcp_sock = (struct ctdb_control_tcp_addr *)indata.dptr;
2134                 break;
2135         default:
2136                 DEBUG(DEBUG_ERR,(__location__ " Invalid data structure passed "
2137                                  "to ctdb_control_tcp_client. size was %d but "
2138                                  "only allowed sizes are %lu and %lu\n",
2139                                  (int)indata.dsize,
2140                                  (long unsigned)sizeof(struct ctdb_control_tcp),
2141                                  (long unsigned)sizeof(struct ctdb_control_tcp_addr)));
2142                 return -1;
2143         }
2144
2145         addr = tcp_sock->src;
2146         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
2147         addr = tcp_sock->dest;
2148         ctdb_canonicalize_ip(&addr, &tcp_sock->dest);
2149
2150         ZERO_STRUCT(addr);
2151         memcpy(&addr, &tcp_sock->dest, sizeof(addr));
2152         vnn = find_public_ip_vnn(ctdb, &addr);
2153         if (vnn == NULL) {
2154                 switch (addr.sa.sa_family) {
2155                 case AF_INET:
2156                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2157                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
2158                                         ctdb_addr_to_str(&addr)));
2159                         }
2160                         break;
2161                 case AF_INET6:
2162                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
2163                                 ctdb_addr_to_str(&addr)));
2164                         break;
2165                 default:
2166                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2167                 }
2168
2169                 return 0;
2170         }
2171
2172         if (vnn->pnn != ctdb->pnn) {
2173                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2174                         ctdb_addr_to_str(&addr),
2175                         client_id, client->pid));
2176                 /* failing this call will tell smbd to die */
2177                 return -1;
2178         }
2179
2180         ip = talloc(client, struct ctdb_client_ip);
2181         CTDB_NO_MEMORY(ctdb, ip);
2182
2183         ip->ctdb      = ctdb;
2184         ip->addr      = addr;
2185         ip->client_id = client_id;
2186         talloc_set_destructor(ip, ctdb_client_ip_destructor);
2187         DLIST_ADD(ctdb->client_ip_list, ip);
2188
2189         tcp = talloc(client, struct ctdb_tcp_list);
2190         CTDB_NO_MEMORY(ctdb, tcp);
2191
2192         tcp->connection.src_addr = tcp_sock->src;
2193         tcp->connection.dst_addr = tcp_sock->dest;
2194
2195         DLIST_ADD(client->tcp_list, tcp);
2196
2197         t.src_addr = tcp_sock->src;
2198         t.dst_addr = tcp_sock->dest;
2199
2200         data.dptr = (uint8_t *)&t;
2201         data.dsize = sizeof(t);
2202
2203         switch (addr.sa.sa_family) {
2204         case AF_INET:
2205                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2206                         (unsigned)ntohs(tcp_sock->dest.ip.sin_port), 
2207                         ctdb_addr_to_str(&tcp_sock->src),
2208                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2209                 break;
2210         case AF_INET6:
2211                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2212                         (unsigned)ntohs(tcp_sock->dest.ip6.sin6_port), 
2213                         ctdb_addr_to_str(&tcp_sock->src),
2214                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2215                 break;
2216         default:
2217                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2218         }
2219
2220
2221         /* tell all nodes about this tcp connection */
2222         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
2223                                        CTDB_CONTROL_TCP_ADD,
2224                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2225         if (ret != 0) {
2226                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
2227                 return -1;
2228         }
2229
2230         return 0;
2231 }
2232
2233 /*
2234   find a tcp address on a list
2235  */
2236 static struct ctdb_tcp_connection *ctdb_tcp_find(struct ctdb_tcp_array *array, 
2237                                            struct ctdb_tcp_connection *tcp)
2238 {
2239         int i;
2240
2241         if (array == NULL) {
2242                 return NULL;
2243         }
2244
2245         for (i=0;i<array->num;i++) {
2246                 if (ctdb_same_sockaddr(&array->connections[i].src_addr, &tcp->src_addr) &&
2247                     ctdb_same_sockaddr(&array->connections[i].dst_addr, &tcp->dst_addr)) {
2248                         return &array->connections[i];
2249                 }
2250         }
2251         return NULL;
2252 }
2253
2254
2255
2256 /*
2257   called by a daemon to inform us of a TCP connection that one of its
2258   clients managing that should tickled with an ACK when IP takeover is
2259   done
2260  */
2261 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
2262 {
2263         struct ctdb_tcp_connection *p = (struct ctdb_tcp_connection *)indata.dptr;
2264         struct ctdb_tcp_array *tcparray;
2265         struct ctdb_tcp_connection tcp;
2266         struct ctdb_vnn *vnn;
2267
2268         vnn = find_public_ip_vnn(ctdb, &p->dst_addr);
2269         if (vnn == NULL) {
2270                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
2271                         ctdb_addr_to_str(&p->dst_addr)));
2272
2273                 return -1;
2274         }
2275
2276
2277         tcparray = vnn->tcp_array;
2278
2279         /* If this is the first tickle */
2280         if (tcparray == NULL) {
2281                 tcparray = talloc_size(ctdb->nodes, 
2282                         offsetof(struct ctdb_tcp_array, connections) +
2283                         sizeof(struct ctdb_tcp_connection) * 1);
2284                 CTDB_NO_MEMORY(ctdb, tcparray);
2285                 vnn->tcp_array = tcparray;
2286
2287                 tcparray->num = 0;
2288                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
2289                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
2290
2291                 tcparray->connections[tcparray->num].src_addr = p->src_addr;
2292                 tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
2293                 tcparray->num++;
2294
2295                 if (tcp_update_needed) {
2296                         vnn->tcp_update_needed = true;
2297                 }
2298                 return 0;
2299         }
2300
2301
2302         /* Do we already have this tickle ?*/
2303         tcp.src_addr = p->src_addr;
2304         tcp.dst_addr = p->dst_addr;
2305         if (ctdb_tcp_find(vnn->tcp_array, &tcp) != NULL) {
2306                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
2307                         ctdb_addr_to_str(&tcp.dst_addr),
2308                         ntohs(tcp.dst_addr.ip.sin_port),
2309                         vnn->pnn));
2310                 return 0;
2311         }
2312
2313         /* A new tickle, we must add it to the array */
2314         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
2315                                         struct ctdb_tcp_connection,
2316                                         tcparray->num+1);
2317         CTDB_NO_MEMORY(ctdb, tcparray->connections);
2318
2319         vnn->tcp_array = tcparray;
2320         tcparray->connections[tcparray->num].src_addr = p->src_addr;
2321         tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
2322         tcparray->num++;
2323                                 
2324         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
2325                 ctdb_addr_to_str(&tcp.dst_addr),
2326                 ntohs(tcp.dst_addr.ip.sin_port),
2327                 vnn->pnn));
2328
2329         if (tcp_update_needed) {
2330                 vnn->tcp_update_needed = true;
2331         }
2332
2333         return 0;
2334 }
2335
2336
2337 /*
2338   called by a daemon to inform us of a TCP connection that one of its
2339   clients managing that should tickled with an ACK when IP takeover is
2340   done
2341  */
2342 static void ctdb_remove_tcp_connection(struct ctdb_context *ctdb, struct ctdb_tcp_connection *conn)
2343 {
2344         struct ctdb_tcp_connection *tcpp;
2345         struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst_addr);
2346
2347         if (vnn == NULL) {
2348                 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
2349                         ctdb_addr_to_str(&conn->dst_addr)));
2350                 return;
2351         }
2352
2353         /* if the array is empty we cant remove it
2354            and we dont need to do anything
2355          */
2356         if (vnn->tcp_array == NULL) {
2357                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
2358                         ctdb_addr_to_str(&conn->dst_addr),
2359                         ntohs(conn->dst_addr.ip.sin_port)));
2360                 return;
2361         }
2362
2363
2364         /* See if we know this connection
2365            if we dont know this connection  then we dont need to do anything
2366          */
2367         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
2368         if (tcpp == NULL) {
2369                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
2370                         ctdb_addr_to_str(&conn->dst_addr),
2371                         ntohs(conn->dst_addr.ip.sin_port)));
2372                 return;
2373         }
2374
2375
2376         /* We need to remove this entry from the array.
2377            Instead of allocating a new array and copying data to it
2378            we cheat and just copy the last entry in the existing array
2379            to the entry that is to be removed and just shring the 
2380            ->num field
2381          */
2382         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
2383         vnn->tcp_array->num--;
2384
2385         /* If we deleted the last entry we also need to remove the entire array
2386          */
2387         if (vnn->tcp_array->num == 0) {
2388                 talloc_free(vnn->tcp_array);
2389                 vnn->tcp_array = NULL;
2390         }               
2391
2392         vnn->tcp_update_needed = true;
2393
2394         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
2395                 ctdb_addr_to_str(&conn->src_addr),
2396                 ntohs(conn->src_addr.ip.sin_port)));
2397 }
2398
2399
2400 /*
2401   called by a daemon to inform us of a TCP connection that one of its
2402   clients used are no longer needed in the tickle database
2403  */
2404 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
2405 {
2406         struct ctdb_tcp_connection *conn = (struct ctdb_tcp_connection *)indata.dptr;
2407
2408         ctdb_remove_tcp_connection(ctdb, conn);
2409
2410         return 0;
2411 }
2412
2413
2414 /*
2415   called when a daemon restarts - send all tickes for all public addresses
2416   we are serving immediately to the new node.
2417  */
2418 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t vnn)
2419 {
2420 /*XXX here we should send all tickes we are serving to the new node */
2421         return 0;
2422 }
2423
2424
2425 /*
2426   called when a client structure goes away - hook to remove
2427   elements from the tcp_list in all daemons
2428  */
2429 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
2430 {
2431         while (client->tcp_list) {
2432                 struct ctdb_tcp_list *tcp = client->tcp_list;
2433                 DLIST_REMOVE(client->tcp_list, tcp);
2434                 ctdb_remove_tcp_connection(client->ctdb, &tcp->connection);
2435         }
2436 }
2437
2438
2439 /*
2440   release all IPs on shutdown
2441  */
2442 void ctdb_release_all_ips(struct ctdb_context *ctdb)
2443 {
2444         struct ctdb_vnn *vnn;
2445
2446         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2447                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
2448                         ctdb_vnn_unassign_iface(ctdb, vnn);
2449                         continue;
2450                 }
2451                 if (!vnn->iface) {
2452                         continue;
2453                 }
2454                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
2455                                   ctdb_vnn_iface_string(vnn),
2456                                   ctdb_addr_to_str(&vnn->public_address),
2457                                   vnn->public_netmask_bits);
2458                 release_kill_clients(ctdb, &vnn->public_address);
2459                 ctdb_vnn_unassign_iface(ctdb, vnn);
2460         }
2461 }
2462
2463
2464 /*
2465   get list of public IPs
2466  */
2467 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
2468                                     struct ctdb_req_control *c, TDB_DATA *outdata)
2469 {
2470         int i, num, len;
2471         struct ctdb_all_public_ips *ips;
2472         struct ctdb_vnn *vnn;
2473         bool only_available = false;
2474
2475         if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
2476                 only_available = true;
2477         }
2478
2479         /* count how many public ip structures we have */
2480         num = 0;
2481         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2482                 num++;
2483         }
2484
2485         len = offsetof(struct ctdb_all_public_ips, ips) + 
2486                 num*sizeof(struct ctdb_public_ip);
2487         ips = talloc_zero_size(outdata, len);
2488         CTDB_NO_MEMORY(ctdb, ips);
2489
2490         i = 0;
2491         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2492                 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
2493                         continue;
2494                 }
2495                 ips->ips[i].pnn  = vnn->pnn;
2496                 ips->ips[i].addr = vnn->public_address;
2497                 i++;
2498         }
2499         ips->num = i;
2500         len = offsetof(struct ctdb_all_public_ips, ips) +
2501                 i*sizeof(struct ctdb_public_ip);
2502
2503         outdata->dsize = len;
2504         outdata->dptr  = (uint8_t *)ips;
2505
2506         return 0;
2507 }
2508
2509
2510 /*
2511   get list of public IPs, old ipv4 style.  only returns ipv4 addresses
2512  */
2513 int32_t ctdb_control_get_public_ipsv4(struct ctdb_context *ctdb, 
2514                                     struct ctdb_req_control *c, TDB_DATA *outdata)
2515 {
2516         int i, num, len;
2517         struct ctdb_all_public_ipsv4 *ips;
2518         struct ctdb_vnn *vnn;
2519
2520         /* count how many public ip structures we have */
2521         num = 0;
2522         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2523                 if (vnn->public_address.sa.sa_family != AF_INET) {
2524                         continue;
2525                 }
2526                 num++;
2527         }
2528
2529         len = offsetof(struct ctdb_all_public_ipsv4, ips) + 
2530                 num*sizeof(struct ctdb_public_ipv4);
2531         ips = talloc_zero_size(outdata, len);
2532         CTDB_NO_MEMORY(ctdb, ips);
2533
2534         outdata->dsize = len;
2535         outdata->dptr  = (uint8_t *)ips;
2536
2537         ips->num = num;
2538         i = 0;
2539         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2540                 if (vnn->public_address.sa.sa_family != AF_INET) {
2541                         continue;
2542                 }
2543                 ips->ips[i].pnn = vnn->pnn;
2544                 ips->ips[i].sin = vnn->public_address.ip;
2545                 i++;
2546         }
2547
2548         return 0;
2549 }
2550
2551 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
2552                                         struct ctdb_req_control *c,
2553                                         TDB_DATA indata,
2554                                         TDB_DATA *outdata)
2555 {
2556         int i, num, len;
2557         ctdb_sock_addr *addr;
2558         struct ctdb_control_public_ip_info *info;
2559         struct ctdb_vnn *vnn;
2560
2561         addr = (ctdb_sock_addr *)indata.dptr;
2562
2563         vnn = find_public_ip_vnn(ctdb, addr);
2564         if (vnn == NULL) {
2565                 /* if it is not a public ip   it could be our 'single ip' */
2566                 if (ctdb->single_ip_vnn) {
2567                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
2568                                 vnn = ctdb->single_ip_vnn;
2569                         }
2570                 }
2571         }
2572         if (vnn == NULL) {
2573                 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
2574                                  "'%s'not a public address\n",
2575                                  ctdb_addr_to_str(addr)));
2576                 return -1;
2577         }
2578
2579         /* count how many public ip structures we have */
2580         num = 0;
2581         for (;vnn->ifaces[num];) {
2582                 num++;
2583         }
2584
2585         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
2586                 num*sizeof(struct ctdb_control_iface_info);
2587         info = talloc_zero_size(outdata, len);
2588         CTDB_NO_MEMORY(ctdb, info);
2589
2590         info->ip.addr = vnn->public_address;
2591         info->ip.pnn = vnn->pnn;
2592         info->active_idx = 0xFFFFFFFF;
2593
2594         for (i=0; vnn->ifaces[i]; i++) {
2595                 struct ctdb_iface *cur;
2596
2597                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
2598                 if (cur == NULL) {
2599                         DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
2600                                            vnn->ifaces[i]));
2601                         return -1;
2602                 }
2603                 if (vnn->iface == cur) {
2604                         info->active_idx = i;
2605                 }
2606                 strcpy(info->ifaces[i].name, cur->name);
2607                 info->ifaces[i].link_state = cur->link_up;
2608                 info->ifaces[i].references = cur->references;
2609         }
2610         info->num = i;
2611         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
2612                 i*sizeof(struct ctdb_control_iface_info);
2613
2614         outdata->dsize = len;
2615         outdata->dptr  = (uint8_t *)info;
2616
2617         return 0;
2618 }
2619
2620 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
2621                                 struct ctdb_req_control *c,
2622                                 TDB_DATA *outdata)
2623 {
2624         int i, num, len;
2625         struct ctdb_control_get_ifaces *ifaces;
2626         struct ctdb_iface *cur;
2627
2628         /* count how many public ip structures we have */
2629         num = 0;
2630         for (cur=ctdb->ifaces;cur;cur=cur->next) {
2631                 num++;
2632         }
2633
2634         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
2635                 num*sizeof(struct ctdb_control_iface_info);
2636         ifaces = talloc_zero_size(outdata, len);
2637         CTDB_NO_MEMORY(ctdb, ifaces);
2638
2639         i = 0;
2640         for (cur=ctdb->ifaces;cur;cur=cur->next) {
2641                 strcpy(ifaces->ifaces[i].name, cur->name);
2642                 ifaces->ifaces[i].link_state = cur->link_up;
2643                 ifaces->ifaces[i].references = cur->references;
2644                 i++;
2645         }
2646         ifaces->num = i;
2647         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
2648                 i*sizeof(struct ctdb_control_iface_info);
2649
2650         outdata->dsize = len;
2651         outdata->dptr  = (uint8_t *)ifaces;
2652
2653         return 0;
2654 }
2655
2656 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
2657                                     struct ctdb_req_control *c,
2658                                     TDB_DATA indata)
2659 {
2660         struct ctdb_control_iface_info *info;
2661         struct ctdb_iface *iface;
2662         bool link_up = false;
2663
2664         info = (struct ctdb_control_iface_info *)indata.dptr;
2665
2666         if (info->name[CTDB_IFACE_SIZE] != '\0') {
2667                 int len = strnlen(info->name, CTDB_IFACE_SIZE);
2668                 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
2669                                   len, len, info->name));
2670                 return -1;
2671         }
2672
2673         switch (info->link_state) {
2674         case 0:
2675                 link_up = false;
2676                 break;
2677         case 1:
2678                 link_up = true;
2679                 break;
2680         default:
2681                 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
2682                                   (unsigned int)info->link_state));
2683                 return -1;
2684         }
2685
2686         if (info->references != 0) {
2687                 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
2688                                   (unsigned int)info->references));
2689                 return -1;
2690         }
2691
2692         iface = ctdb_find_iface(ctdb, info->name);
2693         if (iface == NULL) {
2694                 return -1;
2695         }
2696
2697         if (link_up == iface->link_up) {
2698                 return 0;
2699         }
2700
2701         DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
2702               ("iface[%s] has changed it's link status %s => %s\n",
2703                iface->name,
2704                iface->link_up?"up":"down",
2705                link_up?"up":"down"));
2706
2707         iface->link_up = link_up;
2708         return 0;
2709 }
2710
2711
2712 /* 
2713    structure containing the listening socket and the list of tcp connections
2714    that the ctdb daemon is to kill
2715 */
2716 struct ctdb_kill_tcp {
2717         struct ctdb_vnn *vnn;
2718         struct ctdb_context *ctdb;
2719         int capture_fd;
2720         struct fd_event *fde;
2721         trbt_tree_t *connections;
2722         void *private_data;
2723 };
2724
2725 /*
2726   a tcp connection that is to be killed
2727  */
2728 struct ctdb_killtcp_con {
2729         ctdb_sock_addr src_addr;
2730         ctdb_sock_addr dst_addr;
2731         int count;
2732         struct ctdb_kill_tcp *killtcp;
2733 };
2734
2735 /* this function is used to create a key to represent this socketpair
2736    in the killtcp tree.
2737    this key is used to insert and lookup matching socketpairs that are
2738    to be tickled and RST
2739 */
2740 #define KILLTCP_KEYLEN  10
2741 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
2742 {
2743         static uint32_t key[KILLTCP_KEYLEN];
2744
2745         bzero(key, sizeof(key));
2746
2747         if (src->sa.sa_family != dst->sa.sa_family) {
2748                 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
2749                 return key;
2750         }
2751         
2752         switch (src->sa.sa_family) {
2753         case AF_INET:
2754                 key[0]  = dst->ip.sin_addr.s_addr;
2755                 key[1]  = src->ip.sin_addr.s_addr;
2756                 key[2]  = dst->ip.sin_port;
2757                 key[3]  = src->ip.sin_port;
2758                 break;
2759         case AF_INET6:
2760                 key[0]  = dst->ip6.sin6_addr.s6_addr32[3];
2761                 key[1]  = src->ip6.sin6_addr.s6_addr32[3];
2762                 key[2]  = dst->ip6.sin6_addr.s6_addr32[2];
2763                 key[3]  = src->ip6.sin6_addr.s6_addr32[2];
2764                 key[4]  = dst->ip6.sin6_addr.s6_addr32[1];
2765                 key[5]  = src->ip6.sin6_addr.s6_addr32[1];
2766                 key[6]  = dst->ip6.sin6_addr.s6_addr32[0];
2767                 key[7]  = src->ip6.sin6_addr.s6_addr32[0];
2768                 key[8]  = dst->ip6.sin6_port;
2769                 key[9]  = src->ip6.sin6_port;
2770                 break;
2771         default:
2772                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
2773                 return key;
2774         }
2775
2776         return key;
2777 }
2778
2779 /*
2780   called when we get a read event on the raw socket
2781  */
2782 static void capture_tcp_handler(struct event_context *ev, struct fd_event *fde, 
2783                                 uint16_t flags, void *private_data)
2784 {
2785         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
2786         struct ctdb_killtcp_con *con;
2787         ctdb_sock_addr src, dst;
2788         uint32_t ack_seq, seq;
2789
2790         if (!(flags & EVENT_FD_READ)) {
2791                 return;
2792         }
2793
2794         if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
2795                                 killtcp->private_data,
2796                                 &src, &dst,
2797                                 &ack_seq, &seq) != 0) {
2798                 /* probably a non-tcp ACK packet */
2799                 return;
2800         }
2801
2802         /* check if we have this guy in our list of connections
2803            to kill
2804         */
2805         con = trbt_lookuparray32(killtcp->connections, 
2806                         KILLTCP_KEYLEN, killtcp_key(&src, &dst));
2807         if (con == NULL) {
2808                 /* no this was some other packet we can just ignore */
2809                 return;
2810         }
2811
2812         /* This one has been tickled !
2813            now reset him and remove him from the list.
2814          */
2815         DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
2816                 ntohs(con->dst_addr.ip.sin_port),
2817                 ctdb_addr_to_str(&con->src_addr),
2818                 ntohs(con->src_addr.ip.sin_port)));
2819
2820         ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
2821         talloc_free(con);
2822 }
2823
2824
2825 /* when traversing the list of all tcp connections to send tickle acks to
2826    (so that we can capture the ack coming back and kill the connection
2827     by a RST)
2828    this callback is called for each connection we are currently trying to kill
2829 */
2830 static void tickle_connection_traverse(void *param, void *data)
2831 {
2832         struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
2833
2834         /* have tried too many times, just give up */
2835         if (con->count >= 5) {
2836                 /* can't delete in traverse: reparent to delete_cons */
2837                 talloc_steal(param, con);
2838                 return;
2839         }
2840
2841         /* othervise, try tickling it again */
2842         con->count++;
2843         ctdb_sys_send_tcp(
2844                 (ctdb_sock_addr *)&con->dst_addr,
2845                 (ctdb_sock_addr *)&con->src_addr,
2846                 0, 0, 0);
2847 }
2848
2849
2850 /* 
2851    called every second until all sentenced connections have been reset
2852  */
2853 static void ctdb_tickle_sentenced_connections(struct event_context *ev, struct timed_event *te, 
2854                                               struct timeval t, void *private_data)
2855 {
2856         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
2857         void *delete_cons = talloc_new(NULL);
2858
2859         /* loop over all connections sending tickle ACKs */
2860         trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
2861
2862         /* now we've finished traverse, it's safe to do deletion. */
2863         talloc_free(delete_cons);
2864
2865         /* If there are no more connections to kill we can remove the
2866            entire killtcp structure
2867          */
2868         if ( (killtcp->connections == NULL) || 
2869              (killtcp->connections->root == NULL) ) {
2870                 talloc_free(killtcp);
2871                 return;
2872         }
2873
2874         /* try tickling them again in a seconds time
2875          */
2876         event_add_timed(killtcp->ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
2877                         ctdb_tickle_sentenced_connections, killtcp);
2878 }
2879
2880 /*
2881   destroy the killtcp structure
2882  */
2883 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
2884 {
2885         if (killtcp->vnn) {
2886                 killtcp->vnn->killtcp = NULL;
2887         }
2888         return 0;
2889 }
2890
2891
2892 /* nothing fancy here, just unconditionally replace any existing
2893    connection structure with the new one.
2894
2895    dont even free the old one if it did exist, that one is talloc_stolen
2896    by the same node in the tree anyway and will be deleted when the new data 
2897    is deleted
2898 */
2899 static void *add_killtcp_callback(void *parm, void *data)
2900 {
2901         return parm;
2902 }
2903
2904 /*
2905   add a tcp socket to the list of connections we want to RST
2906  */
2907 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb, 
2908                                        ctdb_sock_addr *s,
2909                                        ctdb_sock_addr *d)
2910 {
2911         ctdb_sock_addr src, dst;
2912         struct ctdb_kill_tcp *killtcp;
2913         struct ctdb_killtcp_con *con;
2914         struct ctdb_vnn *vnn;
2915
2916         ctdb_canonicalize_ip(s, &src);
2917         ctdb_canonicalize_ip(d, &dst);
2918
2919         vnn = find_public_ip_vnn(ctdb, &dst);
2920         if (vnn == NULL) {
2921                 vnn = find_public_ip_vnn(ctdb, &src);
2922         }
2923         if (vnn == NULL) {
2924                 /* if it is not a public ip   it could be our 'single ip' */
2925                 if (ctdb->single_ip_vnn) {
2926                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
2927                                 vnn = ctdb->single_ip_vnn;
2928                         }
2929                 }
2930         }
2931         if (vnn == NULL) {
2932                 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n")); 
2933                 return -1;
2934         }
2935
2936         killtcp = vnn->killtcp;
2937         
2938         /* If this is the first connection to kill we must allocate
2939            a new structure
2940          */
2941         if (killtcp == NULL) {
2942                 killtcp = talloc_zero(ctdb, struct ctdb_kill_tcp);
2943                 CTDB_NO_MEMORY(ctdb, killtcp);
2944
2945                 killtcp->vnn         = vnn;
2946                 killtcp->ctdb        = ctdb;
2947                 killtcp->capture_fd  = -1;
2948                 killtcp->connections = trbt_create(killtcp, 0);
2949
2950                 vnn->killtcp         = killtcp;
2951                 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
2952         }
2953
2954
2955
2956         /* create a structure that describes this connection we want to
2957            RST and store it in killtcp->connections
2958         */
2959         con = talloc(killtcp, struct ctdb_killtcp_con);
2960         CTDB_NO_MEMORY(ctdb, con);
2961         con->src_addr = src;
2962         con->dst_addr = dst;
2963         con->count    = 0;
2964         con->killtcp  = killtcp;
2965
2966
2967         trbt_insertarray32_callback(killtcp->connections,
2968                         KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
2969                         add_killtcp_callback, con);
2970
2971         /* 
2972            If we dont have a socket to listen on yet we must create it
2973          */
2974         if (killtcp->capture_fd == -1) {
2975                 const char *iface = ctdb_vnn_iface_string(vnn);
2976                 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
2977                 if (killtcp->capture_fd == -1) {
2978                         DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
2979                                           "socket on iface '%s' for killtcp (%s)\n",
2980                                           iface, strerror(errno)));
2981                         goto failed;
2982                 }
2983         }
2984
2985
2986         if (killtcp->fde == NULL) {
2987                 killtcp->fde = event_add_fd(ctdb->ev, killtcp, killtcp->capture_fd, 
2988                                             EVENT_FD_READ,
2989                                             capture_tcp_handler, killtcp);
2990                 tevent_fd_set_auto_close(killtcp->fde);
2991
2992                 /* We also need to set up some events to tickle all these connections
2993                    until they are all reset
2994                 */
2995                 event_add_timed(ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
2996                                 ctdb_tickle_sentenced_connections, killtcp);
2997         }
2998
2999         /* tickle him once now */
3000         ctdb_sys_send_tcp(
3001                 &con->dst_addr,
3002                 &con->src_addr,
3003                 0, 0, 0);
3004
3005         return 0;
3006
3007 failed:
3008         talloc_free(vnn->killtcp);
3009         vnn->killtcp = NULL;
3010         return -1;
3011 }
3012
3013 /*
3014   kill a TCP connection.
3015  */
3016 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
3017 {
3018         struct ctdb_control_killtcp *killtcp = (struct ctdb_control_killtcp *)indata.dptr;
3019
3020         return ctdb_killtcp_add_connection(ctdb, &killtcp->src_addr, &killtcp->dst_addr);
3021 }
3022
3023 /*
3024   called by a daemon to inform us of the entire list of TCP tickles for
3025   a particular public address.
3026   this control should only be sent by the node that is currently serving
3027   that public address.
3028  */
3029 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
3030 {
3031         struct ctdb_control_tcp_tickle_list *list = (struct ctdb_control_tcp_tickle_list *)indata.dptr;
3032         struct ctdb_tcp_array *tcparray;
3033         struct ctdb_vnn *vnn;
3034
3035         /* We must at least have tickles.num or else we cant verify the size
3036            of the received data blob
3037          */
3038         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3039                                         tickles.connections)) {
3040                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list. Not enough data for the tickle.num field\n"));
3041                 return -1;
3042         }
3043
3044         /* verify that the size of data matches what we expect */
3045         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3046                                 tickles.connections)
3047                          + sizeof(struct ctdb_tcp_connection)
3048                                  * list->tickles.num) {
3049                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list\n"));
3050                 return -1;
3051         }       
3052
3053         vnn = find_public_ip_vnn(ctdb, &list->addr);
3054         if (vnn == NULL) {
3055                 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n", 
3056                         ctdb_addr_to_str(&list->addr)));
3057
3058                 return 1;
3059         }
3060
3061         /* remove any old ticklelist we might have */
3062         talloc_free(vnn->tcp_array);
3063         vnn->tcp_array = NULL;
3064
3065         tcparray = talloc(ctdb->nodes, struct ctdb_tcp_array);
3066         CTDB_NO_MEMORY(ctdb, tcparray);
3067
3068         tcparray->num = list->tickles.num;
3069
3070         tcparray->connections = talloc_array(tcparray, struct ctdb_tcp_connection, tcparray->num);
3071         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3072
3073         memcpy(tcparray->connections, &list->tickles.connections[0], 
3074                sizeof(struct ctdb_tcp_connection)*tcparray->num);
3075
3076         /* We now have a new fresh tickle list array for this vnn */
3077         vnn->tcp_array = talloc_steal(vnn, tcparray);
3078         
3079         return 0;
3080 }
3081
3082 /*
3083   called to return the full list of tickles for the puclic address associated 
3084   with the provided vnn
3085  */
3086 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3087 {
3088         ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3089         struct ctdb_control_tcp_tickle_list *list;
3090         struct ctdb_tcp_array *tcparray;
3091         int num;
3092         struct ctdb_vnn *vnn;
3093
3094         vnn = find_public_ip_vnn(ctdb, addr);
3095         if (vnn == NULL) {
3096                 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n", 
3097                         ctdb_addr_to_str(addr)));
3098
3099                 return 1;
3100         }
3101
3102         tcparray = vnn->tcp_array;
3103         if (tcparray) {
3104                 num = tcparray->num;
3105         } else {
3106                 num = 0;
3107         }
3108
3109         outdata->dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3110                                 tickles.connections)
3111                         + sizeof(struct ctdb_tcp_connection) * num;
3112
3113         outdata->dptr  = talloc_size(outdata, outdata->dsize);
3114         CTDB_NO_MEMORY(ctdb, outdata->dptr);
3115         list = (struct ctdb_control_tcp_tickle_list *)outdata->dptr;
3116
3117         list->addr = *addr;
3118         list->tickles.num = num;
3119         if (num) {
3120                 memcpy(&list->tickles.connections[0], tcparray->connections, 
3121                         sizeof(struct ctdb_tcp_connection) * num);
3122         }
3123
3124         return 0;
3125 }
3126
3127
3128 /*
3129   set the list of all tcp tickles for a public address
3130  */
3131 static int ctdb_ctrl_set_tcp_tickles(struct ctdb_context *ctdb, 
3132                               struct timeval timeout, uint32_t destnode, 
3133                               ctdb_sock_addr *addr,
3134                               struct ctdb_tcp_array *tcparray)
3135 {
3136         int ret, num;
3137         TDB_DATA data;
3138         struct ctdb_control_tcp_tickle_list *list;
3139
3140         if (tcparray) {
3141                 num = tcparray->num;
3142         } else {
3143                 num = 0;
3144         }
3145
3146         data.dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3147                                 tickles.connections) +
3148                         sizeof(struct ctdb_tcp_connection) * num;
3149         data.dptr = talloc_size(ctdb, data.dsize);
3150         CTDB_NO_MEMORY(ctdb, data.dptr);
3151
3152         list = (struct ctdb_control_tcp_tickle_list *)data.dptr;
3153         list->addr = *addr;
3154         list->tickles.num = num;
3155         if (tcparray) {
3156                 memcpy(&list->tickles.connections[0], tcparray->connections, sizeof(struct ctdb_tcp_connection) * num);
3157         }
3158
3159         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
3160                                        CTDB_CONTROL_SET_TCP_TICKLE_LIST,
3161                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3162         if (ret != 0) {
3163                 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
3164                 return -1;
3165         }
3166
3167         talloc_free(data.dptr);
3168
3169         return ret;
3170 }
3171
3172
3173 /*
3174   perform tickle updates if required
3175  */
3176 static void ctdb_update_tcp_tickles(struct event_context *ev, 
3177                                 struct timed_event *te, 
3178                                 struct timeval t, void *private_data)
3179 {
3180         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3181         int ret;
3182         struct ctdb_vnn *vnn;
3183
3184         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3185                 /* we only send out updates for public addresses that 
3186                    we have taken over
3187                  */
3188                 if (ctdb->pnn != vnn->pnn) {
3189                         continue;
3190                 }
3191                 /* We only send out the updates if we need to */
3192                 if (!vnn->tcp_update_needed) {
3193                         continue;
3194                 }
3195                 ret = ctdb_ctrl_set_tcp_tickles(ctdb, 
3196                                 TAKEOVER_TIMEOUT(),
3197                                 CTDB_BROADCAST_CONNECTED,
3198                                 &vnn->public_address,
3199                                 vnn->tcp_array);
3200                 if (ret != 0) {
3201                         DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
3202                                 ctdb_addr_to_str(&vnn->public_address)));
3203                 }
3204         }
3205
3206         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
3207                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
3208                              ctdb_update_tcp_tickles, ctdb);
3209 }               
3210         
3211
3212 /*
3213   start periodic update of tcp tickles
3214  */
3215 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
3216 {
3217         ctdb->tickle_update_context = talloc_new(ctdb);
3218
3219         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
3220                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
3221                              ctdb_update_tcp_tickles, ctdb);
3222 }
3223
3224
3225
3226
3227 struct control_gratious_arp {
3228         struct ctdb_context *ctdb;
3229         ctdb_sock_addr addr;
3230         const char *iface;
3231         int count;
3232 };
3233
3234 /*
3235   send a control_gratuitous arp
3236  */
3237 static void send_gratious_arp(struct event_context *ev, struct timed_event *te, 
3238                                   struct timeval t, void *private_data)
3239 {
3240         int ret;
3241         struct control_gratious_arp *arp = talloc_get_type(private_data, 
3242                                                         struct control_gratious_arp);
3243
3244         ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
3245         if (ret != 0) {
3246                 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
3247                                  arp->iface, strerror(errno)));
3248         }
3249
3250
3251         arp->count++;
3252         if (arp->count == CTDB_ARP_REPEAT) {
3253                 talloc_free(arp);
3254                 return;
3255         }
3256
3257         event_add_timed(arp->ctdb->ev, arp, 
3258                         timeval_current_ofs(CTDB_ARP_INTERVAL, 0), 
3259                         send_gratious_arp, arp);
3260 }
3261
3262
3263 /*
3264   send a gratious arp 
3265  */
3266 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
3267 {
3268         struct ctdb_control_gratious_arp *gratious_arp = (struct ctdb_control_gratious_arp *)indata.dptr;
3269         struct control_gratious_arp *arp;
3270
3271         /* verify the size of indata */
3272         if (indata.dsize < offsetof(struct ctdb_control_gratious_arp, iface)) {
3273                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
3274                                  (unsigned)indata.dsize, 
3275                                  (unsigned)offsetof(struct ctdb_control_gratious_arp, iface)));
3276                 return -1;
3277         }
3278         if (indata.dsize != 
3279                 ( offsetof(struct ctdb_control_gratious_arp, iface)
3280                 + gratious_arp->len ) ){
3281
3282                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3283                         "but should be %u bytes\n", 
3284                          (unsigned)indata.dsize, 
3285                          (unsigned)(offsetof(struct ctdb_control_gratious_arp, iface)+gratious_arp->len)));
3286                 return -1;
3287         }
3288
3289
3290         arp = talloc(ctdb, struct control_gratious_arp);
3291         CTDB_NO_MEMORY(ctdb, arp);
3292
3293         arp->ctdb  = ctdb;
3294         arp->addr   = gratious_arp->addr;
3295         arp->iface = talloc_strdup(arp, gratious_arp->iface);
3296         CTDB_NO_MEMORY(ctdb, arp->iface);
3297         arp->count = 0;
3298         
3299         event_add_timed(arp->ctdb->ev, arp, 
3300                         timeval_zero(), send_gratious_arp, arp);
3301
3302         return 0;
3303 }
3304
3305 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
3306 {
3307         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
3308         int ret;
3309
3310         /* verify the size of indata */
3311         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
3312                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
3313                 return -1;
3314         }
3315         if (indata.dsize != 
3316                 ( offsetof(struct ctdb_control_ip_iface, iface)
3317                 + pub->len ) ){
3318
3319                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3320                         "but should be %u bytes\n", 
3321                          (unsigned)indata.dsize, 
3322                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
3323                 return -1;
3324         }
3325
3326         ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0]);
3327
3328         if (ret != 0) {
3329                 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
3330                 return -1;
3331         }
3332
3333         return 0;
3334 }
3335
3336 /*
3337   called when releaseip event finishes for del_public_address
3338  */
3339 static void delete_ip_callback(struct ctdb_context *ctdb, int status, 
3340                                 void *private_data)
3341 {
3342         talloc_free(private_data);
3343 }
3344
3345 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
3346 {
3347         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
3348         struct ctdb_vnn *vnn;
3349         int ret;
3350
3351         /* verify the size of indata */
3352         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
3353                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
3354                 return -1;
3355         }
3356         if (indata.dsize != 
3357                 ( offsetof(struct ctdb_control_ip_iface, iface)
3358                 + pub->len ) ){
3359
3360                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3361                         "but should be %u bytes\n", 
3362                          (unsigned)indata.dsize, 
3363                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
3364                 return -1;
3365         }
3366
3367         /* walk over all public addresses until we find a match */
3368         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3369                 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
3370                         TALLOC_CTX *mem_ctx;
3371
3372                         DLIST_REMOVE(ctdb->vnn, vnn);
3373                         if (vnn->iface != NULL) {
3374                                 ctdb_vnn_unassign_iface(ctdb, vnn);
3375                         }
3376                         if (vnn->pnn != ctdb->pnn) {
3377                                 talloc_free(vnn);
3378                                 return 0;
3379                         }
3380
3381                         mem_ctx = talloc_new(ctdb);
3382                         talloc_steal(mem_ctx, vnn);
3383                         ret = ctdb_event_script_callback(ctdb, 
3384                                          mem_ctx, delete_ip_callback, mem_ctx,
3385                                          false,
3386                                          CTDB_EVENT_RELEASE_IP,
3387                                          "%s %s %u",
3388                                          ctdb_vnn_iface_string(vnn),
3389                                          ctdb_addr_to_str(&vnn->public_address),
3390                                          vnn->public_netmask_bits);
3391                         if (ret != 0) {
3392                                 return -1;
3393                         }
3394                         return 0;
3395                 }
3396         }
3397
3398         return -1;
3399 }
3400
3401 /* This function is called from the recovery daemon to verify that a remote
3402    node has the expected ip allocation.
3403    This is verified against ctdb->ip_tree
3404 */
3405 int verify_remote_ip_allocation(struct ctdb_context *ctdb, struct ctdb_all_public_ips *ips)
3406 {
3407         struct ctdb_public_ip_list *tmp_ip; 
3408         int i;
3409
3410         if (ctdb->ip_tree == NULL) {
3411                 /* dont know the expected allocation yet, assume remote node
3412                    is correct. */
3413                 return 0;
3414         }
3415
3416         if (ips == NULL) {
3417                 return 0;
3418         }
3419
3420         for (i=0; i<ips->num; i++) {
3421                 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
3422                 if (tmp_ip == NULL) {
3423                         DEBUG(DEBUG_ERR,(__location__ " Could not find host for address %s, reassign ips\n", ctdb_addr_to_str(&ips->ips[i].addr)));
3424                         return -1;
3425                 }
3426
3427                 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
3428                         continue;
3429                 }
3430
3431                 if (tmp_ip->pnn != ips->ips[i].pnn) {
3432                         DEBUG(DEBUG_ERR,("Inconsistent ip allocation. Trigger reallocation. Thinks %s is held by node %u while it is held by node %u\n", ctdb_addr_to_str(&ips->ips[i].addr), ips->ips[i].pnn, tmp_ip->pnn));
3433                         return -1;
3434                 }
3435         }
3436
3437         return 0;
3438 }
3439
3440 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
3441 {
3442         struct ctdb_public_ip_list *tmp_ip; 
3443
3444         if (ctdb->ip_tree == NULL) {
3445                 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
3446                 return -1;
3447         }
3448
3449         tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
3450         if (tmp_ip == NULL) {
3451                 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
3452                 return -1;
3453         }
3454
3455         DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
3456         tmp_ip->pnn = ip->pnn;
3457
3458         return 0;
3459 }