recoverd: Track the nodes that fail takeover run and set culprit count
[ctdb.git] / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6    Copyright (C) Martin Schwenke  2011
7
8    This program is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3 of the License, or
11    (at your option) any later version.
12    
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17    
18    You should have received a copy of the GNU General Public License
19    along with this program; if not, see <http://www.gnu.org/licenses/>.
20 */
21 #include "includes.h"
22 #include "lib/tevent/tevent.h"
23 #include "lib/tdb/include/tdb.h"
24 #include "lib/util/dlinklist.h"
25 #include "system/network.h"
26 #include "system/filesys.h"
27 #include "system/wait.h"
28 #include "../include/ctdb_private.h"
29 #include "../common/rb_tree.h"
30
31
32 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
33
34 #define CTDB_ARP_INTERVAL 1
35 #define CTDB_ARP_REPEAT   3
36
37 struct ctdb_iface {
38         struct ctdb_iface *prev, *next;
39         const char *name;
40         bool link_up;
41         uint32_t references;
42 };
43
44 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
45 {
46         if (vnn->iface) {
47                 return vnn->iface->name;
48         }
49
50         return "__none__";
51 }
52
53 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
54 {
55         struct ctdb_iface *i;
56
57         /* Verify that we dont have an entry for this ip yet */
58         for (i=ctdb->ifaces;i;i=i->next) {
59                 if (strcmp(i->name, iface) == 0) {
60                         return 0;
61                 }
62         }
63
64         /* create a new structure for this interface */
65         i = talloc_zero(ctdb, struct ctdb_iface);
66         CTDB_NO_MEMORY_FATAL(ctdb, i);
67         i->name = talloc_strdup(i, iface);
68         CTDB_NO_MEMORY(ctdb, i->name);
69         i->link_up = false;
70
71         DLIST_ADD(ctdb->ifaces, i);
72
73         return 0;
74 }
75
76 static struct ctdb_iface *ctdb_find_iface(struct ctdb_context *ctdb,
77                                           const char *iface)
78 {
79         struct ctdb_iface *i;
80
81         /* Verify that we dont have an entry for this ip yet */
82         for (i=ctdb->ifaces;i;i=i->next) {
83                 if (strcmp(i->name, iface) == 0) {
84                         return i;
85                 }
86         }
87
88         return NULL;
89 }
90
91 static struct ctdb_iface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
92                                               struct ctdb_vnn *vnn)
93 {
94         int i;
95         struct ctdb_iface *cur = NULL;
96         struct ctdb_iface *best = NULL;
97
98         for (i=0; vnn->ifaces[i]; i++) {
99
100                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
101                 if (cur == NULL) {
102                         continue;
103                 }
104
105                 if (!cur->link_up) {
106                         continue;
107                 }
108
109                 if (best == NULL) {
110                         best = cur;
111                         continue;
112                 }
113
114                 if (cur->references < best->references) {
115                         best = cur;
116                         continue;
117                 }
118         }
119
120         return best;
121 }
122
123 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
124                                      struct ctdb_vnn *vnn)
125 {
126         struct ctdb_iface *best = NULL;
127
128         if (vnn->iface) {
129                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
130                                    "still assigned to iface '%s'\n",
131                                    ctdb_addr_to_str(&vnn->public_address),
132                                    ctdb_vnn_iface_string(vnn)));
133                 return 0;
134         }
135
136         best = ctdb_vnn_best_iface(ctdb, vnn);
137         if (best == NULL) {
138                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
139                                   "cannot assign to iface any iface\n",
140                                   ctdb_addr_to_str(&vnn->public_address)));
141                 return -1;
142         }
143
144         vnn->iface = best;
145         best->references++;
146         vnn->pnn = ctdb->pnn;
147
148         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
149                            "now assigned to iface '%s' refs[%d]\n",
150                            ctdb_addr_to_str(&vnn->public_address),
151                            ctdb_vnn_iface_string(vnn),
152                            best->references));
153         return 0;
154 }
155
156 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
157                                     struct ctdb_vnn *vnn)
158 {
159         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
160                            "now unassigned (old iface '%s' refs[%d])\n",
161                            ctdb_addr_to_str(&vnn->public_address),
162                            ctdb_vnn_iface_string(vnn),
163                            vnn->iface?vnn->iface->references:0));
164         if (vnn->iface) {
165                 vnn->iface->references--;
166         }
167         vnn->iface = NULL;
168         if (vnn->pnn == ctdb->pnn) {
169                 vnn->pnn = -1;
170         }
171 }
172
173 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
174                                struct ctdb_vnn *vnn)
175 {
176         int i;
177
178         if (vnn->iface && vnn->iface->link_up) {
179                 return true;
180         }
181
182         for (i=0; vnn->ifaces[i]; i++) {
183                 struct ctdb_iface *cur;
184
185                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
186                 if (cur == NULL) {
187                         continue;
188                 }
189
190                 if (cur->link_up) {
191                         return true;
192                 }
193         }
194
195         return false;
196 }
197
198 struct ctdb_takeover_arp {
199         struct ctdb_context *ctdb;
200         uint32_t count;
201         ctdb_sock_addr addr;
202         struct ctdb_tcp_array *tcparray;
203         struct ctdb_vnn *vnn;
204 };
205
206
207 /*
208   lists of tcp endpoints
209  */
210 struct ctdb_tcp_list {
211         struct ctdb_tcp_list *prev, *next;
212         struct ctdb_tcp_connection connection;
213 };
214
215 /*
216   list of clients to kill on IP release
217  */
218 struct ctdb_client_ip {
219         struct ctdb_client_ip *prev, *next;
220         struct ctdb_context *ctdb;
221         ctdb_sock_addr addr;
222         uint32_t client_id;
223 };
224
225
226 /*
227   send a gratuitous arp
228  */
229 static void ctdb_control_send_arp(struct event_context *ev, struct timed_event *te, 
230                                   struct timeval t, void *private_data)
231 {
232         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
233                                                         struct ctdb_takeover_arp);
234         int i, ret;
235         struct ctdb_tcp_array *tcparray;
236         const char *iface = ctdb_vnn_iface_string(arp->vnn);
237
238         ret = ctdb_sys_send_arp(&arp->addr, iface);
239         if (ret != 0) {
240                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
241                                   iface, strerror(errno)));
242         }
243
244         tcparray = arp->tcparray;
245         if (tcparray) {
246                 for (i=0;i<tcparray->num;i++) {
247                         struct ctdb_tcp_connection *tcon;
248
249                         tcon = &tcparray->connections[i];
250                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
251                                 (unsigned)ntohs(tcon->dst_addr.ip.sin_port), 
252                                 ctdb_addr_to_str(&tcon->src_addr),
253                                 (unsigned)ntohs(tcon->src_addr.ip.sin_port)));
254                         ret = ctdb_sys_send_tcp(
255                                 &tcon->src_addr, 
256                                 &tcon->dst_addr,
257                                 0, 0, 0);
258                         if (ret != 0) {
259                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
260                                         ctdb_addr_to_str(&tcon->src_addr)));
261                         }
262                 }
263         }
264
265         arp->count++;
266
267         if (arp->count == CTDB_ARP_REPEAT) {
268                 talloc_free(arp);
269                 return;
270         }
271
272         event_add_timed(arp->ctdb->ev, arp->vnn->takeover_ctx, 
273                         timeval_current_ofs(CTDB_ARP_INTERVAL, 100000), 
274                         ctdb_control_send_arp, arp);
275 }
276
277 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
278                                        struct ctdb_vnn *vnn)
279 {
280         struct ctdb_takeover_arp *arp;
281         struct ctdb_tcp_array *tcparray;
282
283         if (!vnn->takeover_ctx) {
284                 vnn->takeover_ctx = talloc_new(vnn);
285                 if (!vnn->takeover_ctx) {
286                         return -1;
287                 }
288         }
289
290         arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
291         if (!arp) {
292                 return -1;
293         }
294
295         arp->ctdb = ctdb;
296         arp->addr = vnn->public_address;
297         arp->vnn  = vnn;
298
299         tcparray = vnn->tcp_array;
300         if (tcparray) {
301                 /* add all of the known tcp connections for this IP to the
302                    list of tcp connections to send tickle acks for */
303                 arp->tcparray = talloc_steal(arp, tcparray);
304
305                 vnn->tcp_array = NULL;
306                 vnn->tcp_update_needed = true;
307         }
308
309         event_add_timed(arp->ctdb->ev, vnn->takeover_ctx,
310                         timeval_zero(), ctdb_control_send_arp, arp);
311
312         return 0;
313 }
314
315 struct takeover_callback_state {
316         struct ctdb_req_control *c;
317         ctdb_sock_addr *addr;
318         struct ctdb_vnn *vnn;
319 };
320
321 struct ctdb_do_takeip_state {
322         struct ctdb_req_control *c;
323         struct ctdb_vnn *vnn;
324 };
325
326 /*
327   called when takeip event finishes
328  */
329 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
330                                     void *private_data)
331 {
332         struct ctdb_do_takeip_state *state =
333                 talloc_get_type(private_data, struct ctdb_do_takeip_state);
334         int32_t ret;
335         TDB_DATA data;
336
337         if (status != 0) {
338                 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
339         
340                 if (status == -ETIME) {
341                         ctdb_ban_self(ctdb);
342                 }
343                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
344                                  ctdb_addr_to_str(&state->vnn->public_address),
345                                  ctdb_vnn_iface_string(state->vnn)));
346                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
347
348                 node->flags |= NODE_FLAGS_UNHEALTHY;
349                 talloc_free(state);
350                 return;
351         }
352
353         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
354         if (ret != 0) {
355                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
356                 talloc_free(state);
357                 return;
358         }
359
360         data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
361         data.dsize = strlen((char *)data.dptr) + 1;
362         DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
363
364         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
365
366
367         /* the control succeeded */
368         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
369         talloc_free(state);
370         return;
371 }
372
373 /*
374   take over an ip address
375  */
376 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
377                               struct ctdb_req_control *c,
378                               struct ctdb_vnn *vnn)
379 {
380         int ret;
381         struct ctdb_do_takeip_state *state;
382
383         ret = ctdb_vnn_assign_iface(ctdb, vnn);
384         if (ret != 0) {
385                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
386                                  "assin a usable interface\n",
387                                  ctdb_addr_to_str(&vnn->public_address),
388                                  vnn->public_netmask_bits));
389                 return -1;
390         }
391
392         state = talloc(vnn, struct ctdb_do_takeip_state);
393         CTDB_NO_MEMORY(ctdb, state);
394
395         state->c = talloc_steal(ctdb, c);
396         state->vnn   = vnn;
397
398         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
399                             ctdb_addr_to_str(&vnn->public_address),
400                             vnn->public_netmask_bits,
401                             ctdb_vnn_iface_string(vnn)));
402
403         ret = ctdb_event_script_callback(ctdb,
404                                          state,
405                                          ctdb_do_takeip_callback,
406                                          state,
407                                          false,
408                                          CTDB_EVENT_TAKE_IP,
409                                          "%s %s %u",
410                                          ctdb_vnn_iface_string(vnn),
411                                          ctdb_addr_to_str(&vnn->public_address),
412                                          vnn->public_netmask_bits);
413
414         if (ret != 0) {
415                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
416                         ctdb_addr_to_str(&vnn->public_address),
417                         ctdb_vnn_iface_string(vnn)));
418                 talloc_free(state);
419                 return -1;
420         }
421
422         return 0;
423 }
424
425 struct ctdb_do_updateip_state {
426         struct ctdb_req_control *c;
427         struct ctdb_iface *old;
428         struct ctdb_vnn *vnn;
429 };
430
431 /*
432   called when updateip event finishes
433  */
434 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
435                                       void *private_data)
436 {
437         struct ctdb_do_updateip_state *state =
438                 talloc_get_type(private_data, struct ctdb_do_updateip_state);
439         int32_t ret;
440
441         if (status != 0) {
442                 if (status == -ETIME) {
443                         ctdb_ban_self(ctdb);
444                 }
445                 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
446                         ctdb_addr_to_str(&state->vnn->public_address),
447                         state->old->name,
448                         ctdb_vnn_iface_string(state->vnn)));
449
450                 /*
451                  * All we can do is reset the old interface
452                  * and let the next run fix it
453                  */
454                 ctdb_vnn_unassign_iface(ctdb, state->vnn);
455                 state->vnn->iface = state->old;
456                 state->vnn->iface->references++;
457
458                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
459                 talloc_free(state);
460                 return;
461         }
462
463         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
464         if (ret != 0) {
465                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
466                 talloc_free(state);
467                 return;
468         }
469
470         /* the control succeeded */
471         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
472         talloc_free(state);
473         return;
474 }
475
476 /*
477   update (move) an ip address
478  */
479 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
480                                 struct ctdb_req_control *c,
481                                 struct ctdb_vnn *vnn)
482 {
483         int ret;
484         struct ctdb_do_updateip_state *state;
485         struct ctdb_iface *old = vnn->iface;
486         char *new_name;
487
488         ctdb_vnn_unassign_iface(ctdb, vnn);
489         ret = ctdb_vnn_assign_iface(ctdb, vnn);
490         if (ret != 0) {
491                 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
492                                  "assin a usable interface (old iface '%s')\n",
493                                  ctdb_addr_to_str(&vnn->public_address),
494                                  vnn->public_netmask_bits,
495                                  old->name));
496                 return -1;
497         }
498
499         new_name = ctdb_vnn_iface_string(vnn);
500         if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
501                 /* A benign update from one interface onto itself.
502                  * no need to run the eventscripts in this case, just return
503                  * success.
504                  */
505                 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
506                 return 0;
507         }
508
509         state = talloc(vnn, struct ctdb_do_updateip_state);
510         CTDB_NO_MEMORY(ctdb, state);
511
512         state->c = talloc_steal(ctdb, c);
513         state->old = old;
514         state->vnn = vnn;
515
516         DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
517                             "interface %s to %s\n",
518                             ctdb_addr_to_str(&vnn->public_address),
519                             vnn->public_netmask_bits,
520                             old->name,
521                             new_name));
522
523         ret = ctdb_event_script_callback(ctdb,
524                                          state,
525                                          ctdb_do_updateip_callback,
526                                          state,
527                                          false,
528                                          CTDB_EVENT_UPDATE_IP,
529                                          "%s %s %s %u",
530                                          state->old->name,
531                                          new_name,
532                                          ctdb_addr_to_str(&vnn->public_address),
533                                          vnn->public_netmask_bits);
534         if (ret != 0) {
535                 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
536                                  ctdb_addr_to_str(&vnn->public_address),
537                                  old->name, new_name));
538                 talloc_free(state);
539                 return -1;
540         }
541
542         return 0;
543 }
544
545 /*
546   Find the vnn of the node that has a public ip address
547   returns -1 if the address is not known as a public address
548  */
549 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
550 {
551         struct ctdb_vnn *vnn;
552
553         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
554                 if (ctdb_same_ip(&vnn->public_address, addr)) {
555                         return vnn;
556                 }
557         }
558
559         return NULL;
560 }
561
562 /*
563   take over an ip address
564  */
565 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
566                                  struct ctdb_req_control *c,
567                                  TDB_DATA indata,
568                                  bool *async_reply)
569 {
570         int ret;
571         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
572         struct ctdb_vnn *vnn;
573         bool have_ip = false;
574         bool do_updateip = false;
575         bool do_takeip = false;
576         struct ctdb_iface *best_iface = NULL;
577
578         if (pip->pnn != ctdb->pnn) {
579                 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
580                                  "with pnn %d, but we're node %d\n",
581                                  ctdb_addr_to_str(&pip->addr),
582                                  pip->pnn, ctdb->pnn));
583                 return -1;
584         }
585
586         /* update out vnn list */
587         vnn = find_public_ip_vnn(ctdb, &pip->addr);
588         if (vnn == NULL) {
589                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
590                         ctdb_addr_to_str(&pip->addr)));
591                 return 0;
592         }
593
594         have_ip = ctdb_sys_have_ip(&pip->addr);
595         best_iface = ctdb_vnn_best_iface(ctdb, vnn);
596         if (best_iface == NULL) {
597                 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
598                                  "a usable interface (old %s, have_ip %d)\n",
599                                  ctdb_addr_to_str(&vnn->public_address),
600                                  vnn->public_netmask_bits,
601                                  ctdb_vnn_iface_string(vnn),
602                                  have_ip));
603                 return -1;
604         }
605
606         if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
607                 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
608                 have_ip = false;
609         }
610
611         if (vnn->iface == NULL && have_ip) {
612                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
613                                   "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
614                                  ctdb_addr_to_str(&vnn->public_address)));
615                 return 0;
616         }
617
618         if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
619                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
620                                   "and we have it on iface[%s], but it was assigned to node %d"
621                                   "and we are node %d, banning ourself\n",
622                                  ctdb_addr_to_str(&vnn->public_address),
623                                  ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
624                 ctdb_ban_self(ctdb);
625                 return -1;
626         }
627
628         if (vnn->pnn == -1 && have_ip) {
629                 vnn->pnn = ctdb->pnn;
630                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
631                                   "and we already have it on iface[%s], update local daemon\n",
632                                  ctdb_addr_to_str(&vnn->public_address),
633                                   ctdb_vnn_iface_string(vnn)));
634                 return 0;
635         }
636
637         if (vnn->iface) {
638                 if (vnn->iface->link_up) {
639                         /* only move when the rebalance gains something */
640                         if (vnn->iface->references > (best_iface->references + 1)) {
641                                 do_updateip = true;
642                         }
643                 } else if (vnn->iface != best_iface) {
644                         do_updateip = true;
645                 }
646         }
647
648         if (!have_ip) {
649                 if (do_updateip) {
650                         ctdb_vnn_unassign_iface(ctdb, vnn);
651                         do_updateip = false;
652                 }
653                 do_takeip = true;
654         }
655
656         if (do_takeip) {
657                 ret = ctdb_do_takeip(ctdb, c, vnn);
658                 if (ret != 0) {
659                         return -1;
660                 }
661         } else if (do_updateip) {
662                 ret = ctdb_do_updateip(ctdb, c, vnn);
663                 if (ret != 0) {
664                         return -1;
665                 }
666         } else {
667                 /*
668                  * The interface is up and the kernel known the ip
669                  * => do nothing
670                  */
671                 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
672                         ctdb_addr_to_str(&pip->addr),
673                         vnn->public_netmask_bits,
674                         ctdb_vnn_iface_string(vnn)));
675                 return 0;
676         }
677
678         /* tell ctdb_control.c that we will be replying asynchronously */
679         *async_reply = true;
680
681         return 0;
682 }
683
684 /*
685   takeover an ip address old v4 style
686  */
687 int32_t ctdb_control_takeover_ipv4(struct ctdb_context *ctdb, 
688                                 struct ctdb_req_control *c,
689                                 TDB_DATA indata, 
690                                 bool *async_reply)
691 {
692         TDB_DATA data;
693         
694         data.dsize = sizeof(struct ctdb_public_ip);
695         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
696         CTDB_NO_MEMORY(ctdb, data.dptr);
697         
698         memcpy(data.dptr, indata.dptr, indata.dsize);
699         return ctdb_control_takeover_ip(ctdb, c, data, async_reply);
700 }
701
702 /*
703   kill any clients that are registered with a IP that is being released
704  */
705 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
706 {
707         struct ctdb_client_ip *ip;
708
709         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
710                 ctdb_addr_to_str(addr)));
711
712         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
713                 ctdb_sock_addr tmp_addr;
714
715                 tmp_addr = ip->addr;
716                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
717                         ip->client_id,
718                         ctdb_addr_to_str(&ip->addr)));
719
720                 if (ctdb_same_ip(&tmp_addr, addr)) {
721                         struct ctdb_client *client = ctdb_reqid_find(ctdb, 
722                                                                      ip->client_id, 
723                                                                      struct ctdb_client);
724                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
725                                 ip->client_id,
726                                 ctdb_addr_to_str(&ip->addr),
727                                 client->pid));
728
729                         if (client->pid != 0) {
730                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
731                                         (unsigned)client->pid,
732                                         ctdb_addr_to_str(addr),
733                                         ip->client_id));
734                                 kill(client->pid, SIGKILL);
735                         }
736                 }
737         }
738 }
739
740 /*
741   called when releaseip event finishes
742  */
743 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
744                                 void *private_data)
745 {
746         struct takeover_callback_state *state = 
747                 talloc_get_type(private_data, struct takeover_callback_state);
748         TDB_DATA data;
749
750         if (status == -ETIME) {
751                 ctdb_ban_self(ctdb);
752         }
753
754         /* send a message to all clients of this node telling them
755            that the cluster has been reconfigured and they should
756            release any sockets on this IP */
757         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
758         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
759         data.dsize = strlen((char *)data.dptr)+1;
760
761         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
762
763         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
764
765         /* kill clients that have registered with this IP */
766         release_kill_clients(ctdb, state->addr);
767
768         ctdb_vnn_unassign_iface(ctdb, state->vnn);
769
770         /* the control succeeded */
771         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
772         talloc_free(state);
773 }
774
775 /*
776   release an ip address
777  */
778 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
779                                 struct ctdb_req_control *c,
780                                 TDB_DATA indata, 
781                                 bool *async_reply)
782 {
783         int ret;
784         struct takeover_callback_state *state;
785         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
786         struct ctdb_vnn *vnn;
787
788         /* update our vnn list */
789         vnn = find_public_ip_vnn(ctdb, &pip->addr);
790         if (vnn == NULL) {
791                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
792                         ctdb_addr_to_str(&pip->addr)));
793                 return 0;
794         }
795         vnn->pnn = pip->pnn;
796
797         /* stop any previous arps */
798         talloc_free(vnn->takeover_ctx);
799         vnn->takeover_ctx = NULL;
800
801         if (!ctdb_sys_have_ip(&pip->addr)) {
802                 DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n", 
803                         ctdb_addr_to_str(&pip->addr),
804                         vnn->public_netmask_bits, 
805                         ctdb_vnn_iface_string(vnn)));
806                 ctdb_vnn_unassign_iface(ctdb, vnn);
807                 return 0;
808         }
809
810         if (vnn->iface == NULL) {
811                 DEBUG(DEBUG_ERR,(__location__ " release_ip of IP %s is known to the kernel, "
812                                  "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
813                                  ctdb_addr_to_str(&vnn->public_address)));
814                 return 0;
815         }
816
817         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
818                 ctdb_addr_to_str(&pip->addr),
819                 vnn->public_netmask_bits, 
820                 ctdb_vnn_iface_string(vnn),
821                 pip->pnn));
822
823         state = talloc(ctdb, struct takeover_callback_state);
824         CTDB_NO_MEMORY(ctdb, state);
825
826         state->c = talloc_steal(state, c);
827         state->addr = talloc(state, ctdb_sock_addr);       
828         CTDB_NO_MEMORY(ctdb, state->addr);
829         *state->addr = pip->addr;
830         state->vnn   = vnn;
831
832         ret = ctdb_event_script_callback(ctdb, 
833                                          state, release_ip_callback, state,
834                                          false,
835                                          CTDB_EVENT_RELEASE_IP,
836                                          "%s %s %u",
837                                          ctdb_vnn_iface_string(vnn),
838                                          ctdb_addr_to_str(&pip->addr),
839                                          vnn->public_netmask_bits);
840         if (ret != 0) {
841                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
842                         ctdb_addr_to_str(&pip->addr),
843                         ctdb_vnn_iface_string(vnn)));
844                 talloc_free(state);
845                 return -1;
846         }
847
848         /* tell the control that we will be reply asynchronously */
849         *async_reply = true;
850         return 0;
851 }
852
853 /*
854   release an ip address old v4 style
855  */
856 int32_t ctdb_control_release_ipv4(struct ctdb_context *ctdb, 
857                                 struct ctdb_req_control *c,
858                                 TDB_DATA indata, 
859                                 bool *async_reply)
860 {
861         TDB_DATA data;
862         
863         data.dsize = sizeof(struct ctdb_public_ip);
864         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
865         CTDB_NO_MEMORY(ctdb, data.dptr);
866         
867         memcpy(data.dptr, indata.dptr, indata.dsize);
868         return ctdb_control_release_ip(ctdb, c, data, async_reply);
869 }
870
871
872 static int ctdb_add_public_address(struct ctdb_context *ctdb,
873                                    ctdb_sock_addr *addr,
874                                    unsigned mask, const char *ifaces)
875 {
876         struct ctdb_vnn      *vnn;
877         uint32_t num = 0;
878         char *tmp;
879         const char *iface;
880         int i;
881         int ret;
882
883         /* Verify that we dont have an entry for this ip yet */
884         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
885                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
886                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
887                                 ctdb_addr_to_str(addr)));
888                         return -1;
889                 }               
890         }
891
892         /* create a new vnn structure for this ip address */
893         vnn = talloc_zero(ctdb, struct ctdb_vnn);
894         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
895         vnn->ifaces = talloc_array(vnn, const char *, num + 2);
896         tmp = talloc_strdup(vnn, ifaces);
897         CTDB_NO_MEMORY_FATAL(ctdb, tmp);
898         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
899                 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
900                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
901                 vnn->ifaces[num] = talloc_strdup(vnn, iface);
902                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
903                 num++;
904         }
905         talloc_free(tmp);
906         vnn->ifaces[num] = NULL;
907         vnn->public_address      = *addr;
908         vnn->public_netmask_bits = mask;
909         vnn->pnn                 = -1;
910         if (ctdb_sys_have_ip(addr)) {
911                 DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
912                 vnn->pnn = ctdb->pnn;
913         }
914
915         for (i=0; vnn->ifaces[i]; i++) {
916                 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
917                 if (ret != 0) {
918                         DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
919                                            "for public_address[%s]\n",
920                                            vnn->ifaces[i], ctdb_addr_to_str(addr)));
921                         talloc_free(vnn);
922                         return -1;
923                 }
924                 if (i == 0) {
925                         vnn->iface = ctdb_find_iface(ctdb, vnn->ifaces[i]);
926                 }
927         }
928
929         DLIST_ADD(ctdb->vnn, vnn);
930
931         return 0;
932 }
933
934 /*
935   setup the event script directory
936 */
937 int ctdb_set_event_script_dir(struct ctdb_context *ctdb, const char *script_dir)
938 {
939         ctdb->event_script_dir = talloc_strdup(ctdb, script_dir);
940         CTDB_NO_MEMORY(ctdb, ctdb->event_script_dir);
941         return 0;
942 }
943
944 /*
945   setup the public address lists from a file
946 */
947 int ctdb_set_public_addresses(struct ctdb_context *ctdb, const char *alist)
948 {
949         char **lines;
950         int nlines;
951         int i;
952
953         lines = file_lines_load(alist, &nlines, ctdb);
954         if (lines == NULL) {
955                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", alist);
956                 return -1;
957         }
958         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
959                 nlines--;
960         }
961
962         for (i=0;i<nlines;i++) {
963                 unsigned mask;
964                 ctdb_sock_addr addr;
965                 const char *addrstr;
966                 const char *ifaces;
967                 char *tok, *line;
968
969                 line = lines[i];
970                 while ((*line == ' ') || (*line == '\t')) {
971                         line++;
972                 }
973                 if (*line == '#') {
974                         continue;
975                 }
976                 if (strcmp(line, "") == 0) {
977                         continue;
978                 }
979                 tok = strtok(line, " \t");
980                 addrstr = tok;
981                 tok = strtok(NULL, " \t");
982                 if (tok == NULL) {
983                         if (NULL == ctdb->default_public_interface) {
984                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
985                                          i+1));
986                                 talloc_free(lines);
987                                 return -1;
988                         }
989                         ifaces = ctdb->default_public_interface;
990                 } else {
991                         ifaces = tok;
992                 }
993
994                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
995                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
996                         talloc_free(lines);
997                         return -1;
998                 }
999                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces)) {
1000                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1001                         talloc_free(lines);
1002                         return -1;
1003                 }
1004         }
1005
1006         talloc_free(lines);
1007         return 0;
1008 }
1009
1010 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1011                               const char *iface,
1012                               const char *ip)
1013 {
1014         struct ctdb_vnn *svnn;
1015         struct ctdb_iface *cur = NULL;
1016         bool ok;
1017         int ret;
1018
1019         svnn = talloc_zero(ctdb, struct ctdb_vnn);
1020         CTDB_NO_MEMORY(ctdb, svnn);
1021
1022         svnn->ifaces = talloc_array(svnn, const char *, 2);
1023         CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1024         svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1025         CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1026         svnn->ifaces[1] = NULL;
1027
1028         ok = parse_ip(ip, iface, 0, &svnn->public_address);
1029         if (!ok) {
1030                 talloc_free(svnn);
1031                 return -1;
1032         }
1033
1034         ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1035         if (ret != 0) {
1036                 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1037                                    "for single_ip[%s]\n",
1038                                    svnn->ifaces[0],
1039                                    ctdb_addr_to_str(&svnn->public_address)));
1040                 talloc_free(svnn);
1041                 return -1;
1042         }
1043
1044         /* assume the single public ip interface is initially "good" */
1045         cur = ctdb_find_iface(ctdb, iface);
1046         if (cur == NULL) {
1047                 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1048                 return -1;
1049         }
1050         cur->link_up = true;
1051
1052         ret = ctdb_vnn_assign_iface(ctdb, svnn);
1053         if (ret != 0) {
1054                 talloc_free(svnn);
1055                 return -1;
1056         }
1057
1058         ctdb->single_ip_vnn = svnn;
1059         return 0;
1060 }
1061
1062 /* Given a physical node, return the number of
1063    public addresses that is currently assigned to this node.
1064 */
1065 static int node_ip_coverage(struct ctdb_context *ctdb, 
1066         int32_t pnn,
1067         struct ctdb_public_ip_list *ips)
1068 {
1069         int num=0;
1070
1071         for (;ips;ips=ips->next) {
1072                 if (ips->pnn == pnn) {
1073                         num++;
1074                 }
1075         }
1076         return num;
1077 }
1078
1079
1080 /* Check if this is a public ip known to the node, i.e. can that
1081    node takeover this ip ?
1082 */
1083 static int can_node_serve_ip(struct ctdb_context *ctdb, int32_t pnn, 
1084                 struct ctdb_public_ip_list *ip)
1085 {
1086         struct ctdb_all_public_ips *public_ips;
1087         int i;
1088
1089         public_ips = ctdb->nodes[pnn]->available_public_ips;
1090
1091         if (public_ips == NULL) {
1092                 return -1;
1093         }
1094
1095         for (i=0;i<public_ips->num;i++) {
1096                 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1097                         /* yes, this node can serve this public ip */
1098                         return 0;
1099                 }
1100         }
1101
1102         return -1;
1103 }
1104
1105
1106 /* search the node lists list for a node to takeover this ip.
1107    pick the node that currently are serving the least number of ips
1108    so that the ips get spread out evenly.
1109 */
1110 static int find_takeover_node(struct ctdb_context *ctdb, 
1111                 struct ctdb_node_map *nodemap, uint32_t mask, 
1112                 struct ctdb_public_ip_list *ip,
1113                 struct ctdb_public_ip_list *all_ips)
1114 {
1115         int pnn, min=0, num;
1116         int i;
1117
1118         pnn    = -1;
1119         for (i=0;i<nodemap->num;i++) {
1120                 if (nodemap->nodes[i].flags & mask) {
1121                         /* This node is not healty and can not be used to serve
1122                            a public address 
1123                         */
1124                         continue;
1125                 }
1126
1127                 /* verify that this node can serve this ip */
1128                 if (can_node_serve_ip(ctdb, i, ip)) {
1129                         /* no it couldnt   so skip to the next node */
1130                         continue;
1131                 }
1132
1133                 num = node_ip_coverage(ctdb, i, all_ips);
1134                 /* was this the first node we checked ? */
1135                 if (pnn == -1) {
1136                         pnn = i;
1137                         min  = num;
1138                 } else {
1139                         if (num < min) {
1140                                 pnn = i;
1141                                 min  = num;
1142                         }
1143                 }
1144         }       
1145         if (pnn == -1) {
1146                 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1147                         ctdb_addr_to_str(&ip->addr)));
1148
1149                 return -1;
1150         }
1151
1152         ip->pnn = pnn;
1153         return 0;
1154 }
1155
1156 #define IP_KEYLEN       4
1157 static uint32_t *ip_key(ctdb_sock_addr *ip)
1158 {
1159         static uint32_t key[IP_KEYLEN];
1160
1161         bzero(key, sizeof(key));
1162
1163         switch (ip->sa.sa_family) {
1164         case AF_INET:
1165                 key[3]  = htonl(ip->ip.sin_addr.s_addr);
1166                 break;
1167         case AF_INET6:
1168                 key[0]  = htonl(ip->ip6.sin6_addr.s6_addr32[0]);
1169                 key[1]  = htonl(ip->ip6.sin6_addr.s6_addr32[1]);
1170                 key[2]  = htonl(ip->ip6.sin6_addr.s6_addr32[2]);
1171                 key[3]  = htonl(ip->ip6.sin6_addr.s6_addr32[3]);
1172                 break;
1173         default:
1174                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1175                 return key;
1176         }
1177
1178         return key;
1179 }
1180
1181 static void *add_ip_callback(void *parm, void *data)
1182 {
1183         struct ctdb_public_ip_list *this_ip = parm; 
1184         struct ctdb_public_ip_list *prev_ip = data; 
1185
1186         if (prev_ip == NULL) {
1187                 return parm;
1188         }
1189         if (this_ip->pnn == -1) {
1190                 this_ip->pnn = prev_ip->pnn;
1191         }
1192
1193         return parm;
1194 }
1195
1196 void getips_count_callback(void *param, void *data)
1197 {
1198         struct ctdb_public_ip_list **ip_list = (struct ctdb_public_ip_list **)param;
1199         struct ctdb_public_ip_list *new_ip = (struct ctdb_public_ip_list *)data;
1200
1201         new_ip->next = *ip_list;
1202         *ip_list     = new_ip;
1203 }
1204
1205 static struct ctdb_public_ip_list *
1206 create_merged_ip_list(struct ctdb_context *ctdb)
1207 {
1208         int i, j;
1209         struct ctdb_public_ip_list *ip_list;
1210         struct ctdb_all_public_ips *public_ips;
1211
1212         if (ctdb->ip_tree != NULL) {
1213                 talloc_free(ctdb->ip_tree);
1214                 ctdb->ip_tree = NULL;
1215         }
1216         ctdb->ip_tree = trbt_create(ctdb, 0);
1217
1218         for (i=0;i<ctdb->num_nodes;i++) {
1219                 public_ips = ctdb->nodes[i]->known_public_ips;
1220
1221                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1222                         continue;
1223                 }
1224
1225                 /* there were no public ips for this node */
1226                 if (public_ips == NULL) {
1227                         continue;
1228                 }               
1229
1230                 for (j=0;j<public_ips->num;j++) {
1231                         struct ctdb_public_ip_list *tmp_ip; 
1232
1233                         tmp_ip = talloc_zero(ctdb->ip_tree, struct ctdb_public_ip_list);
1234                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1235                         tmp_ip->pnn  = public_ips->ips[j].pnn;
1236                         tmp_ip->addr = public_ips->ips[j].addr;
1237                         tmp_ip->next = NULL;
1238
1239                         trbt_insertarray32_callback(ctdb->ip_tree,
1240                                 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1241                                 add_ip_callback,
1242                                 tmp_ip);
1243                 }
1244         }
1245
1246         ip_list = NULL;
1247         trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1248
1249         return ip_list;
1250 }
1251
1252 /* 
1253  * This is the length of the longtest common prefix between the IPs.
1254  * It is calculated by XOR-ing the 2 IPs together and counting the
1255  * number of leading zeroes.  The implementation means that all
1256  * addresses end up being 128 bits long.
1257  * Not static, so we can easily link it into a unit test.
1258  *
1259  * FIXME? Should we consider IPv4 and IPv6 separately given that the
1260  * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1261  * lots of nodes and IP addresses?
1262  */
1263 uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1264 {
1265         uint32_t ip1_k[IP_KEYLEN];
1266         uint32_t *t;
1267         int i;
1268         uint32_t x;
1269
1270         uint32_t distance = 0;
1271
1272         memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1273         t = ip_key(ip2);
1274         for (i=0; i<IP_KEYLEN; i++) {
1275                 x = ip1_k[i] ^ t[i];
1276                 if (x == 0) {
1277                         distance += 32;
1278                 } else {
1279                         /* Count number of leading zeroes. 
1280                          * FIXME? This could be optimised...
1281                          */
1282                         while ((x & (1 << 31)) == 0) {
1283                                 x <<= 1;
1284                                 distance += 1;
1285                         }
1286                 }
1287         }
1288
1289         return distance;
1290 }
1291
1292 /* Calculate the IP distance for the given IP relative to IPs on the
1293    given node.  The ips argument is generally the all_ips variable
1294    used in the main part of the algorithm.
1295  * Not static, so we can easily link it into a unit test.
1296  */
1297 uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1298                            struct ctdb_public_ip_list *ips,
1299                            int pnn)
1300 {
1301         struct ctdb_public_ip_list *t;
1302         uint32_t d;
1303
1304         uint32_t sum = 0;
1305
1306         for (t=ips; t != NULL; t=t->next) {
1307                 if (t->pnn != pnn) {
1308                         continue;
1309                 }
1310
1311                 /* Optimisation: We never calculate the distance
1312                  * between an address and itself.  This allows us to
1313                  * calculate the effect of removing an address from a
1314                  * node by simply calculating the distance between
1315                  * that address and all of the exitsing addresses.
1316                  * Moreover, we assume that we're only ever dealing
1317                  * with addresses from all_ips so we can identify an
1318                  * address via a pointer rather than doing a more
1319                  * expensive address comparison. */
1320                 if (&(t->addr) == ip) {
1321                         continue;
1322                 }
1323
1324                 d = ip_distance(ip, &(t->addr));
1325                 sum += d * d;  /* Cheaper than pulling in math.h :-) */
1326         }
1327
1328         return sum;
1329 }
1330
1331 /* Return the LCP2 imbalance metric for addresses currently assigned
1332    to the given node.
1333  * Not static, so we can easily link it into a unit test.
1334  */
1335 uint32_t lcp2_imbalance(struct ctdb_public_ip_list * all_ips, int pnn)
1336 {
1337         struct ctdb_public_ip_list *t;
1338
1339         uint32_t imbalance = 0;
1340
1341         for (t=all_ips; t!=NULL; t=t->next) {
1342                 if (t->pnn != pnn) {
1343                         continue;
1344                 }
1345                 /* Pass the rest of the IPs rather than the whole
1346                    all_ips input list.
1347                 */
1348                 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1349         }
1350
1351         return imbalance;
1352 }
1353
1354 /* Allocate any unassigned IPs just by looping through the IPs and
1355  * finding the best node for each.
1356  * Not static, so we can easily link it into a unit test.
1357  */
1358 void basic_allocate_unassigned(struct ctdb_context *ctdb,
1359                                struct ctdb_node_map *nodemap,
1360                                uint32_t mask,
1361                                struct ctdb_public_ip_list *all_ips)
1362 {
1363         struct ctdb_public_ip_list *tmp_ip;
1364
1365         /* loop over all ip's and find a physical node to cover for 
1366            each unassigned ip.
1367         */
1368         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1369                 if (tmp_ip->pnn == -1) {
1370                         if (find_takeover_node(ctdb, nodemap, mask, tmp_ip, all_ips)) {
1371                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1372                                         ctdb_addr_to_str(&tmp_ip->addr)));
1373                         }
1374                 }
1375         }
1376 }
1377
1378 /* Basic non-deterministic rebalancing algorithm.
1379  * Not static, so we can easily link it into a unit test.
1380  */
1381 bool basic_failback(struct ctdb_context *ctdb,
1382                     struct ctdb_node_map *nodemap,
1383                     uint32_t mask,
1384                     struct ctdb_public_ip_list *all_ips,
1385                     int num_ips,
1386                     int *retries)
1387 {
1388         int i;
1389         int maxnode, maxnum=0, minnode, minnum=0, num;
1390         struct ctdb_public_ip_list *tmp_ip;
1391
1392         /* for each ip address, loop over all nodes that can serve
1393            this ip and make sure that the difference between the node
1394            serving the most and the node serving the least ip's are
1395            not greater than 1.
1396         */
1397         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1398                 if (tmp_ip->pnn == -1) {
1399                         continue;
1400                 }
1401
1402                 /* Get the highest and lowest number of ips's served by any 
1403                    valid node which can serve this ip.
1404                 */
1405                 maxnode = -1;
1406                 minnode = -1;
1407                 for (i=0;i<nodemap->num;i++) {
1408                         if (nodemap->nodes[i].flags & mask) {
1409                                 continue;
1410                         }
1411
1412                         /* only check nodes that can actually serve this ip */
1413                         if (can_node_serve_ip(ctdb, i, tmp_ip)) {
1414                                 /* no it couldnt   so skip to the next node */
1415                                 continue;
1416                         }
1417
1418                         num = node_ip_coverage(ctdb, i, all_ips);
1419                         if (maxnode == -1) {
1420                                 maxnode = i;
1421                                 maxnum  = num;
1422                         } else {
1423                                 if (num > maxnum) {
1424                                         maxnode = i;
1425                                         maxnum  = num;
1426                                 }
1427                         }
1428                         if (minnode == -1) {
1429                                 minnode = i;
1430                                 minnum  = num;
1431                         } else {
1432                                 if (num < minnum) {
1433                                         minnode = i;
1434                                         minnum  = num;
1435                                 }
1436                         }
1437                 }
1438                 if (maxnode == -1) {
1439                         DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1440                                 ctdb_addr_to_str(&tmp_ip->addr)));
1441
1442                         continue;
1443                 }
1444
1445                 /* If we want deterministic IPs then dont try to reallocate 
1446                    them to spread out the load.
1447                 */
1448                 if (1 == ctdb->tunable.deterministic_public_ips) {
1449                         continue;
1450                 }
1451
1452                 /* if the spread between the smallest and largest coverage by
1453                    a node is >=2 we steal one of the ips from the node with
1454                    most coverage to even things out a bit.
1455                    try to do this a limited number of times since we dont
1456                    want to spend too much time balancing the ip coverage.
1457                 */
1458                 if ( (maxnum > minnum+1)
1459                      && (*retries < (num_ips + 5)) ){
1460                         struct ctdb_public_ip_list *tmp;
1461
1462                         /* mark one of maxnode's vnn's as unassigned and try
1463                            again
1464                         */
1465                         for (tmp=all_ips;tmp;tmp=tmp->next) {
1466                                 if (tmp->pnn == maxnode) {
1467                                         tmp->pnn = -1;
1468                                         (*retries)++;
1469                                         return true;
1470                                 }
1471                         }
1472                 }
1473         }
1474
1475         return false;
1476 }
1477
1478 /* Do necessary LCP2 initialisation.  Bury it in a function here so
1479  * that we can unit test it.
1480  * Not static, so we can easily link it into a unit test.
1481  */
1482 void lcp2_init(struct ctdb_context * tmp_ctx,
1483                struct ctdb_node_map * nodemap,
1484                uint32_t mask,
1485                struct ctdb_public_ip_list *all_ips,
1486                uint32_t **lcp2_imbalances,
1487                bool **newly_healthy)
1488 {
1489         int i;
1490         struct ctdb_public_ip_list *tmp_ip;
1491
1492         *newly_healthy = talloc_array(tmp_ctx, bool, nodemap->num);
1493         CTDB_NO_MEMORY_FATAL(tmp_ctx, *newly_healthy);
1494         *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, nodemap->num);
1495         CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
1496
1497         for (i=0;i<nodemap->num;i++) {
1498                 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1499                 /* First step: is the node "healthy"? */
1500                 (*newly_healthy)[i] = ! (bool)(nodemap->nodes[i].flags & mask);
1501         }
1502
1503         /* 2nd step: if a ndoe has IPs assigned then it must have been
1504          * healthy before, so we remove it from consideration... */
1505         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1506                 if (tmp_ip->pnn != -1) {
1507                         (*newly_healthy)[tmp_ip->pnn] = false;
1508                 }
1509         }
1510 }
1511
1512 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1513  * the IP/node combination that will cost the least.
1514  * Not static, so we can easily link it into a unit test.
1515  */
1516 void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
1517                               struct ctdb_node_map *nodemap,
1518                               uint32_t mask,
1519                               struct ctdb_public_ip_list *all_ips,
1520                               uint32_t *lcp2_imbalances)
1521 {
1522         struct ctdb_public_ip_list *tmp_ip;
1523         int dstnode;
1524
1525         int minnode;
1526         uint32_t mindsum, dstdsum, dstimbl, minimbl;
1527         struct ctdb_public_ip_list *minip;
1528
1529         bool should_loop = true;
1530         bool have_unassigned = true;
1531
1532         while (have_unassigned && should_loop) {
1533                 should_loop = false;
1534
1535                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1536                 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1537
1538                 minnode = -1;
1539                 mindsum = 0;
1540                 minip = NULL;
1541
1542                 /* loop over each unassigned ip. */
1543                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1544                         if (tmp_ip->pnn != -1) {
1545                                 continue;
1546                         }
1547
1548                         for (dstnode=0; dstnode < nodemap->num; dstnode++) {
1549                                 /* only check nodes that can actually serve this ip */
1550                                 if (can_node_serve_ip(ctdb, dstnode, tmp_ip)) {
1551                                         /* no it couldnt   so skip to the next node */
1552                                         continue;
1553                                 }
1554                                 if (nodemap->nodes[dstnode].flags & mask) {
1555                                         continue;
1556                                 }
1557
1558                                 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1559                                 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1560                                 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1561                                                    ctdb_addr_to_str(&(tmp_ip->addr)),
1562                                                    dstnode,
1563                                                    dstimbl - lcp2_imbalances[dstnode]));
1564
1565
1566                                 if ((minnode == -1) || (dstdsum < mindsum)) {
1567                                         minnode = dstnode;
1568                                         minimbl = dstimbl;
1569                                         mindsum = dstdsum;
1570                                         minip = tmp_ip;
1571                                         should_loop = true;
1572                                 }
1573                         }
1574                 }
1575
1576                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1577
1578                 /* If we found one then assign it to the given node. */
1579                 if (minnode != -1) {
1580                         minip->pnn = minnode;
1581                         lcp2_imbalances[minnode] = minimbl;
1582                         DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1583                                           ctdb_addr_to_str(&(minip->addr)),
1584                                           minnode,
1585                                           mindsum));
1586                 }
1587
1588                 /* There might be a better way but at least this is clear. */
1589                 have_unassigned = false;
1590                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1591                         if (tmp_ip->pnn == -1) {
1592                                 have_unassigned = true;
1593                         }
1594                 }
1595         }
1596
1597         /* We know if we have an unassigned addresses so we might as
1598          * well optimise.
1599          */
1600         if (have_unassigned) {
1601                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1602                         if (tmp_ip->pnn == -1) {
1603                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1604                                                      ctdb_addr_to_str(&tmp_ip->addr)));
1605                         }
1606                 }
1607         }
1608 }
1609
1610 /* LCP2 algorithm for rebalancing the cluster.  Given a candidate node
1611  * to move IPs from, determines the best IP/destination node
1612  * combination to move from the source node.
1613  *
1614  * Not static, so we can easily link it into a unit test.
1615  */
1616 bool lcp2_failback_candidate(struct ctdb_context *ctdb,
1617                              struct ctdb_node_map *nodemap,
1618                              struct ctdb_public_ip_list *all_ips,
1619                              int srcnode,
1620                              uint32_t candimbl,
1621                              uint32_t *lcp2_imbalances,
1622                              bool *newly_healthy)
1623 {
1624         int dstnode, mindstnode;
1625         uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1626         uint32_t minsrcimbl, mindstimbl;
1627         struct ctdb_public_ip_list *minip;
1628         struct ctdb_public_ip_list *tmp_ip;
1629
1630         /* Find an IP and destination node that best reduces imbalance. */
1631         minip = NULL;
1632         minsrcimbl = 0;
1633         mindstnode = -1;
1634         mindstimbl = 0;
1635
1636         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1637         DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n", srcnode, candimbl));
1638
1639         for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1640                 /* Only consider addresses on srcnode. */
1641                 if (tmp_ip->pnn != srcnode) {
1642                         continue;
1643                 }
1644
1645                 /* What is this IP address costing the source node? */
1646                 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1647                 srcimbl = candimbl - srcdsum;
1648
1649                 /* Consider this IP address would cost each potential
1650                  * destination node.  Destination nodes are limited to
1651                  * those that are newly healthy, since we don't want
1652                  * to do gratuitous failover of IPs just to make minor
1653                  * balance improvements.
1654                  */
1655                 for (dstnode=0; dstnode < nodemap->num; dstnode++) {
1656                         if (! newly_healthy[dstnode]) {
1657                                 continue;
1658                         }
1659                         /* only check nodes that can actually serve this ip */
1660                         if (can_node_serve_ip(ctdb, dstnode, tmp_ip)) {
1661                                 /* no it couldnt   so skip to the next node */
1662                                 continue;
1663                         }
1664
1665                         dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1666                         dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1667                         DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1668                                            srcnode, srcimbl - lcp2_imbalances[srcnode],
1669                                            ctdb_addr_to_str(&(tmp_ip->addr)),
1670                                            dstnode, dstimbl - lcp2_imbalances[dstnode]));
1671
1672                         if ((dstimbl < candimbl) && (dstdsum < srcdsum) && \
1673                             ((mindstnode == -1) ||                              \
1674                              ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1675
1676                                 minip = tmp_ip;
1677                                 minsrcimbl = srcimbl;
1678                                 mindstnode = dstnode;
1679                                 mindstimbl = dstimbl;
1680                         }
1681                 }
1682         }
1683         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1684
1685         if (mindstnode != -1) {
1686                 /* We found a move that makes things better... */
1687                 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1688                                   srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1689                                   ctdb_addr_to_str(&(minip->addr)),
1690                                   mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1691
1692
1693                 lcp2_imbalances[srcnode] = srcimbl;
1694                 lcp2_imbalances[mindstnode] = mindstimbl;
1695                 minip->pnn = mindstnode;
1696
1697                 return true;
1698         }
1699
1700         return false;
1701         
1702 }
1703
1704 struct lcp2_imbalance_pnn {
1705         uint32_t imbalance;
1706         int pnn;
1707 };
1708
1709 int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
1710 {
1711         const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
1712         const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
1713
1714         if (lipa->imbalance > lipb->imbalance) {
1715                 return -1;
1716         } else if (lipa->imbalance == lipb->imbalance) {
1717                 return 0;
1718         } else {
1719                 return 1;
1720         }
1721 }
1722
1723 /* LCP2 algorithm for rebalancing the cluster.  This finds the source
1724  * node with the highest LCP2 imbalance, and then determines the best
1725  * IP/destination node combination to move from the source node.
1726  *
1727  * Not static, so we can easily link it into a unit test.
1728  */
1729 bool lcp2_failback(struct ctdb_context *ctdb,
1730                    struct ctdb_node_map *nodemap,
1731                    uint32_t mask,
1732                    struct ctdb_public_ip_list *all_ips,
1733                    uint32_t *lcp2_imbalances,
1734                    bool *newly_healthy)
1735 {
1736         int i, num_newly_healthy;
1737         struct lcp2_imbalance_pnn * lips;
1738         bool ret;
1739
1740         /* It is only worth continuing if we have suitable target
1741          * nodes to transfer IPs to.  This check is much cheaper than
1742          * continuing on...
1743          */
1744         num_newly_healthy = 0;
1745         for (i = 0; i < nodemap->num; i++) {
1746                 if (newly_healthy[i]) {
1747                         num_newly_healthy++;
1748                 }
1749         }
1750         if (num_newly_healthy == 0) {
1751                 return false;
1752         }
1753
1754         /* Put the imbalances and nodes into an array, sort them and
1755          * iterate through candidates.  Usually the 1st one will be
1756          * used, so this doesn't cost much...
1757          */
1758         lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, nodemap->num);
1759         for (i = 0; i < nodemap->num; i++) {
1760                 lips[i].imbalance = lcp2_imbalances[i];
1761                 lips[i].pnn = i;
1762         }
1763         qsort(lips, nodemap->num, sizeof(struct lcp2_imbalance_pnn),
1764               lcp2_cmp_imbalance_pnn);
1765
1766         ret = false;
1767         for (i = 0; i < nodemap->num; i++) {
1768                 /* This means that all nodes had 0 or 1 addresses, so
1769                  * can't be imbalanced.
1770                  */
1771                 if (lips[i].imbalance == 0) {
1772                         break;
1773                 }
1774
1775                 if (lcp2_failback_candidate(ctdb,
1776                                             nodemap,
1777                                             all_ips,
1778                                             lips[i].pnn,
1779                                             lips[i].imbalance,
1780                                             lcp2_imbalances,
1781                                             newly_healthy)) {
1782                         ret = true;
1783                         break;
1784                 }
1785         }
1786
1787         talloc_free(lips);
1788         return ret;
1789 }
1790
1791 /* The calculation part of the IP allocation algorithm.
1792  * Not static, so we can easily link it into a unit test.
1793  */
1794 void ctdb_takeover_run_core(struct ctdb_context *ctdb,
1795                             struct ctdb_node_map *nodemap,
1796                             struct ctdb_public_ip_list **all_ips_p)
1797 {
1798         int i, num_healthy, retries, num_ips;
1799         uint32_t mask;
1800         struct ctdb_public_ip_list *all_ips, *tmp_ip;
1801         uint32_t *lcp2_imbalances;
1802         bool *newly_healthy;
1803
1804         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1805
1806         /* Count how many completely healthy nodes we have */
1807         num_healthy = 0;
1808         for (i=0;i<nodemap->num;i++) {
1809                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
1810                         num_healthy++;
1811                 }
1812         }
1813
1814         if (num_healthy > 0) {
1815                 /* We have healthy nodes, so only consider them for 
1816                    serving public addresses
1817                 */
1818                 mask = NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED;
1819         } else {
1820                 /* We didnt have any completely healthy nodes so
1821                    use "disabled" nodes as a fallback
1822                 */
1823                 mask = NODE_FLAGS_INACTIVE;
1824         }
1825
1826         /* since nodes only know about those public addresses that
1827            can be served by that particular node, no single node has
1828            a full list of all public addresses that exist in the cluster.
1829            Walk over all node structures and create a merged list of
1830            all public addresses that exist in the cluster.
1831
1832            keep the tree of ips around as ctdb->ip_tree
1833         */
1834         all_ips = create_merged_ip_list(ctdb);
1835         *all_ips_p = all_ips; /* minimal code changes */
1836
1837         /* Count how many ips we have */
1838         num_ips = 0;
1839         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1840                 num_ips++;
1841         }
1842
1843         /* If we want deterministic ip allocations, i.e. that the ip addresses
1844            will always be allocated the same way for a specific set of
1845            available/unavailable nodes.
1846         */
1847         if (1 == ctdb->tunable.deterministic_public_ips) {              
1848                 DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
1849                 for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
1850                         tmp_ip->pnn = i%nodemap->num;
1851                 }
1852         }
1853
1854
1855         /* mark all public addresses with a masked node as being served by
1856            node -1
1857         */
1858         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1859                 if (tmp_ip->pnn == -1) {
1860                         continue;
1861                 }
1862                 if (nodemap->nodes[tmp_ip->pnn].flags & mask) {
1863                         tmp_ip->pnn = -1;
1864                 }
1865         }
1866
1867         /* verify that the assigned nodes can serve that public ip
1868            and set it to -1 if not
1869         */
1870         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1871                 if (tmp_ip->pnn == -1) {
1872                         continue;
1873                 }
1874                 if (can_node_serve_ip(ctdb, tmp_ip->pnn, tmp_ip) != 0) {
1875                         /* this node can not serve this ip. */
1876                         tmp_ip->pnn = -1;
1877                 }
1878         }
1879
1880         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
1881                 lcp2_init(tmp_ctx, nodemap, mask, all_ips, &lcp2_imbalances, &newly_healthy);
1882         }
1883
1884         /* now we must redistribute all public addresses with takeover node
1885            -1 among the nodes available
1886         */
1887         retries = 0;
1888 try_again:
1889         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
1890                 lcp2_allocate_unassigned(ctdb, nodemap, mask, all_ips, lcp2_imbalances);
1891         } else {
1892                 basic_allocate_unassigned(ctdb, nodemap, mask, all_ips);
1893         }
1894
1895         /* If we dont want ips to fail back after a node becomes healthy
1896            again, we wont even try to reallocat the ip addresses so that
1897            they are evenly spread out.
1898            This can NOT be used at the same time as DeterministicIPs !
1899         */
1900         if (1 == ctdb->tunable.no_ip_failback) {
1901                 if (1 == ctdb->tunable.deterministic_public_ips) {
1902                         DEBUG(DEBUG_ERR, ("ERROR: You can not use 'DeterministicIPs' and 'NoIPFailback' at the same time\n"));
1903                 }
1904                 goto finished;
1905         }
1906
1907
1908         /* now, try to make sure the ip adresses are evenly distributed
1909            across the node.
1910         */
1911         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
1912                 if (lcp2_failback(ctdb, nodemap, mask, all_ips, lcp2_imbalances, newly_healthy)) {
1913                         goto try_again;
1914                 }
1915         } else {
1916                 if (basic_failback(ctdb, nodemap, mask, all_ips, num_ips, &retries)) {
1917                         goto try_again;
1918                 }
1919         }
1920
1921         /* finished distributing the public addresses, now just send the 
1922            info out to the nodes
1923         */
1924 finished:
1925
1926         /* at this point ->pnn is the node which will own each IP
1927            or -1 if there is no node that can cover this ip
1928         */
1929
1930         return;
1931 }
1932
1933 /*
1934   make any IP alias changes for public addresses that are necessary 
1935  */
1936 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
1937                       client_async_callback fail_callback, void *callback_data)
1938 {
1939         int i;
1940         struct ctdb_public_ip ip;
1941         struct ctdb_public_ipv4 ipv4;
1942         uint32_t *nodes;
1943         struct ctdb_public_ip_list *all_ips, *tmp_ip;
1944         TDB_DATA data;
1945         struct timeval timeout;
1946         struct client_async_data *async_data;
1947         struct ctdb_client_control_state *state;
1948         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1949
1950         /*
1951          * ip failover is completely disabled, just send out the 
1952          * ipreallocated event.
1953          */
1954         if (ctdb->tunable.disable_ip_failover != 0) {
1955                 goto ipreallocated;
1956         }
1957
1958         ZERO_STRUCT(ip);
1959
1960         /* Do the IP reassignment calculations */
1961         ctdb_takeover_run_core(ctdb, nodemap, &all_ips);
1962
1963         /* now tell all nodes to delete any alias that they should not
1964            have.  This will be a NOOP on nodes that don't currently
1965            hold the given alias */
1966         async_data = talloc_zero(tmp_ctx, struct client_async_data);
1967         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
1968
1969         async_data->fail_callback = fail_callback;
1970         async_data->callback_data = callback_data;
1971
1972         for (i=0;i<nodemap->num;i++) {
1973                 /* don't talk to unconnected nodes, but do talk to banned nodes */
1974                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1975                         continue;
1976                 }
1977
1978                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1979                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
1980                                 /* This node should be serving this
1981                                    vnn so dont tell it to release the ip
1982                                 */
1983                                 continue;
1984                         }
1985                         if (tmp_ip->addr.sa.sa_family == AF_INET) {
1986                                 ipv4.pnn = tmp_ip->pnn;
1987                                 ipv4.sin = tmp_ip->addr.ip;
1988
1989                                 timeout = TAKEOVER_TIMEOUT();
1990                                 data.dsize = sizeof(ipv4);
1991                                 data.dptr  = (uint8_t *)&ipv4;
1992                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
1993                                                 0, CTDB_CONTROL_RELEASE_IPv4, 0,
1994                                                 data, async_data,
1995                                                 &timeout, NULL);
1996                         } else {
1997                                 ip.pnn  = tmp_ip->pnn;
1998                                 ip.addr = tmp_ip->addr;
1999
2000                                 timeout = TAKEOVER_TIMEOUT();
2001                                 data.dsize = sizeof(ip);
2002                                 data.dptr  = (uint8_t *)&ip;
2003                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2004                                                 0, CTDB_CONTROL_RELEASE_IP, 0,
2005                                                 data, async_data,
2006                                                 &timeout, NULL);
2007                         }
2008
2009                         if (state == NULL) {
2010                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2011                                 talloc_free(tmp_ctx);
2012                                 return -1;
2013                         }
2014                 
2015                         ctdb_client_async_add(async_data, state);
2016                 }
2017         }
2018         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2019                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2020                 talloc_free(tmp_ctx);
2021                 return -1;
2022         }
2023         talloc_free(async_data);
2024
2025
2026         /* tell all nodes to get their own IPs */
2027         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2028         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2029
2030         async_data->fail_callback = fail_callback;
2031         async_data->callback_data = callback_data;
2032
2033         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2034                 if (tmp_ip->pnn == -1) {
2035                         /* this IP won't be taken over */
2036                         continue;
2037                 }
2038
2039                 if (tmp_ip->addr.sa.sa_family == AF_INET) {
2040                         ipv4.pnn = tmp_ip->pnn;
2041                         ipv4.sin = tmp_ip->addr.ip;
2042
2043                         timeout = TAKEOVER_TIMEOUT();
2044                         data.dsize = sizeof(ipv4);
2045                         data.dptr  = (uint8_t *)&ipv4;
2046                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
2047                                         0, CTDB_CONTROL_TAKEOVER_IPv4, 0,
2048                                         data, async_data,
2049                                         &timeout, NULL);
2050                 } else {
2051                         ip.pnn  = tmp_ip->pnn;
2052                         ip.addr = tmp_ip->addr;
2053
2054                         timeout = TAKEOVER_TIMEOUT();
2055                         data.dsize = sizeof(ip);
2056                         data.dptr  = (uint8_t *)&ip;
2057                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
2058                                         0, CTDB_CONTROL_TAKEOVER_IP, 0,
2059                                         data, async_data,
2060                                         &timeout, NULL);
2061                 }
2062                 if (state == NULL) {
2063                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2064                         talloc_free(tmp_ctx);
2065                         return -1;
2066                 }
2067                 
2068                 ctdb_client_async_add(async_data, state);
2069         }
2070         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2071                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2072                 talloc_free(tmp_ctx);
2073                 return -1;
2074         }
2075
2076 ipreallocated:
2077         /* tell all nodes to update natwg */
2078         /* send the flags update natgw on all connected nodes */
2079         data.dptr  = discard_const("ipreallocated");
2080         data.dsize = strlen((char *)data.dptr) + 1; 
2081         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2082         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_RUN_EVENTSCRIPTS,
2083                                       nodes, 0, TAKEOVER_TIMEOUT(),
2084                                       false, data,
2085                                       NULL, fail_callback,
2086                                       callback_data) != 0) {
2087                 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to updatenatgw failed\n"));
2088         }
2089
2090         talloc_free(tmp_ctx);
2091         return 0;
2092 }
2093
2094
2095 /*
2096   destroy a ctdb_client_ip structure
2097  */
2098 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2099 {
2100         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2101                 ctdb_addr_to_str(&ip->addr),
2102                 ntohs(ip->addr.ip.sin_port),
2103                 ip->client_id));
2104
2105         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2106         return 0;
2107 }
2108
2109 /*
2110   called by a client to inform us of a TCP connection that it is managing
2111   that should tickled with an ACK when IP takeover is done
2112   we handle both the old ipv4 style of packets as well as the new ipv4/6
2113   pdus.
2114  */
2115 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2116                                 TDB_DATA indata)
2117 {
2118         struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
2119         struct ctdb_control_tcp *old_addr = NULL;
2120         struct ctdb_control_tcp_addr new_addr;
2121         struct ctdb_control_tcp_addr *tcp_sock = NULL;
2122         struct ctdb_tcp_list *tcp;
2123         struct ctdb_tcp_connection t;
2124         int ret;
2125         TDB_DATA data;
2126         struct ctdb_client_ip *ip;
2127         struct ctdb_vnn *vnn;
2128         ctdb_sock_addr addr;
2129
2130         switch (indata.dsize) {
2131         case sizeof(struct ctdb_control_tcp):
2132                 old_addr = (struct ctdb_control_tcp *)indata.dptr;
2133                 ZERO_STRUCT(new_addr);
2134                 tcp_sock = &new_addr;
2135                 tcp_sock->src.ip  = old_addr->src;
2136                 tcp_sock->dest.ip = old_addr->dest;
2137                 break;
2138         case sizeof(struct ctdb_control_tcp_addr):
2139                 tcp_sock = (struct ctdb_control_tcp_addr *)indata.dptr;
2140                 break;
2141         default:
2142                 DEBUG(DEBUG_ERR,(__location__ " Invalid data structure passed "
2143                                  "to ctdb_control_tcp_client. size was %d but "
2144                                  "only allowed sizes are %lu and %lu\n",
2145                                  (int)indata.dsize,
2146                                  (long unsigned)sizeof(struct ctdb_control_tcp),
2147                                  (long unsigned)sizeof(struct ctdb_control_tcp_addr)));
2148                 return -1;
2149         }
2150
2151         addr = tcp_sock->src;
2152         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
2153         addr = tcp_sock->dest;
2154         ctdb_canonicalize_ip(&addr, &tcp_sock->dest);
2155
2156         ZERO_STRUCT(addr);
2157         memcpy(&addr, &tcp_sock->dest, sizeof(addr));
2158         vnn = find_public_ip_vnn(ctdb, &addr);
2159         if (vnn == NULL) {
2160                 switch (addr.sa.sa_family) {
2161                 case AF_INET:
2162                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2163                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
2164                                         ctdb_addr_to_str(&addr)));
2165                         }
2166                         break;
2167                 case AF_INET6:
2168                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
2169                                 ctdb_addr_to_str(&addr)));
2170                         break;
2171                 default:
2172                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2173                 }
2174
2175                 return 0;
2176         }
2177
2178         if (vnn->pnn != ctdb->pnn) {
2179                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2180                         ctdb_addr_to_str(&addr),
2181                         client_id, client->pid));
2182                 /* failing this call will tell smbd to die */
2183                 return -1;
2184         }
2185
2186         ip = talloc(client, struct ctdb_client_ip);
2187         CTDB_NO_MEMORY(ctdb, ip);
2188
2189         ip->ctdb      = ctdb;
2190         ip->addr      = addr;
2191         ip->client_id = client_id;
2192         talloc_set_destructor(ip, ctdb_client_ip_destructor);
2193         DLIST_ADD(ctdb->client_ip_list, ip);
2194
2195         tcp = talloc(client, struct ctdb_tcp_list);
2196         CTDB_NO_MEMORY(ctdb, tcp);
2197
2198         tcp->connection.src_addr = tcp_sock->src;
2199         tcp->connection.dst_addr = tcp_sock->dest;
2200
2201         DLIST_ADD(client->tcp_list, tcp);
2202
2203         t.src_addr = tcp_sock->src;
2204         t.dst_addr = tcp_sock->dest;
2205
2206         data.dptr = (uint8_t *)&t;
2207         data.dsize = sizeof(t);
2208
2209         switch (addr.sa.sa_family) {
2210         case AF_INET:
2211                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2212                         (unsigned)ntohs(tcp_sock->dest.ip.sin_port), 
2213                         ctdb_addr_to_str(&tcp_sock->src),
2214                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2215                 break;
2216         case AF_INET6:
2217                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2218                         (unsigned)ntohs(tcp_sock->dest.ip6.sin6_port), 
2219                         ctdb_addr_to_str(&tcp_sock->src),
2220                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2221                 break;
2222         default:
2223                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2224         }
2225
2226
2227         /* tell all nodes about this tcp connection */
2228         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
2229                                        CTDB_CONTROL_TCP_ADD,
2230                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2231         if (ret != 0) {
2232                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
2233                 return -1;
2234         }
2235
2236         return 0;
2237 }
2238
2239 /*
2240   find a tcp address on a list
2241  */
2242 static struct ctdb_tcp_connection *ctdb_tcp_find(struct ctdb_tcp_array *array, 
2243                                            struct ctdb_tcp_connection *tcp)
2244 {
2245         int i;
2246
2247         if (array == NULL) {
2248                 return NULL;
2249         }
2250
2251         for (i=0;i<array->num;i++) {
2252                 if (ctdb_same_sockaddr(&array->connections[i].src_addr, &tcp->src_addr) &&
2253                     ctdb_same_sockaddr(&array->connections[i].dst_addr, &tcp->dst_addr)) {
2254                         return &array->connections[i];
2255                 }
2256         }
2257         return NULL;
2258 }
2259
2260
2261
2262 /*
2263   called by a daemon to inform us of a TCP connection that one of its
2264   clients managing that should tickled with an ACK when IP takeover is
2265   done
2266  */
2267 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
2268 {
2269         struct ctdb_tcp_connection *p = (struct ctdb_tcp_connection *)indata.dptr;
2270         struct ctdb_tcp_array *tcparray;
2271         struct ctdb_tcp_connection tcp;
2272         struct ctdb_vnn *vnn;
2273
2274         vnn = find_public_ip_vnn(ctdb, &p->dst_addr);
2275         if (vnn == NULL) {
2276                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
2277                         ctdb_addr_to_str(&p->dst_addr)));
2278
2279                 return -1;
2280         }
2281
2282
2283         tcparray = vnn->tcp_array;
2284
2285         /* If this is the first tickle */
2286         if (tcparray == NULL) {
2287                 tcparray = talloc_size(ctdb->nodes, 
2288                         offsetof(struct ctdb_tcp_array, connections) +
2289                         sizeof(struct ctdb_tcp_connection) * 1);
2290                 CTDB_NO_MEMORY(ctdb, tcparray);
2291                 vnn->tcp_array = tcparray;
2292
2293                 tcparray->num = 0;
2294                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
2295                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
2296
2297                 tcparray->connections[tcparray->num].src_addr = p->src_addr;
2298                 tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
2299                 tcparray->num++;
2300
2301                 if (tcp_update_needed) {
2302                         vnn->tcp_update_needed = true;
2303                 }
2304                 return 0;
2305         }
2306
2307
2308         /* Do we already have this tickle ?*/
2309         tcp.src_addr = p->src_addr;
2310         tcp.dst_addr = p->dst_addr;
2311         if (ctdb_tcp_find(vnn->tcp_array, &tcp) != NULL) {
2312                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
2313                         ctdb_addr_to_str(&tcp.dst_addr),
2314                         ntohs(tcp.dst_addr.ip.sin_port),
2315                         vnn->pnn));
2316                 return 0;
2317         }
2318
2319         /* A new tickle, we must add it to the array */
2320         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
2321                                         struct ctdb_tcp_connection,
2322                                         tcparray->num+1);
2323         CTDB_NO_MEMORY(ctdb, tcparray->connections);
2324
2325         vnn->tcp_array = tcparray;
2326         tcparray->connections[tcparray->num].src_addr = p->src_addr;
2327         tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
2328         tcparray->num++;
2329                                 
2330         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
2331                 ctdb_addr_to_str(&tcp.dst_addr),
2332                 ntohs(tcp.dst_addr.ip.sin_port),
2333                 vnn->pnn));
2334
2335         if (tcp_update_needed) {
2336                 vnn->tcp_update_needed = true;
2337         }
2338
2339         return 0;
2340 }
2341
2342
2343 /*
2344   called by a daemon to inform us of a TCP connection that one of its
2345   clients managing that should tickled with an ACK when IP takeover is
2346   done
2347  */
2348 static void ctdb_remove_tcp_connection(struct ctdb_context *ctdb, struct ctdb_tcp_connection *conn)
2349 {
2350         struct ctdb_tcp_connection *tcpp;
2351         struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst_addr);
2352
2353         if (vnn == NULL) {
2354                 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
2355                         ctdb_addr_to_str(&conn->dst_addr)));
2356                 return;
2357         }
2358
2359         /* if the array is empty we cant remove it
2360            and we dont need to do anything
2361          */
2362         if (vnn->tcp_array == NULL) {
2363                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
2364                         ctdb_addr_to_str(&conn->dst_addr),
2365                         ntohs(conn->dst_addr.ip.sin_port)));
2366                 return;
2367         }
2368
2369
2370         /* See if we know this connection
2371            if we dont know this connection  then we dont need to do anything
2372          */
2373         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
2374         if (tcpp == NULL) {
2375                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
2376                         ctdb_addr_to_str(&conn->dst_addr),
2377                         ntohs(conn->dst_addr.ip.sin_port)));
2378                 return;
2379         }
2380
2381
2382         /* We need to remove this entry from the array.
2383            Instead of allocating a new array and copying data to it
2384            we cheat and just copy the last entry in the existing array
2385            to the entry that is to be removed and just shring the 
2386            ->num field
2387          */
2388         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
2389         vnn->tcp_array->num--;
2390
2391         /* If we deleted the last entry we also need to remove the entire array
2392          */
2393         if (vnn->tcp_array->num == 0) {
2394                 talloc_free(vnn->tcp_array);
2395                 vnn->tcp_array = NULL;
2396         }               
2397
2398         vnn->tcp_update_needed = true;
2399
2400         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
2401                 ctdb_addr_to_str(&conn->src_addr),
2402                 ntohs(conn->src_addr.ip.sin_port)));
2403 }
2404
2405
2406 /*
2407   called by a daemon to inform us of a TCP connection that one of its
2408   clients used are no longer needed in the tickle database
2409  */
2410 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
2411 {
2412         struct ctdb_tcp_connection *conn = (struct ctdb_tcp_connection *)indata.dptr;
2413
2414         ctdb_remove_tcp_connection(ctdb, conn);
2415
2416         return 0;
2417 }
2418
2419
2420 /*
2421   called when a daemon restarts - send all tickes for all public addresses
2422   we are serving immediately to the new node.
2423  */
2424 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t vnn)
2425 {
2426 /*XXX here we should send all tickes we are serving to the new node */
2427         return 0;
2428 }
2429
2430
2431 /*
2432   called when a client structure goes away - hook to remove
2433   elements from the tcp_list in all daemons
2434  */
2435 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
2436 {
2437         while (client->tcp_list) {
2438                 struct ctdb_tcp_list *tcp = client->tcp_list;
2439                 DLIST_REMOVE(client->tcp_list, tcp);
2440                 ctdb_remove_tcp_connection(client->ctdb, &tcp->connection);
2441         }
2442 }
2443
2444
2445 /*
2446   release all IPs on shutdown
2447  */
2448 void ctdb_release_all_ips(struct ctdb_context *ctdb)
2449 {
2450         struct ctdb_vnn *vnn;
2451
2452         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2453                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
2454                         ctdb_vnn_unassign_iface(ctdb, vnn);
2455                         continue;
2456                 }
2457                 if (!vnn->iface) {
2458                         continue;
2459                 }
2460                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
2461                                   ctdb_vnn_iface_string(vnn),
2462                                   ctdb_addr_to_str(&vnn->public_address),
2463                                   vnn->public_netmask_bits);
2464                 release_kill_clients(ctdb, &vnn->public_address);
2465                 ctdb_vnn_unassign_iface(ctdb, vnn);
2466         }
2467 }
2468
2469
2470 /*
2471   get list of public IPs
2472  */
2473 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
2474                                     struct ctdb_req_control *c, TDB_DATA *outdata)
2475 {
2476         int i, num, len;
2477         struct ctdb_all_public_ips *ips;
2478         struct ctdb_vnn *vnn;
2479         bool only_available = false;
2480
2481         if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
2482                 only_available = true;
2483         }
2484
2485         /* count how many public ip structures we have */
2486         num = 0;
2487         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2488                 num++;
2489         }
2490
2491         len = offsetof(struct ctdb_all_public_ips, ips) + 
2492                 num*sizeof(struct ctdb_public_ip);
2493         ips = talloc_zero_size(outdata, len);
2494         CTDB_NO_MEMORY(ctdb, ips);
2495
2496         i = 0;
2497         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2498                 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
2499                         continue;
2500                 }
2501                 ips->ips[i].pnn  = vnn->pnn;
2502                 ips->ips[i].addr = vnn->public_address;
2503                 i++;
2504         }
2505         ips->num = i;
2506         len = offsetof(struct ctdb_all_public_ips, ips) +
2507                 i*sizeof(struct ctdb_public_ip);
2508
2509         outdata->dsize = len;
2510         outdata->dptr  = (uint8_t *)ips;
2511
2512         return 0;
2513 }
2514
2515
2516 /*
2517   get list of public IPs, old ipv4 style.  only returns ipv4 addresses
2518  */
2519 int32_t ctdb_control_get_public_ipsv4(struct ctdb_context *ctdb, 
2520                                     struct ctdb_req_control *c, TDB_DATA *outdata)
2521 {
2522         int i, num, len;
2523         struct ctdb_all_public_ipsv4 *ips;
2524         struct ctdb_vnn *vnn;
2525
2526         /* count how many public ip structures we have */
2527         num = 0;
2528         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2529                 if (vnn->public_address.sa.sa_family != AF_INET) {
2530                         continue;
2531                 }
2532                 num++;
2533         }
2534
2535         len = offsetof(struct ctdb_all_public_ipsv4, ips) + 
2536                 num*sizeof(struct ctdb_public_ipv4);
2537         ips = talloc_zero_size(outdata, len);
2538         CTDB_NO_MEMORY(ctdb, ips);
2539
2540         outdata->dsize = len;
2541         outdata->dptr  = (uint8_t *)ips;
2542
2543         ips->num = num;
2544         i = 0;
2545         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2546                 if (vnn->public_address.sa.sa_family != AF_INET) {
2547                         continue;
2548                 }
2549                 ips->ips[i].pnn = vnn->pnn;
2550                 ips->ips[i].sin = vnn->public_address.ip;
2551                 i++;
2552         }
2553
2554         return 0;
2555 }
2556
2557 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
2558                                         struct ctdb_req_control *c,
2559                                         TDB_DATA indata,
2560                                         TDB_DATA *outdata)
2561 {
2562         int i, num, len;
2563         ctdb_sock_addr *addr;
2564         struct ctdb_control_public_ip_info *info;
2565         struct ctdb_vnn *vnn;
2566
2567         addr = (ctdb_sock_addr *)indata.dptr;
2568
2569         vnn = find_public_ip_vnn(ctdb, addr);
2570         if (vnn == NULL) {
2571                 /* if it is not a public ip   it could be our 'single ip' */
2572                 if (ctdb->single_ip_vnn) {
2573                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
2574                                 vnn = ctdb->single_ip_vnn;
2575                         }
2576                 }
2577         }
2578         if (vnn == NULL) {
2579                 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
2580                                  "'%s'not a public address\n",
2581                                  ctdb_addr_to_str(addr)));
2582                 return -1;
2583         }
2584
2585         /* count how many public ip structures we have */
2586         num = 0;
2587         for (;vnn->ifaces[num];) {
2588                 num++;
2589         }
2590
2591         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
2592                 num*sizeof(struct ctdb_control_iface_info);
2593         info = talloc_zero_size(outdata, len);
2594         CTDB_NO_MEMORY(ctdb, info);
2595
2596         info->ip.addr = vnn->public_address;
2597         info->ip.pnn = vnn->pnn;
2598         info->active_idx = 0xFFFFFFFF;
2599
2600         for (i=0; vnn->ifaces[i]; i++) {
2601                 struct ctdb_iface *cur;
2602
2603                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
2604                 if (cur == NULL) {
2605                         DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
2606                                            vnn->ifaces[i]));
2607                         return -1;
2608                 }
2609                 if (vnn->iface == cur) {
2610                         info->active_idx = i;
2611                 }
2612                 strcpy(info->ifaces[i].name, cur->name);
2613                 info->ifaces[i].link_state = cur->link_up;
2614                 info->ifaces[i].references = cur->references;
2615         }
2616         info->num = i;
2617         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
2618                 i*sizeof(struct ctdb_control_iface_info);
2619
2620         outdata->dsize = len;
2621         outdata->dptr  = (uint8_t *)info;
2622
2623         return 0;
2624 }
2625
2626 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
2627                                 struct ctdb_req_control *c,
2628                                 TDB_DATA *outdata)
2629 {
2630         int i, num, len;
2631         struct ctdb_control_get_ifaces *ifaces;
2632         struct ctdb_iface *cur;
2633
2634         /* count how many public ip structures we have */
2635         num = 0;
2636         for (cur=ctdb->ifaces;cur;cur=cur->next) {
2637                 num++;
2638         }
2639
2640         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
2641                 num*sizeof(struct ctdb_control_iface_info);
2642         ifaces = talloc_zero_size(outdata, len);
2643         CTDB_NO_MEMORY(ctdb, ifaces);
2644
2645         i = 0;
2646         for (cur=ctdb->ifaces;cur;cur=cur->next) {
2647                 strcpy(ifaces->ifaces[i].name, cur->name);
2648                 ifaces->ifaces[i].link_state = cur->link_up;
2649                 ifaces->ifaces[i].references = cur->references;
2650                 i++;
2651         }
2652         ifaces->num = i;
2653         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
2654                 i*sizeof(struct ctdb_control_iface_info);
2655
2656         outdata->dsize = len;
2657         outdata->dptr  = (uint8_t *)ifaces;
2658
2659         return 0;
2660 }
2661
2662 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
2663                                     struct ctdb_req_control *c,
2664                                     TDB_DATA indata)
2665 {
2666         struct ctdb_control_iface_info *info;
2667         struct ctdb_iface *iface;
2668         bool link_up = false;
2669
2670         info = (struct ctdb_control_iface_info *)indata.dptr;
2671
2672         if (info->name[CTDB_IFACE_SIZE] != '\0') {
2673                 int len = strnlen(info->name, CTDB_IFACE_SIZE);
2674                 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
2675                                   len, len, info->name));
2676                 return -1;
2677         }
2678
2679         switch (info->link_state) {
2680         case 0:
2681                 link_up = false;
2682                 break;
2683         case 1:
2684                 link_up = true;
2685                 break;
2686         default:
2687                 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
2688                                   (unsigned int)info->link_state));
2689                 return -1;
2690         }
2691
2692         if (info->references != 0) {
2693                 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
2694                                   (unsigned int)info->references));
2695                 return -1;
2696         }
2697
2698         iface = ctdb_find_iface(ctdb, info->name);
2699         if (iface == NULL) {
2700                 return -1;
2701         }
2702
2703         if (link_up == iface->link_up) {
2704                 return 0;
2705         }
2706
2707         DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
2708               ("iface[%s] has changed it's link status %s => %s\n",
2709                iface->name,
2710                iface->link_up?"up":"down",
2711                link_up?"up":"down"));
2712
2713         iface->link_up = link_up;
2714         return 0;
2715 }
2716
2717
2718 /* 
2719    structure containing the listening socket and the list of tcp connections
2720    that the ctdb daemon is to kill
2721 */
2722 struct ctdb_kill_tcp {
2723         struct ctdb_vnn *vnn;
2724         struct ctdb_context *ctdb;
2725         int capture_fd;
2726         struct fd_event *fde;
2727         trbt_tree_t *connections;
2728         void *private_data;
2729 };
2730
2731 /*
2732   a tcp connection that is to be killed
2733  */
2734 struct ctdb_killtcp_con {
2735         ctdb_sock_addr src_addr;
2736         ctdb_sock_addr dst_addr;
2737         int count;
2738         struct ctdb_kill_tcp *killtcp;
2739 };
2740
2741 /* this function is used to create a key to represent this socketpair
2742    in the killtcp tree.
2743    this key is used to insert and lookup matching socketpairs that are
2744    to be tickled and RST
2745 */
2746 #define KILLTCP_KEYLEN  10
2747 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
2748 {
2749         static uint32_t key[KILLTCP_KEYLEN];
2750
2751         bzero(key, sizeof(key));
2752
2753         if (src->sa.sa_family != dst->sa.sa_family) {
2754                 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
2755                 return key;
2756         }
2757         
2758         switch (src->sa.sa_family) {
2759         case AF_INET:
2760                 key[0]  = dst->ip.sin_addr.s_addr;
2761                 key[1]  = src->ip.sin_addr.s_addr;
2762                 key[2]  = dst->ip.sin_port;
2763                 key[3]  = src->ip.sin_port;
2764                 break;
2765         case AF_INET6:
2766                 key[0]  = dst->ip6.sin6_addr.s6_addr32[3];
2767                 key[1]  = src->ip6.sin6_addr.s6_addr32[3];
2768                 key[2]  = dst->ip6.sin6_addr.s6_addr32[2];
2769                 key[3]  = src->ip6.sin6_addr.s6_addr32[2];
2770                 key[4]  = dst->ip6.sin6_addr.s6_addr32[1];
2771                 key[5]  = src->ip6.sin6_addr.s6_addr32[1];
2772                 key[6]  = dst->ip6.sin6_addr.s6_addr32[0];
2773                 key[7]  = src->ip6.sin6_addr.s6_addr32[0];
2774                 key[8]  = dst->ip6.sin6_port;
2775                 key[9]  = src->ip6.sin6_port;
2776                 break;
2777         default:
2778                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
2779                 return key;
2780         }
2781
2782         return key;
2783 }
2784
2785 /*
2786   called when we get a read event on the raw socket
2787  */
2788 static void capture_tcp_handler(struct event_context *ev, struct fd_event *fde, 
2789                                 uint16_t flags, void *private_data)
2790 {
2791         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
2792         struct ctdb_killtcp_con *con;
2793         ctdb_sock_addr src, dst;
2794         uint32_t ack_seq, seq;
2795
2796         if (!(flags & EVENT_FD_READ)) {
2797                 return;
2798         }
2799
2800         if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
2801                                 killtcp->private_data,
2802                                 &src, &dst,
2803                                 &ack_seq, &seq) != 0) {
2804                 /* probably a non-tcp ACK packet */
2805                 return;
2806         }
2807
2808         /* check if we have this guy in our list of connections
2809            to kill
2810         */
2811         con = trbt_lookuparray32(killtcp->connections, 
2812                         KILLTCP_KEYLEN, killtcp_key(&src, &dst));
2813         if (con == NULL) {
2814                 /* no this was some other packet we can just ignore */
2815                 return;
2816         }
2817
2818         /* This one has been tickled !
2819            now reset him and remove him from the list.
2820          */
2821         DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
2822                 ntohs(con->dst_addr.ip.sin_port),
2823                 ctdb_addr_to_str(&con->src_addr),
2824                 ntohs(con->src_addr.ip.sin_port)));
2825
2826         ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
2827         talloc_free(con);
2828 }
2829
2830
2831 /* when traversing the list of all tcp connections to send tickle acks to
2832    (so that we can capture the ack coming back and kill the connection
2833     by a RST)
2834    this callback is called for each connection we are currently trying to kill
2835 */
2836 static void tickle_connection_traverse(void *param, void *data)
2837 {
2838         struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
2839
2840         /* have tried too many times, just give up */
2841         if (con->count >= 5) {
2842                 /* can't delete in traverse: reparent to delete_cons */
2843                 talloc_steal(param, con);
2844                 return;
2845         }
2846
2847         /* othervise, try tickling it again */
2848         con->count++;
2849         ctdb_sys_send_tcp(
2850                 (ctdb_sock_addr *)&con->dst_addr,
2851                 (ctdb_sock_addr *)&con->src_addr,
2852                 0, 0, 0);
2853 }
2854
2855
2856 /* 
2857    called every second until all sentenced connections have been reset
2858  */
2859 static void ctdb_tickle_sentenced_connections(struct event_context *ev, struct timed_event *te, 
2860                                               struct timeval t, void *private_data)
2861 {
2862         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
2863         void *delete_cons = talloc_new(NULL);
2864
2865         /* loop over all connections sending tickle ACKs */
2866         trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
2867
2868         /* now we've finished traverse, it's safe to do deletion. */
2869         talloc_free(delete_cons);
2870
2871         /* If there are no more connections to kill we can remove the
2872            entire killtcp structure
2873          */
2874         if ( (killtcp->connections == NULL) || 
2875              (killtcp->connections->root == NULL) ) {
2876                 talloc_free(killtcp);
2877                 return;
2878         }
2879
2880         /* try tickling them again in a seconds time
2881          */
2882         event_add_timed(killtcp->ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
2883                         ctdb_tickle_sentenced_connections, killtcp);
2884 }
2885
2886 /*
2887   destroy the killtcp structure
2888  */
2889 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
2890 {
2891         struct ctdb_vnn *tmpvnn;
2892
2893         /* verify that this vnn is still active */
2894         for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
2895                 if (tmpvnn == killtcp->vnn) {
2896                         break;
2897                 }
2898         }
2899
2900         if (tmpvnn == NULL) {
2901                 return 0;
2902         }
2903
2904         if (killtcp->vnn->killtcp != killtcp) {
2905                 return 0;
2906         }
2907
2908         killtcp->vnn->killtcp = NULL;
2909
2910         return 0;
2911 }
2912
2913
2914 /* nothing fancy here, just unconditionally replace any existing
2915    connection structure with the new one.
2916
2917    dont even free the old one if it did exist, that one is talloc_stolen
2918    by the same node in the tree anyway and will be deleted when the new data 
2919    is deleted
2920 */
2921 static void *add_killtcp_callback(void *parm, void *data)
2922 {
2923         return parm;
2924 }
2925
2926 /*
2927   add a tcp socket to the list of connections we want to RST
2928  */
2929 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb, 
2930                                        ctdb_sock_addr *s,
2931                                        ctdb_sock_addr *d)
2932 {
2933         ctdb_sock_addr src, dst;
2934         struct ctdb_kill_tcp *killtcp;
2935         struct ctdb_killtcp_con *con;
2936         struct ctdb_vnn *vnn;
2937
2938         ctdb_canonicalize_ip(s, &src);
2939         ctdb_canonicalize_ip(d, &dst);
2940
2941         vnn = find_public_ip_vnn(ctdb, &dst);
2942         if (vnn == NULL) {
2943                 vnn = find_public_ip_vnn(ctdb, &src);
2944         }
2945         if (vnn == NULL) {
2946                 /* if it is not a public ip   it could be our 'single ip' */
2947                 if (ctdb->single_ip_vnn) {
2948                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
2949                                 vnn = ctdb->single_ip_vnn;
2950                         }
2951                 }
2952         }
2953         if (vnn == NULL) {
2954                 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n")); 
2955                 return -1;
2956         }
2957
2958         killtcp = vnn->killtcp;
2959         
2960         /* If this is the first connection to kill we must allocate
2961            a new structure
2962          */
2963         if (killtcp == NULL) {
2964                 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
2965                 CTDB_NO_MEMORY(ctdb, killtcp);
2966
2967                 killtcp->vnn         = vnn;
2968                 killtcp->ctdb        = ctdb;
2969                 killtcp->capture_fd  = -1;
2970                 killtcp->connections = trbt_create(killtcp, 0);
2971
2972                 vnn->killtcp         = killtcp;
2973                 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
2974         }
2975
2976
2977
2978         /* create a structure that describes this connection we want to
2979            RST and store it in killtcp->connections
2980         */
2981         con = talloc(killtcp, struct ctdb_killtcp_con);
2982         CTDB_NO_MEMORY(ctdb, con);
2983         con->src_addr = src;
2984         con->dst_addr = dst;
2985         con->count    = 0;
2986         con->killtcp  = killtcp;
2987
2988
2989         trbt_insertarray32_callback(killtcp->connections,
2990                         KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
2991                         add_killtcp_callback, con);
2992
2993         /* 
2994            If we dont have a socket to listen on yet we must create it
2995          */
2996         if (killtcp->capture_fd == -1) {
2997                 const char *iface = ctdb_vnn_iface_string(vnn);
2998                 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
2999                 if (killtcp->capture_fd == -1) {
3000                         DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
3001                                           "socket on iface '%s' for killtcp (%s)\n",
3002                                           iface, strerror(errno)));
3003                         goto failed;
3004                 }
3005         }
3006
3007
3008         if (killtcp->fde == NULL) {
3009                 killtcp->fde = event_add_fd(ctdb->ev, killtcp, killtcp->capture_fd, 
3010                                             EVENT_FD_READ,
3011                                             capture_tcp_handler, killtcp);
3012                 tevent_fd_set_auto_close(killtcp->fde);
3013
3014                 /* We also need to set up some events to tickle all these connections
3015                    until they are all reset
3016                 */
3017                 event_add_timed(ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
3018                                 ctdb_tickle_sentenced_connections, killtcp);
3019         }
3020
3021         /* tickle him once now */
3022         ctdb_sys_send_tcp(
3023                 &con->dst_addr,
3024                 &con->src_addr,
3025                 0, 0, 0);
3026
3027         return 0;
3028
3029 failed:
3030         talloc_free(vnn->killtcp);
3031         vnn->killtcp = NULL;
3032         return -1;
3033 }
3034
3035 /*
3036   kill a TCP connection.
3037  */
3038 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
3039 {
3040         struct ctdb_control_killtcp *killtcp = (struct ctdb_control_killtcp *)indata.dptr;
3041
3042         return ctdb_killtcp_add_connection(ctdb, &killtcp->src_addr, &killtcp->dst_addr);
3043 }
3044
3045 /*
3046   called by a daemon to inform us of the entire list of TCP tickles for
3047   a particular public address.
3048   this control should only be sent by the node that is currently serving
3049   that public address.
3050  */
3051 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
3052 {
3053         struct ctdb_control_tcp_tickle_list *list = (struct ctdb_control_tcp_tickle_list *)indata.dptr;
3054         struct ctdb_tcp_array *tcparray;
3055         struct ctdb_vnn *vnn;
3056
3057         /* We must at least have tickles.num or else we cant verify the size
3058            of the received data blob
3059          */
3060         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3061                                         tickles.connections)) {
3062                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list. Not enough data for the tickle.num field\n"));
3063                 return -1;
3064         }
3065
3066         /* verify that the size of data matches what we expect */
3067         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3068                                 tickles.connections)
3069                          + sizeof(struct ctdb_tcp_connection)
3070                                  * list->tickles.num) {
3071                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list\n"));
3072                 return -1;
3073         }       
3074
3075         vnn = find_public_ip_vnn(ctdb, &list->addr);
3076         if (vnn == NULL) {
3077                 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n", 
3078                         ctdb_addr_to_str(&list->addr)));
3079
3080                 return 1;
3081         }
3082
3083         /* remove any old ticklelist we might have */
3084         talloc_free(vnn->tcp_array);
3085         vnn->tcp_array = NULL;
3086
3087         tcparray = talloc(ctdb->nodes, struct ctdb_tcp_array);
3088         CTDB_NO_MEMORY(ctdb, tcparray);
3089
3090         tcparray->num = list->tickles.num;
3091
3092         tcparray->connections = talloc_array(tcparray, struct ctdb_tcp_connection, tcparray->num);
3093         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3094
3095         memcpy(tcparray->connections, &list->tickles.connections[0], 
3096                sizeof(struct ctdb_tcp_connection)*tcparray->num);
3097
3098         /* We now have a new fresh tickle list array for this vnn */
3099         vnn->tcp_array = talloc_steal(vnn, tcparray);
3100         
3101         return 0;
3102 }
3103
3104 /*
3105   called to return the full list of tickles for the puclic address associated 
3106   with the provided vnn
3107  */
3108 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3109 {
3110         ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3111         struct ctdb_control_tcp_tickle_list *list;
3112         struct ctdb_tcp_array *tcparray;
3113         int num;
3114         struct ctdb_vnn *vnn;
3115
3116         vnn = find_public_ip_vnn(ctdb, addr);
3117         if (vnn == NULL) {
3118                 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n", 
3119                         ctdb_addr_to_str(addr)));
3120
3121                 return 1;
3122         }
3123
3124         tcparray = vnn->tcp_array;
3125         if (tcparray) {
3126                 num = tcparray->num;
3127         } else {
3128                 num = 0;
3129         }
3130
3131         outdata->dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3132                                 tickles.connections)
3133                         + sizeof(struct ctdb_tcp_connection) * num;
3134
3135         outdata->dptr  = talloc_size(outdata, outdata->dsize);
3136         CTDB_NO_MEMORY(ctdb, outdata->dptr);
3137         list = (struct ctdb_control_tcp_tickle_list *)outdata->dptr;
3138
3139         list->addr = *addr;
3140         list->tickles.num = num;
3141         if (num) {
3142                 memcpy(&list->tickles.connections[0], tcparray->connections, 
3143                         sizeof(struct ctdb_tcp_connection) * num);
3144         }
3145
3146         return 0;
3147 }
3148
3149
3150 /*
3151   set the list of all tcp tickles for a public address
3152  */
3153 static int ctdb_ctrl_set_tcp_tickles(struct ctdb_context *ctdb, 
3154                               struct timeval timeout, uint32_t destnode, 
3155                               ctdb_sock_addr *addr,
3156                               struct ctdb_tcp_array *tcparray)
3157 {
3158         int ret, num;
3159         TDB_DATA data;
3160         struct ctdb_control_tcp_tickle_list *list;
3161
3162         if (tcparray) {
3163                 num = tcparray->num;
3164         } else {
3165                 num = 0;
3166         }
3167
3168         data.dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3169                                 tickles.connections) +
3170                         sizeof(struct ctdb_tcp_connection) * num;
3171         data.dptr = talloc_size(ctdb, data.dsize);
3172         CTDB_NO_MEMORY(ctdb, data.dptr);
3173
3174         list = (struct ctdb_control_tcp_tickle_list *)data.dptr;
3175         list->addr = *addr;
3176         list->tickles.num = num;
3177         if (tcparray) {
3178                 memcpy(&list->tickles.connections[0], tcparray->connections, sizeof(struct ctdb_tcp_connection) * num);
3179         }
3180
3181         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
3182                                        CTDB_CONTROL_SET_TCP_TICKLE_LIST,
3183                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3184         if (ret != 0) {
3185                 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
3186                 return -1;
3187         }
3188
3189         talloc_free(data.dptr);
3190
3191         return ret;
3192 }
3193
3194
3195 /*
3196   perform tickle updates if required
3197  */
3198 static void ctdb_update_tcp_tickles(struct event_context *ev, 
3199                                 struct timed_event *te, 
3200                                 struct timeval t, void *private_data)
3201 {
3202         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3203         int ret;
3204         struct ctdb_vnn *vnn;
3205
3206         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3207                 /* we only send out updates for public addresses that 
3208                    we have taken over
3209                  */
3210                 if (ctdb->pnn != vnn->pnn) {
3211                         continue;
3212                 }
3213                 /* We only send out the updates if we need to */
3214                 if (!vnn->tcp_update_needed) {
3215                         continue;
3216                 }
3217                 ret = ctdb_ctrl_set_tcp_tickles(ctdb, 
3218                                 TAKEOVER_TIMEOUT(),
3219                                 CTDB_BROADCAST_CONNECTED,
3220                                 &vnn->public_address,
3221                                 vnn->tcp_array);
3222                 if (ret != 0) {
3223                         DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
3224                                 ctdb_addr_to_str(&vnn->public_address)));
3225                 }
3226         }
3227
3228         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
3229                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
3230                              ctdb_update_tcp_tickles, ctdb);
3231 }               
3232         
3233
3234 /*
3235   start periodic update of tcp tickles
3236  */
3237 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
3238 {
3239         ctdb->tickle_update_context = talloc_new(ctdb);
3240
3241         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
3242                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
3243                              ctdb_update_tcp_tickles, ctdb);
3244 }
3245
3246
3247
3248
3249 struct control_gratious_arp {
3250         struct ctdb_context *ctdb;
3251         ctdb_sock_addr addr;
3252         const char *iface;
3253         int count;
3254 };
3255
3256 /*
3257   send a control_gratuitous arp
3258  */
3259 static void send_gratious_arp(struct event_context *ev, struct timed_event *te, 
3260                                   struct timeval t, void *private_data)
3261 {
3262         int ret;
3263         struct control_gratious_arp *arp = talloc_get_type(private_data, 
3264                                                         struct control_gratious_arp);
3265
3266         ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
3267         if (ret != 0) {
3268                 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
3269                                  arp->iface, strerror(errno)));
3270         }
3271
3272
3273         arp->count++;
3274         if (arp->count == CTDB_ARP_REPEAT) {
3275                 talloc_free(arp);
3276                 return;
3277         }
3278
3279         event_add_timed(arp->ctdb->ev, arp, 
3280                         timeval_current_ofs(CTDB_ARP_INTERVAL, 0), 
3281                         send_gratious_arp, arp);
3282 }
3283
3284
3285 /*
3286   send a gratious arp 
3287  */
3288 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
3289 {
3290         struct ctdb_control_gratious_arp *gratious_arp = (struct ctdb_control_gratious_arp *)indata.dptr;
3291         struct control_gratious_arp *arp;
3292
3293         /* verify the size of indata */
3294         if (indata.dsize < offsetof(struct ctdb_control_gratious_arp, iface)) {
3295                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
3296                                  (unsigned)indata.dsize, 
3297                                  (unsigned)offsetof(struct ctdb_control_gratious_arp, iface)));
3298                 return -1;
3299         }
3300         if (indata.dsize != 
3301                 ( offsetof(struct ctdb_control_gratious_arp, iface)
3302                 + gratious_arp->len ) ){
3303
3304                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3305                         "but should be %u bytes\n", 
3306                          (unsigned)indata.dsize, 
3307                          (unsigned)(offsetof(struct ctdb_control_gratious_arp, iface)+gratious_arp->len)));
3308                 return -1;
3309         }
3310
3311
3312         arp = talloc(ctdb, struct control_gratious_arp);
3313         CTDB_NO_MEMORY(ctdb, arp);
3314
3315         arp->ctdb  = ctdb;
3316         arp->addr   = gratious_arp->addr;
3317         arp->iface = talloc_strdup(arp, gratious_arp->iface);
3318         CTDB_NO_MEMORY(ctdb, arp->iface);
3319         arp->count = 0;
3320         
3321         event_add_timed(arp->ctdb->ev, arp, 
3322                         timeval_zero(), send_gratious_arp, arp);
3323
3324         return 0;
3325 }
3326
3327 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
3328 {
3329         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
3330         int ret;
3331
3332         /* verify the size of indata */
3333         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
3334                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
3335                 return -1;
3336         }
3337         if (indata.dsize != 
3338                 ( offsetof(struct ctdb_control_ip_iface, iface)
3339                 + pub->len ) ){
3340
3341                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3342                         "but should be %u bytes\n", 
3343                          (unsigned)indata.dsize, 
3344                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
3345                 return -1;
3346         }
3347
3348         ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0]);
3349
3350         if (ret != 0) {
3351                 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
3352                 return -1;
3353         }
3354
3355         return 0;
3356 }
3357
3358 /*
3359   called when releaseip event finishes for del_public_address
3360  */
3361 static void delete_ip_callback(struct ctdb_context *ctdb, int status, 
3362                                 void *private_data)
3363 {
3364         talloc_free(private_data);
3365 }
3366
3367 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
3368 {
3369         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
3370         struct ctdb_vnn *vnn;
3371         int ret;
3372
3373         /* verify the size of indata */
3374         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
3375                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
3376                 return -1;
3377         }
3378         if (indata.dsize != 
3379                 ( offsetof(struct ctdb_control_ip_iface, iface)
3380                 + pub->len ) ){
3381
3382                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3383                         "but should be %u bytes\n", 
3384                          (unsigned)indata.dsize, 
3385                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
3386                 return -1;
3387         }
3388
3389         /* walk over all public addresses until we find a match */
3390         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3391                 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
3392                         TALLOC_CTX *mem_ctx;
3393
3394                         DLIST_REMOVE(ctdb->vnn, vnn);
3395                         if (vnn->pnn != ctdb->pnn) {
3396                                 if (vnn->iface != NULL) {
3397                                         ctdb_vnn_unassign_iface(ctdb, vnn);
3398                                 }
3399                                 talloc_free(vnn);
3400                                 return 0;
3401                         }
3402                         vnn->pnn = -1;
3403
3404                         mem_ctx = talloc_new(ctdb);
3405                         talloc_steal(mem_ctx, vnn);
3406                         ret = ctdb_event_script_callback(ctdb, 
3407                                          mem_ctx, delete_ip_callback, mem_ctx,
3408                                          false,
3409                                          CTDB_EVENT_RELEASE_IP,
3410                                          "%s %s %u",
3411                                          ctdb_vnn_iface_string(vnn),
3412                                          ctdb_addr_to_str(&vnn->public_address),
3413                                          vnn->public_netmask_bits);
3414                         if (vnn->iface != NULL) {
3415                                 ctdb_vnn_unassign_iface(ctdb, vnn);
3416                         }
3417                         if (ret != 0) {
3418                                 return -1;
3419                         }
3420                         return 0;
3421                 }
3422         }
3423
3424         return -1;
3425 }
3426
3427 /* This function is called from the recovery daemon to verify that a remote
3428    node has the expected ip allocation.
3429    This is verified against ctdb->ip_tree
3430 */
3431 int verify_remote_ip_allocation(struct ctdb_context *ctdb, struct ctdb_all_public_ips *ips)
3432 {
3433         struct ctdb_public_ip_list *tmp_ip; 
3434         int i;
3435
3436         if (ctdb->ip_tree == NULL) {
3437                 /* dont know the expected allocation yet, assume remote node
3438                    is correct. */
3439                 return 0;
3440         }
3441
3442         if (ips == NULL) {
3443                 return 0;
3444         }
3445
3446         for (i=0; i<ips->num; i++) {
3447                 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
3448                 if (tmp_ip == NULL) {
3449                         DEBUG(DEBUG_ERR,(__location__ " Could not find host for address %s, reassign ips\n", ctdb_addr_to_str(&ips->ips[i].addr)));
3450                         return -1;
3451                 }
3452
3453                 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
3454                         continue;
3455                 }
3456
3457                 if (tmp_ip->pnn != ips->ips[i].pnn) {
3458                         DEBUG(DEBUG_ERR,("Inconsistent ip allocation. Trigger reallocation. Thinks %s is held by node %u while it is held by node %u\n", ctdb_addr_to_str(&ips->ips[i].addr), ips->ips[i].pnn, tmp_ip->pnn));
3459                         return -1;
3460                 }
3461         }
3462
3463         return 0;
3464 }
3465
3466 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
3467 {
3468         struct ctdb_public_ip_list *tmp_ip; 
3469
3470         if (ctdb->ip_tree == NULL) {
3471                 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
3472                 return -1;
3473         }
3474
3475         tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
3476         if (tmp_ip == NULL) {
3477                 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
3478                 return -1;
3479         }
3480
3481         DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
3482         tmp_ip->pnn = ip->pnn;
3483
3484         return 0;
3485 }