c82443416192a8314db906b887f793e002eb63d7
[ctdb.git] / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6
7    This program is free software; you can redistribute it and/or modify
8    it under the terms of the GNU General Public License as published by
9    the Free Software Foundation; either version 3 of the License, or
10    (at your option) any later version.
11    
12    This program is distributed in the hope that it will be useful,
13    but WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15    GNU General Public License for more details.
16    
17    You should have received a copy of the GNU General Public License
18    along with this program; if not, see <http://www.gnu.org/licenses/>.
19 */
20 #include "includes.h"
21 #include "lib/tevent/tevent.h"
22 #include "lib/tdb/include/tdb.h"
23 #include "lib/util/dlinklist.h"
24 #include "system/network.h"
25 #include "system/filesys.h"
26 #include "system/wait.h"
27 #include "../include/ctdb_private.h"
28 #include "../common/rb_tree.h"
29
30
31 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
32
33 #define CTDB_ARP_INTERVAL 1
34 #define CTDB_ARP_REPEAT   3
35
36 struct ctdb_iface {
37         struct ctdb_iface *prev, *next;
38         const char *name;
39         bool link_up;
40         uint32_t references;
41 };
42
43 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
44 {
45         if (vnn->iface) {
46                 return vnn->iface->name;
47         }
48
49         return "__none__";
50 }
51
52 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
53 {
54         struct ctdb_iface *i;
55
56         /* Verify that we dont have an entry for this ip yet */
57         for (i=ctdb->ifaces;i;i=i->next) {
58                 if (strcmp(i->name, iface) == 0) {
59                         return 0;
60                 }
61         }
62
63         /* create a new structure for this interface */
64         i = talloc_zero(ctdb, struct ctdb_iface);
65         CTDB_NO_MEMORY_FATAL(ctdb, i);
66         i->name = talloc_strdup(i, iface);
67         CTDB_NO_MEMORY(ctdb, i->name);
68         i->link_up = false;
69
70         DLIST_ADD(ctdb->ifaces, i);
71
72         return 0;
73 }
74
75 static struct ctdb_iface *ctdb_find_iface(struct ctdb_context *ctdb,
76                                           const char *iface)
77 {
78         struct ctdb_iface *i;
79
80         /* Verify that we dont have an entry for this ip yet */
81         for (i=ctdb->ifaces;i;i=i->next) {
82                 if (strcmp(i->name, iface) == 0) {
83                         return i;
84                 }
85         }
86
87         return NULL;
88 }
89
90 static struct ctdb_iface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
91                                               struct ctdb_vnn *vnn)
92 {
93         int i;
94         struct ctdb_iface *cur = NULL;
95         struct ctdb_iface *best = NULL;
96
97         for (i=0; vnn->ifaces[i]; i++) {
98
99                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
100                 if (cur == NULL) {
101                         continue;
102                 }
103
104                 if (!cur->link_up) {
105                         continue;
106                 }
107
108                 if (best == NULL) {
109                         best = cur;
110                         continue;
111                 }
112
113                 if (cur->references < best->references) {
114                         best = cur;
115                         continue;
116                 }
117         }
118
119         return best;
120 }
121
122 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
123                                      struct ctdb_vnn *vnn)
124 {
125         struct ctdb_iface *best = NULL;
126
127         if (vnn->iface) {
128                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
129                                    "still assigned to iface '%s'\n",
130                                    ctdb_addr_to_str(&vnn->public_address),
131                                    ctdb_vnn_iface_string(vnn)));
132                 return 0;
133         }
134
135         best = ctdb_vnn_best_iface(ctdb, vnn);
136         if (best == NULL) {
137                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
138                                   "cannot assign to iface any iface\n",
139                                   ctdb_addr_to_str(&vnn->public_address)));
140                 return -1;
141         }
142
143         vnn->iface = best;
144         best->references++;
145         vnn->pnn = ctdb->pnn;
146
147         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
148                            "now assigned to iface '%s' refs[%d]\n",
149                            ctdb_addr_to_str(&vnn->public_address),
150                            ctdb_vnn_iface_string(vnn),
151                            best->references));
152         return 0;
153 }
154
155 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
156                                     struct ctdb_vnn *vnn)
157 {
158         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
159                            "now unassigned (old iface '%s' refs[%d])\n",
160                            ctdb_addr_to_str(&vnn->public_address),
161                            ctdb_vnn_iface_string(vnn),
162                            vnn->iface?vnn->iface->references:0));
163         if (vnn->iface) {
164                 vnn->iface->references--;
165         }
166         vnn->iface = NULL;
167         if (vnn->pnn == ctdb->pnn) {
168                 vnn->pnn = -1;
169         }
170 }
171
172 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
173                                struct ctdb_vnn *vnn)
174 {
175         int i;
176
177         if (vnn->iface && vnn->iface->link_up) {
178                 return true;
179         }
180
181         for (i=0; vnn->ifaces[i]; i++) {
182                 struct ctdb_iface *cur;
183
184                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
185                 if (cur == NULL) {
186                         continue;
187                 }
188
189                 if (cur->link_up) {
190                         return true;
191                 }
192         }
193
194         return false;
195 }
196
197 struct ctdb_takeover_arp {
198         struct ctdb_context *ctdb;
199         uint32_t count;
200         ctdb_sock_addr addr;
201         struct ctdb_tcp_array *tcparray;
202         struct ctdb_vnn *vnn;
203 };
204
205
206 /*
207   lists of tcp endpoints
208  */
209 struct ctdb_tcp_list {
210         struct ctdb_tcp_list *prev, *next;
211         struct ctdb_tcp_connection connection;
212 };
213
214 /*
215   list of clients to kill on IP release
216  */
217 struct ctdb_client_ip {
218         struct ctdb_client_ip *prev, *next;
219         struct ctdb_context *ctdb;
220         ctdb_sock_addr addr;
221         uint32_t client_id;
222 };
223
224
225 /*
226   send a gratuitous arp
227  */
228 static void ctdb_control_send_arp(struct event_context *ev, struct timed_event *te, 
229                                   struct timeval t, void *private_data)
230 {
231         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
232                                                         struct ctdb_takeover_arp);
233         int i, ret;
234         struct ctdb_tcp_array *tcparray;
235         const char *iface = ctdb_vnn_iface_string(arp->vnn);
236
237         ret = ctdb_sys_send_arp(&arp->addr, iface);
238         if (ret != 0) {
239                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
240                                   iface, strerror(errno)));
241         }
242
243         tcparray = arp->tcparray;
244         if (tcparray) {
245                 for (i=0;i<tcparray->num;i++) {
246                         struct ctdb_tcp_connection *tcon;
247
248                         tcon = &tcparray->connections[i];
249                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
250                                 (unsigned)ntohs(tcon->dst_addr.ip.sin_port), 
251                                 ctdb_addr_to_str(&tcon->src_addr),
252                                 (unsigned)ntohs(tcon->src_addr.ip.sin_port)));
253                         ret = ctdb_sys_send_tcp(
254                                 &tcon->src_addr, 
255                                 &tcon->dst_addr,
256                                 0, 0, 0);
257                         if (ret != 0) {
258                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
259                                         ctdb_addr_to_str(&tcon->src_addr)));
260                         }
261                 }
262         }
263
264         arp->count++;
265
266         if (arp->count == CTDB_ARP_REPEAT) {
267                 talloc_free(arp);
268                 return;
269         }
270
271         event_add_timed(arp->ctdb->ev, arp->vnn->takeover_ctx, 
272                         timeval_current_ofs(CTDB_ARP_INTERVAL, 100000), 
273                         ctdb_control_send_arp, arp);
274 }
275
276 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
277                                        struct ctdb_vnn *vnn)
278 {
279         struct ctdb_takeover_arp *arp;
280         struct ctdb_tcp_array *tcparray;
281
282         if (!vnn->takeover_ctx) {
283                 vnn->takeover_ctx = talloc_new(vnn);
284                 if (!vnn->takeover_ctx) {
285                         return -1;
286                 }
287         }
288
289         arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
290         if (!arp) {
291                 return -1;
292         }
293
294         arp->ctdb = ctdb;
295         arp->addr = vnn->public_address;
296         arp->vnn  = vnn;
297
298         tcparray = vnn->tcp_array;
299         if (tcparray) {
300                 /* add all of the known tcp connections for this IP to the
301                    list of tcp connections to send tickle acks for */
302                 arp->tcparray = talloc_steal(arp, tcparray);
303
304                 vnn->tcp_array = NULL;
305                 vnn->tcp_update_needed = true;
306         }
307
308         event_add_timed(arp->ctdb->ev, vnn->takeover_ctx,
309                         timeval_zero(), ctdb_control_send_arp, arp);
310
311         return 0;
312 }
313
314 struct takeover_callback_state {
315         struct ctdb_req_control *c;
316         ctdb_sock_addr *addr;
317         struct ctdb_vnn *vnn;
318 };
319
320 struct ctdb_do_takeip_state {
321         struct ctdb_req_control *c;
322         struct ctdb_vnn *vnn;
323 };
324
325 /*
326   called when takeip event finishes
327  */
328 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
329                                     void *private_data)
330 {
331         struct ctdb_do_takeip_state *state =
332                 talloc_get_type(private_data, struct ctdb_do_takeip_state);
333         int32_t ret;
334         TDB_DATA data;
335
336         if (status != 0) {
337                 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
338         
339                 if (status == -ETIME) {
340                         ctdb_ban_self(ctdb);
341                 }
342                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
343                                  ctdb_addr_to_str(&state->vnn->public_address),
344                                  ctdb_vnn_iface_string(state->vnn)));
345                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
346
347                 node->flags |= NODE_FLAGS_UNHEALTHY;
348                 talloc_free(state);
349                 return;
350         }
351
352         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
353         if (ret != 0) {
354                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
355                 talloc_free(state);
356                 return;
357         }
358
359         data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
360         data.dsize = strlen((char *)data.dptr) + 1;
361         DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
362
363         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
364
365
366         /* the control succeeded */
367         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
368         talloc_free(state);
369         return;
370 }
371
372 /*
373   take over an ip address
374  */
375 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
376                               struct ctdb_req_control *c,
377                               struct ctdb_vnn *vnn)
378 {
379         int ret;
380         struct ctdb_do_takeip_state *state;
381
382         ret = ctdb_vnn_assign_iface(ctdb, vnn);
383         if (ret != 0) {
384                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
385                                  "assin a usable interface\n",
386                                  ctdb_addr_to_str(&vnn->public_address),
387                                  vnn->public_netmask_bits));
388                 return -1;
389         }
390
391         state = talloc(vnn, struct ctdb_do_takeip_state);
392         CTDB_NO_MEMORY(ctdb, state);
393
394         state->c = talloc_steal(ctdb, c);
395         state->vnn   = vnn;
396
397         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
398                             ctdb_addr_to_str(&vnn->public_address),
399                             vnn->public_netmask_bits,
400                             ctdb_vnn_iface_string(vnn)));
401
402         ret = ctdb_event_script_callback(ctdb,
403                                          state,
404                                          ctdb_do_takeip_callback,
405                                          state,
406                                          false,
407                                          CTDB_EVENT_TAKE_IP,
408                                          "%s %s %u",
409                                          ctdb_vnn_iface_string(vnn),
410                                          ctdb_addr_to_str(&vnn->public_address),
411                                          vnn->public_netmask_bits);
412
413         if (ret != 0) {
414                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
415                         ctdb_addr_to_str(&vnn->public_address),
416                         ctdb_vnn_iface_string(vnn)));
417                 talloc_free(state);
418                 return -1;
419         }
420
421         return 0;
422 }
423
424 struct ctdb_do_updateip_state {
425         struct ctdb_req_control *c;
426         struct ctdb_iface *old;
427         struct ctdb_vnn *vnn;
428 };
429
430 /*
431   called when updateip event finishes
432  */
433 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
434                                       void *private_data)
435 {
436         struct ctdb_do_updateip_state *state =
437                 talloc_get_type(private_data, struct ctdb_do_updateip_state);
438         int32_t ret;
439
440         if (status != 0) {
441                 if (status == -ETIME) {
442                         ctdb_ban_self(ctdb);
443                 }
444                 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
445                         ctdb_addr_to_str(&state->vnn->public_address),
446                         state->old->name,
447                         ctdb_vnn_iface_string(state->vnn)));
448
449                 /*
450                  * All we can do is reset the old interface
451                  * and let the next run fix it
452                  */
453                 ctdb_vnn_unassign_iface(ctdb, state->vnn);
454                 state->vnn->iface = state->old;
455                 state->vnn->iface->references++;
456
457                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
458                 talloc_free(state);
459                 return;
460         }
461
462         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
463         if (ret != 0) {
464                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
465                 talloc_free(state);
466                 return;
467         }
468
469         /* the control succeeded */
470         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
471         talloc_free(state);
472         return;
473 }
474
475 /*
476   update (move) an ip address
477  */
478 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
479                                 struct ctdb_req_control *c,
480                                 struct ctdb_vnn *vnn)
481 {
482         int ret;
483         struct ctdb_do_updateip_state *state;
484         struct ctdb_iface *old = vnn->iface;
485         char *new_name;
486
487         ctdb_vnn_unassign_iface(ctdb, vnn);
488         ret = ctdb_vnn_assign_iface(ctdb, vnn);
489         if (ret != 0) {
490                 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
491                                  "assin a usable interface (old iface '%s')\n",
492                                  ctdb_addr_to_str(&vnn->public_address),
493                                  vnn->public_netmask_bits,
494                                  old->name));
495                 return -1;
496         }
497
498         new_name = ctdb_vnn_iface_string(vnn);
499         if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
500                 /* A benign update from one interface onto itself.
501                  * no need to run the eventscripts in this case, just return
502                  * success.
503                  */
504                 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
505                 return 0;
506         }
507
508         state = talloc(vnn, struct ctdb_do_updateip_state);
509         CTDB_NO_MEMORY(ctdb, state);
510
511         state->c = talloc_steal(ctdb, c);
512         state->old = old;
513         state->vnn = vnn;
514
515         DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
516                             "interface %s to %s\n",
517                             ctdb_addr_to_str(&vnn->public_address),
518                             vnn->public_netmask_bits,
519                             old->name,
520                             new_name));
521
522         ret = ctdb_event_script_callback(ctdb,
523                                          state,
524                                          ctdb_do_updateip_callback,
525                                          state,
526                                          false,
527                                          CTDB_EVENT_UPDATE_IP,
528                                          "%s %s %s %u",
529                                          state->old->name,
530                                          new_name,
531                                          ctdb_addr_to_str(&vnn->public_address),
532                                          vnn->public_netmask_bits);
533         if (ret != 0) {
534                 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
535                                  ctdb_addr_to_str(&vnn->public_address),
536                                  old->name, new_name));
537                 talloc_free(state);
538                 return -1;
539         }
540
541         return 0;
542 }
543
544 /*
545   Find the vnn of the node that has a public ip address
546   returns -1 if the address is not known as a public address
547  */
548 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
549 {
550         struct ctdb_vnn *vnn;
551
552         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
553                 if (ctdb_same_ip(&vnn->public_address, addr)) {
554                         return vnn;
555                 }
556         }
557
558         return NULL;
559 }
560
561 /*
562   take over an ip address
563  */
564 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
565                                  struct ctdb_req_control *c,
566                                  TDB_DATA indata,
567                                  bool *async_reply)
568 {
569         int ret;
570         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
571         struct ctdb_vnn *vnn;
572         bool have_ip = false;
573         bool do_updateip = false;
574         bool do_takeip = false;
575         struct ctdb_iface *best_iface = NULL;
576
577         if (pip->pnn != ctdb->pnn) {
578                 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
579                                  "with pnn %d, but we're node %d\n",
580                                  ctdb_addr_to_str(&pip->addr),
581                                  pip->pnn, ctdb->pnn));
582                 return -1;
583         }
584
585         /* update out vnn list */
586         vnn = find_public_ip_vnn(ctdb, &pip->addr);
587         if (vnn == NULL) {
588                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
589                         ctdb_addr_to_str(&pip->addr)));
590                 return 0;
591         }
592
593         have_ip = ctdb_sys_have_ip(&pip->addr);
594         best_iface = ctdb_vnn_best_iface(ctdb, vnn);
595         if (best_iface == NULL) {
596                 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
597                                  "a usable interface (old %s, have_ip %d)\n",
598                                  ctdb_addr_to_str(&vnn->public_address),
599                                  vnn->public_netmask_bits,
600                                  ctdb_vnn_iface_string(vnn),
601                                  have_ip));
602                 return -1;
603         }
604
605         if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
606                 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
607                 have_ip = false;
608         }
609
610         if (vnn->iface == NULL && have_ip) {
611                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
612                                   "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
613                                  ctdb_addr_to_str(&vnn->public_address)));
614                 return 0;
615         }
616
617         if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
618                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
619                                   "and we have it on iface[%s], but it was assigned to node %d"
620                                   "and we are node %d, banning ourself\n",
621                                  ctdb_addr_to_str(&vnn->public_address),
622                                  ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
623                 ctdb_ban_self(ctdb);
624                 return -1;
625         }
626
627         if (vnn->pnn == -1 && have_ip) {
628                 vnn->pnn = ctdb->pnn;
629                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
630                                   "and we already have it on iface[%s], update local daemon\n",
631                                  ctdb_addr_to_str(&vnn->public_address),
632                                   ctdb_vnn_iface_string(vnn)));
633                 return 0;
634         }
635
636         if (vnn->iface) {
637                 if (vnn->iface->link_up) {
638                         /* only move when the rebalance gains something */
639                         if (vnn->iface->references > (best_iface->references + 1)) {
640                                 do_updateip = true;
641                         }
642                 } else if (vnn->iface != best_iface) {
643                         do_updateip = true;
644                 }
645         }
646
647         if (!have_ip) {
648                 if (do_updateip) {
649                         ctdb_vnn_unassign_iface(ctdb, vnn);
650                         do_updateip = false;
651                 }
652                 do_takeip = true;
653         }
654
655         if (do_takeip) {
656                 ret = ctdb_do_takeip(ctdb, c, vnn);
657                 if (ret != 0) {
658                         return -1;
659                 }
660         } else if (do_updateip) {
661                 ret = ctdb_do_updateip(ctdb, c, vnn);
662                 if (ret != 0) {
663                         return -1;
664                 }
665         } else {
666                 /*
667                  * The interface is up and the kernel known the ip
668                  * => do nothing
669                  */
670                 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
671                         ctdb_addr_to_str(&pip->addr),
672                         vnn->public_netmask_bits,
673                         ctdb_vnn_iface_string(vnn)));
674                 return 0;
675         }
676
677         /* tell ctdb_control.c that we will be replying asynchronously */
678         *async_reply = true;
679
680         return 0;
681 }
682
683 /*
684   takeover an ip address old v4 style
685  */
686 int32_t ctdb_control_takeover_ipv4(struct ctdb_context *ctdb, 
687                                 struct ctdb_req_control *c,
688                                 TDB_DATA indata, 
689                                 bool *async_reply)
690 {
691         TDB_DATA data;
692         
693         data.dsize = sizeof(struct ctdb_public_ip);
694         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
695         CTDB_NO_MEMORY(ctdb, data.dptr);
696         
697         memcpy(data.dptr, indata.dptr, indata.dsize);
698         return ctdb_control_takeover_ip(ctdb, c, data, async_reply);
699 }
700
701 /*
702   kill any clients that are registered with a IP that is being released
703  */
704 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
705 {
706         struct ctdb_client_ip *ip;
707
708         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
709                 ctdb_addr_to_str(addr)));
710
711         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
712                 ctdb_sock_addr tmp_addr;
713
714                 tmp_addr = ip->addr;
715                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
716                         ip->client_id,
717                         ctdb_addr_to_str(&ip->addr)));
718
719                 if (ctdb_same_ip(&tmp_addr, addr)) {
720                         struct ctdb_client *client = ctdb_reqid_find(ctdb, 
721                                                                      ip->client_id, 
722                                                                      struct ctdb_client);
723                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
724                                 ip->client_id,
725                                 ctdb_addr_to_str(&ip->addr),
726                                 client->pid));
727
728                         if (client->pid != 0) {
729                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
730                                         (unsigned)client->pid,
731                                         ctdb_addr_to_str(addr),
732                                         ip->client_id));
733                                 kill(client->pid, SIGKILL);
734                         }
735                 }
736         }
737 }
738
739 /*
740   called when releaseip event finishes
741  */
742 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
743                                 void *private_data)
744 {
745         struct takeover_callback_state *state = 
746                 talloc_get_type(private_data, struct takeover_callback_state);
747         TDB_DATA data;
748
749         if (status == -ETIME) {
750                 ctdb_ban_self(ctdb);
751         }
752
753         /* send a message to all clients of this node telling them
754            that the cluster has been reconfigured and they should
755            release any sockets on this IP */
756         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
757         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
758         data.dsize = strlen((char *)data.dptr)+1;
759
760         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
761
762         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
763
764         /* kill clients that have registered with this IP */
765         release_kill_clients(ctdb, state->addr);
766
767         ctdb_vnn_unassign_iface(ctdb, state->vnn);
768
769         /* the control succeeded */
770         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
771         talloc_free(state);
772 }
773
774 /*
775   release an ip address
776  */
777 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
778                                 struct ctdb_req_control *c,
779                                 TDB_DATA indata, 
780                                 bool *async_reply)
781 {
782         int ret;
783         struct takeover_callback_state *state;
784         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
785         struct ctdb_vnn *vnn;
786
787         /* update our vnn list */
788         vnn = find_public_ip_vnn(ctdb, &pip->addr);
789         if (vnn == NULL) {
790                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
791                         ctdb_addr_to_str(&pip->addr)));
792                 return 0;
793         }
794         vnn->pnn = pip->pnn;
795
796         /* stop any previous arps */
797         talloc_free(vnn->takeover_ctx);
798         vnn->takeover_ctx = NULL;
799
800         if (!ctdb_sys_have_ip(&pip->addr)) {
801                 DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n", 
802                         ctdb_addr_to_str(&pip->addr),
803                         vnn->public_netmask_bits, 
804                         ctdb_vnn_iface_string(vnn)));
805                 ctdb_vnn_unassign_iface(ctdb, vnn);
806                 return 0;
807         }
808
809         if (vnn->iface == NULL) {
810                 DEBUG(DEBUG_ERR,(__location__ " release_ip of IP %s is known to the kernel, "
811                                  "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
812                                  ctdb_addr_to_str(&vnn->public_address)));
813                 return 0;
814         }
815
816         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
817                 ctdb_addr_to_str(&pip->addr),
818                 vnn->public_netmask_bits, 
819                 ctdb_vnn_iface_string(vnn),
820                 pip->pnn));
821
822         state = talloc(ctdb, struct takeover_callback_state);
823         CTDB_NO_MEMORY(ctdb, state);
824
825         state->c = talloc_steal(state, c);
826         state->addr = talloc(state, ctdb_sock_addr);       
827         CTDB_NO_MEMORY(ctdb, state->addr);
828         *state->addr = pip->addr;
829         state->vnn   = vnn;
830
831         ret = ctdb_event_script_callback(ctdb, 
832                                          state, release_ip_callback, state,
833                                          false,
834                                          CTDB_EVENT_RELEASE_IP,
835                                          "%s %s %u",
836                                          ctdb_vnn_iface_string(vnn),
837                                          ctdb_addr_to_str(&pip->addr),
838                                          vnn->public_netmask_bits);
839         if (ret != 0) {
840                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
841                         ctdb_addr_to_str(&pip->addr),
842                         ctdb_vnn_iface_string(vnn)));
843                 talloc_free(state);
844                 return -1;
845         }
846
847         /* tell the control that we will be reply asynchronously */
848         *async_reply = true;
849         return 0;
850 }
851
852 /*
853   release an ip address old v4 style
854  */
855 int32_t ctdb_control_release_ipv4(struct ctdb_context *ctdb, 
856                                 struct ctdb_req_control *c,
857                                 TDB_DATA indata, 
858                                 bool *async_reply)
859 {
860         TDB_DATA data;
861         
862         data.dsize = sizeof(struct ctdb_public_ip);
863         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
864         CTDB_NO_MEMORY(ctdb, data.dptr);
865         
866         memcpy(data.dptr, indata.dptr, indata.dsize);
867         return ctdb_control_release_ip(ctdb, c, data, async_reply);
868 }
869
870
871 static int ctdb_add_public_address(struct ctdb_context *ctdb,
872                                    ctdb_sock_addr *addr,
873                                    unsigned mask, const char *ifaces)
874 {
875         struct ctdb_vnn      *vnn;
876         uint32_t num = 0;
877         char *tmp;
878         const char *iface;
879         int i;
880         int ret;
881
882         /* Verify that we dont have an entry for this ip yet */
883         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
884                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
885                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
886                                 ctdb_addr_to_str(addr)));
887                         return -1;
888                 }               
889         }
890
891         /* create a new vnn structure for this ip address */
892         vnn = talloc_zero(ctdb, struct ctdb_vnn);
893         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
894         vnn->ifaces = talloc_array(vnn, const char *, num + 2);
895         tmp = talloc_strdup(vnn, ifaces);
896         CTDB_NO_MEMORY_FATAL(ctdb, tmp);
897         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
898                 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
899                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
900                 vnn->ifaces[num] = talloc_strdup(vnn, iface);
901                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
902                 num++;
903         }
904         talloc_free(tmp);
905         vnn->ifaces[num] = NULL;
906         vnn->public_address      = *addr;
907         vnn->public_netmask_bits = mask;
908         vnn->pnn                 = -1;
909         if (ctdb_sys_have_ip(addr)) {
910                 DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
911                 vnn->pnn = ctdb->pnn;
912         }
913
914         for (i=0; vnn->ifaces[i]; i++) {
915                 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
916                 if (ret != 0) {
917                         DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
918                                            "for public_address[%s]\n",
919                                            vnn->ifaces[i], ctdb_addr_to_str(addr)));
920                         talloc_free(vnn);
921                         return -1;
922                 }
923                 if (i == 0) {
924                         vnn->iface = ctdb_find_iface(ctdb, vnn->ifaces[i]);
925                 }
926         }
927
928         DLIST_ADD(ctdb->vnn, vnn);
929
930         return 0;
931 }
932
933 /*
934   setup the event script directory
935 */
936 int ctdb_set_event_script_dir(struct ctdb_context *ctdb, const char *script_dir)
937 {
938         ctdb->event_script_dir = talloc_strdup(ctdb, script_dir);
939         CTDB_NO_MEMORY(ctdb, ctdb->event_script_dir);
940         return 0;
941 }
942
943 /*
944   setup the public address lists from a file
945 */
946 int ctdb_set_public_addresses(struct ctdb_context *ctdb, const char *alist)
947 {
948         char **lines;
949         int nlines;
950         int i;
951
952         lines = file_lines_load(alist, &nlines, ctdb);
953         if (lines == NULL) {
954                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", alist);
955                 return -1;
956         }
957         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
958                 nlines--;
959         }
960
961         for (i=0;i<nlines;i++) {
962                 unsigned mask;
963                 ctdb_sock_addr addr;
964                 const char *addrstr;
965                 const char *ifaces;
966                 char *tok, *line;
967
968                 line = lines[i];
969                 while ((*line == ' ') || (*line == '\t')) {
970                         line++;
971                 }
972                 if (*line == '#') {
973                         continue;
974                 }
975                 if (strcmp(line, "") == 0) {
976                         continue;
977                 }
978                 tok = strtok(line, " \t");
979                 addrstr = tok;
980                 tok = strtok(NULL, " \t");
981                 if (tok == NULL) {
982                         if (NULL == ctdb->default_public_interface) {
983                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
984                                          i+1));
985                                 talloc_free(lines);
986                                 return -1;
987                         }
988                         ifaces = ctdb->default_public_interface;
989                 } else {
990                         ifaces = tok;
991                 }
992
993                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
994                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
995                         talloc_free(lines);
996                         return -1;
997                 }
998                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces)) {
999                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1000                         talloc_free(lines);
1001                         return -1;
1002                 }
1003         }
1004
1005         talloc_free(lines);
1006         return 0;
1007 }
1008
1009 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1010                               const char *iface,
1011                               const char *ip)
1012 {
1013         struct ctdb_vnn *svnn;
1014         struct ctdb_iface *cur = NULL;
1015         bool ok;
1016         int ret;
1017
1018         svnn = talloc_zero(ctdb, struct ctdb_vnn);
1019         CTDB_NO_MEMORY(ctdb, svnn);
1020
1021         svnn->ifaces = talloc_array(svnn, const char *, 2);
1022         CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1023         svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1024         CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1025         svnn->ifaces[1] = NULL;
1026
1027         ok = parse_ip(ip, iface, 0, &svnn->public_address);
1028         if (!ok) {
1029                 talloc_free(svnn);
1030                 return -1;
1031         }
1032
1033         ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1034         if (ret != 0) {
1035                 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1036                                    "for single_ip[%s]\n",
1037                                    svnn->ifaces[0],
1038                                    ctdb_addr_to_str(&svnn->public_address)));
1039                 talloc_free(svnn);
1040                 return -1;
1041         }
1042
1043         /* assume the single public ip interface is initially "good" */
1044         cur = ctdb_find_iface(ctdb, iface);
1045         if (cur == NULL) {
1046                 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1047                 return -1;
1048         }
1049         cur->link_up = true;
1050
1051         ret = ctdb_vnn_assign_iface(ctdb, svnn);
1052         if (ret != 0) {
1053                 talloc_free(svnn);
1054                 return -1;
1055         }
1056
1057         ctdb->single_ip_vnn = svnn;
1058         return 0;
1059 }
1060
1061 struct ctdb_public_ip_list {
1062         struct ctdb_public_ip_list *next;
1063         uint32_t pnn;
1064         ctdb_sock_addr addr;
1065 };
1066
1067
1068 /* Given a physical node, return the number of
1069    public addresses that is currently assigned to this node.
1070 */
1071 static int node_ip_coverage(struct ctdb_context *ctdb, 
1072         int32_t pnn,
1073         struct ctdb_public_ip_list *ips)
1074 {
1075         int num=0;
1076
1077         for (;ips;ips=ips->next) {
1078                 if (ips->pnn == pnn) {
1079                         num++;
1080                 }
1081         }
1082         return num;
1083 }
1084
1085
1086 /* Check if this is a public ip known to the node, i.e. can that
1087    node takeover this ip ?
1088 */
1089 static int can_node_serve_ip(struct ctdb_context *ctdb, int32_t pnn, 
1090                 struct ctdb_public_ip_list *ip)
1091 {
1092         struct ctdb_all_public_ips *public_ips;
1093         int i;
1094
1095         public_ips = ctdb->nodes[pnn]->available_public_ips;
1096
1097         if (public_ips == NULL) {
1098                 return -1;
1099         }
1100
1101         for (i=0;i<public_ips->num;i++) {
1102                 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1103                         /* yes, this node can serve this public ip */
1104                         return 0;
1105                 }
1106         }
1107
1108         return -1;
1109 }
1110
1111
1112 /* search the node lists list for a node to takeover this ip.
1113    pick the node that currently are serving the least number of ips
1114    so that the ips get spread out evenly.
1115 */
1116 static int find_takeover_node(struct ctdb_context *ctdb, 
1117                 struct ctdb_node_map *nodemap, uint32_t mask, 
1118                 struct ctdb_public_ip_list *ip,
1119                 struct ctdb_public_ip_list *all_ips)
1120 {
1121         int pnn, min=0, num;
1122         int i;
1123
1124         pnn    = -1;
1125         for (i=0;i<nodemap->num;i++) {
1126                 if (nodemap->nodes[i].flags & mask) {
1127                         /* This node is not healty and can not be used to serve
1128                            a public address 
1129                         */
1130                         continue;
1131                 }
1132
1133                 /* verify that this node can serve this ip */
1134                 if (can_node_serve_ip(ctdb, i, ip)) {
1135                         /* no it couldnt   so skip to the next node */
1136                         continue;
1137                 }
1138
1139                 num = node_ip_coverage(ctdb, i, all_ips);
1140                 /* was this the first node we checked ? */
1141                 if (pnn == -1) {
1142                         pnn = i;
1143                         min  = num;
1144                 } else {
1145                         if (num < min) {
1146                                 pnn = i;
1147                                 min  = num;
1148                         }
1149                 }
1150         }       
1151         if (pnn == -1) {
1152                 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1153                         ctdb_addr_to_str(&ip->addr)));
1154
1155                 return -1;
1156         }
1157
1158         ip->pnn = pnn;
1159         return 0;
1160 }
1161
1162 #define IP_KEYLEN       4
1163 static uint32_t *ip_key(ctdb_sock_addr *ip)
1164 {
1165         static uint32_t key[IP_KEYLEN];
1166
1167         bzero(key, sizeof(key));
1168
1169         switch (ip->sa.sa_family) {
1170         case AF_INET:
1171                 key[3]  = htonl(ip->ip.sin_addr.s_addr);
1172                 break;
1173         case AF_INET6:
1174                 key[0]  = htonl(ip->ip6.sin6_addr.s6_addr32[0]);
1175                 key[1]  = htonl(ip->ip6.sin6_addr.s6_addr32[1]);
1176                 key[2]  = htonl(ip->ip6.sin6_addr.s6_addr32[2]);
1177                 key[3]  = htonl(ip->ip6.sin6_addr.s6_addr32[3]);
1178                 break;
1179         default:
1180                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1181                 return key;
1182         }
1183
1184         return key;
1185 }
1186
1187 static void *add_ip_callback(void *parm, void *data)
1188 {
1189         struct ctdb_public_ip_list *this_ip = parm; 
1190         struct ctdb_public_ip_list *prev_ip = data; 
1191
1192         if (prev_ip == NULL) {
1193                 return parm;
1194         }
1195         if (this_ip->pnn == -1) {
1196                 this_ip->pnn = prev_ip->pnn;
1197         }
1198
1199         return parm;
1200 }
1201
1202 void getips_count_callback(void *param, void *data)
1203 {
1204         struct ctdb_public_ip_list **ip_list = (struct ctdb_public_ip_list **)param;
1205         struct ctdb_public_ip_list *new_ip = (struct ctdb_public_ip_list *)data;
1206
1207         new_ip->next = *ip_list;
1208         *ip_list     = new_ip;
1209 }
1210
1211 static struct ctdb_public_ip_list *
1212 create_merged_ip_list(struct ctdb_context *ctdb)
1213 {
1214         int i, j;
1215         struct ctdb_public_ip_list *ip_list;
1216         struct ctdb_all_public_ips *public_ips;
1217
1218         if (ctdb->ip_tree != NULL) {
1219                 talloc_free(ctdb->ip_tree);
1220                 ctdb->ip_tree = NULL;
1221         }
1222         ctdb->ip_tree = trbt_create(ctdb, 0);
1223
1224         for (i=0;i<ctdb->num_nodes;i++) {
1225                 public_ips = ctdb->nodes[i]->known_public_ips;
1226
1227                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1228                         continue;
1229                 }
1230
1231                 /* there were no public ips for this node */
1232                 if (public_ips == NULL) {
1233                         continue;
1234                 }               
1235
1236                 for (j=0;j<public_ips->num;j++) {
1237                         struct ctdb_public_ip_list *tmp_ip; 
1238
1239                         tmp_ip = talloc_zero(ctdb->ip_tree, struct ctdb_public_ip_list);
1240                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1241                         tmp_ip->pnn  = public_ips->ips[j].pnn;
1242                         tmp_ip->addr = public_ips->ips[j].addr;
1243                         tmp_ip->next = NULL;
1244
1245                         trbt_insertarray32_callback(ctdb->ip_tree,
1246                                 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1247                                 add_ip_callback,
1248                                 tmp_ip);
1249                 }
1250         }
1251
1252         ip_list = NULL;
1253         trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1254
1255         return ip_list;
1256 }
1257
1258 /*
1259   make any IP alias changes for public addresses that are necessary 
1260  */
1261 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
1262 {
1263         int i, num_healthy, retries;
1264         struct ctdb_public_ip ip;
1265         struct ctdb_public_ipv4 ipv4;
1266         uint32_t mask, *nodes;
1267         struct ctdb_public_ip_list *all_ips, *tmp_ip;
1268         int maxnode, maxnum=0, minnode, minnum=0, num;
1269         TDB_DATA data;
1270         struct timeval timeout;
1271         struct client_async_data *async_data;
1272         struct ctdb_client_control_state *state;
1273         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1274
1275         /*
1276          * ip failover is completely disabled, just send out the 
1277          * ipreallocated event.
1278          */
1279         if (ctdb->tunable.disable_ip_failover != 0) {
1280                 goto ipreallocated;
1281         }
1282
1283         ZERO_STRUCT(ip);
1284
1285         /* Count how many completely healthy nodes we have */
1286         num_healthy = 0;
1287         for (i=0;i<nodemap->num;i++) {
1288                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
1289                         num_healthy++;
1290                 }
1291         }
1292
1293         if (num_healthy > 0) {
1294                 /* We have healthy nodes, so only consider them for 
1295                    serving public addresses
1296                 */
1297                 mask = NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED;
1298         } else {
1299                 /* We didnt have any completely healthy nodes so
1300                    use "disabled" nodes as a fallback
1301                 */
1302                 mask = NODE_FLAGS_INACTIVE;
1303         }
1304
1305         /* since nodes only know about those public addresses that
1306            can be served by that particular node, no single node has
1307            a full list of all public addresses that exist in the cluster.
1308            Walk over all node structures and create a merged list of
1309            all public addresses that exist in the cluster.
1310
1311            keep the tree of ips around as ctdb->ip_tree
1312         */
1313         all_ips = create_merged_ip_list(ctdb);
1314
1315         /* If we want deterministic ip allocations, i.e. that the ip addresses
1316            will always be allocated the same way for a specific set of
1317            available/unavailable nodes.
1318         */
1319         if (1 == ctdb->tunable.deterministic_public_ips) {              
1320                 DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
1321                 for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
1322                         tmp_ip->pnn = i%nodemap->num;
1323                 }
1324         }
1325
1326
1327         /* mark all public addresses with a masked node as being served by
1328            node -1
1329         */
1330         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1331                 if (tmp_ip->pnn == -1) {
1332                         continue;
1333                 }
1334                 if (nodemap->nodes[tmp_ip->pnn].flags & mask) {
1335                         tmp_ip->pnn = -1;
1336                 }
1337         }
1338
1339         /* verify that the assigned nodes can serve that public ip
1340            and set it to -1 if not
1341         */
1342         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1343                 if (tmp_ip->pnn == -1) {
1344                         continue;
1345                 }
1346                 if (can_node_serve_ip(ctdb, tmp_ip->pnn, tmp_ip) != 0) {
1347                         /* this node can not serve this ip. */
1348                         tmp_ip->pnn = -1;
1349                 }
1350         }
1351
1352
1353         /* now we must redistribute all public addresses with takeover node
1354            -1 among the nodes available
1355         */
1356         retries = 0;
1357 try_again:
1358         /* loop over all ip's and find a physical node to cover for 
1359            each unassigned ip.
1360         */
1361         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1362                 if (tmp_ip->pnn == -1) {
1363                         if (find_takeover_node(ctdb, nodemap, mask, tmp_ip, all_ips)) {
1364                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1365                                         ctdb_addr_to_str(&tmp_ip->addr)));
1366                         }
1367                 }
1368         }
1369
1370         /* If we dont want ips to fail back after a node becomes healthy
1371            again, we wont even try to reallocat the ip addresses so that
1372            they are evenly spread out.
1373            This can NOT be used at the same time as DeterministicIPs !
1374         */
1375         if (1 == ctdb->tunable.no_ip_failback) {
1376                 if (1 == ctdb->tunable.deterministic_public_ips) {
1377                         DEBUG(DEBUG_ERR, ("ERROR: You can not use 'DeterministicIPs' and 'NoIPFailback' at the same time\n"));
1378                 }
1379                 goto finished;
1380         }
1381
1382
1383         /* now, try to make sure the ip adresses are evenly distributed
1384            across the node.
1385            for each ip address, loop over all nodes that can serve this
1386            ip and make sure that the difference between the node
1387            serving the most and the node serving the least ip's are not greater
1388            than 1.
1389         */
1390         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1391                 if (tmp_ip->pnn == -1) {
1392                         continue;
1393                 }
1394
1395                 /* Get the highest and lowest number of ips's served by any 
1396                    valid node which can serve this ip.
1397                 */
1398                 maxnode = -1;
1399                 minnode = -1;
1400                 for (i=0;i<nodemap->num;i++) {
1401                         if (nodemap->nodes[i].flags & mask) {
1402                                 continue;
1403                         }
1404
1405                         /* only check nodes that can actually serve this ip */
1406                         if (can_node_serve_ip(ctdb, i, tmp_ip)) {
1407                                 /* no it couldnt   so skip to the next node */
1408                                 continue;
1409                         }
1410
1411                         num = node_ip_coverage(ctdb, i, all_ips);
1412                         if (maxnode == -1) {
1413                                 maxnode = i;
1414                                 maxnum  = num;
1415                         } else {
1416                                 if (num > maxnum) {
1417                                         maxnode = i;
1418                                         maxnum  = num;
1419                                 }
1420                         }
1421                         if (minnode == -1) {
1422                                 minnode = i;
1423                                 minnum  = num;
1424                         } else {
1425                                 if (num < minnum) {
1426                                         minnode = i;
1427                                         minnum  = num;
1428                                 }
1429                         }
1430                 }
1431                 if (maxnode == -1) {
1432                         DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1433                                 ctdb_addr_to_str(&tmp_ip->addr)));
1434
1435                         continue;
1436                 }
1437
1438                 /* If we want deterministic IPs then dont try to reallocate 
1439                    them to spread out the load.
1440                 */
1441                 if (1 == ctdb->tunable.deterministic_public_ips) {
1442                         continue;
1443                 }
1444
1445                 /* if the spread between the smallest and largest coverage by
1446                    a node is >=2 we steal one of the ips from the node with
1447                    most coverage to even things out a bit.
1448                    try to do this at most 5 times  since we dont want to spend
1449                    too much time balancing the ip coverage.
1450                 */
1451                 if ( (maxnum > minnum+1)
1452                   && (retries < 5) ){
1453                         struct ctdb_public_ip_list *tmp;
1454
1455                         /* mark one of maxnode's vnn's as unassigned and try
1456                            again
1457                         */
1458                         for (tmp=all_ips;tmp;tmp=tmp->next) {
1459                                 if (tmp->pnn == maxnode) {
1460                                         tmp->pnn = -1;
1461                                         retries++;
1462                                         goto try_again;
1463                                 }
1464                         }
1465                 }
1466         }
1467
1468
1469         /* finished distributing the public addresses, now just send the 
1470            info out to the nodes
1471         */
1472 finished:
1473
1474         /* at this point ->pnn is the node which will own each IP
1475            or -1 if there is no node that can cover this ip
1476         */
1477
1478         /* now tell all nodes to delete any alias that they should not
1479            have.  This will be a NOOP on nodes that don't currently
1480            hold the given alias */
1481         async_data = talloc_zero(tmp_ctx, struct client_async_data);
1482         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
1483
1484         for (i=0;i<nodemap->num;i++) {
1485                 /* don't talk to unconnected nodes, but do talk to banned nodes */
1486                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1487                         continue;
1488                 }
1489
1490                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1491                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
1492                                 /* This node should be serving this
1493                                    vnn so dont tell it to release the ip
1494                                 */
1495                                 continue;
1496                         }
1497                         if (tmp_ip->addr.sa.sa_family == AF_INET) {
1498                                 ipv4.pnn = tmp_ip->pnn;
1499                                 ipv4.sin = tmp_ip->addr.ip;
1500
1501                                 timeout = TAKEOVER_TIMEOUT();
1502                                 data.dsize = sizeof(ipv4);
1503                                 data.dptr  = (uint8_t *)&ipv4;
1504                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
1505                                                 0, CTDB_CONTROL_RELEASE_IPv4, 0,
1506                                                 data, async_data,
1507                                                 &timeout, NULL);
1508                         } else {
1509                                 ip.pnn  = tmp_ip->pnn;
1510                                 ip.addr = tmp_ip->addr;
1511
1512                                 timeout = TAKEOVER_TIMEOUT();
1513                                 data.dsize = sizeof(ip);
1514                                 data.dptr  = (uint8_t *)&ip;
1515                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
1516                                                 0, CTDB_CONTROL_RELEASE_IP, 0,
1517                                                 data, async_data,
1518                                                 &timeout, NULL);
1519                         }
1520
1521                         if (state == NULL) {
1522                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
1523                                 talloc_free(tmp_ctx);
1524                                 return -1;
1525                         }
1526                 
1527                         ctdb_client_async_add(async_data, state);
1528                 }
1529         }
1530         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
1531                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
1532                 talloc_free(tmp_ctx);
1533                 return -1;
1534         }
1535         talloc_free(async_data);
1536
1537
1538         /* tell all nodes to get their own IPs */
1539         async_data = talloc_zero(tmp_ctx, struct client_async_data);
1540         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
1541         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1542                 if (tmp_ip->pnn == -1) {
1543                         /* this IP won't be taken over */
1544                         continue;
1545                 }
1546
1547                 if (tmp_ip->addr.sa.sa_family == AF_INET) {
1548                         ipv4.pnn = tmp_ip->pnn;
1549                         ipv4.sin = tmp_ip->addr.ip;
1550
1551                         timeout = TAKEOVER_TIMEOUT();
1552                         data.dsize = sizeof(ipv4);
1553                         data.dptr  = (uint8_t *)&ipv4;
1554                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
1555                                         0, CTDB_CONTROL_TAKEOVER_IPv4, 0,
1556                                         data, async_data,
1557                                         &timeout, NULL);
1558                 } else {
1559                         ip.pnn  = tmp_ip->pnn;
1560                         ip.addr = tmp_ip->addr;
1561
1562                         timeout = TAKEOVER_TIMEOUT();
1563                         data.dsize = sizeof(ip);
1564                         data.dptr  = (uint8_t *)&ip;
1565                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
1566                                         0, CTDB_CONTROL_TAKEOVER_IP, 0,
1567                                         data, async_data,
1568                                         &timeout, NULL);
1569                 }
1570                 if (state == NULL) {
1571                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
1572                         talloc_free(tmp_ctx);
1573                         return -1;
1574                 }
1575                 
1576                 ctdb_client_async_add(async_data, state);
1577         }
1578         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
1579                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
1580                 talloc_free(tmp_ctx);
1581                 return -1;
1582         }
1583
1584 ipreallocated:
1585         /* tell all nodes to update natwg */
1586         /* send the flags update natgw on all connected nodes */
1587         data.dptr  = discard_const("ipreallocated");
1588         data.dsize = strlen((char *)data.dptr) + 1; 
1589         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
1590         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_RUN_EVENTSCRIPTS,
1591                                       nodes, 0, TAKEOVER_TIMEOUT(),
1592                                       false, data,
1593                                       NULL, NULL,
1594                                       NULL) != 0) {
1595                 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to updatenatgw failed\n"));
1596         }
1597
1598         talloc_free(tmp_ctx);
1599         return 0;
1600 }
1601
1602
1603 /*
1604   destroy a ctdb_client_ip structure
1605  */
1606 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
1607 {
1608         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
1609                 ctdb_addr_to_str(&ip->addr),
1610                 ntohs(ip->addr.ip.sin_port),
1611                 ip->client_id));
1612
1613         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
1614         return 0;
1615 }
1616
1617 /*
1618   called by a client to inform us of a TCP connection that it is managing
1619   that should tickled with an ACK when IP takeover is done
1620   we handle both the old ipv4 style of packets as well as the new ipv4/6
1621   pdus.
1622  */
1623 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
1624                                 TDB_DATA indata)
1625 {
1626         struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
1627         struct ctdb_control_tcp *old_addr = NULL;
1628         struct ctdb_control_tcp_addr new_addr;
1629         struct ctdb_control_tcp_addr *tcp_sock = NULL;
1630         struct ctdb_tcp_list *tcp;
1631         struct ctdb_tcp_connection t;
1632         int ret;
1633         TDB_DATA data;
1634         struct ctdb_client_ip *ip;
1635         struct ctdb_vnn *vnn;
1636         ctdb_sock_addr addr;
1637
1638         switch (indata.dsize) {
1639         case sizeof(struct ctdb_control_tcp):
1640                 old_addr = (struct ctdb_control_tcp *)indata.dptr;
1641                 ZERO_STRUCT(new_addr);
1642                 tcp_sock = &new_addr;
1643                 tcp_sock->src.ip  = old_addr->src;
1644                 tcp_sock->dest.ip = old_addr->dest;
1645                 break;
1646         case sizeof(struct ctdb_control_tcp_addr):
1647                 tcp_sock = (struct ctdb_control_tcp_addr *)indata.dptr;
1648                 break;
1649         default:
1650                 DEBUG(DEBUG_ERR,(__location__ " Invalid data structure passed "
1651                                  "to ctdb_control_tcp_client. size was %d but "
1652                                  "only allowed sizes are %lu and %lu\n",
1653                                  (int)indata.dsize,
1654                                  (long unsigned)sizeof(struct ctdb_control_tcp),
1655                                  (long unsigned)sizeof(struct ctdb_control_tcp_addr)));
1656                 return -1;
1657         }
1658
1659         addr = tcp_sock->src;
1660         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
1661         addr = tcp_sock->dest;
1662         ctdb_canonicalize_ip(&addr, &tcp_sock->dest);
1663
1664         ZERO_STRUCT(addr);
1665         memcpy(&addr, &tcp_sock->dest, sizeof(addr));
1666         vnn = find_public_ip_vnn(ctdb, &addr);
1667         if (vnn == NULL) {
1668                 switch (addr.sa.sa_family) {
1669                 case AF_INET:
1670                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
1671                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
1672                                         ctdb_addr_to_str(&addr)));
1673                         }
1674                         break;
1675                 case AF_INET6:
1676                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
1677                                 ctdb_addr_to_str(&addr)));
1678                         break;
1679                 default:
1680                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
1681                 }
1682
1683                 return 0;
1684         }
1685
1686         if (vnn->pnn != ctdb->pnn) {
1687                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
1688                         ctdb_addr_to_str(&addr),
1689                         client_id, client->pid));
1690                 /* failing this call will tell smbd to die */
1691                 return -1;
1692         }
1693
1694         ip = talloc(client, struct ctdb_client_ip);
1695         CTDB_NO_MEMORY(ctdb, ip);
1696
1697         ip->ctdb      = ctdb;
1698         ip->addr      = addr;
1699         ip->client_id = client_id;
1700         talloc_set_destructor(ip, ctdb_client_ip_destructor);
1701         DLIST_ADD(ctdb->client_ip_list, ip);
1702
1703         tcp = talloc(client, struct ctdb_tcp_list);
1704         CTDB_NO_MEMORY(ctdb, tcp);
1705
1706         tcp->connection.src_addr = tcp_sock->src;
1707         tcp->connection.dst_addr = tcp_sock->dest;
1708
1709         DLIST_ADD(client->tcp_list, tcp);
1710
1711         t.src_addr = tcp_sock->src;
1712         t.dst_addr = tcp_sock->dest;
1713
1714         data.dptr = (uint8_t *)&t;
1715         data.dsize = sizeof(t);
1716
1717         switch (addr.sa.sa_family) {
1718         case AF_INET:
1719                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
1720                         (unsigned)ntohs(tcp_sock->dest.ip.sin_port), 
1721                         ctdb_addr_to_str(&tcp_sock->src),
1722                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
1723                 break;
1724         case AF_INET6:
1725                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
1726                         (unsigned)ntohs(tcp_sock->dest.ip6.sin6_port), 
1727                         ctdb_addr_to_str(&tcp_sock->src),
1728                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
1729                 break;
1730         default:
1731                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
1732         }
1733
1734
1735         /* tell all nodes about this tcp connection */
1736         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
1737                                        CTDB_CONTROL_TCP_ADD,
1738                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
1739         if (ret != 0) {
1740                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
1741                 return -1;
1742         }
1743
1744         return 0;
1745 }
1746
1747 /*
1748   find a tcp address on a list
1749  */
1750 static struct ctdb_tcp_connection *ctdb_tcp_find(struct ctdb_tcp_array *array, 
1751                                            struct ctdb_tcp_connection *tcp)
1752 {
1753         int i;
1754
1755         if (array == NULL) {
1756                 return NULL;
1757         }
1758
1759         for (i=0;i<array->num;i++) {
1760                 if (ctdb_same_sockaddr(&array->connections[i].src_addr, &tcp->src_addr) &&
1761                     ctdb_same_sockaddr(&array->connections[i].dst_addr, &tcp->dst_addr)) {
1762                         return &array->connections[i];
1763                 }
1764         }
1765         return NULL;
1766 }
1767
1768
1769
1770 /*
1771   called by a daemon to inform us of a TCP connection that one of its
1772   clients managing that should tickled with an ACK when IP takeover is
1773   done
1774  */
1775 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
1776 {
1777         struct ctdb_tcp_connection *p = (struct ctdb_tcp_connection *)indata.dptr;
1778         struct ctdb_tcp_array *tcparray;
1779         struct ctdb_tcp_connection tcp;
1780         struct ctdb_vnn *vnn;
1781
1782         vnn = find_public_ip_vnn(ctdb, &p->dst_addr);
1783         if (vnn == NULL) {
1784                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
1785                         ctdb_addr_to_str(&p->dst_addr)));
1786
1787                 return -1;
1788         }
1789
1790
1791         tcparray = vnn->tcp_array;
1792
1793         /* If this is the first tickle */
1794         if (tcparray == NULL) {
1795                 tcparray = talloc_size(ctdb->nodes, 
1796                         offsetof(struct ctdb_tcp_array, connections) +
1797                         sizeof(struct ctdb_tcp_connection) * 1);
1798                 CTDB_NO_MEMORY(ctdb, tcparray);
1799                 vnn->tcp_array = tcparray;
1800
1801                 tcparray->num = 0;
1802                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
1803                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
1804
1805                 tcparray->connections[tcparray->num].src_addr = p->src_addr;
1806                 tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
1807                 tcparray->num++;
1808
1809                 if (tcp_update_needed) {
1810                         vnn->tcp_update_needed = true;
1811                 }
1812                 return 0;
1813         }
1814
1815
1816         /* Do we already have this tickle ?*/
1817         tcp.src_addr = p->src_addr;
1818         tcp.dst_addr = p->dst_addr;
1819         if (ctdb_tcp_find(vnn->tcp_array, &tcp) != NULL) {
1820                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
1821                         ctdb_addr_to_str(&tcp.dst_addr),
1822                         ntohs(tcp.dst_addr.ip.sin_port),
1823                         vnn->pnn));
1824                 return 0;
1825         }
1826
1827         /* A new tickle, we must add it to the array */
1828         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
1829                                         struct ctdb_tcp_connection,
1830                                         tcparray->num+1);
1831         CTDB_NO_MEMORY(ctdb, tcparray->connections);
1832
1833         vnn->tcp_array = tcparray;
1834         tcparray->connections[tcparray->num].src_addr = p->src_addr;
1835         tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
1836         tcparray->num++;
1837                                 
1838         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
1839                 ctdb_addr_to_str(&tcp.dst_addr),
1840                 ntohs(tcp.dst_addr.ip.sin_port),
1841                 vnn->pnn));
1842
1843         if (tcp_update_needed) {
1844                 vnn->tcp_update_needed = true;
1845         }
1846
1847         return 0;
1848 }
1849
1850
1851 /*
1852   called by a daemon to inform us of a TCP connection that one of its
1853   clients managing that should tickled with an ACK when IP takeover is
1854   done
1855  */
1856 static void ctdb_remove_tcp_connection(struct ctdb_context *ctdb, struct ctdb_tcp_connection *conn)
1857 {
1858         struct ctdb_tcp_connection *tcpp;
1859         struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst_addr);
1860
1861         if (vnn == NULL) {
1862                 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
1863                         ctdb_addr_to_str(&conn->dst_addr)));
1864                 return;
1865         }
1866
1867         /* if the array is empty we cant remove it
1868            and we dont need to do anything
1869          */
1870         if (vnn->tcp_array == NULL) {
1871                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
1872                         ctdb_addr_to_str(&conn->dst_addr),
1873                         ntohs(conn->dst_addr.ip.sin_port)));
1874                 return;
1875         }
1876
1877
1878         /* See if we know this connection
1879            if we dont know this connection  then we dont need to do anything
1880          */
1881         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
1882         if (tcpp == NULL) {
1883                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
1884                         ctdb_addr_to_str(&conn->dst_addr),
1885                         ntohs(conn->dst_addr.ip.sin_port)));
1886                 return;
1887         }
1888
1889
1890         /* We need to remove this entry from the array.
1891            Instead of allocating a new array and copying data to it
1892            we cheat and just copy the last entry in the existing array
1893            to the entry that is to be removed and just shring the 
1894            ->num field
1895          */
1896         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
1897         vnn->tcp_array->num--;
1898
1899         /* If we deleted the last entry we also need to remove the entire array
1900          */
1901         if (vnn->tcp_array->num == 0) {
1902                 talloc_free(vnn->tcp_array);
1903                 vnn->tcp_array = NULL;
1904         }               
1905
1906         vnn->tcp_update_needed = true;
1907
1908         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
1909                 ctdb_addr_to_str(&conn->src_addr),
1910                 ntohs(conn->src_addr.ip.sin_port)));
1911 }
1912
1913
1914 /*
1915   called by a daemon to inform us of a TCP connection that one of its
1916   clients used are no longer needed in the tickle database
1917  */
1918 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
1919 {
1920         struct ctdb_tcp_connection *conn = (struct ctdb_tcp_connection *)indata.dptr;
1921
1922         ctdb_remove_tcp_connection(ctdb, conn);
1923
1924         return 0;
1925 }
1926
1927
1928 /*
1929   called when a daemon restarts - send all tickes for all public addresses
1930   we are serving immediately to the new node.
1931  */
1932 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t vnn)
1933 {
1934 /*XXX here we should send all tickes we are serving to the new node */
1935         return 0;
1936 }
1937
1938
1939 /*
1940   called when a client structure goes away - hook to remove
1941   elements from the tcp_list in all daemons
1942  */
1943 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
1944 {
1945         while (client->tcp_list) {
1946                 struct ctdb_tcp_list *tcp = client->tcp_list;
1947                 DLIST_REMOVE(client->tcp_list, tcp);
1948                 ctdb_remove_tcp_connection(client->ctdb, &tcp->connection);
1949         }
1950 }
1951
1952
1953 /*
1954   release all IPs on shutdown
1955  */
1956 void ctdb_release_all_ips(struct ctdb_context *ctdb)
1957 {
1958         struct ctdb_vnn *vnn;
1959
1960         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1961                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
1962                         ctdb_vnn_unassign_iface(ctdb, vnn);
1963                         continue;
1964                 }
1965                 if (!vnn->iface) {
1966                         continue;
1967                 }
1968                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
1969                                   ctdb_vnn_iface_string(vnn),
1970                                   ctdb_addr_to_str(&vnn->public_address),
1971                                   vnn->public_netmask_bits);
1972                 release_kill_clients(ctdb, &vnn->public_address);
1973                 ctdb_vnn_unassign_iface(ctdb, vnn);
1974         }
1975 }
1976
1977
1978 /*
1979   get list of public IPs
1980  */
1981 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
1982                                     struct ctdb_req_control *c, TDB_DATA *outdata)
1983 {
1984         int i, num, len;
1985         struct ctdb_all_public_ips *ips;
1986         struct ctdb_vnn *vnn;
1987         bool only_available = false;
1988
1989         if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
1990                 only_available = true;
1991         }
1992
1993         /* count how many public ip structures we have */
1994         num = 0;
1995         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1996                 num++;
1997         }
1998
1999         len = offsetof(struct ctdb_all_public_ips, ips) + 
2000                 num*sizeof(struct ctdb_public_ip);
2001         ips = talloc_zero_size(outdata, len);
2002         CTDB_NO_MEMORY(ctdb, ips);
2003
2004         i = 0;
2005         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2006                 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
2007                         continue;
2008                 }
2009                 ips->ips[i].pnn  = vnn->pnn;
2010                 ips->ips[i].addr = vnn->public_address;
2011                 i++;
2012         }
2013         ips->num = i;
2014         len = offsetof(struct ctdb_all_public_ips, ips) +
2015                 i*sizeof(struct ctdb_public_ip);
2016
2017         outdata->dsize = len;
2018         outdata->dptr  = (uint8_t *)ips;
2019
2020         return 0;
2021 }
2022
2023
2024 /*
2025   get list of public IPs, old ipv4 style.  only returns ipv4 addresses
2026  */
2027 int32_t ctdb_control_get_public_ipsv4(struct ctdb_context *ctdb, 
2028                                     struct ctdb_req_control *c, TDB_DATA *outdata)
2029 {
2030         int i, num, len;
2031         struct ctdb_all_public_ipsv4 *ips;
2032         struct ctdb_vnn *vnn;
2033
2034         /* count how many public ip structures we have */
2035         num = 0;
2036         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2037                 if (vnn->public_address.sa.sa_family != AF_INET) {
2038                         continue;
2039                 }
2040                 num++;
2041         }
2042
2043         len = offsetof(struct ctdb_all_public_ipsv4, ips) + 
2044                 num*sizeof(struct ctdb_public_ipv4);
2045         ips = talloc_zero_size(outdata, len);
2046         CTDB_NO_MEMORY(ctdb, ips);
2047
2048         outdata->dsize = len;
2049         outdata->dptr  = (uint8_t *)ips;
2050
2051         ips->num = num;
2052         i = 0;
2053         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2054                 if (vnn->public_address.sa.sa_family != AF_INET) {
2055                         continue;
2056                 }
2057                 ips->ips[i].pnn = vnn->pnn;
2058                 ips->ips[i].sin = vnn->public_address.ip;
2059                 i++;
2060         }
2061
2062         return 0;
2063 }
2064
2065 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
2066                                         struct ctdb_req_control *c,
2067                                         TDB_DATA indata,
2068                                         TDB_DATA *outdata)
2069 {
2070         int i, num, len;
2071         ctdb_sock_addr *addr;
2072         struct ctdb_control_public_ip_info *info;
2073         struct ctdb_vnn *vnn;
2074
2075         addr = (ctdb_sock_addr *)indata.dptr;
2076
2077         vnn = find_public_ip_vnn(ctdb, addr);
2078         if (vnn == NULL) {
2079                 /* if it is not a public ip   it could be our 'single ip' */
2080                 if (ctdb->single_ip_vnn) {
2081                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
2082                                 vnn = ctdb->single_ip_vnn;
2083                         }
2084                 }
2085         }
2086         if (vnn == NULL) {
2087                 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
2088                                  "'%s'not a public address\n",
2089                                  ctdb_addr_to_str(addr)));
2090                 return -1;
2091         }
2092
2093         /* count how many public ip structures we have */
2094         num = 0;
2095         for (;vnn->ifaces[num];) {
2096                 num++;
2097         }
2098
2099         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
2100                 num*sizeof(struct ctdb_control_iface_info);
2101         info = talloc_zero_size(outdata, len);
2102         CTDB_NO_MEMORY(ctdb, info);
2103
2104         info->ip.addr = vnn->public_address;
2105         info->ip.pnn = vnn->pnn;
2106         info->active_idx = 0xFFFFFFFF;
2107
2108         for (i=0; vnn->ifaces[i]; i++) {
2109                 struct ctdb_iface *cur;
2110
2111                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
2112                 if (cur == NULL) {
2113                         DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
2114                                            vnn->ifaces[i]));
2115                         return -1;
2116                 }
2117                 if (vnn->iface == cur) {
2118                         info->active_idx = i;
2119                 }
2120                 strcpy(info->ifaces[i].name, cur->name);
2121                 info->ifaces[i].link_state = cur->link_up;
2122                 info->ifaces[i].references = cur->references;
2123         }
2124         info->num = i;
2125         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
2126                 i*sizeof(struct ctdb_control_iface_info);
2127
2128         outdata->dsize = len;
2129         outdata->dptr  = (uint8_t *)info;
2130
2131         return 0;
2132 }
2133
2134 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
2135                                 struct ctdb_req_control *c,
2136                                 TDB_DATA *outdata)
2137 {
2138         int i, num, len;
2139         struct ctdb_control_get_ifaces *ifaces;
2140         struct ctdb_iface *cur;
2141
2142         /* count how many public ip structures we have */
2143         num = 0;
2144         for (cur=ctdb->ifaces;cur;cur=cur->next) {
2145                 num++;
2146         }
2147
2148         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
2149                 num*sizeof(struct ctdb_control_iface_info);
2150         ifaces = talloc_zero_size(outdata, len);
2151         CTDB_NO_MEMORY(ctdb, ifaces);
2152
2153         i = 0;
2154         for (cur=ctdb->ifaces;cur;cur=cur->next) {
2155                 strcpy(ifaces->ifaces[i].name, cur->name);
2156                 ifaces->ifaces[i].link_state = cur->link_up;
2157                 ifaces->ifaces[i].references = cur->references;
2158                 i++;
2159         }
2160         ifaces->num = i;
2161         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
2162                 i*sizeof(struct ctdb_control_iface_info);
2163
2164         outdata->dsize = len;
2165         outdata->dptr  = (uint8_t *)ifaces;
2166
2167         return 0;
2168 }
2169
2170 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
2171                                     struct ctdb_req_control *c,
2172                                     TDB_DATA indata)
2173 {
2174         struct ctdb_control_iface_info *info;
2175         struct ctdb_iface *iface;
2176         bool link_up = false;
2177
2178         info = (struct ctdb_control_iface_info *)indata.dptr;
2179
2180         if (info->name[CTDB_IFACE_SIZE] != '\0') {
2181                 int len = strnlen(info->name, CTDB_IFACE_SIZE);
2182                 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
2183                                   len, len, info->name));
2184                 return -1;
2185         }
2186
2187         switch (info->link_state) {
2188         case 0:
2189                 link_up = false;
2190                 break;
2191         case 1:
2192                 link_up = true;
2193                 break;
2194         default:
2195                 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
2196                                   (unsigned int)info->link_state));
2197                 return -1;
2198         }
2199
2200         if (info->references != 0) {
2201                 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
2202                                   (unsigned int)info->references));
2203                 return -1;
2204         }
2205
2206         iface = ctdb_find_iface(ctdb, info->name);
2207         if (iface == NULL) {
2208                 DEBUG(DEBUG_ERR, (__location__ "iface[%s] is unknown\n",
2209                                   info->name));
2210                 return -1;
2211         }
2212
2213         if (link_up == iface->link_up) {
2214                 return 0;
2215         }
2216
2217         DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
2218               ("iface[%s] has changed it's link status %s => %s\n",
2219                iface->name,
2220                iface->link_up?"up":"down",
2221                link_up?"up":"down"));
2222
2223         iface->link_up = link_up;
2224         return 0;
2225 }
2226
2227
2228 /* 
2229    structure containing the listening socket and the list of tcp connections
2230    that the ctdb daemon is to kill
2231 */
2232 struct ctdb_kill_tcp {
2233         struct ctdb_vnn *vnn;
2234         struct ctdb_context *ctdb;
2235         int capture_fd;
2236         struct fd_event *fde;
2237         trbt_tree_t *connections;
2238         void *private_data;
2239 };
2240
2241 /*
2242   a tcp connection that is to be killed
2243  */
2244 struct ctdb_killtcp_con {
2245         ctdb_sock_addr src_addr;
2246         ctdb_sock_addr dst_addr;
2247         int count;
2248         struct ctdb_kill_tcp *killtcp;
2249 };
2250
2251 /* this function is used to create a key to represent this socketpair
2252    in the killtcp tree.
2253    this key is used to insert and lookup matching socketpairs that are
2254    to be tickled and RST
2255 */
2256 #define KILLTCP_KEYLEN  10
2257 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
2258 {
2259         static uint32_t key[KILLTCP_KEYLEN];
2260
2261         bzero(key, sizeof(key));
2262
2263         if (src->sa.sa_family != dst->sa.sa_family) {
2264                 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
2265                 return key;
2266         }
2267         
2268         switch (src->sa.sa_family) {
2269         case AF_INET:
2270                 key[0]  = dst->ip.sin_addr.s_addr;
2271                 key[1]  = src->ip.sin_addr.s_addr;
2272                 key[2]  = dst->ip.sin_port;
2273                 key[3]  = src->ip.sin_port;
2274                 break;
2275         case AF_INET6:
2276                 key[0]  = dst->ip6.sin6_addr.s6_addr32[3];
2277                 key[1]  = src->ip6.sin6_addr.s6_addr32[3];
2278                 key[2]  = dst->ip6.sin6_addr.s6_addr32[2];
2279                 key[3]  = src->ip6.sin6_addr.s6_addr32[2];
2280                 key[4]  = dst->ip6.sin6_addr.s6_addr32[1];
2281                 key[5]  = src->ip6.sin6_addr.s6_addr32[1];
2282                 key[6]  = dst->ip6.sin6_addr.s6_addr32[0];
2283                 key[7]  = src->ip6.sin6_addr.s6_addr32[0];
2284                 key[8]  = dst->ip6.sin6_port;
2285                 key[9]  = src->ip6.sin6_port;
2286                 break;
2287         default:
2288                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
2289                 return key;
2290         }
2291
2292         return key;
2293 }
2294
2295 /*
2296   called when we get a read event on the raw socket
2297  */
2298 static void capture_tcp_handler(struct event_context *ev, struct fd_event *fde, 
2299                                 uint16_t flags, void *private_data)
2300 {
2301         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
2302         struct ctdb_killtcp_con *con;
2303         ctdb_sock_addr src, dst;
2304         uint32_t ack_seq, seq;
2305
2306         if (!(flags & EVENT_FD_READ)) {
2307                 return;
2308         }
2309
2310         if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
2311                                 killtcp->private_data,
2312                                 &src, &dst,
2313                                 &ack_seq, &seq) != 0) {
2314                 /* probably a non-tcp ACK packet */
2315                 return;
2316         }
2317
2318         /* check if we have this guy in our list of connections
2319            to kill
2320         */
2321         con = trbt_lookuparray32(killtcp->connections, 
2322                         KILLTCP_KEYLEN, killtcp_key(&src, &dst));
2323         if (con == NULL) {
2324                 /* no this was some other packet we can just ignore */
2325                 return;
2326         }
2327
2328         /* This one has been tickled !
2329            now reset him and remove him from the list.
2330          */
2331         DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
2332                 ntohs(con->dst_addr.ip.sin_port),
2333                 ctdb_addr_to_str(&con->src_addr),
2334                 ntohs(con->src_addr.ip.sin_port)));
2335
2336         ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
2337         talloc_free(con);
2338 }
2339
2340
2341 /* when traversing the list of all tcp connections to send tickle acks to
2342    (so that we can capture the ack coming back and kill the connection
2343     by a RST)
2344    this callback is called for each connection we are currently trying to kill
2345 */
2346 static void tickle_connection_traverse(void *param, void *data)
2347 {
2348         struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
2349
2350         /* have tried too many times, just give up */
2351         if (con->count >= 5) {
2352                 /* can't delete in traverse: reparent to delete_cons */
2353                 talloc_steal(param, con);
2354                 return;
2355         }
2356
2357         /* othervise, try tickling it again */
2358         con->count++;
2359         ctdb_sys_send_tcp(
2360                 (ctdb_sock_addr *)&con->dst_addr,
2361                 (ctdb_sock_addr *)&con->src_addr,
2362                 0, 0, 0);
2363 }
2364
2365
2366 /* 
2367    called every second until all sentenced connections have been reset
2368  */
2369 static void ctdb_tickle_sentenced_connections(struct event_context *ev, struct timed_event *te, 
2370                                               struct timeval t, void *private_data)
2371 {
2372         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
2373         void *delete_cons = talloc_new(NULL);
2374
2375         /* loop over all connections sending tickle ACKs */
2376         trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
2377
2378         /* now we've finished traverse, it's safe to do deletion. */
2379         talloc_free(delete_cons);
2380
2381         /* If there are no more connections to kill we can remove the
2382            entire killtcp structure
2383          */
2384         if ( (killtcp->connections == NULL) || 
2385              (killtcp->connections->root == NULL) ) {
2386                 talloc_free(killtcp);
2387                 return;
2388         }
2389
2390         /* try tickling them again in a seconds time
2391          */
2392         event_add_timed(killtcp->ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
2393                         ctdb_tickle_sentenced_connections, killtcp);
2394 }
2395
2396 /*
2397   destroy the killtcp structure
2398  */
2399 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
2400 {
2401         if (killtcp->vnn) {
2402                 killtcp->vnn->killtcp = NULL;
2403         }
2404         return 0;
2405 }
2406
2407
2408 /* nothing fancy here, just unconditionally replace any existing
2409    connection structure with the new one.
2410
2411    dont even free the old one if it did exist, that one is talloc_stolen
2412    by the same node in the tree anyway and will be deleted when the new data 
2413    is deleted
2414 */
2415 static void *add_killtcp_callback(void *parm, void *data)
2416 {
2417         return parm;
2418 }
2419
2420 /*
2421   add a tcp socket to the list of connections we want to RST
2422  */
2423 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb, 
2424                                        ctdb_sock_addr *s,
2425                                        ctdb_sock_addr *d)
2426 {
2427         ctdb_sock_addr src, dst;
2428         struct ctdb_kill_tcp *killtcp;
2429         struct ctdb_killtcp_con *con;
2430         struct ctdb_vnn *vnn;
2431
2432         ctdb_canonicalize_ip(s, &src);
2433         ctdb_canonicalize_ip(d, &dst);
2434
2435         vnn = find_public_ip_vnn(ctdb, &dst);
2436         if (vnn == NULL) {
2437                 vnn = find_public_ip_vnn(ctdb, &src);
2438         }
2439         if (vnn == NULL) {
2440                 /* if it is not a public ip   it could be our 'single ip' */
2441                 if (ctdb->single_ip_vnn) {
2442                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
2443                                 vnn = ctdb->single_ip_vnn;
2444                         }
2445                 }
2446         }
2447         if (vnn == NULL) {
2448                 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n")); 
2449                 return -1;
2450         }
2451
2452         killtcp = vnn->killtcp;
2453         
2454         /* If this is the first connection to kill we must allocate
2455            a new structure
2456          */
2457         if (killtcp == NULL) {
2458                 killtcp = talloc_zero(ctdb, struct ctdb_kill_tcp);
2459                 CTDB_NO_MEMORY(ctdb, killtcp);
2460
2461                 killtcp->vnn         = vnn;
2462                 killtcp->ctdb        = ctdb;
2463                 killtcp->capture_fd  = -1;
2464                 killtcp->connections = trbt_create(killtcp, 0);
2465
2466                 vnn->killtcp         = killtcp;
2467                 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
2468         }
2469
2470
2471
2472         /* create a structure that describes this connection we want to
2473            RST and store it in killtcp->connections
2474         */
2475         con = talloc(killtcp, struct ctdb_killtcp_con);
2476         CTDB_NO_MEMORY(ctdb, con);
2477         con->src_addr = src;
2478         con->dst_addr = dst;
2479         con->count    = 0;
2480         con->killtcp  = killtcp;
2481
2482
2483         trbt_insertarray32_callback(killtcp->connections,
2484                         KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
2485                         add_killtcp_callback, con);
2486
2487         /* 
2488            If we dont have a socket to listen on yet we must create it
2489          */
2490         if (killtcp->capture_fd == -1) {
2491                 const char *iface = ctdb_vnn_iface_string(vnn);
2492                 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
2493                 if (killtcp->capture_fd == -1) {
2494                         DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
2495                                           "socket on iface '%s' for killtcp (%s)\n",
2496                                           iface, strerror(errno)));
2497                         goto failed;
2498                 }
2499         }
2500
2501
2502         if (killtcp->fde == NULL) {
2503                 killtcp->fde = event_add_fd(ctdb->ev, killtcp, killtcp->capture_fd, 
2504                                             EVENT_FD_READ,
2505                                             capture_tcp_handler, killtcp);
2506                 tevent_fd_set_auto_close(killtcp->fde);
2507
2508                 /* We also need to set up some events to tickle all these connections
2509                    until they are all reset
2510                 */
2511                 event_add_timed(ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
2512                                 ctdb_tickle_sentenced_connections, killtcp);
2513         }
2514
2515         /* tickle him once now */
2516         ctdb_sys_send_tcp(
2517                 &con->dst_addr,
2518                 &con->src_addr,
2519                 0, 0, 0);
2520
2521         return 0;
2522
2523 failed:
2524         talloc_free(vnn->killtcp);
2525         vnn->killtcp = NULL;
2526         return -1;
2527 }
2528
2529 /*
2530   kill a TCP connection.
2531  */
2532 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
2533 {
2534         struct ctdb_control_killtcp *killtcp = (struct ctdb_control_killtcp *)indata.dptr;
2535
2536         return ctdb_killtcp_add_connection(ctdb, &killtcp->src_addr, &killtcp->dst_addr);
2537 }
2538
2539 /*
2540   called by a daemon to inform us of the entire list of TCP tickles for
2541   a particular public address.
2542   this control should only be sent by the node that is currently serving
2543   that public address.
2544  */
2545 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
2546 {
2547         struct ctdb_control_tcp_tickle_list *list = (struct ctdb_control_tcp_tickle_list *)indata.dptr;
2548         struct ctdb_tcp_array *tcparray;
2549         struct ctdb_vnn *vnn;
2550
2551         /* We must at least have tickles.num or else we cant verify the size
2552            of the received data blob
2553          */
2554         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
2555                                         tickles.connections)) {
2556                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list. Not enough data for the tickle.num field\n"));
2557                 return -1;
2558         }
2559
2560         /* verify that the size of data matches what we expect */
2561         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
2562                                 tickles.connections)
2563                          + sizeof(struct ctdb_tcp_connection)
2564                                  * list->tickles.num) {
2565                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list\n"));
2566                 return -1;
2567         }       
2568
2569         vnn = find_public_ip_vnn(ctdb, &list->addr);
2570         if (vnn == NULL) {
2571                 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n", 
2572                         ctdb_addr_to_str(&list->addr)));
2573
2574                 return 1;
2575         }
2576
2577         /* remove any old ticklelist we might have */
2578         talloc_free(vnn->tcp_array);
2579         vnn->tcp_array = NULL;
2580
2581         tcparray = talloc(ctdb->nodes, struct ctdb_tcp_array);
2582         CTDB_NO_MEMORY(ctdb, tcparray);
2583
2584         tcparray->num = list->tickles.num;
2585
2586         tcparray->connections = talloc_array(tcparray, struct ctdb_tcp_connection, tcparray->num);
2587         CTDB_NO_MEMORY(ctdb, tcparray->connections);
2588
2589         memcpy(tcparray->connections, &list->tickles.connections[0], 
2590                sizeof(struct ctdb_tcp_connection)*tcparray->num);
2591
2592         /* We now have a new fresh tickle list array for this vnn */
2593         vnn->tcp_array = talloc_steal(vnn, tcparray);
2594         
2595         return 0;
2596 }
2597
2598 /*
2599   called to return the full list of tickles for the puclic address associated 
2600   with the provided vnn
2601  */
2602 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
2603 {
2604         ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
2605         struct ctdb_control_tcp_tickle_list *list;
2606         struct ctdb_tcp_array *tcparray;
2607         int num;
2608         struct ctdb_vnn *vnn;
2609
2610         vnn = find_public_ip_vnn(ctdb, addr);
2611         if (vnn == NULL) {
2612                 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n", 
2613                         ctdb_addr_to_str(addr)));
2614
2615                 return 1;
2616         }
2617
2618         tcparray = vnn->tcp_array;
2619         if (tcparray) {
2620                 num = tcparray->num;
2621         } else {
2622                 num = 0;
2623         }
2624
2625         outdata->dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
2626                                 tickles.connections)
2627                         + sizeof(struct ctdb_tcp_connection) * num;
2628
2629         outdata->dptr  = talloc_size(outdata, outdata->dsize);
2630         CTDB_NO_MEMORY(ctdb, outdata->dptr);
2631         list = (struct ctdb_control_tcp_tickle_list *)outdata->dptr;
2632
2633         list->addr = *addr;
2634         list->tickles.num = num;
2635         if (num) {
2636                 memcpy(&list->tickles.connections[0], tcparray->connections, 
2637                         sizeof(struct ctdb_tcp_connection) * num);
2638         }
2639
2640         return 0;
2641 }
2642
2643
2644 /*
2645   set the list of all tcp tickles for a public address
2646  */
2647 static int ctdb_ctrl_set_tcp_tickles(struct ctdb_context *ctdb, 
2648                               struct timeval timeout, uint32_t destnode, 
2649                               ctdb_sock_addr *addr,
2650                               struct ctdb_tcp_array *tcparray)
2651 {
2652         int ret, num;
2653         TDB_DATA data;
2654         struct ctdb_control_tcp_tickle_list *list;
2655
2656         if (tcparray) {
2657                 num = tcparray->num;
2658         } else {
2659                 num = 0;
2660         }
2661
2662         data.dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
2663                                 tickles.connections) +
2664                         sizeof(struct ctdb_tcp_connection) * num;
2665         data.dptr = talloc_size(ctdb, data.dsize);
2666         CTDB_NO_MEMORY(ctdb, data.dptr);
2667
2668         list = (struct ctdb_control_tcp_tickle_list *)data.dptr;
2669         list->addr = *addr;
2670         list->tickles.num = num;
2671         if (tcparray) {
2672                 memcpy(&list->tickles.connections[0], tcparray->connections, sizeof(struct ctdb_tcp_connection) * num);
2673         }
2674
2675         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
2676                                        CTDB_CONTROL_SET_TCP_TICKLE_LIST,
2677                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2678         if (ret != 0) {
2679                 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
2680                 return -1;
2681         }
2682
2683         talloc_free(data.dptr);
2684
2685         return ret;
2686 }
2687
2688
2689 /*
2690   perform tickle updates if required
2691  */
2692 static void ctdb_update_tcp_tickles(struct event_context *ev, 
2693                                 struct timed_event *te, 
2694                                 struct timeval t, void *private_data)
2695 {
2696         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
2697         int ret;
2698         struct ctdb_vnn *vnn;
2699
2700         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2701                 /* we only send out updates for public addresses that 
2702                    we have taken over
2703                  */
2704                 if (ctdb->pnn != vnn->pnn) {
2705                         continue;
2706                 }
2707                 /* We only send out the updates if we need to */
2708                 if (!vnn->tcp_update_needed) {
2709                         continue;
2710                 }
2711                 ret = ctdb_ctrl_set_tcp_tickles(ctdb, 
2712                                 TAKEOVER_TIMEOUT(),
2713                                 CTDB_BROADCAST_CONNECTED,
2714                                 &vnn->public_address,
2715                                 vnn->tcp_array);
2716                 if (ret != 0) {
2717                         DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
2718                                 ctdb_addr_to_str(&vnn->public_address)));
2719                 }
2720         }
2721
2722         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
2723                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
2724                              ctdb_update_tcp_tickles, ctdb);
2725 }               
2726         
2727
2728 /*
2729   start periodic update of tcp tickles
2730  */
2731 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
2732 {
2733         ctdb->tickle_update_context = talloc_new(ctdb);
2734
2735         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
2736                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
2737                              ctdb_update_tcp_tickles, ctdb);
2738 }
2739
2740
2741
2742
2743 struct control_gratious_arp {
2744         struct ctdb_context *ctdb;
2745         ctdb_sock_addr addr;
2746         const char *iface;
2747         int count;
2748 };
2749
2750 /*
2751   send a control_gratuitous arp
2752  */
2753 static void send_gratious_arp(struct event_context *ev, struct timed_event *te, 
2754                                   struct timeval t, void *private_data)
2755 {
2756         int ret;
2757         struct control_gratious_arp *arp = talloc_get_type(private_data, 
2758                                                         struct control_gratious_arp);
2759
2760         ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
2761         if (ret != 0) {
2762                 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
2763                                  arp->iface, strerror(errno)));
2764         }
2765
2766
2767         arp->count++;
2768         if (arp->count == CTDB_ARP_REPEAT) {
2769                 talloc_free(arp);
2770                 return;
2771         }
2772
2773         event_add_timed(arp->ctdb->ev, arp, 
2774                         timeval_current_ofs(CTDB_ARP_INTERVAL, 0), 
2775                         send_gratious_arp, arp);
2776 }
2777
2778
2779 /*
2780   send a gratious arp 
2781  */
2782 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
2783 {
2784         struct ctdb_control_gratious_arp *gratious_arp = (struct ctdb_control_gratious_arp *)indata.dptr;
2785         struct control_gratious_arp *arp;
2786
2787         /* verify the size of indata */
2788         if (indata.dsize < offsetof(struct ctdb_control_gratious_arp, iface)) {
2789                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
2790                                  (unsigned)indata.dsize, 
2791                                  (unsigned)offsetof(struct ctdb_control_gratious_arp, iface)));
2792                 return -1;
2793         }
2794         if (indata.dsize != 
2795                 ( offsetof(struct ctdb_control_gratious_arp, iface)
2796                 + gratious_arp->len ) ){
2797
2798                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
2799                         "but should be %u bytes\n", 
2800                          (unsigned)indata.dsize, 
2801                          (unsigned)(offsetof(struct ctdb_control_gratious_arp, iface)+gratious_arp->len)));
2802                 return -1;
2803         }
2804
2805
2806         arp = talloc(ctdb, struct control_gratious_arp);
2807         CTDB_NO_MEMORY(ctdb, arp);
2808
2809         arp->ctdb  = ctdb;
2810         arp->addr   = gratious_arp->addr;
2811         arp->iface = talloc_strdup(arp, gratious_arp->iface);
2812         CTDB_NO_MEMORY(ctdb, arp->iface);
2813         arp->count = 0;
2814         
2815         event_add_timed(arp->ctdb->ev, arp, 
2816                         timeval_zero(), send_gratious_arp, arp);
2817
2818         return 0;
2819 }
2820
2821 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
2822 {
2823         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
2824         int ret;
2825
2826         /* verify the size of indata */
2827         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
2828                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
2829                 return -1;
2830         }
2831         if (indata.dsize != 
2832                 ( offsetof(struct ctdb_control_ip_iface, iface)
2833                 + pub->len ) ){
2834
2835                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
2836                         "but should be %u bytes\n", 
2837                          (unsigned)indata.dsize, 
2838                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
2839                 return -1;
2840         }
2841
2842         ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0]);
2843
2844         if (ret != 0) {
2845                 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
2846                 return -1;
2847         }
2848
2849         return 0;
2850 }
2851
2852 /*
2853   called when releaseip event finishes for del_public_address
2854  */
2855 static void delete_ip_callback(struct ctdb_context *ctdb, int status, 
2856                                 void *private_data)
2857 {
2858         talloc_free(private_data);
2859 }
2860
2861 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
2862 {
2863         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
2864         struct ctdb_vnn *vnn;
2865         int ret;
2866
2867         /* verify the size of indata */
2868         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
2869                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
2870                 return -1;
2871         }
2872         if (indata.dsize != 
2873                 ( offsetof(struct ctdb_control_ip_iface, iface)
2874                 + pub->len ) ){
2875
2876                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
2877                         "but should be %u bytes\n", 
2878                          (unsigned)indata.dsize, 
2879                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
2880                 return -1;
2881         }
2882
2883         /* walk over all public addresses until we find a match */
2884         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2885                 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
2886                         TALLOC_CTX *mem_ctx;
2887
2888                         DLIST_REMOVE(ctdb->vnn, vnn);
2889                         if (vnn->iface == NULL) {
2890                                 talloc_free(vnn);
2891                                 return 0;
2892                         }
2893
2894                         mem_ctx = talloc_new(ctdb);
2895                         ret = ctdb_event_script_callback(ctdb, 
2896                                          mem_ctx, delete_ip_callback, mem_ctx,
2897                                          false,
2898                                          CTDB_EVENT_RELEASE_IP,
2899                                          "%s %s %u",
2900                                          ctdb_vnn_iface_string(vnn),
2901                                          ctdb_addr_to_str(&vnn->public_address),
2902                                          vnn->public_netmask_bits);
2903                         ctdb_vnn_unassign_iface(ctdb, vnn);
2904                         talloc_free(vnn);
2905                         if (ret != 0) {
2906                                 return -1;
2907                         }
2908                         return 0;
2909                 }
2910         }
2911
2912         return -1;
2913 }
2914
2915 /* This function is called from the recovery daemon to verify that a remote
2916    node has the expected ip allocation.
2917    This is verified against ctdb->ip_tree
2918 */
2919 int verify_remote_ip_allocation(struct ctdb_context *ctdb, struct ctdb_all_public_ips *ips)
2920 {
2921         struct ctdb_public_ip_list *tmp_ip; 
2922         int i;
2923
2924         if (ctdb->ip_tree == NULL) {
2925                 /* dont know the expected allocation yet, assume remote node
2926                    is correct. */
2927                 return 0;
2928         }
2929
2930         if (ips == NULL) {
2931                 return 0;
2932         }
2933
2934         for (i=0; i<ips->num; i++) {
2935                 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
2936                 if (tmp_ip == NULL) {
2937                         DEBUG(DEBUG_ERR,(__location__ " Could not find host for address %s, reassign ips\n", ctdb_addr_to_str(&ips->ips[i].addr)));
2938                         return -1;
2939                 }
2940
2941                 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
2942                         continue;
2943                 }
2944
2945                 if (tmp_ip->pnn != ips->ips[i].pnn) {
2946                         DEBUG(DEBUG_ERR,("Inconsistent ip allocation. Trigger reallocation. Thinks %s is held by node %u while it is held by node %u\n", ctdb_addr_to_str(&ips->ips[i].addr), ips->ips[i].pnn, tmp_ip->pnn));
2947                         return -1;
2948                 }
2949         }
2950
2951         return 0;
2952 }
2953
2954 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
2955 {
2956         struct ctdb_public_ip_list *tmp_ip; 
2957
2958         if (ctdb->ip_tree == NULL) {
2959                 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
2960                 return -1;
2961         }
2962
2963         tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
2964         if (tmp_ip == NULL) {
2965                 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
2966                 return -1;
2967         }
2968
2969         DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
2970         tmp_ip->pnn = ip->pnn;
2971
2972         return 0;
2973 }