d4958079d7c41c1edec98cf142f8f295ab82bbbd
[sahlberg/ctdb.git] / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6
7    This program is free software; you can redistribute it and/or modify
8    it under the terms of the GNU General Public License as published by
9    the Free Software Foundation; either version 3 of the License, or
10    (at your option) any later version.
11    
12    This program is distributed in the hope that it will be useful,
13    but WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15    GNU General Public License for more details.
16    
17    You should have received a copy of the GNU General Public License
18    along with this program; if not, see <http://www.gnu.org/licenses/>.
19 */
20 #include "includes.h"
21 #include "lib/tevent/tevent.h"
22 #include "lib/tdb/include/tdb.h"
23 #include "lib/util/dlinklist.h"
24 #include "system/network.h"
25 #include "system/filesys.h"
26 #include "system/wait.h"
27 #include "../include/ctdb_private.h"
28 #include "../common/rb_tree.h"
29
30
31 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
32
33 #define CTDB_ARP_INTERVAL 1
34 #define CTDB_ARP_REPEAT   3
35
36 struct ctdb_iface {
37         struct ctdb_iface *prev, *next;
38         const char *name;
39         bool link_up;
40         uint32_t references;
41 };
42
43 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
44 {
45         if (vnn->iface) {
46                 return vnn->iface->name;
47         }
48
49         return "__none__";
50 }
51
52 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
53 {
54         struct ctdb_iface *i;
55
56         /* Verify that we dont have an entry for this ip yet */
57         for (i=ctdb->ifaces;i;i=i->next) {
58                 if (strcmp(i->name, iface) == 0) {
59                         return 0;
60                 }
61         }
62
63         /* create a new structure for this interface */
64         i = talloc_zero(ctdb, struct ctdb_iface);
65         CTDB_NO_MEMORY_FATAL(ctdb, i);
66         i->name = talloc_strdup(i, iface);
67         CTDB_NO_MEMORY(ctdb, i->name);
68         i->link_up = false;
69
70         DLIST_ADD(ctdb->ifaces, i);
71
72         return 0;
73 }
74
75 static struct ctdb_iface *ctdb_find_iface(struct ctdb_context *ctdb,
76                                           const char *iface)
77 {
78         struct ctdb_iface *i;
79
80         /* Verify that we dont have an entry for this ip yet */
81         for (i=ctdb->ifaces;i;i=i->next) {
82                 if (strcmp(i->name, iface) == 0) {
83                         return i;
84                 }
85         }
86
87         return NULL;
88 }
89
90 static struct ctdb_iface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
91                                               struct ctdb_vnn *vnn)
92 {
93         int i;
94         struct ctdb_iface *cur = NULL;
95         struct ctdb_iface *best = NULL;
96
97         for (i=0; vnn->ifaces[i]; i++) {
98
99                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
100                 if (cur == NULL) {
101                         continue;
102                 }
103
104                 if (!cur->link_up) {
105                         continue;
106                 }
107
108                 if (best == NULL) {
109                         best = cur;
110                         continue;
111                 }
112
113                 if (cur->references < best->references) {
114                         best = cur;
115                         continue;
116                 }
117         }
118
119         return best;
120 }
121
122 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
123                                      struct ctdb_vnn *vnn)
124 {
125         struct ctdb_iface *best = NULL;
126
127         if (vnn->iface) {
128                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
129                                    "still assigned to iface '%s'\n",
130                                    ctdb_addr_to_str(&vnn->public_address),
131                                    ctdb_vnn_iface_string(vnn)));
132                 return 0;
133         }
134
135         best = ctdb_vnn_best_iface(ctdb, vnn);
136         if (best == NULL) {
137                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
138                                   "cannot assign to iface any iface\n",
139                                   ctdb_addr_to_str(&vnn->public_address)));
140                 return -1;
141         }
142
143         vnn->iface = best;
144         best->references++;
145         vnn->pnn = ctdb->pnn;
146
147         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
148                            "now assigned to iface '%s' refs[%d]\n",
149                            ctdb_addr_to_str(&vnn->public_address),
150                            ctdb_vnn_iface_string(vnn),
151                            best->references));
152         return 0;
153 }
154
155 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
156                                     struct ctdb_vnn *vnn)
157 {
158         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
159                            "now unassigned (old iface '%s' refs[%d])\n",
160                            ctdb_addr_to_str(&vnn->public_address),
161                            ctdb_vnn_iface_string(vnn),
162                            vnn->iface?vnn->iface->references:0));
163         if (vnn->iface) {
164                 vnn->iface->references--;
165         }
166         vnn->iface = NULL;
167         if (vnn->pnn == ctdb->pnn) {
168                 vnn->pnn = -1;
169         }
170 }
171
172 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
173                                struct ctdb_vnn *vnn)
174 {
175         int i;
176
177         if (vnn->iface && vnn->iface->link_up) {
178                 return true;
179         }
180
181         for (i=0; vnn->ifaces[i]; i++) {
182                 struct ctdb_iface *cur;
183
184                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
185                 if (cur == NULL) {
186                         continue;
187                 }
188
189                 if (cur->link_up) {
190                         return true;
191                 }
192         }
193
194         return false;
195 }
196
197 struct ctdb_takeover_arp {
198         struct ctdb_context *ctdb;
199         uint32_t count;
200         ctdb_sock_addr addr;
201         struct ctdb_tcp_array *tcparray;
202         struct ctdb_vnn *vnn;
203 };
204
205
206 /*
207   lists of tcp endpoints
208  */
209 struct ctdb_tcp_list {
210         struct ctdb_tcp_list *prev, *next;
211         struct ctdb_tcp_connection connection;
212 };
213
214 /*
215   list of clients to kill on IP release
216  */
217 struct ctdb_client_ip {
218         struct ctdb_client_ip *prev, *next;
219         struct ctdb_context *ctdb;
220         ctdb_sock_addr addr;
221         uint32_t client_id;
222 };
223
224
225 /*
226   send a gratuitous arp
227  */
228 static void ctdb_control_send_arp(struct event_context *ev, struct timed_event *te, 
229                                   struct timeval t, void *private_data)
230 {
231         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
232                                                         struct ctdb_takeover_arp);
233         int i, ret;
234         struct ctdb_tcp_array *tcparray;
235         const char *iface = ctdb_vnn_iface_string(arp->vnn);
236
237         ret = ctdb_sys_send_arp(&arp->addr, iface);
238         if (ret != 0) {
239                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
240                                   iface, strerror(errno)));
241         }
242
243         tcparray = arp->tcparray;
244         if (tcparray) {
245                 for (i=0;i<tcparray->num;i++) {
246                         struct ctdb_tcp_connection *tcon;
247
248                         tcon = &tcparray->connections[i];
249                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
250                                 (unsigned)ntohs(tcon->dst_addr.ip.sin_port), 
251                                 ctdb_addr_to_str(&tcon->src_addr),
252                                 (unsigned)ntohs(tcon->src_addr.ip.sin_port)));
253                         ret = ctdb_sys_send_tcp(
254                                 &tcon->src_addr, 
255                                 &tcon->dst_addr,
256                                 0, 0, 0);
257                         if (ret != 0) {
258                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
259                                         ctdb_addr_to_str(&tcon->src_addr)));
260                         }
261                 }
262         }
263
264         arp->count++;
265
266         if (arp->count == CTDB_ARP_REPEAT) {
267                 talloc_free(arp);
268                 return;
269         }
270
271         event_add_timed(arp->ctdb->ev, arp->vnn->takeover_ctx, 
272                         timeval_current_ofs(CTDB_ARP_INTERVAL, 100000), 
273                         ctdb_control_send_arp, arp);
274 }
275
276 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
277                                        struct ctdb_vnn *vnn)
278 {
279         struct ctdb_takeover_arp *arp;
280         struct ctdb_tcp_array *tcparray;
281
282         if (!vnn->takeover_ctx) {
283                 vnn->takeover_ctx = talloc_new(vnn);
284                 if (!vnn->takeover_ctx) {
285                         return -1;
286                 }
287         }
288
289         arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
290         if (!arp) {
291                 return -1;
292         }
293
294         arp->ctdb = ctdb;
295         arp->addr = vnn->public_address;
296         arp->vnn  = vnn;
297
298         tcparray = vnn->tcp_array;
299         if (tcparray) {
300                 /* add all of the known tcp connections for this IP to the
301                    list of tcp connections to send tickle acks for */
302                 arp->tcparray = talloc_steal(arp, tcparray);
303
304                 vnn->tcp_array = NULL;
305                 vnn->tcp_update_needed = true;
306         }
307
308         event_add_timed(arp->ctdb->ev, vnn->takeover_ctx,
309                         timeval_zero(), ctdb_control_send_arp, arp);
310
311         return 0;
312 }
313
314 struct takeover_callback_state {
315         struct ctdb_req_control *c;
316         ctdb_sock_addr *addr;
317         struct ctdb_vnn *vnn;
318 };
319
320 struct ctdb_do_takeip_state {
321         struct ctdb_req_control *c;
322         struct ctdb_vnn *vnn;
323 };
324
325 /*
326   called when takeip event finishes
327  */
328 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
329                                     void *private_data)
330 {
331         struct ctdb_do_takeip_state *state =
332                 talloc_get_type(private_data, struct ctdb_do_takeip_state);
333         int32_t ret;
334         TDB_DATA data;
335
336         if (status != 0) {
337                 if (status == -ETIME) {
338                         ctdb_ban_self(ctdb);
339                 }
340                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
341                                  ctdb_addr_to_str(&state->vnn->public_address),
342                                  ctdb_vnn_iface_string(state->vnn)));
343                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
344                 talloc_free(state);
345                 return;
346         }
347
348         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
349         if (ret != 0) {
350                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
351                 talloc_free(state);
352                 return;
353         }
354
355         data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
356         data.dsize = strlen((char *)data.dptr) + 1;
357         DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
358
359         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
360
361
362         /* the control succeeded */
363         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
364         talloc_free(state);
365         return;
366 }
367
368 /*
369   take over an ip address
370  */
371 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
372                               struct ctdb_req_control *c,
373                               struct ctdb_vnn *vnn)
374 {
375         int ret;
376         struct ctdb_do_takeip_state *state;
377
378         ret = ctdb_vnn_assign_iface(ctdb, vnn);
379         if (ret != 0) {
380                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
381                                  "assin a usable interface\n",
382                                  ctdb_addr_to_str(&vnn->public_address),
383                                  vnn->public_netmask_bits));
384                 return -1;
385         }
386
387         state = talloc(vnn, struct ctdb_do_takeip_state);
388         CTDB_NO_MEMORY(ctdb, state);
389
390         state->c = talloc_steal(ctdb, c);
391         state->vnn   = vnn;
392
393         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
394                             ctdb_addr_to_str(&vnn->public_address),
395                             vnn->public_netmask_bits,
396                             ctdb_vnn_iface_string(vnn)));
397
398         ret = ctdb_event_script_callback(ctdb,
399                                          state,
400                                          ctdb_do_takeip_callback,
401                                          state,
402                                          false,
403                                          CTDB_EVENT_TAKE_IP,
404                                          "%s %s %u",
405                                          ctdb_vnn_iface_string(vnn),
406                                          ctdb_addr_to_str(&vnn->public_address),
407                                          vnn->public_netmask_bits);
408
409         if (ret != 0) {
410                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
411                         ctdb_addr_to_str(&vnn->public_address),
412                         ctdb_vnn_iface_string(vnn)));
413                 talloc_free(state);
414                 return -1;
415         }
416
417         return 0;
418 }
419
420 struct ctdb_do_updateip_state {
421         struct ctdb_req_control *c;
422         struct ctdb_iface *old;
423         struct ctdb_vnn *vnn;
424 };
425
426 /*
427   called when updateip event finishes
428  */
429 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
430                                       void *private_data)
431 {
432         struct ctdb_do_updateip_state *state =
433                 talloc_get_type(private_data, struct ctdb_do_updateip_state);
434         int32_t ret;
435
436         if (status != 0) {
437                 if (status == -ETIME) {
438                         ctdb_ban_self(ctdb);
439                 }
440                 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
441                         ctdb_addr_to_str(&state->vnn->public_address),
442                         state->old->name,
443                         ctdb_vnn_iface_string(state->vnn)));
444
445                 /*
446                  * All we can do is reset the old interface
447                  * and let the next run fix it
448                  */
449                 ctdb_vnn_unassign_iface(ctdb, state->vnn);
450                 state->vnn->iface = state->old;
451                 state->vnn->iface->references++;
452
453                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
454                 talloc_free(state);
455                 return;
456         }
457
458         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
459         if (ret != 0) {
460                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
461                 talloc_free(state);
462                 return;
463         }
464
465         /* the control succeeded */
466         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
467         talloc_free(state);
468         return;
469 }
470
471 /*
472   update (move) an ip address
473  */
474 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
475                                 struct ctdb_req_control *c,
476                                 struct ctdb_vnn *vnn)
477 {
478         int ret;
479         struct ctdb_do_updateip_state *state;
480         struct ctdb_iface *old = vnn->iface;
481
482         ctdb_vnn_unassign_iface(ctdb, vnn);
483         ret = ctdb_vnn_assign_iface(ctdb, vnn);
484         if (ret != 0) {
485                 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
486                                  "assin a usable interface (old iface '%s')\n",
487                                  ctdb_addr_to_str(&vnn->public_address),
488                                  vnn->public_netmask_bits,
489                                  old->name));
490                 return -1;
491         }
492
493         state = talloc(vnn, struct ctdb_do_updateip_state);
494         CTDB_NO_MEMORY(ctdb, state);
495
496         state->c = talloc_steal(ctdb, c);
497         state->old = old;
498         state->vnn = vnn;
499
500         DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
501                             "interface %s to %s\n",
502                             ctdb_addr_to_str(&vnn->public_address),
503                             vnn->public_netmask_bits,
504                             old->name,
505                             ctdb_vnn_iface_string(vnn)));
506
507         ret = ctdb_event_script_callback(ctdb,
508                                          state,
509                                          ctdb_do_updateip_callback,
510                                          state,
511                                          false,
512                                          CTDB_EVENT_UPDATE_IP,
513                                          "%s %s %s %u",
514                                          state->old->name,
515                                          ctdb_vnn_iface_string(vnn),
516                                          ctdb_addr_to_str(&vnn->public_address),
517                                          vnn->public_netmask_bits);
518         if (ret != 0) {
519                 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
520                                  ctdb_addr_to_str(&vnn->public_address),
521                                  old->name, ctdb_vnn_iface_string(vnn)));
522                 talloc_free(state);
523                 return -1;
524         }
525
526         return 0;
527 }
528
529 /*
530   Find the vnn of the node that has a public ip address
531   returns -1 if the address is not known as a public address
532  */
533 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
534 {
535         struct ctdb_vnn *vnn;
536
537         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
538                 if (ctdb_same_ip(&vnn->public_address, addr)) {
539                         return vnn;
540                 }
541         }
542
543         return NULL;
544 }
545
546 /*
547   take over an ip address
548  */
549 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
550                                  struct ctdb_req_control *c,
551                                  TDB_DATA indata,
552                                  bool *async_reply)
553 {
554         int ret;
555         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
556         struct ctdb_vnn *vnn;
557         bool have_ip = false;
558         bool do_updateip = false;
559         bool do_takeip = false;
560         struct ctdb_iface *best_iface = NULL;
561
562         if (pip->pnn != ctdb->pnn) {
563                 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
564                                  "with pnn %d, but we're node %d\n",
565                                  ctdb_addr_to_str(&pip->addr),
566                                  pip->pnn, ctdb->pnn));
567                 return -1;
568         }
569
570         /* update out vnn list */
571         vnn = find_public_ip_vnn(ctdb, &pip->addr);
572         if (vnn == NULL) {
573                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
574                         ctdb_addr_to_str(&pip->addr)));
575                 return 0;
576         }
577
578         have_ip = ctdb_sys_have_ip(&pip->addr);
579         best_iface = ctdb_vnn_best_iface(ctdb, vnn);
580         if (best_iface == NULL) {
581                 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
582                                  "a usable interface (old %s, have_ip %d)\n",
583                                  ctdb_addr_to_str(&vnn->public_address),
584                                  vnn->public_netmask_bits,
585                                  ctdb_vnn_iface_string(vnn),
586                                  have_ip));
587                 return -1;
588         }
589
590         if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
591                 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
592                 have_ip = false;
593         }
594
595         if (vnn->iface == NULL && have_ip) {
596                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
597                                   "but we have no interface assigned, has someone manually configured it?"
598                                   "banning ourself\n",
599                                  ctdb_addr_to_str(&vnn->public_address)));
600                 ctdb_ban_self(ctdb);
601                 return -1;
602         }
603
604         if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
605                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
606                                   "and we have it on iface[%s], but it was assigned to node %d"
607                                   "and we are node %d, banning ourself\n",
608                                  ctdb_addr_to_str(&vnn->public_address),
609                                  ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
610                 ctdb_ban_self(ctdb);
611                 return -1;
612         }
613
614         if (vnn->iface) {
615                 if (vnn->iface->link_up) {
616                         /* only move when the rebalance gains something */
617                         if (vnn->iface->references > (best_iface->references + 1)) {
618                                 do_updateip = true;
619                         }
620                 } else if (vnn->iface != best_iface) {
621                         do_updateip = true;
622                 }
623         }
624
625         if (!have_ip) {
626                 if (do_updateip) {
627                         ctdb_vnn_unassign_iface(ctdb, vnn);
628                         do_updateip = false;
629                 }
630                 do_takeip = true;
631         }
632
633         if (do_takeip) {
634                 ret = ctdb_do_takeip(ctdb, c, vnn);
635                 if (ret != 0) {
636                         return -1;
637                 }
638         } else if (do_updateip) {
639                 ret = ctdb_do_updateip(ctdb, c, vnn);
640                 if (ret != 0) {
641                         return -1;
642                 }
643         } else {
644                 /*
645                  * The interface is up and the kernel known the ip
646                  * => do nothing
647                  */
648                 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
649                         ctdb_addr_to_str(&pip->addr),
650                         vnn->public_netmask_bits,
651                         ctdb_vnn_iface_string(vnn)));
652                 return 0;
653         }
654
655         /* tell ctdb_control.c that we will be replying asynchronously */
656         *async_reply = true;
657
658         return 0;
659 }
660
661 /*
662   takeover an ip address old v4 style
663  */
664 int32_t ctdb_control_takeover_ipv4(struct ctdb_context *ctdb, 
665                                 struct ctdb_req_control *c,
666                                 TDB_DATA indata, 
667                                 bool *async_reply)
668 {
669         TDB_DATA data;
670         
671         data.dsize = sizeof(struct ctdb_public_ip);
672         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
673         CTDB_NO_MEMORY(ctdb, data.dptr);
674         
675         memcpy(data.dptr, indata.dptr, indata.dsize);
676         return ctdb_control_takeover_ip(ctdb, c, data, async_reply);
677 }
678
679 /*
680   kill any clients that are registered with a IP that is being released
681  */
682 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
683 {
684         struct ctdb_client_ip *ip;
685
686         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
687                 ctdb_addr_to_str(addr)));
688
689         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
690                 ctdb_sock_addr tmp_addr;
691
692                 tmp_addr = ip->addr;
693                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
694                         ip->client_id,
695                         ctdb_addr_to_str(&ip->addr)));
696
697                 if (ctdb_same_ip(&tmp_addr, addr)) {
698                         struct ctdb_client *client = ctdb_reqid_find(ctdb, 
699                                                                      ip->client_id, 
700                                                                      struct ctdb_client);
701                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
702                                 ip->client_id,
703                                 ctdb_addr_to_str(&ip->addr),
704                                 client->pid));
705
706                         if (client->pid != 0) {
707                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
708                                         (unsigned)client->pid,
709                                         ctdb_addr_to_str(addr),
710                                         ip->client_id));
711                                 kill(client->pid, SIGKILL);
712                         }
713                 }
714         }
715 }
716
717 /*
718   called when releaseip event finishes
719  */
720 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
721                                 void *private_data)
722 {
723         struct takeover_callback_state *state = 
724                 talloc_get_type(private_data, struct takeover_callback_state);
725         TDB_DATA data;
726
727         if (status == -ETIME) {
728                 ctdb_ban_self(ctdb);
729         }
730
731         /* send a message to all clients of this node telling them
732            that the cluster has been reconfigured and they should
733            release any sockets on this IP */
734         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
735         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
736         data.dsize = strlen((char *)data.dptr)+1;
737
738         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
739
740         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
741
742         /* kill clients that have registered with this IP */
743         release_kill_clients(ctdb, state->addr);
744
745         ctdb_vnn_unassign_iface(ctdb, state->vnn);
746
747         /* the control succeeded */
748         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
749         talloc_free(state);
750 }
751
752 /*
753   release an ip address
754  */
755 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
756                                 struct ctdb_req_control *c,
757                                 TDB_DATA indata, 
758                                 bool *async_reply)
759 {
760         int ret;
761         struct takeover_callback_state *state;
762         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
763         struct ctdb_vnn *vnn;
764
765         /* update our vnn list */
766         vnn = find_public_ip_vnn(ctdb, &pip->addr);
767         if (vnn == NULL) {
768                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
769                         ctdb_addr_to_str(&pip->addr)));
770                 return 0;
771         }
772         vnn->pnn = pip->pnn;
773
774         /* stop any previous arps */
775         talloc_free(vnn->takeover_ctx);
776         vnn->takeover_ctx = NULL;
777
778         if (!ctdb_sys_have_ip(&pip->addr)) {
779                 DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n", 
780                         ctdb_addr_to_str(&pip->addr),
781                         vnn->public_netmask_bits, 
782                         ctdb_vnn_iface_string(vnn)));
783                 ctdb_vnn_unassign_iface(ctdb, vnn);
784                 return 0;
785         }
786
787         if (vnn->iface == NULL) {
788                 DEBUG(DEBUG_CRIT,(__location__ " release_ip of IP %s is known to the kernel, "
789                                   "but we have no interface assigned, has someone manually configured it?"
790                                   "banning ourself\n",
791                                  ctdb_addr_to_str(&vnn->public_address)));
792                 ctdb_ban_self(ctdb);
793                 return -1;
794         }
795
796         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
797                 ctdb_addr_to_str(&pip->addr),
798                 vnn->public_netmask_bits, 
799                 ctdb_vnn_iface_string(vnn),
800                 pip->pnn));
801
802         state = talloc(ctdb, struct takeover_callback_state);
803         CTDB_NO_MEMORY(ctdb, state);
804
805         state->c = talloc_steal(state, c);
806         state->addr = talloc(state, ctdb_sock_addr);       
807         CTDB_NO_MEMORY(ctdb, state->addr);
808         *state->addr = pip->addr;
809         state->vnn   = vnn;
810
811         ret = ctdb_event_script_callback(ctdb, 
812                                          state, release_ip_callback, state,
813                                          false,
814                                          CTDB_EVENT_RELEASE_IP,
815                                          "%s %s %u",
816                                          ctdb_vnn_iface_string(vnn),
817                                          ctdb_addr_to_str(&pip->addr),
818                                          vnn->public_netmask_bits);
819         if (ret != 0) {
820                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
821                         ctdb_addr_to_str(&pip->addr),
822                         ctdb_vnn_iface_string(vnn)));
823                 talloc_free(state);
824                 return -1;
825         }
826
827         /* tell the control that we will be reply asynchronously */
828         *async_reply = true;
829         return 0;
830 }
831
832 /*
833   release an ip address old v4 style
834  */
835 int32_t ctdb_control_release_ipv4(struct ctdb_context *ctdb, 
836                                 struct ctdb_req_control *c,
837                                 TDB_DATA indata, 
838                                 bool *async_reply)
839 {
840         TDB_DATA data;
841         
842         data.dsize = sizeof(struct ctdb_public_ip);
843         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
844         CTDB_NO_MEMORY(ctdb, data.dptr);
845         
846         memcpy(data.dptr, indata.dptr, indata.dsize);
847         return ctdb_control_release_ip(ctdb, c, data, async_reply);
848 }
849
850
851 static int ctdb_add_public_address(struct ctdb_context *ctdb,
852                                    ctdb_sock_addr *addr,
853                                    unsigned mask, const char *ifaces)
854 {
855         struct ctdb_vnn      *vnn;
856         uint32_t num = 0;
857         char *tmp;
858         const char *iface;
859         int i;
860         int ret;
861
862         /* Verify that we dont have an entry for this ip yet */
863         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
864                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
865                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
866                                 ctdb_addr_to_str(addr)));
867                         return -1;
868                 }               
869         }
870
871         /* create a new vnn structure for this ip address */
872         vnn = talloc_zero(ctdb, struct ctdb_vnn);
873         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
874         vnn->ifaces = talloc_array(vnn, const char *, num + 2);
875         tmp = talloc_strdup(vnn, ifaces);
876         CTDB_NO_MEMORY_FATAL(ctdb, tmp);
877         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
878                 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
879                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
880                 vnn->ifaces[num] = talloc_strdup(vnn, iface);
881                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
882                 num++;
883         }
884         talloc_free(tmp);
885         vnn->ifaces[num] = NULL;
886         vnn->public_address      = *addr;
887         vnn->public_netmask_bits = mask;
888         vnn->pnn                 = -1;
889         if (ctdb_sys_have_ip(addr)) {
890                 DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
891                 vnn->pnn = ctdb->pnn;
892         }
893
894         for (i=0; vnn->ifaces[i]; i++) {
895                 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
896                 if (ret != 0) {
897                         DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
898                                            "for public_address[%s]\n",
899                                            vnn->ifaces[i], ctdb_addr_to_str(addr)));
900                         talloc_free(vnn);
901                         return -1;
902                 }
903                 if (i == 0) {
904                         vnn->iface = ctdb_find_iface(ctdb, vnn->ifaces[i]);
905                 }
906         }
907
908         DLIST_ADD(ctdb->vnn, vnn);
909
910         return 0;
911 }
912
913 /*
914   setup the event script directory
915 */
916 int ctdb_set_event_script_dir(struct ctdb_context *ctdb, const char *script_dir)
917 {
918         ctdb->event_script_dir = talloc_strdup(ctdb, script_dir);
919         CTDB_NO_MEMORY(ctdb, ctdb->event_script_dir);
920         return 0;
921 }
922
923 /*
924   setup the public address lists from a file
925 */
926 int ctdb_set_public_addresses(struct ctdb_context *ctdb, const char *alist)
927 {
928         char **lines;
929         int nlines;
930         int i;
931
932         lines = file_lines_load(alist, &nlines, ctdb);
933         if (lines == NULL) {
934                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", alist);
935                 return -1;
936         }
937         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
938                 nlines--;
939         }
940
941         for (i=0;i<nlines;i++) {
942                 unsigned mask;
943                 ctdb_sock_addr addr;
944                 const char *addrstr;
945                 const char *ifaces;
946                 char *tok, *line;
947
948                 line = lines[i];
949                 while ((*line == ' ') || (*line == '\t')) {
950                         line++;
951                 }
952                 if (*line == '#') {
953                         continue;
954                 }
955                 if (strcmp(line, "") == 0) {
956                         continue;
957                 }
958                 tok = strtok(line, " \t");
959                 addrstr = tok;
960                 tok = strtok(NULL, " \t");
961                 if (tok == NULL) {
962                         if (NULL == ctdb->default_public_interface) {
963                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
964                                          i+1));
965                                 talloc_free(lines);
966                                 return -1;
967                         }
968                         ifaces = ctdb->default_public_interface;
969                 } else {
970                         ifaces = tok;
971                 }
972
973                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
974                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
975                         talloc_free(lines);
976                         return -1;
977                 }
978                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces)) {
979                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
980                         talloc_free(lines);
981                         return -1;
982                 }
983         }
984
985         talloc_free(lines);
986         return 0;
987 }
988
989 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
990                               const char *iface,
991                               const char *ip)
992 {
993         struct ctdb_vnn *svnn;
994         bool ok;
995         int ret;
996
997         svnn = talloc_zero(ctdb, struct ctdb_vnn);
998         CTDB_NO_MEMORY(ctdb, svnn);
999
1000         svnn->ifaces = talloc_array(svnn, const char *, 2);
1001         CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1002         svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1003         CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1004         svnn->ifaces[1] = NULL;
1005
1006         ok = parse_ip(ip, iface, 0, &svnn->public_address);
1007         if (!ok) {
1008                 talloc_free(svnn);
1009                 return -1;
1010         }
1011
1012         ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1013         if (ret != 0) {
1014                 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1015                                    "for single_ip[%s]\n",
1016                                    svnn->ifaces[0],
1017                                    ctdb_addr_to_str(&svnn->public_address)));
1018                 talloc_free(svnn);
1019                 return -1;
1020         }
1021
1022         ret = ctdb_vnn_assign_iface(ctdb, svnn);
1023         if (ret != 0) {
1024                 talloc_free(svnn);
1025                 return -1;
1026         }
1027
1028         ctdb->single_ip_vnn = svnn;
1029         return 0;
1030 }
1031
1032 struct ctdb_public_ip_list {
1033         struct ctdb_public_ip_list *next;
1034         uint32_t pnn;
1035         ctdb_sock_addr addr;
1036 };
1037
1038
1039 /* Given a physical node, return the number of
1040    public addresses that is currently assigned to this node.
1041 */
1042 static int node_ip_coverage(struct ctdb_context *ctdb, 
1043         int32_t pnn,
1044         struct ctdb_public_ip_list *ips)
1045 {
1046         int num=0;
1047
1048         for (;ips;ips=ips->next) {
1049                 if (ips->pnn == pnn) {
1050                         num++;
1051                 }
1052         }
1053         return num;
1054 }
1055
1056
1057 /* Check if this is a public ip known to the node, i.e. can that
1058    node takeover this ip ?
1059 */
1060 static int can_node_serve_ip(struct ctdb_context *ctdb, int32_t pnn, 
1061                 struct ctdb_public_ip_list *ip)
1062 {
1063         struct ctdb_all_public_ips *public_ips;
1064         int i;
1065
1066         public_ips = ctdb->nodes[pnn]->available_public_ips;
1067
1068         if (public_ips == NULL) {
1069                 return -1;
1070         }
1071
1072         for (i=0;i<public_ips->num;i++) {
1073                 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1074                         /* yes, this node can serve this public ip */
1075                         return 0;
1076                 }
1077         }
1078
1079         return -1;
1080 }
1081
1082
1083 /* search the node lists list for a node to takeover this ip.
1084    pick the node that currently are serving the least number of ips
1085    so that the ips get spread out evenly.
1086 */
1087 static int find_takeover_node(struct ctdb_context *ctdb, 
1088                 struct ctdb_node_map *nodemap, uint32_t mask, 
1089                 struct ctdb_public_ip_list *ip,
1090                 struct ctdb_public_ip_list *all_ips)
1091 {
1092         int pnn, min=0, num;
1093         int i;
1094
1095         pnn    = -1;
1096         for (i=0;i<nodemap->num;i++) {
1097                 if (nodemap->nodes[i].flags & mask) {
1098                         /* This node is not healty and can not be used to serve
1099                            a public address 
1100                         */
1101                         continue;
1102                 }
1103
1104                 /* verify that this node can serve this ip */
1105                 if (can_node_serve_ip(ctdb, i, ip)) {
1106                         /* no it couldnt   so skip to the next node */
1107                         continue;
1108                 }
1109
1110                 num = node_ip_coverage(ctdb, i, all_ips);
1111                 /* was this the first node we checked ? */
1112                 if (pnn == -1) {
1113                         pnn = i;
1114                         min  = num;
1115                 } else {
1116                         if (num < min) {
1117                                 pnn = i;
1118                                 min  = num;
1119                         }
1120                 }
1121         }       
1122         if (pnn == -1) {
1123                 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1124                         ctdb_addr_to_str(&ip->addr)));
1125
1126                 return -1;
1127         }
1128
1129         ip->pnn = pnn;
1130         return 0;
1131 }
1132
1133 #define IP_KEYLEN       4
1134 static uint32_t *ip_key(ctdb_sock_addr *ip)
1135 {
1136         static uint32_t key[IP_KEYLEN];
1137
1138         bzero(key, sizeof(key));
1139
1140         switch (ip->sa.sa_family) {
1141         case AF_INET:
1142                 key[3]  = htonl(ip->ip.sin_addr.s_addr);
1143                 break;
1144         case AF_INET6:
1145                 key[0]  = htonl(ip->ip6.sin6_addr.s6_addr32[0]);
1146                 key[1]  = htonl(ip->ip6.sin6_addr.s6_addr32[1]);
1147                 key[2]  = htonl(ip->ip6.sin6_addr.s6_addr32[2]);
1148                 key[3]  = htonl(ip->ip6.sin6_addr.s6_addr32[3]);
1149                 break;
1150         default:
1151                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1152                 return key;
1153         }
1154
1155         return key;
1156 }
1157
1158 static void *add_ip_callback(void *parm, void *data)
1159 {
1160         struct ctdb_public_ip_list *this_ip = parm; 
1161         struct ctdb_public_ip_list *prev_ip = data; 
1162
1163         if (prev_ip == NULL) {
1164                 return parm;
1165         }
1166         if (this_ip->pnn == -1) {
1167                 this_ip->pnn = prev_ip->pnn;
1168         }
1169
1170         return parm;
1171 }
1172
1173 void getips_count_callback(void *param, void *data)
1174 {
1175         struct ctdb_public_ip_list **ip_list = (struct ctdb_public_ip_list **)param;
1176         struct ctdb_public_ip_list *new_ip = (struct ctdb_public_ip_list *)data;
1177
1178         new_ip->next = *ip_list;
1179         *ip_list     = new_ip;
1180 }
1181
1182 static struct ctdb_public_ip_list *
1183 create_merged_ip_list(struct ctdb_context *ctdb)
1184 {
1185         int i, j;
1186         struct ctdb_public_ip_list *ip_list;
1187         struct ctdb_all_public_ips *public_ips;
1188
1189         if (ctdb->ip_tree != NULL) {
1190                 talloc_free(ctdb->ip_tree);
1191                 ctdb->ip_tree = NULL;
1192         }
1193         ctdb->ip_tree = trbt_create(ctdb, 0);
1194
1195         for (i=0;i<ctdb->num_nodes;i++) {
1196                 public_ips = ctdb->nodes[i]->known_public_ips;
1197
1198                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1199                         continue;
1200                 }
1201
1202                 /* there were no public ips for this node */
1203                 if (public_ips == NULL) {
1204                         continue;
1205                 }               
1206
1207                 for (j=0;j<public_ips->num;j++) {
1208                         struct ctdb_public_ip_list *tmp_ip; 
1209
1210                         tmp_ip = talloc_zero(ctdb->ip_tree, struct ctdb_public_ip_list);
1211                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1212                         tmp_ip->pnn  = public_ips->ips[j].pnn;
1213                         tmp_ip->addr = public_ips->ips[j].addr;
1214                         tmp_ip->next = NULL;
1215
1216                         trbt_insertarray32_callback(ctdb->ip_tree,
1217                                 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1218                                 add_ip_callback,
1219                                 tmp_ip);
1220                 }
1221         }
1222
1223         ip_list = NULL;
1224         trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1225
1226         return ip_list;
1227 }
1228
1229 /*
1230   make any IP alias changes for public addresses that are necessary 
1231  */
1232 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
1233 {
1234         int i, num_healthy, retries;
1235         struct ctdb_public_ip ip;
1236         struct ctdb_public_ipv4 ipv4;
1237         uint32_t mask, *nodes;
1238         struct ctdb_public_ip_list *all_ips, *tmp_ip;
1239         int maxnode, maxnum=0, minnode, minnum=0, num;
1240         TDB_DATA data;
1241         struct timeval timeout;
1242         struct client_async_data *async_data;
1243         struct ctdb_client_control_state *state;
1244         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1245
1246         /*
1247          * ip failover is completely disabled, just send out the 
1248          * ipreallocated event.
1249          */
1250         if (ctdb->tunable.disable_ip_failover != 0) {
1251                 goto ipreallocated;
1252         }
1253
1254         ZERO_STRUCT(ip);
1255
1256         /* Count how many completely healthy nodes we have */
1257         num_healthy = 0;
1258         for (i=0;i<nodemap->num;i++) {
1259                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
1260                         num_healthy++;
1261                 }
1262         }
1263
1264         if (num_healthy > 0) {
1265                 /* We have healthy nodes, so only consider them for 
1266                    serving public addresses
1267                 */
1268                 mask = NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED;
1269         } else {
1270                 /* We didnt have any completely healthy nodes so
1271                    use "disabled" nodes as a fallback
1272                 */
1273                 mask = NODE_FLAGS_INACTIVE;
1274         }
1275
1276         /* since nodes only know about those public addresses that
1277            can be served by that particular node, no single node has
1278            a full list of all public addresses that exist in the cluster.
1279            Walk over all node structures and create a merged list of
1280            all public addresses that exist in the cluster.
1281
1282            keep the tree of ips around as ctdb->ip_tree
1283         */
1284         all_ips = create_merged_ip_list(ctdb);
1285
1286         /* If we want deterministic ip allocations, i.e. that the ip addresses
1287            will always be allocated the same way for a specific set of
1288            available/unavailable nodes.
1289         */
1290         if (1 == ctdb->tunable.deterministic_public_ips) {              
1291                 DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
1292                 for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
1293                         tmp_ip->pnn = i%nodemap->num;
1294                 }
1295         }
1296
1297
1298         /* mark all public addresses with a masked node as being served by
1299            node -1
1300         */
1301         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1302                 if (tmp_ip->pnn == -1) {
1303                         continue;
1304                 }
1305                 if (nodemap->nodes[tmp_ip->pnn].flags & mask) {
1306                         tmp_ip->pnn = -1;
1307                 }
1308         }
1309
1310         /* verify that the assigned nodes can serve that public ip
1311            and set it to -1 if not
1312         */
1313         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1314                 if (tmp_ip->pnn == -1) {
1315                         continue;
1316                 }
1317                 if (can_node_serve_ip(ctdb, tmp_ip->pnn, tmp_ip) != 0) {
1318                         /* this node can not serve this ip. */
1319                         tmp_ip->pnn = -1;
1320                 }
1321         }
1322
1323
1324         /* now we must redistribute all public addresses with takeover node
1325            -1 among the nodes available
1326         */
1327         retries = 0;
1328 try_again:
1329         /* loop over all ip's and find a physical node to cover for 
1330            each unassigned ip.
1331         */
1332         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1333                 if (tmp_ip->pnn == -1) {
1334                         if (find_takeover_node(ctdb, nodemap, mask, tmp_ip, all_ips)) {
1335                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1336                                         ctdb_addr_to_str(&tmp_ip->addr)));
1337                         }
1338                 }
1339         }
1340
1341         /* If we dont want ips to fail back after a node becomes healthy
1342            again, we wont even try to reallocat the ip addresses so that
1343            they are evenly spread out.
1344            This can NOT be used at the same time as DeterministicIPs !
1345         */
1346         if (1 == ctdb->tunable.no_ip_failback) {
1347                 if (1 == ctdb->tunable.deterministic_public_ips) {
1348                         DEBUG(DEBUG_ERR, ("ERROR: You can not use 'DeterministicIPs' and 'NoIPFailback' at the same time\n"));
1349                 }
1350                 goto finished;
1351         }
1352
1353
1354         /* now, try to make sure the ip adresses are evenly distributed
1355            across the node.
1356            for each ip address, loop over all nodes that can serve this
1357            ip and make sure that the difference between the node
1358            serving the most and the node serving the least ip's are not greater
1359            than 1.
1360         */
1361         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1362                 if (tmp_ip->pnn == -1) {
1363                         continue;
1364                 }
1365
1366                 /* Get the highest and lowest number of ips's served by any 
1367                    valid node which can serve this ip.
1368                 */
1369                 maxnode = -1;
1370                 minnode = -1;
1371                 for (i=0;i<nodemap->num;i++) {
1372                         if (nodemap->nodes[i].flags & mask) {
1373                                 continue;
1374                         }
1375
1376                         /* only check nodes that can actually serve this ip */
1377                         if (can_node_serve_ip(ctdb, i, tmp_ip)) {
1378                                 /* no it couldnt   so skip to the next node */
1379                                 continue;
1380                         }
1381
1382                         num = node_ip_coverage(ctdb, i, all_ips);
1383                         if (maxnode == -1) {
1384                                 maxnode = i;
1385                                 maxnum  = num;
1386                         } else {
1387                                 if (num > maxnum) {
1388                                         maxnode = i;
1389                                         maxnum  = num;
1390                                 }
1391                         }
1392                         if (minnode == -1) {
1393                                 minnode = i;
1394                                 minnum  = num;
1395                         } else {
1396                                 if (num < minnum) {
1397                                         minnode = i;
1398                                         minnum  = num;
1399                                 }
1400                         }
1401                 }
1402                 if (maxnode == -1) {
1403                         DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1404                                 ctdb_addr_to_str(&tmp_ip->addr)));
1405
1406                         continue;
1407                 }
1408
1409                 /* If we want deterministic IPs then dont try to reallocate 
1410                    them to spread out the load.
1411                 */
1412                 if (1 == ctdb->tunable.deterministic_public_ips) {
1413                         continue;
1414                 }
1415
1416                 /* if the spread between the smallest and largest coverage by
1417                    a node is >=2 we steal one of the ips from the node with
1418                    most coverage to even things out a bit.
1419                    try to do this at most 5 times  since we dont want to spend
1420                    too much time balancing the ip coverage.
1421                 */
1422                 if ( (maxnum > minnum+1)
1423                   && (retries < 5) ){
1424                         struct ctdb_public_ip_list *tmp;
1425
1426                         /* mark one of maxnode's vnn's as unassigned and try
1427                            again
1428                         */
1429                         for (tmp=all_ips;tmp;tmp=tmp->next) {
1430                                 if (tmp->pnn == maxnode) {
1431                                         tmp->pnn = -1;
1432                                         retries++;
1433                                         goto try_again;
1434                                 }
1435                         }
1436                 }
1437         }
1438
1439
1440         /* finished distributing the public addresses, now just send the 
1441            info out to the nodes
1442         */
1443 finished:
1444
1445         /* at this point ->pnn is the node which will own each IP
1446            or -1 if there is no node that can cover this ip
1447         */
1448
1449         /* now tell all nodes to delete any alias that they should not
1450            have.  This will be a NOOP on nodes that don't currently
1451            hold the given alias */
1452         async_data = talloc_zero(tmp_ctx, struct client_async_data);
1453         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
1454
1455         for (i=0;i<nodemap->num;i++) {
1456                 /* don't talk to unconnected nodes, but do talk to banned nodes */
1457                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1458                         continue;
1459                 }
1460
1461                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1462                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
1463                                 /* This node should be serving this
1464                                    vnn so dont tell it to release the ip
1465                                 */
1466                                 continue;
1467                         }
1468                         if (tmp_ip->addr.sa.sa_family == AF_INET) {
1469                                 ipv4.pnn = tmp_ip->pnn;
1470                                 ipv4.sin = tmp_ip->addr.ip;
1471
1472                                 timeout = TAKEOVER_TIMEOUT();
1473                                 data.dsize = sizeof(ipv4);
1474                                 data.dptr  = (uint8_t *)&ipv4;
1475                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
1476                                                 0, CTDB_CONTROL_RELEASE_IPv4, 0,
1477                                                 data, async_data,
1478                                                 &timeout, NULL);
1479                         } else {
1480                                 ip.pnn  = tmp_ip->pnn;
1481                                 ip.addr = tmp_ip->addr;
1482
1483                                 timeout = TAKEOVER_TIMEOUT();
1484                                 data.dsize = sizeof(ip);
1485                                 data.dptr  = (uint8_t *)&ip;
1486                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
1487                                                 0, CTDB_CONTROL_RELEASE_IP, 0,
1488                                                 data, async_data,
1489                                                 &timeout, NULL);
1490                         }
1491
1492                         if (state == NULL) {
1493                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
1494                                 talloc_free(tmp_ctx);
1495                                 return -1;
1496                         }
1497                 
1498                         ctdb_client_async_add(async_data, state);
1499                 }
1500         }
1501         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
1502                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
1503                 talloc_free(tmp_ctx);
1504                 return -1;
1505         }
1506         talloc_free(async_data);
1507
1508
1509         /* tell all nodes to get their own IPs */
1510         async_data = talloc_zero(tmp_ctx, struct client_async_data);
1511         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
1512         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1513                 if (tmp_ip->pnn == -1) {
1514                         /* this IP won't be taken over */
1515                         continue;
1516                 }
1517
1518                 if (tmp_ip->addr.sa.sa_family == AF_INET) {
1519                         ipv4.pnn = tmp_ip->pnn;
1520                         ipv4.sin = tmp_ip->addr.ip;
1521
1522                         timeout = TAKEOVER_TIMEOUT();
1523                         data.dsize = sizeof(ipv4);
1524                         data.dptr  = (uint8_t *)&ipv4;
1525                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
1526                                         0, CTDB_CONTROL_TAKEOVER_IPv4, 0,
1527                                         data, async_data,
1528                                         &timeout, NULL);
1529                 } else {
1530                         ip.pnn  = tmp_ip->pnn;
1531                         ip.addr = tmp_ip->addr;
1532
1533                         timeout = TAKEOVER_TIMEOUT();
1534                         data.dsize = sizeof(ip);
1535                         data.dptr  = (uint8_t *)&ip;
1536                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
1537                                         0, CTDB_CONTROL_TAKEOVER_IP, 0,
1538                                         data, async_data,
1539                                         &timeout, NULL);
1540                 }
1541                 if (state == NULL) {
1542                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
1543                         talloc_free(tmp_ctx);
1544                         return -1;
1545                 }
1546                 
1547                 ctdb_client_async_add(async_data, state);
1548         }
1549         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
1550                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
1551                 talloc_free(tmp_ctx);
1552                 return -1;
1553         }
1554
1555 ipreallocated:
1556         /* tell all nodes to update natwg */
1557         /* send the flags update natgw on all connected nodes */
1558         data.dptr  = discard_const("ipreallocated");
1559         data.dsize = strlen((char *)data.dptr) + 1; 
1560         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
1561         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_RUN_EVENTSCRIPTS,
1562                                       nodes, 0, TAKEOVER_TIMEOUT(),
1563                                       false, data,
1564                                       NULL, NULL,
1565                                       NULL) != 0) {
1566                 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to updatenatgw failed\n"));
1567         }
1568
1569         talloc_free(tmp_ctx);
1570         return 0;
1571 }
1572
1573
1574 /*
1575   destroy a ctdb_client_ip structure
1576  */
1577 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
1578 {
1579         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
1580                 ctdb_addr_to_str(&ip->addr),
1581                 ntohs(ip->addr.ip.sin_port),
1582                 ip->client_id));
1583
1584         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
1585         return 0;
1586 }
1587
1588 /*
1589   called by a client to inform us of a TCP connection that it is managing
1590   that should tickled with an ACK when IP takeover is done
1591   we handle both the old ipv4 style of packets as well as the new ipv4/6
1592   pdus.
1593  */
1594 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
1595                                 TDB_DATA indata)
1596 {
1597         struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
1598         struct ctdb_control_tcp *old_addr = NULL;
1599         struct ctdb_control_tcp_addr new_addr;
1600         struct ctdb_control_tcp_addr *tcp_sock = NULL;
1601         struct ctdb_tcp_list *tcp;
1602         struct ctdb_tcp_connection t;
1603         int ret;
1604         TDB_DATA data;
1605         struct ctdb_client_ip *ip;
1606         struct ctdb_vnn *vnn;
1607         ctdb_sock_addr addr;
1608
1609         switch (indata.dsize) {
1610         case sizeof(struct ctdb_control_tcp):
1611                 old_addr = (struct ctdb_control_tcp *)indata.dptr;
1612                 ZERO_STRUCT(new_addr);
1613                 tcp_sock = &new_addr;
1614                 tcp_sock->src.ip  = old_addr->src;
1615                 tcp_sock->dest.ip = old_addr->dest;
1616                 break;
1617         case sizeof(struct ctdb_control_tcp_addr):
1618                 tcp_sock = (struct ctdb_control_tcp_addr *)indata.dptr;
1619                 break;
1620         default:
1621                 DEBUG(DEBUG_ERR,(__location__ " Invalid data structure passed "
1622                                  "to ctdb_control_tcp_client. size was %d but "
1623                                  "only allowed sizes are %lu and %lu\n",
1624                                  (int)indata.dsize,
1625                                  (long unsigned)sizeof(struct ctdb_control_tcp),
1626                                  (long unsigned)sizeof(struct ctdb_control_tcp_addr)));
1627                 return -1;
1628         }
1629
1630         addr = tcp_sock->src;
1631         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
1632         addr = tcp_sock->dest;
1633         ctdb_canonicalize_ip(&addr, &tcp_sock->dest);
1634
1635         ZERO_STRUCT(addr);
1636         memcpy(&addr, &tcp_sock->dest, sizeof(addr));
1637         vnn = find_public_ip_vnn(ctdb, &addr);
1638         if (vnn == NULL) {
1639                 switch (addr.sa.sa_family) {
1640                 case AF_INET:
1641                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
1642                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
1643                                         ctdb_addr_to_str(&addr)));
1644                         }
1645                         break;
1646                 case AF_INET6:
1647                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
1648                                 ctdb_addr_to_str(&addr)));
1649                         break;
1650                 default:
1651                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
1652                 }
1653
1654                 return 0;
1655         }
1656
1657         if (vnn->pnn != ctdb->pnn) {
1658                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
1659                         ctdb_addr_to_str(&addr),
1660                         client_id, client->pid));
1661                 /* failing this call will tell smbd to die */
1662                 return -1;
1663         }
1664
1665         ip = talloc(client, struct ctdb_client_ip);
1666         CTDB_NO_MEMORY(ctdb, ip);
1667
1668         ip->ctdb      = ctdb;
1669         ip->addr      = addr;
1670         ip->client_id = client_id;
1671         talloc_set_destructor(ip, ctdb_client_ip_destructor);
1672         DLIST_ADD(ctdb->client_ip_list, ip);
1673
1674         tcp = talloc(client, struct ctdb_tcp_list);
1675         CTDB_NO_MEMORY(ctdb, tcp);
1676
1677         tcp->connection.src_addr = tcp_sock->src;
1678         tcp->connection.dst_addr = tcp_sock->dest;
1679
1680         DLIST_ADD(client->tcp_list, tcp);
1681
1682         t.src_addr = tcp_sock->src;
1683         t.dst_addr = tcp_sock->dest;
1684
1685         data.dptr = (uint8_t *)&t;
1686         data.dsize = sizeof(t);
1687
1688         switch (addr.sa.sa_family) {
1689         case AF_INET:
1690                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
1691                         (unsigned)ntohs(tcp_sock->dest.ip.sin_port), 
1692                         ctdb_addr_to_str(&tcp_sock->src),
1693                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
1694                 break;
1695         case AF_INET6:
1696                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
1697                         (unsigned)ntohs(tcp_sock->dest.ip6.sin6_port), 
1698                         ctdb_addr_to_str(&tcp_sock->src),
1699                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
1700                 break;
1701         default:
1702                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
1703         }
1704
1705
1706         /* tell all nodes about this tcp connection */
1707         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
1708                                        CTDB_CONTROL_TCP_ADD,
1709                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
1710         if (ret != 0) {
1711                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
1712                 return -1;
1713         }
1714
1715         return 0;
1716 }
1717
1718 /*
1719   find a tcp address on a list
1720  */
1721 static struct ctdb_tcp_connection *ctdb_tcp_find(struct ctdb_tcp_array *array, 
1722                                            struct ctdb_tcp_connection *tcp)
1723 {
1724         int i;
1725
1726         if (array == NULL) {
1727                 return NULL;
1728         }
1729
1730         for (i=0;i<array->num;i++) {
1731                 if (ctdb_same_sockaddr(&array->connections[i].src_addr, &tcp->src_addr) &&
1732                     ctdb_same_sockaddr(&array->connections[i].dst_addr, &tcp->dst_addr)) {
1733                         return &array->connections[i];
1734                 }
1735         }
1736         return NULL;
1737 }
1738
1739
1740
1741 /*
1742   called by a daemon to inform us of a TCP connection that one of its
1743   clients managing that should tickled with an ACK when IP takeover is
1744   done
1745  */
1746 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
1747 {
1748         struct ctdb_tcp_connection *p = (struct ctdb_tcp_connection *)indata.dptr;
1749         struct ctdb_tcp_array *tcparray;
1750         struct ctdb_tcp_connection tcp;
1751         struct ctdb_vnn *vnn;
1752
1753         vnn = find_public_ip_vnn(ctdb, &p->dst_addr);
1754         if (vnn == NULL) {
1755                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
1756                         ctdb_addr_to_str(&p->dst_addr)));
1757
1758                 return -1;
1759         }
1760
1761
1762         tcparray = vnn->tcp_array;
1763
1764         /* If this is the first tickle */
1765         if (tcparray == NULL) {
1766                 tcparray = talloc_size(ctdb->nodes, 
1767                         offsetof(struct ctdb_tcp_array, connections) +
1768                         sizeof(struct ctdb_tcp_connection) * 1);
1769                 CTDB_NO_MEMORY(ctdb, tcparray);
1770                 vnn->tcp_array = tcparray;
1771
1772                 tcparray->num = 0;
1773                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
1774                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
1775
1776                 tcparray->connections[tcparray->num].src_addr = p->src_addr;
1777                 tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
1778                 tcparray->num++;
1779
1780                 if (tcp_update_needed) {
1781                         vnn->tcp_update_needed = true;
1782                 }
1783                 return 0;
1784         }
1785
1786
1787         /* Do we already have this tickle ?*/
1788         tcp.src_addr = p->src_addr;
1789         tcp.dst_addr = p->dst_addr;
1790         if (ctdb_tcp_find(vnn->tcp_array, &tcp) != NULL) {
1791                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
1792                         ctdb_addr_to_str(&tcp.dst_addr),
1793                         ntohs(tcp.dst_addr.ip.sin_port),
1794                         vnn->pnn));
1795                 return 0;
1796         }
1797
1798         /* A new tickle, we must add it to the array */
1799         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
1800                                         struct ctdb_tcp_connection,
1801                                         tcparray->num+1);
1802         CTDB_NO_MEMORY(ctdb, tcparray->connections);
1803
1804         vnn->tcp_array = tcparray;
1805         tcparray->connections[tcparray->num].src_addr = p->src_addr;
1806         tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
1807         tcparray->num++;
1808                                 
1809         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
1810                 ctdb_addr_to_str(&tcp.dst_addr),
1811                 ntohs(tcp.dst_addr.ip.sin_port),
1812                 vnn->pnn));
1813
1814         if (tcp_update_needed) {
1815                 vnn->tcp_update_needed = true;
1816         }
1817
1818         return 0;
1819 }
1820
1821
1822 /*
1823   called by a daemon to inform us of a TCP connection that one of its
1824   clients managing that should tickled with an ACK when IP takeover is
1825   done
1826  */
1827 static void ctdb_remove_tcp_connection(struct ctdb_context *ctdb, struct ctdb_tcp_connection *conn)
1828 {
1829         struct ctdb_tcp_connection *tcpp;
1830         struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst_addr);
1831
1832         if (vnn == NULL) {
1833                 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
1834                         ctdb_addr_to_str(&conn->dst_addr)));
1835                 return;
1836         }
1837
1838         /* if the array is empty we cant remove it
1839            and we dont need to do anything
1840          */
1841         if (vnn->tcp_array == NULL) {
1842                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
1843                         ctdb_addr_to_str(&conn->dst_addr),
1844                         ntohs(conn->dst_addr.ip.sin_port)));
1845                 return;
1846         }
1847
1848
1849         /* See if we know this connection
1850            if we dont know this connection  then we dont need to do anything
1851          */
1852         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
1853         if (tcpp == NULL) {
1854                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
1855                         ctdb_addr_to_str(&conn->dst_addr),
1856                         ntohs(conn->dst_addr.ip.sin_port)));
1857                 return;
1858         }
1859
1860
1861         /* We need to remove this entry from the array.
1862            Instead of allocating a new array and copying data to it
1863            we cheat and just copy the last entry in the existing array
1864            to the entry that is to be removed and just shring the 
1865            ->num field
1866          */
1867         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
1868         vnn->tcp_array->num--;
1869
1870         /* If we deleted the last entry we also need to remove the entire array
1871          */
1872         if (vnn->tcp_array->num == 0) {
1873                 talloc_free(vnn->tcp_array);
1874                 vnn->tcp_array = NULL;
1875         }               
1876
1877         vnn->tcp_update_needed = true;
1878
1879         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
1880                 ctdb_addr_to_str(&conn->src_addr),
1881                 ntohs(conn->src_addr.ip.sin_port)));
1882 }
1883
1884
1885 /*
1886   called by a daemon to inform us of a TCP connection that one of its
1887   clients used are no longer needed in the tickle database
1888  */
1889 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
1890 {
1891         struct ctdb_tcp_connection *conn = (struct ctdb_tcp_connection *)indata.dptr;
1892
1893         ctdb_remove_tcp_connection(ctdb, conn);
1894
1895         return 0;
1896 }
1897
1898
1899 /*
1900   called when a daemon restarts - send all tickes for all public addresses
1901   we are serving immediately to the new node.
1902  */
1903 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t vnn)
1904 {
1905 /*XXX here we should send all tickes we are serving to the new node */
1906         return 0;
1907 }
1908
1909
1910 /*
1911   called when a client structure goes away - hook to remove
1912   elements from the tcp_list in all daemons
1913  */
1914 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
1915 {
1916         while (client->tcp_list) {
1917                 struct ctdb_tcp_list *tcp = client->tcp_list;
1918                 DLIST_REMOVE(client->tcp_list, tcp);
1919                 ctdb_remove_tcp_connection(client->ctdb, &tcp->connection);
1920         }
1921 }
1922
1923
1924 /*
1925   release all IPs on shutdown
1926  */
1927 void ctdb_release_all_ips(struct ctdb_context *ctdb)
1928 {
1929         struct ctdb_vnn *vnn;
1930
1931         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1932                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
1933                         ctdb_vnn_unassign_iface(ctdb, vnn);
1934                         continue;
1935                 }
1936                 if (!vnn->iface) {
1937                         continue;
1938                 }
1939                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
1940                                   ctdb_vnn_iface_string(vnn),
1941                                   ctdb_addr_to_str(&vnn->public_address),
1942                                   vnn->public_netmask_bits);
1943                 release_kill_clients(ctdb, &vnn->public_address);
1944                 ctdb_vnn_unassign_iface(ctdb, vnn);
1945         }
1946 }
1947
1948
1949 /*
1950   get list of public IPs
1951  */
1952 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
1953                                     struct ctdb_req_control *c, TDB_DATA *outdata)
1954 {
1955         int i, num, len;
1956         struct ctdb_all_public_ips *ips;
1957         struct ctdb_vnn *vnn;
1958         bool only_available = false;
1959
1960         if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
1961                 only_available = true;
1962         }
1963
1964         /* count how many public ip structures we have */
1965         num = 0;
1966         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1967                 num++;
1968         }
1969
1970         len = offsetof(struct ctdb_all_public_ips, ips) + 
1971                 num*sizeof(struct ctdb_public_ip);
1972         ips = talloc_zero_size(outdata, len);
1973         CTDB_NO_MEMORY(ctdb, ips);
1974
1975         i = 0;
1976         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1977                 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
1978                         continue;
1979                 }
1980                 ips->ips[i].pnn  = vnn->pnn;
1981                 ips->ips[i].addr = vnn->public_address;
1982                 i++;
1983         }
1984         ips->num = i;
1985         len = offsetof(struct ctdb_all_public_ips, ips) +
1986                 i*sizeof(struct ctdb_public_ip);
1987
1988         outdata->dsize = len;
1989         outdata->dptr  = (uint8_t *)ips;
1990
1991         return 0;
1992 }
1993
1994
1995 /*
1996   get list of public IPs, old ipv4 style.  only returns ipv4 addresses
1997  */
1998 int32_t ctdb_control_get_public_ipsv4(struct ctdb_context *ctdb, 
1999                                     struct ctdb_req_control *c, TDB_DATA *outdata)
2000 {
2001         int i, num, len;
2002         struct ctdb_all_public_ipsv4 *ips;
2003         struct ctdb_vnn *vnn;
2004
2005         /* count how many public ip structures we have */
2006         num = 0;
2007         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2008                 if (vnn->public_address.sa.sa_family != AF_INET) {
2009                         continue;
2010                 }
2011                 num++;
2012         }
2013
2014         len = offsetof(struct ctdb_all_public_ipsv4, ips) + 
2015                 num*sizeof(struct ctdb_public_ipv4);
2016         ips = talloc_zero_size(outdata, len);
2017         CTDB_NO_MEMORY(ctdb, ips);
2018
2019         outdata->dsize = len;
2020         outdata->dptr  = (uint8_t *)ips;
2021
2022         ips->num = num;
2023         i = 0;
2024         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2025                 if (vnn->public_address.sa.sa_family != AF_INET) {
2026                         continue;
2027                 }
2028                 ips->ips[i].pnn = vnn->pnn;
2029                 ips->ips[i].sin = vnn->public_address.ip;
2030                 i++;
2031         }
2032
2033         return 0;
2034 }
2035
2036 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
2037                                         struct ctdb_req_control *c,
2038                                         TDB_DATA indata,
2039                                         TDB_DATA *outdata)
2040 {
2041         int i, num, len;
2042         ctdb_sock_addr *addr;
2043         struct ctdb_control_public_ip_info *info;
2044         struct ctdb_vnn *vnn;
2045
2046         addr = (ctdb_sock_addr *)indata.dptr;
2047
2048         vnn = find_public_ip_vnn(ctdb, addr);
2049         if (vnn == NULL) {
2050                 /* if it is not a public ip   it could be our 'single ip' */
2051                 if (ctdb->single_ip_vnn) {
2052                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
2053                                 vnn = ctdb->single_ip_vnn;
2054                         }
2055                 }
2056         }
2057         if (vnn == NULL) {
2058                 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
2059                                  "'%s'not a public address\n",
2060                                  ctdb_addr_to_str(addr)));
2061                 return -1;
2062         }
2063
2064         /* count how many public ip structures we have */
2065         num = 0;
2066         for (;vnn->ifaces[num];) {
2067                 num++;
2068         }
2069
2070         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
2071                 num*sizeof(struct ctdb_control_iface_info);
2072         info = talloc_zero_size(outdata, len);
2073         CTDB_NO_MEMORY(ctdb, info);
2074
2075         info->ip.addr = vnn->public_address;
2076         info->ip.pnn = vnn->pnn;
2077         info->active_idx = 0xFFFFFFFF;
2078
2079         for (i=0; vnn->ifaces[i]; i++) {
2080                 struct ctdb_iface *cur;
2081
2082                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
2083                 if (cur == NULL) {
2084                         DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
2085                                            vnn->ifaces[i]));
2086                         return -1;
2087                 }
2088                 if (vnn->iface == cur) {
2089                         info->active_idx = i;
2090                 }
2091                 strcpy(info->ifaces[i].name, cur->name);
2092                 info->ifaces[i].link_state = cur->link_up;
2093                 info->ifaces[i].references = cur->references;
2094         }
2095         info->num = i;
2096         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
2097                 i*sizeof(struct ctdb_control_iface_info);
2098
2099         outdata->dsize = len;
2100         outdata->dptr  = (uint8_t *)info;
2101
2102         return 0;
2103 }
2104
2105 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
2106                                 struct ctdb_req_control *c,
2107                                 TDB_DATA *outdata)
2108 {
2109         int i, num, len;
2110         struct ctdb_control_get_ifaces *ifaces;
2111         struct ctdb_iface *cur;
2112
2113         /* count how many public ip structures we have */
2114         num = 0;
2115         for (cur=ctdb->ifaces;cur;cur=cur->next) {
2116                 num++;
2117         }
2118
2119         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
2120                 num*sizeof(struct ctdb_control_iface_info);
2121         ifaces = talloc_zero_size(outdata, len);
2122         CTDB_NO_MEMORY(ctdb, ifaces);
2123
2124         i = 0;
2125         for (cur=ctdb->ifaces;cur;cur=cur->next) {
2126                 strcpy(ifaces->ifaces[i].name, cur->name);
2127                 ifaces->ifaces[i].link_state = cur->link_up;
2128                 ifaces->ifaces[i].references = cur->references;
2129                 i++;
2130         }
2131         ifaces->num = i;
2132         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
2133                 i*sizeof(struct ctdb_control_iface_info);
2134
2135         outdata->dsize = len;
2136         outdata->dptr  = (uint8_t *)ifaces;
2137
2138         return 0;
2139 }
2140
2141 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
2142                                     struct ctdb_req_control *c,
2143                                     TDB_DATA indata)
2144 {
2145         struct ctdb_control_iface_info *info;
2146         struct ctdb_iface *iface;
2147         bool link_up = false;
2148
2149         info = (struct ctdb_control_iface_info *)indata.dptr;
2150
2151         if (info->name[CTDB_IFACE_SIZE] != '\0') {
2152                 int len = strnlen(info->name, CTDB_IFACE_SIZE);
2153                 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
2154                                   len, len, info->name));
2155                 return -1;
2156         }
2157
2158         switch (info->link_state) {
2159         case 0:
2160                 link_up = false;
2161                 break;
2162         case 1:
2163                 link_up = true;
2164                 break;
2165         default:
2166                 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
2167                                   (unsigned int)info->link_state));
2168                 return -1;
2169         }
2170
2171         if (info->references != 0) {
2172                 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
2173                                   (unsigned int)info->references));
2174                 return -1;
2175         }
2176
2177         iface = ctdb_find_iface(ctdb, info->name);
2178         if (iface == NULL) {
2179                 DEBUG(DEBUG_ERR, (__location__ "iface[%s] is unknown\n",
2180                                   info->name));
2181                 return -1;
2182         }
2183
2184         if (link_up == iface->link_up) {
2185                 return 0;
2186         }
2187
2188         DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
2189               ("iface[%s] has changed it's link status %s => %s\n",
2190                iface->name,
2191                iface->link_up?"up":"down",
2192                link_up?"up":"down"));
2193
2194         iface->link_up = link_up;
2195         return 0;
2196 }
2197
2198
2199 /* 
2200    structure containing the listening socket and the list of tcp connections
2201    that the ctdb daemon is to kill
2202 */
2203 struct ctdb_kill_tcp {
2204         struct ctdb_vnn *vnn;
2205         struct ctdb_context *ctdb;
2206         int capture_fd;
2207         struct fd_event *fde;
2208         trbt_tree_t *connections;
2209         void *private_data;
2210 };
2211
2212 /*
2213   a tcp connection that is to be killed
2214  */
2215 struct ctdb_killtcp_con {
2216         ctdb_sock_addr src_addr;
2217         ctdb_sock_addr dst_addr;
2218         int count;
2219         struct ctdb_kill_tcp *killtcp;
2220 };
2221
2222 /* this function is used to create a key to represent this socketpair
2223    in the killtcp tree.
2224    this key is used to insert and lookup matching socketpairs that are
2225    to be tickled and RST
2226 */
2227 #define KILLTCP_KEYLEN  10
2228 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
2229 {
2230         static uint32_t key[KILLTCP_KEYLEN];
2231
2232         bzero(key, sizeof(key));
2233
2234         if (src->sa.sa_family != dst->sa.sa_family) {
2235                 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
2236                 return key;
2237         }
2238         
2239         switch (src->sa.sa_family) {
2240         case AF_INET:
2241                 key[0]  = dst->ip.sin_addr.s_addr;
2242                 key[1]  = src->ip.sin_addr.s_addr;
2243                 key[2]  = dst->ip.sin_port;
2244                 key[3]  = src->ip.sin_port;
2245                 break;
2246         case AF_INET6:
2247                 key[0]  = dst->ip6.sin6_addr.s6_addr32[3];
2248                 key[1]  = src->ip6.sin6_addr.s6_addr32[3];
2249                 key[2]  = dst->ip6.sin6_addr.s6_addr32[2];
2250                 key[3]  = src->ip6.sin6_addr.s6_addr32[2];
2251                 key[4]  = dst->ip6.sin6_addr.s6_addr32[1];
2252                 key[5]  = src->ip6.sin6_addr.s6_addr32[1];
2253                 key[6]  = dst->ip6.sin6_addr.s6_addr32[0];
2254                 key[7]  = src->ip6.sin6_addr.s6_addr32[0];
2255                 key[8]  = dst->ip6.sin6_port;
2256                 key[9]  = src->ip6.sin6_port;
2257                 break;
2258         default:
2259                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
2260                 return key;
2261         }
2262
2263         return key;
2264 }
2265
2266 /*
2267   called when we get a read event on the raw socket
2268  */
2269 static void capture_tcp_handler(struct event_context *ev, struct fd_event *fde, 
2270                                 uint16_t flags, void *private_data)
2271 {
2272         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
2273         struct ctdb_killtcp_con *con;
2274         ctdb_sock_addr src, dst;
2275         uint32_t ack_seq, seq;
2276
2277         if (!(flags & EVENT_FD_READ)) {
2278                 return;
2279         }
2280
2281         if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
2282                                 killtcp->private_data,
2283                                 &src, &dst,
2284                                 &ack_seq, &seq) != 0) {
2285                 /* probably a non-tcp ACK packet */
2286                 return;
2287         }
2288
2289         /* check if we have this guy in our list of connections
2290            to kill
2291         */
2292         con = trbt_lookuparray32(killtcp->connections, 
2293                         KILLTCP_KEYLEN, killtcp_key(&src, &dst));
2294         if (con == NULL) {
2295                 /* no this was some other packet we can just ignore */
2296                 return;
2297         }
2298
2299         /* This one has been tickled !
2300            now reset him and remove him from the list.
2301          */
2302         DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
2303                 ntohs(con->dst_addr.ip.sin_port),
2304                 ctdb_addr_to_str(&con->src_addr),
2305                 ntohs(con->src_addr.ip.sin_port)));
2306
2307         ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
2308         talloc_free(con);
2309 }
2310
2311
2312 /* when traversing the list of all tcp connections to send tickle acks to
2313    (so that we can capture the ack coming back and kill the connection
2314     by a RST)
2315    this callback is called for each connection we are currently trying to kill
2316 */
2317 static void tickle_connection_traverse(void *param, void *data)
2318 {
2319         struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
2320
2321         /* have tried too many times, just give up */
2322         if (con->count >= 5) {
2323                 /* can't delete in traverse: reparent to delete_cons */
2324                 talloc_steal(param, con);
2325                 return;
2326         }
2327
2328         /* othervise, try tickling it again */
2329         con->count++;
2330         ctdb_sys_send_tcp(
2331                 (ctdb_sock_addr *)&con->dst_addr,
2332                 (ctdb_sock_addr *)&con->src_addr,
2333                 0, 0, 0);
2334 }
2335
2336
2337 /* 
2338    called every second until all sentenced connections have been reset
2339  */
2340 static void ctdb_tickle_sentenced_connections(struct event_context *ev, struct timed_event *te, 
2341                                               struct timeval t, void *private_data)
2342 {
2343         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
2344         void *delete_cons = talloc_new(NULL);
2345
2346         /* loop over all connections sending tickle ACKs */
2347         trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
2348
2349         /* now we've finished traverse, it's safe to do deletion. */
2350         talloc_free(delete_cons);
2351
2352         /* If there are no more connections to kill we can remove the
2353            entire killtcp structure
2354          */
2355         if ( (killtcp->connections == NULL) || 
2356              (killtcp->connections->root == NULL) ) {
2357                 talloc_free(killtcp);
2358                 return;
2359         }
2360
2361         /* try tickling them again in a seconds time
2362          */
2363         event_add_timed(killtcp->ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
2364                         ctdb_tickle_sentenced_connections, killtcp);
2365 }
2366
2367 /*
2368   destroy the killtcp structure
2369  */
2370 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
2371 {
2372         if (killtcp->vnn) {
2373                 killtcp->vnn->killtcp = NULL;
2374         }
2375         return 0;
2376 }
2377
2378
2379 /* nothing fancy here, just unconditionally replace any existing
2380    connection structure with the new one.
2381
2382    dont even free the old one if it did exist, that one is talloc_stolen
2383    by the same node in the tree anyway and will be deleted when the new data 
2384    is deleted
2385 */
2386 static void *add_killtcp_callback(void *parm, void *data)
2387 {
2388         return parm;
2389 }
2390
2391 /*
2392   add a tcp socket to the list of connections we want to RST
2393  */
2394 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb, 
2395                                        ctdb_sock_addr *s,
2396                                        ctdb_sock_addr *d)
2397 {
2398         ctdb_sock_addr src, dst;
2399         struct ctdb_kill_tcp *killtcp;
2400         struct ctdb_killtcp_con *con;
2401         struct ctdb_vnn *vnn;
2402
2403         ctdb_canonicalize_ip(s, &src);
2404         ctdb_canonicalize_ip(d, &dst);
2405
2406         vnn = find_public_ip_vnn(ctdb, &dst);
2407         if (vnn == NULL) {
2408                 vnn = find_public_ip_vnn(ctdb, &src);
2409         }
2410         if (vnn == NULL) {
2411                 /* if it is not a public ip   it could be our 'single ip' */
2412                 if (ctdb->single_ip_vnn) {
2413                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
2414                                 vnn = ctdb->single_ip_vnn;
2415                         }
2416                 }
2417         }
2418         if (vnn == NULL) {
2419                 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n")); 
2420                 return -1;
2421         }
2422
2423         killtcp = vnn->killtcp;
2424         
2425         /* If this is the first connection to kill we must allocate
2426            a new structure
2427          */
2428         if (killtcp == NULL) {
2429                 killtcp = talloc_zero(ctdb, struct ctdb_kill_tcp);
2430                 CTDB_NO_MEMORY(ctdb, killtcp);
2431
2432                 killtcp->vnn         = vnn;
2433                 killtcp->ctdb        = ctdb;
2434                 killtcp->capture_fd  = -1;
2435                 killtcp->connections = trbt_create(killtcp, 0);
2436
2437                 vnn->killtcp         = killtcp;
2438                 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
2439         }
2440
2441
2442
2443         /* create a structure that describes this connection we want to
2444            RST and store it in killtcp->connections
2445         */
2446         con = talloc(killtcp, struct ctdb_killtcp_con);
2447         CTDB_NO_MEMORY(ctdb, con);
2448         con->src_addr = src;
2449         con->dst_addr = dst;
2450         con->count    = 0;
2451         con->killtcp  = killtcp;
2452
2453
2454         trbt_insertarray32_callback(killtcp->connections,
2455                         KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
2456                         add_killtcp_callback, con);
2457
2458         /* 
2459            If we dont have a socket to listen on yet we must create it
2460          */
2461         if (killtcp->capture_fd == -1) {
2462                 const char *iface = ctdb_vnn_iface_string(vnn);
2463                 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
2464                 if (killtcp->capture_fd == -1) {
2465                         DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
2466                                           "socket on iface '%s' for killtcp (%s)\n",
2467                                           iface, strerror(errno)));
2468                         goto failed;
2469                 }
2470         }
2471
2472
2473         if (killtcp->fde == NULL) {
2474                 killtcp->fde = event_add_fd(ctdb->ev, killtcp, killtcp->capture_fd, 
2475                                             EVENT_FD_READ,
2476                                             capture_tcp_handler, killtcp);
2477                 tevent_fd_set_auto_close(killtcp->fde);
2478
2479                 /* We also need to set up some events to tickle all these connections
2480                    until they are all reset
2481                 */
2482                 event_add_timed(ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
2483                                 ctdb_tickle_sentenced_connections, killtcp);
2484         }
2485
2486         /* tickle him once now */
2487         ctdb_sys_send_tcp(
2488                 &con->dst_addr,
2489                 &con->src_addr,
2490                 0, 0, 0);
2491
2492         return 0;
2493
2494 failed:
2495         talloc_free(vnn->killtcp);
2496         vnn->killtcp = NULL;
2497         return -1;
2498 }
2499
2500 /*
2501   kill a TCP connection.
2502  */
2503 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
2504 {
2505         struct ctdb_control_killtcp *killtcp = (struct ctdb_control_killtcp *)indata.dptr;
2506
2507         return ctdb_killtcp_add_connection(ctdb, &killtcp->src_addr, &killtcp->dst_addr);
2508 }
2509
2510 /*
2511   called by a daemon to inform us of the entire list of TCP tickles for
2512   a particular public address.
2513   this control should only be sent by the node that is currently serving
2514   that public address.
2515  */
2516 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
2517 {
2518         struct ctdb_control_tcp_tickle_list *list = (struct ctdb_control_tcp_tickle_list *)indata.dptr;
2519         struct ctdb_tcp_array *tcparray;
2520         struct ctdb_vnn *vnn;
2521
2522         /* We must at least have tickles.num or else we cant verify the size
2523            of the received data blob
2524          */
2525         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
2526                                         tickles.connections)) {
2527                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list. Not enough data for the tickle.num field\n"));
2528                 return -1;
2529         }
2530
2531         /* verify that the size of data matches what we expect */
2532         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
2533                                 tickles.connections)
2534                          + sizeof(struct ctdb_tcp_connection)
2535                                  * list->tickles.num) {
2536                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list\n"));
2537                 return -1;
2538         }       
2539
2540         vnn = find_public_ip_vnn(ctdb, &list->addr);
2541         if (vnn == NULL) {
2542                 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n", 
2543                         ctdb_addr_to_str(&list->addr)));
2544
2545                 return 1;
2546         }
2547
2548         /* remove any old ticklelist we might have */
2549         talloc_free(vnn->tcp_array);
2550         vnn->tcp_array = NULL;
2551
2552         tcparray = talloc(ctdb->nodes, struct ctdb_tcp_array);
2553         CTDB_NO_MEMORY(ctdb, tcparray);
2554
2555         tcparray->num = list->tickles.num;
2556
2557         tcparray->connections = talloc_array(tcparray, struct ctdb_tcp_connection, tcparray->num);
2558         CTDB_NO_MEMORY(ctdb, tcparray->connections);
2559
2560         memcpy(tcparray->connections, &list->tickles.connections[0], 
2561                sizeof(struct ctdb_tcp_connection)*tcparray->num);
2562
2563         /* We now have a new fresh tickle list array for this vnn */
2564         vnn->tcp_array = talloc_steal(vnn, tcparray);
2565         
2566         return 0;
2567 }
2568
2569 /*
2570   called to return the full list of tickles for the puclic address associated 
2571   with the provided vnn
2572  */
2573 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
2574 {
2575         ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
2576         struct ctdb_control_tcp_tickle_list *list;
2577         struct ctdb_tcp_array *tcparray;
2578         int num;
2579         struct ctdb_vnn *vnn;
2580
2581         vnn = find_public_ip_vnn(ctdb, addr);
2582         if (vnn == NULL) {
2583                 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n", 
2584                         ctdb_addr_to_str(addr)));
2585
2586                 return 1;
2587         }
2588
2589         tcparray = vnn->tcp_array;
2590         if (tcparray) {
2591                 num = tcparray->num;
2592         } else {
2593                 num = 0;
2594         }
2595
2596         outdata->dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
2597                                 tickles.connections)
2598                         + sizeof(struct ctdb_tcp_connection) * num;
2599
2600         outdata->dptr  = talloc_size(outdata, outdata->dsize);
2601         CTDB_NO_MEMORY(ctdb, outdata->dptr);
2602         list = (struct ctdb_control_tcp_tickle_list *)outdata->dptr;
2603
2604         list->addr = *addr;
2605         list->tickles.num = num;
2606         if (num) {
2607                 memcpy(&list->tickles.connections[0], tcparray->connections, 
2608                         sizeof(struct ctdb_tcp_connection) * num);
2609         }
2610
2611         return 0;
2612 }
2613
2614
2615 /*
2616   set the list of all tcp tickles for a public address
2617  */
2618 static int ctdb_ctrl_set_tcp_tickles(struct ctdb_context *ctdb, 
2619                               struct timeval timeout, uint32_t destnode, 
2620                               ctdb_sock_addr *addr,
2621                               struct ctdb_tcp_array *tcparray)
2622 {
2623         int ret, num;
2624         TDB_DATA data;
2625         struct ctdb_control_tcp_tickle_list *list;
2626
2627         if (tcparray) {
2628                 num = tcparray->num;
2629         } else {
2630                 num = 0;
2631         }
2632
2633         data.dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
2634                                 tickles.connections) +
2635                         sizeof(struct ctdb_tcp_connection) * num;
2636         data.dptr = talloc_size(ctdb, data.dsize);
2637         CTDB_NO_MEMORY(ctdb, data.dptr);
2638
2639         list = (struct ctdb_control_tcp_tickle_list *)data.dptr;
2640         list->addr = *addr;
2641         list->tickles.num = num;
2642         if (tcparray) {
2643                 memcpy(&list->tickles.connections[0], tcparray->connections, sizeof(struct ctdb_tcp_connection) * num);
2644         }
2645
2646         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
2647                                        CTDB_CONTROL_SET_TCP_TICKLE_LIST,
2648                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2649         if (ret != 0) {
2650                 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
2651                 return -1;
2652         }
2653
2654         talloc_free(data.dptr);
2655
2656         return ret;
2657 }
2658
2659
2660 /*
2661   perform tickle updates if required
2662  */
2663 static void ctdb_update_tcp_tickles(struct event_context *ev, 
2664                                 struct timed_event *te, 
2665                                 struct timeval t, void *private_data)
2666 {
2667         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
2668         int ret;
2669         struct ctdb_vnn *vnn;
2670
2671         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2672                 /* we only send out updates for public addresses that 
2673                    we have taken over
2674                  */
2675                 if (ctdb->pnn != vnn->pnn) {
2676                         continue;
2677                 }
2678                 /* We only send out the updates if we need to */
2679                 if (!vnn->tcp_update_needed) {
2680                         continue;
2681                 }
2682                 ret = ctdb_ctrl_set_tcp_tickles(ctdb, 
2683                                 TAKEOVER_TIMEOUT(),
2684                                 CTDB_BROADCAST_CONNECTED,
2685                                 &vnn->public_address,
2686                                 vnn->tcp_array);
2687                 if (ret != 0) {
2688                         DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
2689                                 ctdb_addr_to_str(&vnn->public_address)));
2690                 }
2691         }
2692
2693         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
2694                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
2695                              ctdb_update_tcp_tickles, ctdb);
2696 }               
2697         
2698
2699 /*
2700   start periodic update of tcp tickles
2701  */
2702 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
2703 {
2704         ctdb->tickle_update_context = talloc_new(ctdb);
2705
2706         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
2707                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
2708                              ctdb_update_tcp_tickles, ctdb);
2709 }
2710
2711
2712
2713
2714 struct control_gratious_arp {
2715         struct ctdb_context *ctdb;
2716         ctdb_sock_addr addr;
2717         const char *iface;
2718         int count;
2719 };
2720
2721 /*
2722   send a control_gratuitous arp
2723  */
2724 static void send_gratious_arp(struct event_context *ev, struct timed_event *te, 
2725                                   struct timeval t, void *private_data)
2726 {
2727         int ret;
2728         struct control_gratious_arp *arp = talloc_get_type(private_data, 
2729                                                         struct control_gratious_arp);
2730
2731         ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
2732         if (ret != 0) {
2733                 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
2734                                  arp->iface, strerror(errno)));
2735         }
2736
2737
2738         arp->count++;
2739         if (arp->count == CTDB_ARP_REPEAT) {
2740                 talloc_free(arp);
2741                 return;
2742         }
2743
2744         event_add_timed(arp->ctdb->ev, arp, 
2745                         timeval_current_ofs(CTDB_ARP_INTERVAL, 0), 
2746                         send_gratious_arp, arp);
2747 }
2748
2749
2750 /*
2751   send a gratious arp 
2752  */
2753 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
2754 {
2755         struct ctdb_control_gratious_arp *gratious_arp = (struct ctdb_control_gratious_arp *)indata.dptr;
2756         struct control_gratious_arp *arp;
2757
2758         /* verify the size of indata */
2759         if (indata.dsize < offsetof(struct ctdb_control_gratious_arp, iface)) {
2760                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
2761                                  (unsigned)indata.dsize, 
2762                                  (unsigned)offsetof(struct ctdb_control_gratious_arp, iface)));
2763                 return -1;
2764         }
2765         if (indata.dsize != 
2766                 ( offsetof(struct ctdb_control_gratious_arp, iface)
2767                 + gratious_arp->len ) ){
2768
2769                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
2770                         "but should be %u bytes\n", 
2771                          (unsigned)indata.dsize, 
2772                          (unsigned)(offsetof(struct ctdb_control_gratious_arp, iface)+gratious_arp->len)));
2773                 return -1;
2774         }
2775
2776
2777         arp = talloc(ctdb, struct control_gratious_arp);
2778         CTDB_NO_MEMORY(ctdb, arp);
2779
2780         arp->ctdb  = ctdb;
2781         arp->addr   = gratious_arp->addr;
2782         arp->iface = talloc_strdup(arp, gratious_arp->iface);
2783         CTDB_NO_MEMORY(ctdb, arp->iface);
2784         arp->count = 0;
2785         
2786         event_add_timed(arp->ctdb->ev, arp, 
2787                         timeval_zero(), send_gratious_arp, arp);
2788
2789         return 0;
2790 }
2791
2792 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
2793 {
2794         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
2795         int ret;
2796
2797         /* verify the size of indata */
2798         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
2799                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
2800                 return -1;
2801         }
2802         if (indata.dsize != 
2803                 ( offsetof(struct ctdb_control_ip_iface, iface)
2804                 + pub->len ) ){
2805
2806                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
2807                         "but should be %u bytes\n", 
2808                          (unsigned)indata.dsize, 
2809                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
2810                 return -1;
2811         }
2812
2813         ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0]);
2814
2815         if (ret != 0) {
2816                 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
2817                 return -1;
2818         }
2819
2820         return 0;
2821 }
2822
2823 /*
2824   called when releaseip event finishes for del_public_address
2825  */
2826 static void delete_ip_callback(struct ctdb_context *ctdb, int status, 
2827                                 void *private_data)
2828 {
2829         talloc_free(private_data);
2830 }
2831
2832 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
2833 {
2834         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
2835         struct ctdb_vnn *vnn;
2836         int ret;
2837
2838         /* verify the size of indata */
2839         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
2840                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
2841                 return -1;
2842         }
2843         if (indata.dsize != 
2844                 ( offsetof(struct ctdb_control_ip_iface, iface)
2845                 + pub->len ) ){
2846
2847                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
2848                         "but should be %u bytes\n", 
2849                          (unsigned)indata.dsize, 
2850                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
2851                 return -1;
2852         }
2853
2854         /* walk over all public addresses until we find a match */
2855         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2856                 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
2857                         TALLOC_CTX *mem_ctx;
2858
2859                         DLIST_REMOVE(ctdb->vnn, vnn);
2860                         if (vnn->iface == NULL) {
2861                                 talloc_free(vnn);
2862                                 return 0;
2863                         }
2864
2865                         mem_ctx = talloc_new(ctdb);
2866                         ret = ctdb_event_script_callback(ctdb, 
2867                                          mem_ctx, delete_ip_callback, mem_ctx,
2868                                          false,
2869                                          CTDB_EVENT_RELEASE_IP,
2870                                          "%s %s %u",
2871                                          ctdb_vnn_iface_string(vnn),
2872                                          ctdb_addr_to_str(&vnn->public_address),
2873                                          vnn->public_netmask_bits);
2874                         ctdb_vnn_unassign_iface(ctdb, vnn);
2875                         talloc_free(vnn);
2876                         if (ret != 0) {
2877                                 return -1;
2878                         }
2879                         return 0;
2880                 }
2881         }
2882
2883         return -1;
2884 }
2885
2886 /* This function is called from the recovery daemon to verify that a remote
2887    node has the expected ip allocation.
2888    This is verified against ctdb->ip_tree
2889 */
2890 int verify_remote_ip_allocation(struct ctdb_context *ctdb, struct ctdb_all_public_ips *ips)
2891 {
2892         struct ctdb_public_ip_list *tmp_ip; 
2893         int i;
2894
2895         if (ctdb->ip_tree == NULL) {
2896                 /* dont know the expected allocation yet, assume remote node
2897                    is correct. */
2898                 return 0;
2899         }
2900
2901         if (ips == NULL) {
2902                 return 0;
2903         }
2904
2905         for (i=0; i<ips->num; i++) {
2906                 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
2907                 if (tmp_ip == NULL) {
2908                         DEBUG(DEBUG_ERR,(__location__ " Could not find host for address %s, reassign ips\n", ctdb_addr_to_str(&ips->ips[i].addr)));
2909                         return -1;
2910                 }
2911
2912                 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
2913                         continue;
2914                 }
2915
2916                 if (tmp_ip->pnn != ips->ips[i].pnn) {
2917                         DEBUG(DEBUG_ERR,("Inconsistent ip allocation. Trigger reallocation. Thinks %s is held by node %u while it is held by node %u\n", ctdb_addr_to_str(&ips->ips[i].addr), ips->ips[i].pnn, tmp_ip->pnn));
2918                         return -1;
2919                 }
2920         }
2921
2922         return 0;
2923 }
2924
2925 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
2926 {
2927         struct ctdb_public_ip_list *tmp_ip; 
2928
2929         if (ctdb->ip_tree == NULL) {
2930                 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
2931                 return -1;
2932         }
2933
2934         tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
2935         if (tmp_ip == NULL) {
2936                 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
2937                 return -1;
2938         }
2939
2940         DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
2941         tmp_ip->pnn = ip->pnn;
2942
2943         return 0;
2944 }