For deterministic IPs, check if the designated node can host the ip before
[ctdb.git] / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6
7    This program is free software; you can redistribute it and/or modify
8    it under the terms of the GNU General Public License as published by
9    the Free Software Foundation; either version 3 of the License, or
10    (at your option) any later version.
11    
12    This program is distributed in the hope that it will be useful,
13    but WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15    GNU General Public License for more details.
16    
17    You should have received a copy of the GNU General Public License
18    along with this program; if not, see <http://www.gnu.org/licenses/>.
19 */
20 #include "includes.h"
21 #include "lib/events/events.h"
22 #include "lib/tdb/include/tdb.h"
23 #include "lib/util/dlinklist.h"
24 #include "system/network.h"
25 #include "system/filesys.h"
26 #include "system/wait.h"
27 #include "../include/ctdb_private.h"
28 #include "../common/rb_tree.h"
29
30
31 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
32
33 #define CTDB_ARP_INTERVAL 1
34 #define CTDB_ARP_REPEAT   3
35
36 struct ctdb_takeover_arp {
37         struct ctdb_context *ctdb;
38         uint32_t count;
39         ctdb_sock_addr addr;
40         struct ctdb_tcp_array *tcparray;
41         struct ctdb_vnn *vnn;
42 };
43
44
45 /*
46   lists of tcp endpoints
47  */
48 struct ctdb_tcp_list {
49         struct ctdb_tcp_list *prev, *next;
50         struct ctdb_tcp_connection connection;
51 };
52
53 /*
54   list of clients to kill on IP release
55  */
56 struct ctdb_client_ip {
57         struct ctdb_client_ip *prev, *next;
58         struct ctdb_context *ctdb;
59         ctdb_sock_addr addr;
60         uint32_t client_id;
61 };
62
63
64 /*
65   send a gratuitous arp
66  */
67 static void ctdb_control_send_arp(struct event_context *ev, struct timed_event *te, 
68                                   struct timeval t, void *private_data)
69 {
70         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
71                                                         struct ctdb_takeover_arp);
72         int i, ret;
73         struct ctdb_tcp_array *tcparray;
74
75         ret = ctdb_sys_send_arp(&arp->addr, arp->vnn->iface);
76         if (ret != 0) {
77                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed (%s)\n", strerror(errno)));
78         }
79
80         tcparray = arp->tcparray;
81         if (tcparray) {
82                 for (i=0;i<tcparray->num;i++) {
83                         struct ctdb_tcp_connection *tcon;
84
85                         tcon = &tcparray->connections[i];
86                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
87                                 (unsigned)ntohs(tcon->dst_addr.ip.sin_port), 
88                                 ctdb_addr_to_str(&tcon->src_addr),
89                                 (unsigned)ntohs(tcon->src_addr.ip.sin_port)));
90                         ret = ctdb_sys_send_tcp(
91                                 &tcon->src_addr, 
92                                 &tcon->dst_addr,
93                                 0, 0, 0);
94                         if (ret != 0) {
95                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
96                                         ctdb_addr_to_str(&tcon->src_addr)));
97                         }
98                 }
99         }
100
101         arp->count++;
102
103         if (arp->count == CTDB_ARP_REPEAT) {
104                 talloc_free(arp);
105                 return;
106         }
107
108         event_add_timed(arp->ctdb->ev, arp->vnn->takeover_ctx, 
109                         timeval_current_ofs(CTDB_ARP_INTERVAL, 100000), 
110                         ctdb_control_send_arp, arp);
111 }
112
113 struct takeover_callback_state {
114         struct ctdb_req_control *c;
115         ctdb_sock_addr *addr;
116         struct ctdb_vnn *vnn;
117 };
118
119 /*
120   called when takeip event finishes
121  */
122 static void takeover_ip_callback(struct ctdb_context *ctdb, int status, 
123                                  void *private_data)
124 {
125         struct takeover_callback_state *state = 
126                 talloc_get_type(private_data, struct takeover_callback_state);
127         struct ctdb_takeover_arp *arp;
128         struct ctdb_tcp_array *tcparray;
129
130         if (status != 0) {
131                 if (status == -ETIME) {
132                         ctdb_ban_self(ctdb);
133                 }
134                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
135                         ctdb_addr_to_str(state->addr),
136                         state->vnn->iface));
137                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
138                 talloc_free(state);
139                 return;
140         }
141
142         if (!state->vnn->takeover_ctx) {
143                 state->vnn->takeover_ctx = talloc_new(state->vnn);
144                 if (!state->vnn->takeover_ctx) {
145                         goto failed;
146                 }
147         }
148
149         arp = talloc_zero(state->vnn->takeover_ctx, struct ctdb_takeover_arp);
150         if (!arp) goto failed;
151         
152         arp->ctdb = ctdb;
153         arp->addr = *state->addr;
154         arp->vnn  = state->vnn;
155
156         tcparray = state->vnn->tcp_array;
157         if (tcparray) {
158                 /* add all of the known tcp connections for this IP to the
159                    list of tcp connections to send tickle acks for */
160                 arp->tcparray = talloc_steal(arp, tcparray);
161
162                 state->vnn->tcp_array = NULL;
163                 state->vnn->tcp_update_needed = true;
164         }
165
166         event_add_timed(arp->ctdb->ev, state->vnn->takeover_ctx, 
167                         timeval_zero(), ctdb_control_send_arp, arp);
168
169         /* the control succeeded */
170         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
171         talloc_free(state);
172         return;
173
174 failed:
175         ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
176         talloc_free(state);
177         return;
178 }
179
180 /*
181   Find the vnn of the node that has a public ip address
182   returns -1 if the address is not known as a public address
183  */
184 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
185 {
186         struct ctdb_vnn *vnn;
187
188         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
189                 if (ctdb_same_ip(&vnn->public_address, addr)) {
190                         return vnn;
191                 }
192         }
193
194         return NULL;
195 }
196
197
198 /*
199   take over an ip address
200  */
201 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb, 
202                                  struct ctdb_req_control *c,
203                                  TDB_DATA indata, 
204                                  bool *async_reply)
205 {
206         int ret;
207         struct takeover_callback_state *state;
208         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
209         struct ctdb_vnn *vnn;
210
211         /* update out vnn list */
212         vnn = find_public_ip_vnn(ctdb, &pip->addr);
213         if (vnn == NULL) {
214                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n", 
215                         ctdb_addr_to_str(&pip->addr)));
216                 return 0;
217         }
218         vnn->pnn = pip->pnn;
219
220         /* if our kernel already has this IP, do nothing */
221         if (ctdb_sys_have_ip(&pip->addr)) {
222                 return 0;
223         }
224
225         state = talloc(vnn, struct takeover_callback_state);
226         CTDB_NO_MEMORY(ctdb, state);
227
228         state->c = talloc_steal(ctdb, c);
229         state->addr = talloc(ctdb, ctdb_sock_addr);
230         CTDB_NO_MEMORY(ctdb, state->addr);
231
232         *state->addr = pip->addr;
233         state->vnn   = vnn;
234
235         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n", 
236                 ctdb_addr_to_str(&pip->addr),
237                 vnn->public_netmask_bits, 
238                 vnn->iface));
239
240         ret = ctdb_event_script_callback(ctdb, 
241                                          state, takeover_ip_callback, state,
242                                          false,
243                                          CTDB_EVENT_TAKE_IP,
244                                          "%s %s %u",
245                                          vnn->iface, 
246                                          talloc_strdup(state, ctdb_addr_to_str(&pip->addr)),
247                                          vnn->public_netmask_bits);
248
249         if (ret != 0) {
250                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
251                         ctdb_addr_to_str(&pip->addr),
252                         vnn->iface));
253                 talloc_free(state);
254                 return -1;
255         }
256
257         /* tell ctdb_control.c that we will be replying asynchronously */
258         *async_reply = true;
259
260         return 0;
261 }
262
263 /*
264   takeover an ip address old v4 style
265  */
266 int32_t ctdb_control_takeover_ipv4(struct ctdb_context *ctdb, 
267                                 struct ctdb_req_control *c,
268                                 TDB_DATA indata, 
269                                 bool *async_reply)
270 {
271         TDB_DATA data;
272         
273         data.dsize = sizeof(struct ctdb_public_ip);
274         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
275         CTDB_NO_MEMORY(ctdb, data.dptr);
276         
277         memcpy(data.dptr, indata.dptr, indata.dsize);
278         return ctdb_control_takeover_ip(ctdb, c, data, async_reply);
279 }
280
281 /*
282   kill any clients that are registered with a IP that is being released
283  */
284 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
285 {
286         struct ctdb_client_ip *ip;
287
288         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
289                 ctdb_addr_to_str(addr)));
290
291         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
292                 ctdb_sock_addr tmp_addr;
293
294                 tmp_addr = ip->addr;
295                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
296                         ip->client_id,
297                         ctdb_addr_to_str(&ip->addr)));
298
299                 if (ctdb_same_ip(&tmp_addr, addr)) {
300                         struct ctdb_client *client = ctdb_reqid_find(ctdb, 
301                                                                      ip->client_id, 
302                                                                      struct ctdb_client);
303                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
304                                 ip->client_id,
305                                 ctdb_addr_to_str(&ip->addr),
306                                 client->pid));
307
308                         if (client->pid != 0) {
309                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
310                                         (unsigned)client->pid,
311                                         ctdb_addr_to_str(addr),
312                                         ip->client_id));
313                                 kill(client->pid, SIGKILL);
314                         }
315                 }
316         }
317 }
318
319 /*
320   called when releaseip event finishes
321  */
322 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
323                                 void *private_data)
324 {
325         struct takeover_callback_state *state = 
326                 talloc_get_type(private_data, struct takeover_callback_state);
327         TDB_DATA data;
328
329         if (status == -ETIME) {
330                 ctdb_ban_self(ctdb);
331         }
332
333         /* send a message to all clients of this node telling them
334            that the cluster has been reconfigured and they should
335            release any sockets on this IP */
336         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
337         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
338         data.dsize = strlen((char *)data.dptr)+1;
339
340         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
341
342         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
343
344         /* kill clients that have registered with this IP */
345         release_kill_clients(ctdb, state->addr);
346         
347         /* the control succeeded */
348         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
349         talloc_free(state);
350 }
351
352 /*
353   release an ip address
354  */
355 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
356                                 struct ctdb_req_control *c,
357                                 TDB_DATA indata, 
358                                 bool *async_reply)
359 {
360         int ret;
361         struct takeover_callback_state *state;
362         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
363         struct ctdb_vnn *vnn;
364
365         /* update our vnn list */
366         vnn = find_public_ip_vnn(ctdb, &pip->addr);
367         if (vnn == NULL) {
368                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
369                         ctdb_addr_to_str(&pip->addr)));
370                 return 0;
371         }
372         vnn->pnn = pip->pnn;
373
374         /* stop any previous arps */
375         talloc_free(vnn->takeover_ctx);
376         vnn->takeover_ctx = NULL;
377
378         if (!ctdb_sys_have_ip(&pip->addr)) {
379                 DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n", 
380                         ctdb_addr_to_str(&pip->addr),
381                         vnn->public_netmask_bits, 
382                         vnn->iface));
383                 return 0;
384         }
385
386         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%u\n", 
387                 ctdb_addr_to_str(&pip->addr),
388                 vnn->public_netmask_bits, 
389                 vnn->iface,
390                 pip->pnn));
391
392         state = talloc(ctdb, struct takeover_callback_state);
393         CTDB_NO_MEMORY(ctdb, state);
394
395         state->c = talloc_steal(state, c);
396         state->addr = talloc(state, ctdb_sock_addr);       
397         CTDB_NO_MEMORY(ctdb, state->addr);
398         *state->addr = pip->addr;
399         state->vnn   = vnn;
400
401         ret = ctdb_event_script_callback(ctdb, 
402                                          state, release_ip_callback, state,
403                                          false,
404                                          CTDB_EVENT_RELEASE_IP,
405                                          "%s %s %u",
406                                          vnn->iface, 
407                                          talloc_strdup(state, ctdb_addr_to_str(&pip->addr)),
408                                          vnn->public_netmask_bits);
409         if (ret != 0) {
410                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
411                         ctdb_addr_to_str(&pip->addr),
412                         vnn->iface));
413                 talloc_free(state);
414                 return -1;
415         }
416
417         /* tell the control that we will be reply asynchronously */
418         *async_reply = true;
419         return 0;
420 }
421
422 /*
423   release an ip address old v4 style
424  */
425 int32_t ctdb_control_release_ipv4(struct ctdb_context *ctdb, 
426                                 struct ctdb_req_control *c,
427                                 TDB_DATA indata, 
428                                 bool *async_reply)
429 {
430         TDB_DATA data;
431         
432         data.dsize = sizeof(struct ctdb_public_ip);
433         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
434         CTDB_NO_MEMORY(ctdb, data.dptr);
435         
436         memcpy(data.dptr, indata.dptr, indata.dsize);
437         return ctdb_control_release_ip(ctdb, c, data, async_reply);
438 }
439
440
441 static int ctdb_add_public_address(struct ctdb_context *ctdb, ctdb_sock_addr *addr, unsigned mask, const char *iface)
442 {
443         struct ctdb_vnn      *vnn;
444
445         /* Verify that we dont have an entry for this ip yet */
446         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
447                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
448                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
449                                 ctdb_addr_to_str(addr)));
450                         return -1;
451                 }               
452         }
453
454         /* create a new vnn structure for this ip address */
455         vnn = talloc_zero(ctdb, struct ctdb_vnn);
456         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
457         vnn->iface = talloc_strdup(vnn, iface);
458         CTDB_NO_MEMORY(ctdb, vnn->iface);
459         vnn->public_address      = *addr;
460         vnn->public_netmask_bits = mask;
461         vnn->pnn                 = -1;
462         
463         DLIST_ADD(ctdb->vnn, vnn);
464
465         return 0;
466 }
467
468
469 /*
470   setup the event script directory
471 */
472 int ctdb_set_event_script_dir(struct ctdb_context *ctdb, const char *script_dir)
473 {
474         ctdb->event_script_dir = talloc_strdup(ctdb, script_dir);
475         CTDB_NO_MEMORY(ctdb, ctdb->event_script_dir);
476         return 0;
477 }
478
479 /*
480   setup the public address lists from a file
481 */
482 int ctdb_set_public_addresses(struct ctdb_context *ctdb, const char *alist)
483 {
484         char **lines;
485         int nlines;
486         int i;
487
488         lines = file_lines_load(alist, &nlines, ctdb);
489         if (lines == NULL) {
490                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", alist);
491                 return -1;
492         }
493         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
494                 nlines--;
495         }
496
497         for (i=0;i<nlines;i++) {
498                 unsigned mask;
499                 ctdb_sock_addr addr;
500                 const char *addrstr;
501                 const char *iface;
502                 char *tok, *line;
503
504                 line = lines[i];
505                 while ((*line == ' ') || (*line == '\t')) {
506                         line++;
507                 }
508                 if (*line == '#') {
509                         continue;
510                 }
511                 if (strcmp(line, "") == 0) {
512                         continue;
513                 }
514                 tok = strtok(line, " \t");
515                 addrstr = tok;
516                 tok = strtok(NULL, " \t");
517                 if (tok == NULL) {
518                         if (NULL == ctdb->default_public_interface) {
519                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
520                                          i+1));
521                                 talloc_free(lines);
522                                 return -1;
523                         }
524                         iface = ctdb->default_public_interface;
525                 } else {
526                         iface = tok;
527                 }
528
529                 if (!addrstr || !parse_ip_mask(addrstr, iface, &addr, &mask)) {
530                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
531                         talloc_free(lines);
532                         return -1;
533                 }
534                 if (ctdb_add_public_address(ctdb, &addr, mask, iface)) {
535                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
536                         talloc_free(lines);
537                         return -1;
538                 }
539         }
540
541         talloc_free(lines);
542         return 0;
543 }
544
545
546
547
548 struct ctdb_public_ip_list {
549         struct ctdb_public_ip_list *next;
550         uint32_t pnn;
551         ctdb_sock_addr addr;
552 };
553
554
555 /* Given a physical node, return the number of
556    public addresses that is currently assigned to this node.
557 */
558 static int node_ip_coverage(struct ctdb_context *ctdb, 
559         int32_t pnn,
560         struct ctdb_public_ip_list *ips)
561 {
562         int num=0;
563
564         for (;ips;ips=ips->next) {
565                 if (ips->pnn == pnn) {
566                         num++;
567                 }
568         }
569         return num;
570 }
571
572
573 /* Check if this is a public ip known to the node, i.e. can that
574    node takeover this ip ?
575 */
576 static int can_node_serve_ip(struct ctdb_context *ctdb, int32_t pnn, 
577                 struct ctdb_public_ip_list *ip)
578 {
579         struct ctdb_all_public_ips *public_ips;
580         int i;
581
582         public_ips = ctdb->nodes[pnn]->public_ips;
583
584         if (public_ips == NULL) {
585                 return -1;
586         }
587
588         for (i=0;i<public_ips->num;i++) {
589                 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
590                         /* yes, this node can serve this public ip */
591                         return 0;
592                 }
593         }
594
595         return -1;
596 }
597
598
599 /* search the node lists list for a node to takeover this ip.
600    pick the node that currently are serving the least number of ips
601    so that the ips get spread out evenly.
602 */
603 static int find_takeover_node(struct ctdb_context *ctdb, 
604                 struct ctdb_node_map *nodemap, uint32_t mask, 
605                 struct ctdb_public_ip_list *ip,
606                 struct ctdb_public_ip_list *all_ips)
607 {
608         int pnn, min=0, num;
609         int i;
610
611         pnn    = -1;
612         for (i=0;i<nodemap->num;i++) {
613                 if (nodemap->nodes[i].flags & mask) {
614                         /* This node is not healty and can not be used to serve
615                            a public address 
616                         */
617                         continue;
618                 }
619
620                 /* verify that this node can serve this ip */
621                 if (can_node_serve_ip(ctdb, i, ip)) {
622                         /* no it couldnt   so skip to the next node */
623                         continue;
624                 }
625
626                 num = node_ip_coverage(ctdb, i, all_ips);
627                 /* was this the first node we checked ? */
628                 if (pnn == -1) {
629                         pnn = i;
630                         min  = num;
631                 } else {
632                         if (num < min) {
633                                 pnn = i;
634                                 min  = num;
635                         }
636                 }
637         }       
638         if (pnn == -1) {
639                 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
640                         ctdb_addr_to_str(&ip->addr)));
641
642                 return -1;
643         }
644
645         ip->pnn = pnn;
646         return 0;
647 }
648
649 #define IP_KEYLEN       4
650 static uint32_t *ip_key(ctdb_sock_addr *ip)
651 {
652         static uint32_t key[IP_KEYLEN];
653
654         bzero(key, sizeof(key));
655
656         switch (ip->sa.sa_family) {
657         case AF_INET:
658                 key[3]  = htonl(ip->ip.sin_addr.s_addr);
659                 break;
660         case AF_INET6:
661                 key[0]  = htonl(ip->ip6.sin6_addr.s6_addr32[0]);
662                 key[1]  = htonl(ip->ip6.sin6_addr.s6_addr32[1]);
663                 key[2]  = htonl(ip->ip6.sin6_addr.s6_addr32[2]);
664                 key[3]  = htonl(ip->ip6.sin6_addr.s6_addr32[3]);
665                 break;
666         default:
667                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
668                 return key;
669         }
670
671         return key;
672 }
673
674 static void *add_ip_callback(void *parm, void *data)
675 {
676         return parm;
677 }
678
679 void getips_count_callback(void *param, void *data)
680 {
681         struct ctdb_public_ip_list **ip_list = (struct ctdb_public_ip_list **)param;
682         struct ctdb_public_ip_list *new_ip = (struct ctdb_public_ip_list *)data;
683
684         new_ip->next = *ip_list;
685         *ip_list     = new_ip;
686 }
687
688 static struct ctdb_public_ip_list *
689 create_merged_ip_list(struct ctdb_context *ctdb)
690 {
691         int i, j;
692         struct ctdb_public_ip_list *ip_list;
693         struct ctdb_all_public_ips *public_ips;
694
695         if (ctdb->ip_tree != NULL) {
696                 talloc_free(ctdb->ip_tree);
697                 ctdb->ip_tree = NULL;
698         }
699         ctdb->ip_tree = trbt_create(ctdb, 0);
700
701         for (i=0;i<ctdb->num_nodes;i++) {
702                 public_ips = ctdb->nodes[i]->public_ips;
703
704                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
705                         continue;
706                 }
707
708                 /* there were no public ips for this node */
709                 if (public_ips == NULL) {
710                         continue;
711                 }               
712
713                 for (j=0;j<public_ips->num;j++) {
714                         struct ctdb_public_ip_list *tmp_ip; 
715
716                         tmp_ip = talloc_zero(ctdb->ip_tree, struct ctdb_public_ip_list);
717                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
718                         tmp_ip->pnn  = public_ips->ips[j].pnn;
719                         tmp_ip->addr = public_ips->ips[j].addr;
720                         tmp_ip->next = NULL;
721
722                         trbt_insertarray32_callback(ctdb->ip_tree,
723                                 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
724                                 add_ip_callback,
725                                 tmp_ip);
726                 }
727         }
728
729         ip_list = NULL;
730         trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
731
732         return ip_list;
733 }
734
735 /*
736   make any IP alias changes for public addresses that are necessary 
737  */
738 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
739 {
740         int i, num_healthy, retries, num_ips;
741         struct ctdb_public_ip ip;
742         struct ctdb_public_ipv4 ipv4;
743         uint32_t mask;
744         struct ctdb_public_ip_list *all_ips, *tmp_ip;
745         int maxnode, maxnum=0, minnode, minnum=0, num;
746         TDB_DATA data;
747         struct timeval timeout;
748         struct client_async_data *async_data;
749         struct ctdb_client_control_state *state;
750         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
751
752
753         ZERO_STRUCT(ip);
754
755         /* Count how many completely healthy nodes we have */
756         num_healthy = 0;
757         for (i=0;i<nodemap->num;i++) {
758                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
759                         num_healthy++;
760                 }
761         }
762
763         if (num_healthy > 0) {
764                 /* We have healthy nodes, so only consider them for 
765                    serving public addresses
766                 */
767                 mask = NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED;
768         } else {
769                 /* We didnt have any completely healthy nodes so
770                    use "disabled" nodes as a fallback
771                 */
772                 mask = NODE_FLAGS_INACTIVE;
773         }
774
775         /* since nodes only know about those public addresses that
776            can be served by that particular node, no single node has
777            a full list of all public addresses that exist in the cluster.
778            Walk over all node structures and create a merged list of
779            all public addresses that exist in the cluster.
780
781            keep the tree of ips around as ctdb->ip_tree
782         */
783         all_ips = create_merged_ip_list(ctdb);
784
785         /* Count how many ips we have */
786         num_ips = 0;
787         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
788                 num_ips++;
789         }
790
791         /* If we want deterministic ip allocations, i.e. that the ip addresses
792            will always be allocated the same way for a specific set of
793            available/unavailable nodes.
794         */
795         if (1 == ctdb->tunable.deterministic_public_ips) {              
796                 DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
797                 for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
798                         int tmp_pnn;
799
800                         tmp_pnn = i%nodemap->num;
801                         if (can_node_serve_ip(ctdb, tmp_pnn, tmp_ip) == 0) {
802                                 tmp_ip->pnn = tmp_pnn;
803                         }
804                 }
805         }
806
807
808         /* mark all public addresses with a masked node as being served by
809            node -1
810         */
811         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
812                 if (tmp_ip->pnn == -1) {
813                         continue;
814                 }
815                 if (nodemap->nodes[tmp_ip->pnn].flags & mask) {
816                         tmp_ip->pnn = -1;
817                 }
818         }
819
820         /* verify that the assigned nodes can serve that public ip
821            and set it to -1 if not
822         */
823         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
824                 if (tmp_ip->pnn == -1) {
825                         continue;
826                 }
827                 if (can_node_serve_ip(ctdb, tmp_ip->pnn, tmp_ip) != 0) {
828                         /* this node can not serve this ip. */
829                         tmp_ip->pnn = -1;
830                 }
831         }
832
833
834         /* now we must redistribute all public addresses with takeover node
835            -1 among the nodes available
836         */
837         retries = 0;
838 try_again:
839         /* loop over all ip's and find a physical node to cover for 
840            each unassigned ip.
841         */
842         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
843                 if (tmp_ip->pnn == -1) {
844                         if (find_takeover_node(ctdb, nodemap, mask, tmp_ip, all_ips)) {
845                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
846                                         ctdb_addr_to_str(&tmp_ip->addr)));
847                         }
848                 }
849         }
850
851         /* If we dont want ips to fail back after a node becomes healthy
852            again, we wont even try to reallocat the ip addresses so that
853            they are evenly spread out.
854            This can NOT be used at the same time as DeterministicIPs !
855         */
856         if (1 == ctdb->tunable.no_ip_failback) {
857                 if (1 == ctdb->tunable.deterministic_public_ips) {
858                         DEBUG(DEBUG_ERR, ("ERROR: You can not use 'DeterministicIPs' and 'NoIPFailback' at the same time\n"));
859                 }
860                 goto finished;
861         }
862
863
864         /* now, try to make sure the ip adresses are evenly distributed
865            across the node.
866            for each ip address, loop over all nodes that can serve this
867            ip and make sure that the difference between the node
868            serving the most and the node serving the least ip's are not greater
869            than 1.
870         */
871         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
872                 if (tmp_ip->pnn == -1) {
873                         continue;
874                 }
875
876                 /* Get the highest and lowest number of ips's served by any 
877                    valid node which can serve this ip.
878                 */
879                 maxnode = -1;
880                 minnode = -1;
881                 for (i=0;i<nodemap->num;i++) {
882                         if (nodemap->nodes[i].flags & mask) {
883                                 continue;
884                         }
885
886                         /* only check nodes that can actually serve this ip */
887                         if (can_node_serve_ip(ctdb, i, tmp_ip)) {
888                                 /* no it couldnt   so skip to the next node */
889                                 continue;
890                         }
891
892                         num = node_ip_coverage(ctdb, i, all_ips);
893                         if (maxnode == -1) {
894                                 maxnode = i;
895                                 maxnum  = num;
896                         } else {
897                                 if (num > maxnum) {
898                                         maxnode = i;
899                                         maxnum  = num;
900                                 }
901                         }
902                         if (minnode == -1) {
903                                 minnode = i;
904                                 minnum  = num;
905                         } else {
906                                 if (num < minnum) {
907                                         minnode = i;
908                                         minnum  = num;
909                                 }
910                         }
911                 }
912                 if (maxnode == -1) {
913                         DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
914                                 ctdb_addr_to_str(&tmp_ip->addr)));
915
916                         continue;
917                 }
918
919                 /* If we want deterministic IPs then dont try to reallocate 
920                    them to spread out the load.
921                 */
922                 if (1 == ctdb->tunable.deterministic_public_ips) {
923                         continue;
924                 }
925
926                 /* if the spread between the smallest and largest coverage by
927                    a node is >=2 we steal one of the ips from the node with
928                    most coverage to even things out a bit.
929                    try to do this a limited number of times since we dont
930                    want to spend too much time balancing the ip coverage.
931                 */
932                 if ( (maxnum > minnum+1)
933                   && (retries < (num_ips + 5)) ){
934                         struct ctdb_public_ip_list *tmp;
935
936                         /* mark one of maxnode's vnn's as unassigned and try
937                            again
938                         */
939                         for (tmp=all_ips;tmp;tmp=tmp->next) {
940                                 if (tmp->pnn == maxnode) {
941                                         tmp->pnn = -1;
942                                         retries++;
943                                         goto try_again;
944                                 }
945                         }
946                 }
947         }
948
949
950         /* finished distributing the public addresses, now just send the 
951            info out to the nodes
952         */
953 finished:
954
955         /* at this point ->pnn is the node which will own each IP
956            or -1 if there is no node that can cover this ip
957         */
958
959         /* now tell all nodes to delete any alias that they should not
960            have.  This will be a NOOP on nodes that don't currently
961            hold the given alias */
962         async_data = talloc_zero(tmp_ctx, struct client_async_data);
963         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
964
965         for (i=0;i<nodemap->num;i++) {
966                 /* don't talk to unconnected nodes, but do talk to banned nodes */
967                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
968                         continue;
969                 }
970
971                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
972                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
973                                 /* This node should be serving this
974                                    vnn so dont tell it to release the ip
975                                 */
976                                 continue;
977                         }
978                         if (tmp_ip->addr.sa.sa_family == AF_INET) {
979                                 ipv4.pnn = tmp_ip->pnn;
980                                 ipv4.sin = tmp_ip->addr.ip;
981
982                                 timeout = TAKEOVER_TIMEOUT();
983                                 data.dsize = sizeof(ipv4);
984                                 data.dptr  = (uint8_t *)&ipv4;
985                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
986                                                 0, CTDB_CONTROL_RELEASE_IPv4, 0,
987                                                 data, async_data,
988                                                 &timeout, NULL);
989                         } else {
990                                 ip.pnn  = tmp_ip->pnn;
991                                 ip.addr = tmp_ip->addr;
992
993                                 timeout = TAKEOVER_TIMEOUT();
994                                 data.dsize = sizeof(ip);
995                                 data.dptr  = (uint8_t *)&ip;
996                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
997                                                 0, CTDB_CONTROL_RELEASE_IP, 0,
998                                                 data, async_data,
999                                                 &timeout, NULL);
1000                         }
1001
1002                         if (state == NULL) {
1003                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
1004                                 talloc_free(tmp_ctx);
1005                                 return -1;
1006                         }
1007                 
1008                         ctdb_client_async_add(async_data, state);
1009                 }
1010         }
1011         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
1012                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
1013                 talloc_free(tmp_ctx);
1014                 return -1;
1015         }
1016         talloc_free(async_data);
1017
1018
1019         /* tell all nodes to get their own IPs */
1020         async_data = talloc_zero(tmp_ctx, struct client_async_data);
1021         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
1022         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1023                 if (tmp_ip->pnn == -1) {
1024                         /* this IP won't be taken over */
1025                         continue;
1026                 }
1027
1028                 if (tmp_ip->addr.sa.sa_family == AF_INET) {
1029                         ipv4.pnn = tmp_ip->pnn;
1030                         ipv4.sin = tmp_ip->addr.ip;
1031
1032                         timeout = TAKEOVER_TIMEOUT();
1033                         data.dsize = sizeof(ipv4);
1034                         data.dptr  = (uint8_t *)&ipv4;
1035                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
1036                                         0, CTDB_CONTROL_TAKEOVER_IPv4, 0,
1037                                         data, async_data,
1038                                         &timeout, NULL);
1039                 } else {
1040                         ip.pnn  = tmp_ip->pnn;
1041                         ip.addr = tmp_ip->addr;
1042
1043                         timeout = TAKEOVER_TIMEOUT();
1044                         data.dsize = sizeof(ip);
1045                         data.dptr  = (uint8_t *)&ip;
1046                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
1047                                         0, CTDB_CONTROL_TAKEOVER_IP, 0,
1048                                         data, async_data,
1049                                         &timeout, NULL);
1050                 }
1051                 if (state == NULL) {
1052                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
1053                         talloc_free(tmp_ctx);
1054                         return -1;
1055                 }
1056                 
1057                 ctdb_client_async_add(async_data, state);
1058         }
1059         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
1060                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
1061                 talloc_free(tmp_ctx);
1062                 return -1;
1063         }
1064
1065         talloc_free(tmp_ctx);
1066         return 0;
1067 }
1068
1069
1070 /*
1071   destroy a ctdb_client_ip structure
1072  */
1073 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
1074 {
1075         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
1076                 ctdb_addr_to_str(&ip->addr),
1077                 ntohs(ip->addr.ip.sin_port),
1078                 ip->client_id));
1079
1080         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
1081         return 0;
1082 }
1083
1084 /*
1085   called by a client to inform us of a TCP connection that it is managing
1086   that should tickled with an ACK when IP takeover is done
1087   we handle both the old ipv4 style of packets as well as the new ipv4/6
1088   pdus.
1089  */
1090 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
1091                                 TDB_DATA indata)
1092 {
1093         struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
1094         struct ctdb_control_tcp *old_addr = NULL;
1095         struct ctdb_control_tcp_addr new_addr;
1096         struct ctdb_control_tcp_addr *tcp_sock = NULL;
1097         struct ctdb_tcp_list *tcp;
1098         struct ctdb_control_tcp_vnn t;
1099         int ret;
1100         TDB_DATA data;
1101         struct ctdb_client_ip *ip;
1102         struct ctdb_vnn *vnn;
1103         ctdb_sock_addr addr;
1104
1105         switch (indata.dsize) {
1106         case sizeof(struct ctdb_control_tcp):
1107                 old_addr = (struct ctdb_control_tcp *)indata.dptr;
1108                 ZERO_STRUCT(new_addr);
1109                 tcp_sock = &new_addr;
1110                 tcp_sock->src.ip  = old_addr->src;
1111                 tcp_sock->dest.ip = old_addr->dest;
1112                 break;
1113         case sizeof(struct ctdb_control_tcp_addr):
1114                 tcp_sock = (struct ctdb_control_tcp_addr *)indata.dptr;
1115                 break;
1116         default:
1117                 DEBUG(DEBUG_ERR,(__location__ " Invalid data structure passed "
1118                                  "to ctdb_control_tcp_client. size was %d but "
1119                                  "only allowed sizes are %lu and %lu\n",
1120                                  (int)indata.dsize,
1121                                  (long unsigned)sizeof(struct ctdb_control_tcp),
1122                                  (long unsigned)sizeof(struct ctdb_control_tcp_addr)));
1123                 return -1;
1124         }
1125
1126         addr = tcp_sock->src;
1127         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
1128         addr = tcp_sock->dest;
1129         ctdb_canonicalize_ip(&addr, &tcp_sock->dest);
1130
1131         ZERO_STRUCT(addr);
1132         memcpy(&addr, &tcp_sock->dest, sizeof(addr));
1133         vnn = find_public_ip_vnn(ctdb, &addr);
1134         if (vnn == NULL) {
1135                 switch (addr.sa.sa_family) {
1136                 case AF_INET:
1137                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
1138                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
1139                                         ctdb_addr_to_str(&addr)));
1140                         }
1141                         break;
1142                 case AF_INET6:
1143                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
1144                                 ctdb_addr_to_str(&addr)));
1145                         break;
1146                 default:
1147                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
1148                 }
1149
1150                 return 0;
1151         }
1152
1153         if (vnn->pnn != ctdb->pnn) {
1154                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
1155                         ctdb_addr_to_str(&addr),
1156                         client_id, client->pid));
1157                 /* failing this call will tell smbd to die */
1158                 return -1;
1159         }
1160
1161         ip = talloc(client, struct ctdb_client_ip);
1162         CTDB_NO_MEMORY(ctdb, ip);
1163
1164         ip->ctdb      = ctdb;
1165         ip->addr      = addr;
1166         ip->client_id = client_id;
1167         talloc_set_destructor(ip, ctdb_client_ip_destructor);
1168         DLIST_ADD(ctdb->client_ip_list, ip);
1169
1170         tcp = talloc(client, struct ctdb_tcp_list);
1171         CTDB_NO_MEMORY(ctdb, tcp);
1172
1173         tcp->connection.src_addr = tcp_sock->src;
1174         tcp->connection.dst_addr = tcp_sock->dest;
1175
1176         DLIST_ADD(client->tcp_list, tcp);
1177
1178         t.src  = tcp_sock->src;
1179         t.dest = tcp_sock->dest;
1180
1181         data.dptr = (uint8_t *)&t;
1182         data.dsize = sizeof(t);
1183
1184         switch (addr.sa.sa_family) {
1185         case AF_INET:
1186                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
1187                         (unsigned)ntohs(tcp_sock->dest.ip.sin_port), 
1188                         ctdb_addr_to_str(&tcp_sock->src),
1189                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
1190                 break;
1191         case AF_INET6:
1192                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
1193                         (unsigned)ntohs(tcp_sock->dest.ip6.sin6_port), 
1194                         ctdb_addr_to_str(&tcp_sock->src),
1195                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
1196                 break;
1197         default:
1198                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
1199         }
1200
1201
1202         /* tell all nodes about this tcp connection */
1203         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
1204                                        CTDB_CONTROL_TCP_ADD,
1205                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
1206         if (ret != 0) {
1207                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
1208                 return -1;
1209         }
1210
1211         return 0;
1212 }
1213
1214 /*
1215   find a tcp address on a list
1216  */
1217 static struct ctdb_tcp_connection *ctdb_tcp_find(struct ctdb_tcp_array *array, 
1218                                            struct ctdb_tcp_connection *tcp)
1219 {
1220         int i;
1221
1222         if (array == NULL) {
1223                 return NULL;
1224         }
1225
1226         for (i=0;i<array->num;i++) {
1227                 if (ctdb_same_sockaddr(&array->connections[i].src_addr, &tcp->src_addr) &&
1228                     ctdb_same_sockaddr(&array->connections[i].dst_addr, &tcp->dst_addr)) {
1229                         return &array->connections[i];
1230                 }
1231         }
1232         return NULL;
1233 }
1234
1235 /*
1236   called by a daemon to inform us of a TCP connection that one of its
1237   clients managing that should tickled with an ACK when IP takeover is
1238   done
1239  */
1240 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata)
1241 {
1242         struct ctdb_control_tcp_vnn *p = (struct ctdb_control_tcp_vnn *)indata.dptr;
1243         struct ctdb_tcp_array *tcparray;
1244         struct ctdb_tcp_connection tcp;
1245         struct ctdb_vnn *vnn;
1246
1247         vnn = find_public_ip_vnn(ctdb, &p->dest);
1248         if (vnn == NULL) {
1249                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
1250                         ctdb_addr_to_str(&p->dest)));
1251
1252                 return -1;
1253         }
1254
1255
1256         tcparray = vnn->tcp_array;
1257
1258         /* If this is the first tickle */
1259         if (tcparray == NULL) {
1260                 tcparray = talloc_size(ctdb->nodes, 
1261                         offsetof(struct ctdb_tcp_array, connections) +
1262                         sizeof(struct ctdb_tcp_connection) * 1);
1263                 CTDB_NO_MEMORY(ctdb, tcparray);
1264                 vnn->tcp_array = tcparray;
1265
1266                 tcparray->num = 0;
1267                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
1268                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
1269
1270                 tcparray->connections[tcparray->num].src_addr = p->src;
1271                 tcparray->connections[tcparray->num].dst_addr = p->dest;
1272                 tcparray->num++;
1273                 return 0;
1274         }
1275
1276
1277         /* Do we already have this tickle ?*/
1278         tcp.src_addr = p->src;
1279         tcp.dst_addr = p->dest;
1280         if (ctdb_tcp_find(vnn->tcp_array, &tcp) != NULL) {
1281                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
1282                         ctdb_addr_to_str(&tcp.dst_addr),
1283                         ntohs(tcp.dst_addr.ip.sin_port),
1284                         vnn->pnn));
1285                 return 0;
1286         }
1287
1288         /* A new tickle, we must add it to the array */
1289         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
1290                                         struct ctdb_tcp_connection,
1291                                         tcparray->num+1);
1292         CTDB_NO_MEMORY(ctdb, tcparray->connections);
1293
1294         vnn->tcp_array = tcparray;
1295         tcparray->connections[tcparray->num].src_addr = p->src;
1296         tcparray->connections[tcparray->num].dst_addr = p->dest;
1297         tcparray->num++;
1298                                 
1299         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
1300                 ctdb_addr_to_str(&tcp.dst_addr),
1301                 ntohs(tcp.dst_addr.ip.sin_port),
1302                 vnn->pnn));
1303
1304         return 0;
1305 }
1306
1307
1308 /*
1309   called by a daemon to inform us of a TCP connection that one of its
1310   clients managing that should tickled with an ACK when IP takeover is
1311   done
1312  */
1313 static void ctdb_remove_tcp_connection(struct ctdb_context *ctdb, struct ctdb_tcp_connection *conn)
1314 {
1315         struct ctdb_tcp_connection *tcpp;
1316         struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst_addr);
1317
1318         if (vnn == NULL) {
1319                 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
1320                         ctdb_addr_to_str(&conn->dst_addr)));
1321                 return;
1322         }
1323
1324         /* if the array is empty we cant remove it
1325            and we dont need to do anything
1326          */
1327         if (vnn->tcp_array == NULL) {
1328                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
1329                         ctdb_addr_to_str(&conn->dst_addr),
1330                         ntohs(conn->dst_addr.ip.sin_port)));
1331                 return;
1332         }
1333
1334
1335         /* See if we know this connection
1336            if we dont know this connection  then we dont need to do anything
1337          */
1338         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
1339         if (tcpp == NULL) {
1340                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
1341                         ctdb_addr_to_str(&conn->dst_addr),
1342                         ntohs(conn->dst_addr.ip.sin_port)));
1343                 return;
1344         }
1345
1346
1347         /* We need to remove this entry from the array.
1348            Instead of allocating a new array and copying data to it
1349            we cheat and just copy the last entry in the existing array
1350            to the entry that is to be removed and just shring the 
1351            ->num field
1352          */
1353         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
1354         vnn->tcp_array->num--;
1355
1356         /* If we deleted the last entry we also need to remove the entire array
1357          */
1358         if (vnn->tcp_array->num == 0) {
1359                 talloc_free(vnn->tcp_array);
1360                 vnn->tcp_array = NULL;
1361         }               
1362
1363         vnn->tcp_update_needed = true;
1364
1365         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
1366                 ctdb_addr_to_str(&conn->src_addr),
1367                 ntohs(conn->src_addr.ip.sin_port)));
1368 }
1369
1370
1371 /*
1372   called when a daemon restarts - send all tickes for all public addresses
1373   we are serving immediately to the new node.
1374  */
1375 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t vnn)
1376 {
1377 /*XXX here we should send all tickes we are serving to the new node */
1378         return 0;
1379 }
1380
1381
1382 /*
1383   called when a client structure goes away - hook to remove
1384   elements from the tcp_list in all daemons
1385  */
1386 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
1387 {
1388         while (client->tcp_list) {
1389                 struct ctdb_tcp_list *tcp = client->tcp_list;
1390                 DLIST_REMOVE(client->tcp_list, tcp);
1391                 ctdb_remove_tcp_connection(client->ctdb, &tcp->connection);
1392         }
1393 }
1394
1395
1396 /*
1397   release all IPs on shutdown
1398  */
1399 void ctdb_release_all_ips(struct ctdb_context *ctdb)
1400 {
1401         struct ctdb_vnn *vnn;
1402
1403         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1404                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
1405                         continue;
1406                 }
1407                 if (vnn->pnn == ctdb->pnn) {
1408                         vnn->pnn = -1;
1409                 }
1410                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
1411                                   vnn->iface, 
1412                                   talloc_strdup(ctdb, ctdb_addr_to_str(&vnn->public_address)),
1413                                   vnn->public_netmask_bits);
1414                 release_kill_clients(ctdb, &vnn->public_address);
1415         }
1416 }
1417
1418
1419 /*
1420   get list of public IPs
1421  */
1422 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
1423                                     struct ctdb_req_control *c, TDB_DATA *outdata)
1424 {
1425         int i, num, len;
1426         struct ctdb_all_public_ips *ips;
1427         struct ctdb_vnn *vnn;
1428
1429         /* count how many public ip structures we have */
1430         num = 0;
1431         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1432                 num++;
1433         }
1434
1435         len = offsetof(struct ctdb_all_public_ips, ips) + 
1436                 num*sizeof(struct ctdb_public_ip);
1437         ips = talloc_zero_size(outdata, len);
1438         CTDB_NO_MEMORY(ctdb, ips);
1439
1440         outdata->dsize = len;
1441         outdata->dptr  = (uint8_t *)ips;
1442
1443         ips->num = num;
1444         i = 0;
1445         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1446                 ips->ips[i].pnn  = vnn->pnn;
1447                 ips->ips[i].addr = vnn->public_address;
1448                 i++;
1449         }
1450
1451         return 0;
1452 }
1453
1454
1455 /*
1456   get list of public IPs, old ipv4 style.  only returns ipv4 addresses
1457  */
1458 int32_t ctdb_control_get_public_ipsv4(struct ctdb_context *ctdb, 
1459                                     struct ctdb_req_control *c, TDB_DATA *outdata)
1460 {
1461         int i, num, len;
1462         struct ctdb_all_public_ipsv4 *ips;
1463         struct ctdb_vnn *vnn;
1464
1465         /* count how many public ip structures we have */
1466         num = 0;
1467         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1468                 if (vnn->public_address.sa.sa_family != AF_INET) {
1469                         continue;
1470                 }
1471                 num++;
1472         }
1473
1474         len = offsetof(struct ctdb_all_public_ipsv4, ips) + 
1475                 num*sizeof(struct ctdb_public_ipv4);
1476         ips = talloc_zero_size(outdata, len);
1477         CTDB_NO_MEMORY(ctdb, ips);
1478
1479         outdata->dsize = len;
1480         outdata->dptr  = (uint8_t *)ips;
1481
1482         ips->num = num;
1483         i = 0;
1484         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1485                 if (vnn->public_address.sa.sa_family != AF_INET) {
1486                         continue;
1487                 }
1488                 ips->ips[i].pnn = vnn->pnn;
1489                 ips->ips[i].sin = vnn->public_address.ip;
1490                 i++;
1491         }
1492
1493         return 0;
1494 }
1495
1496
1497 /* 
1498    structure containing the listening socket and the list of tcp connections
1499    that the ctdb daemon is to kill
1500 */
1501 struct ctdb_kill_tcp {
1502         struct ctdb_vnn *vnn;
1503         struct ctdb_context *ctdb;
1504         int capture_fd;
1505         struct fd_event *fde;
1506         trbt_tree_t *connections;
1507         void *private_data;
1508 };
1509
1510 /*
1511   a tcp connection that is to be killed
1512  */
1513 struct ctdb_killtcp_con {
1514         ctdb_sock_addr src_addr;
1515         ctdb_sock_addr dst_addr;
1516         int count;
1517         struct ctdb_kill_tcp *killtcp;
1518 };
1519
1520 /* this function is used to create a key to represent this socketpair
1521    in the killtcp tree.
1522    this key is used to insert and lookup matching socketpairs that are
1523    to be tickled and RST
1524 */
1525 #define KILLTCP_KEYLEN  10
1526 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
1527 {
1528         static uint32_t key[KILLTCP_KEYLEN];
1529
1530         bzero(key, sizeof(key));
1531
1532         if (src->sa.sa_family != dst->sa.sa_family) {
1533                 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
1534                 return key;
1535         }
1536         
1537         switch (src->sa.sa_family) {
1538         case AF_INET:
1539                 key[0]  = dst->ip.sin_addr.s_addr;
1540                 key[1]  = src->ip.sin_addr.s_addr;
1541                 key[2]  = dst->ip.sin_port;
1542                 key[3]  = src->ip.sin_port;
1543                 break;
1544         case AF_INET6:
1545                 key[0]  = dst->ip6.sin6_addr.s6_addr32[3];
1546                 key[1]  = src->ip6.sin6_addr.s6_addr32[3];
1547                 key[2]  = dst->ip6.sin6_addr.s6_addr32[2];
1548                 key[3]  = src->ip6.sin6_addr.s6_addr32[2];
1549                 key[4]  = dst->ip6.sin6_addr.s6_addr32[1];
1550                 key[5]  = src->ip6.sin6_addr.s6_addr32[1];
1551                 key[6]  = dst->ip6.sin6_addr.s6_addr32[0];
1552                 key[7]  = src->ip6.sin6_addr.s6_addr32[0];
1553                 key[8]  = dst->ip6.sin6_port;
1554                 key[9]  = src->ip6.sin6_port;
1555                 break;
1556         default:
1557                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
1558                 return key;
1559         }
1560
1561         return key;
1562 }
1563
1564 /*
1565   called when we get a read event on the raw socket
1566  */
1567 static void capture_tcp_handler(struct event_context *ev, struct fd_event *fde, 
1568                                 uint16_t flags, void *private_data)
1569 {
1570         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
1571         struct ctdb_killtcp_con *con;
1572         ctdb_sock_addr src, dst;
1573         uint32_t ack_seq, seq;
1574
1575         if (!(flags & EVENT_FD_READ)) {
1576                 return;
1577         }
1578
1579         if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
1580                                 killtcp->private_data,
1581                                 &src, &dst,
1582                                 &ack_seq, &seq) != 0) {
1583                 /* probably a non-tcp ACK packet */
1584                 return;
1585         }
1586
1587         /* check if we have this guy in our list of connections
1588            to kill
1589         */
1590         con = trbt_lookuparray32(killtcp->connections, 
1591                         KILLTCP_KEYLEN, killtcp_key(&src, &dst));
1592         if (con == NULL) {
1593                 /* no this was some other packet we can just ignore */
1594                 return;
1595         }
1596
1597         /* This one has been tickled !
1598            now reset him and remove him from the list.
1599          */
1600         DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
1601                 ntohs(con->dst_addr.ip.sin_port),
1602                 ctdb_addr_to_str(&con->src_addr),
1603                 ntohs(con->src_addr.ip.sin_port)));
1604
1605         ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
1606         talloc_free(con);
1607 }
1608
1609
1610 /* when traversing the list of all tcp connections to send tickle acks to
1611    (so that we can capture the ack coming back and kill the connection
1612     by a RST)
1613    this callback is called for each connection we are currently trying to kill
1614 */
1615 static void tickle_connection_traverse(void *param, void *data)
1616 {
1617         struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
1618
1619         /* have tried too many times, just give up */
1620         if (con->count >= 5) {
1621                 /* can't delete in traverse: reparent to delete_cons */
1622                 talloc_steal(param, con);
1623                 return;
1624         }
1625
1626         /* othervise, try tickling it again */
1627         con->count++;
1628         ctdb_sys_send_tcp(
1629                 (ctdb_sock_addr *)&con->dst_addr,
1630                 (ctdb_sock_addr *)&con->src_addr,
1631                 0, 0, 0);
1632 }
1633
1634
1635 /* 
1636    called every second until all sentenced connections have been reset
1637  */
1638 static void ctdb_tickle_sentenced_connections(struct event_context *ev, struct timed_event *te, 
1639                                               struct timeval t, void *private_data)
1640 {
1641         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
1642         void *delete_cons = talloc_new(NULL);
1643
1644         /* loop over all connections sending tickle ACKs */
1645         trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
1646
1647         /* now we've finished traverse, it's safe to do deletion. */
1648         talloc_free(delete_cons);
1649
1650         /* If there are no more connections to kill we can remove the
1651            entire killtcp structure
1652          */
1653         if ( (killtcp->connections == NULL) || 
1654              (killtcp->connections->root == NULL) ) {
1655                 talloc_free(killtcp);
1656                 return;
1657         }
1658
1659         /* try tickling them again in a seconds time
1660          */
1661         event_add_timed(killtcp->ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
1662                         ctdb_tickle_sentenced_connections, killtcp);
1663 }
1664
1665 /*
1666   destroy the killtcp structure
1667  */
1668 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
1669 {
1670         if (killtcp->vnn) {
1671                 killtcp->vnn->killtcp = NULL;
1672         }
1673         return 0;
1674 }
1675
1676
1677 /* nothing fancy here, just unconditionally replace any existing
1678    connection structure with the new one.
1679
1680    dont even free the old one if it did exist, that one is talloc_stolen
1681    by the same node in the tree anyway and will be deleted when the new data 
1682    is deleted
1683 */
1684 static void *add_killtcp_callback(void *parm, void *data)
1685 {
1686         return parm;
1687 }
1688
1689 /*
1690   add a tcp socket to the list of connections we want to RST
1691  */
1692 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb, 
1693                                        ctdb_sock_addr *s,
1694                                        ctdb_sock_addr *d)
1695 {
1696         ctdb_sock_addr src, dst;
1697         struct ctdb_kill_tcp *killtcp;
1698         struct ctdb_killtcp_con *con;
1699         struct ctdb_vnn *vnn;
1700
1701         ctdb_canonicalize_ip(s, &src);
1702         ctdb_canonicalize_ip(d, &dst);
1703
1704         vnn = find_public_ip_vnn(ctdb, &dst);
1705         if (vnn == NULL) {
1706                 vnn = find_public_ip_vnn(ctdb, &src);
1707         }
1708         if (vnn == NULL) {
1709                 /* if it is not a public ip   it could be our 'single ip' */
1710                 if (ctdb->single_ip_vnn) {
1711                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
1712                                 vnn = ctdb->single_ip_vnn;
1713                         }
1714                 }
1715         }
1716         if (vnn == NULL) {
1717                 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n")); 
1718                 return -1;
1719         }
1720
1721         killtcp = vnn->killtcp;
1722         
1723         /* If this is the first connection to kill we must allocate
1724            a new structure
1725          */
1726         if (killtcp == NULL) {
1727                 killtcp = talloc_zero(ctdb, struct ctdb_kill_tcp);
1728                 CTDB_NO_MEMORY(ctdb, killtcp);
1729
1730                 killtcp->vnn         = vnn;
1731                 killtcp->ctdb        = ctdb;
1732                 killtcp->capture_fd  = -1;
1733                 killtcp->connections = trbt_create(killtcp, 0);
1734
1735                 vnn->killtcp         = killtcp;
1736                 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
1737         }
1738
1739
1740
1741         /* create a structure that describes this connection we want to
1742            RST and store it in killtcp->connections
1743         */
1744         con = talloc(killtcp, struct ctdb_killtcp_con);
1745         CTDB_NO_MEMORY(ctdb, con);
1746         con->src_addr = src;
1747         con->dst_addr = dst;
1748         con->count    = 0;
1749         con->killtcp  = killtcp;
1750
1751
1752         trbt_insertarray32_callback(killtcp->connections,
1753                         KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
1754                         add_killtcp_callback, con);
1755
1756         /* 
1757            If we dont have a socket to listen on yet we must create it
1758          */
1759         if (killtcp->capture_fd == -1) {
1760                 killtcp->capture_fd = ctdb_sys_open_capture_socket(vnn->iface, &killtcp->private_data);
1761                 if (killtcp->capture_fd == -1) {
1762                         DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing socket for killtcp\n"));
1763                         goto failed;
1764                 }
1765         }
1766
1767
1768         if (killtcp->fde == NULL) {
1769                 killtcp->fde = event_add_fd(ctdb->ev, killtcp, killtcp->capture_fd, 
1770                                             EVENT_FD_READ | EVENT_FD_AUTOCLOSE, 
1771                                             capture_tcp_handler, killtcp);
1772
1773                 /* We also need to set up some events to tickle all these connections
1774                    until they are all reset
1775                 */
1776                 event_add_timed(ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
1777                                 ctdb_tickle_sentenced_connections, killtcp);
1778         }
1779
1780         /* tickle him once now */
1781         ctdb_sys_send_tcp(
1782                 &con->dst_addr,
1783                 &con->src_addr,
1784                 0, 0, 0);
1785
1786         return 0;
1787
1788 failed:
1789         talloc_free(vnn->killtcp);
1790         vnn->killtcp = NULL;
1791         return -1;
1792 }
1793
1794 /*
1795   kill a TCP connection.
1796  */
1797 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
1798 {
1799         struct ctdb_control_killtcp *killtcp = (struct ctdb_control_killtcp *)indata.dptr;
1800
1801         return ctdb_killtcp_add_connection(ctdb, &killtcp->src_addr, &killtcp->dst_addr);
1802 }
1803
1804 /*
1805   called by a daemon to inform us of the entire list of TCP tickles for
1806   a particular public address.
1807   this control should only be sent by the node that is currently serving
1808   that public address.
1809  */
1810 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
1811 {
1812         struct ctdb_control_tcp_tickle_list *list = (struct ctdb_control_tcp_tickle_list *)indata.dptr;
1813         struct ctdb_tcp_array *tcparray;
1814         struct ctdb_vnn *vnn;
1815
1816         /* We must at least have tickles.num or else we cant verify the size
1817            of the received data blob
1818          */
1819         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
1820                                         tickles.connections)) {
1821                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list. Not enough data for the tickle.num field\n"));
1822                 return -1;
1823         }
1824
1825         /* verify that the size of data matches what we expect */
1826         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
1827                                 tickles.connections)
1828                          + sizeof(struct ctdb_tcp_connection)
1829                                  * list->tickles.num) {
1830                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list\n"));
1831                 return -1;
1832         }       
1833
1834         vnn = find_public_ip_vnn(ctdb, &list->addr);
1835         if (vnn == NULL) {
1836                 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n", 
1837                         ctdb_addr_to_str(&list->addr)));
1838
1839                 return 1;
1840         }
1841
1842         /* remove any old ticklelist we might have */
1843         talloc_free(vnn->tcp_array);
1844         vnn->tcp_array = NULL;
1845
1846         tcparray = talloc(ctdb->nodes, struct ctdb_tcp_array);
1847         CTDB_NO_MEMORY(ctdb, tcparray);
1848
1849         tcparray->num = list->tickles.num;
1850
1851         tcparray->connections = talloc_array(tcparray, struct ctdb_tcp_connection, tcparray->num);
1852         CTDB_NO_MEMORY(ctdb, tcparray->connections);
1853
1854         memcpy(tcparray->connections, &list->tickles.connections[0], 
1855                sizeof(struct ctdb_tcp_connection)*tcparray->num);
1856
1857         /* We now have a new fresh tickle list array for this vnn */
1858         vnn->tcp_array = talloc_steal(vnn, tcparray);
1859         
1860         return 0;
1861 }
1862
1863 /*
1864   called to return the full list of tickles for the puclic address associated 
1865   with the provided vnn
1866  */
1867 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
1868 {
1869         ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
1870         struct ctdb_control_tcp_tickle_list *list;
1871         struct ctdb_tcp_array *tcparray;
1872         int num;
1873         struct ctdb_vnn *vnn;
1874
1875         vnn = find_public_ip_vnn(ctdb, addr);
1876         if (vnn == NULL) {
1877                 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n", 
1878                         ctdb_addr_to_str(addr)));
1879
1880                 return 1;
1881         }
1882
1883         tcparray = vnn->tcp_array;
1884         if (tcparray) {
1885                 num = tcparray->num;
1886         } else {
1887                 num = 0;
1888         }
1889
1890         outdata->dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
1891                                 tickles.connections)
1892                         + sizeof(struct ctdb_tcp_connection) * num;
1893
1894         outdata->dptr  = talloc_size(outdata, outdata->dsize);
1895         CTDB_NO_MEMORY(ctdb, outdata->dptr);
1896         list = (struct ctdb_control_tcp_tickle_list *)outdata->dptr;
1897
1898         list->addr = *addr;
1899         list->tickles.num = num;
1900         if (num) {
1901                 memcpy(&list->tickles.connections[0], tcparray->connections, 
1902                         sizeof(struct ctdb_tcp_connection) * num);
1903         }
1904
1905         return 0;
1906 }
1907
1908
1909 /*
1910   set the list of all tcp tickles for a public address
1911  */
1912 static int ctdb_ctrl_set_tcp_tickles(struct ctdb_context *ctdb, 
1913                               struct timeval timeout, uint32_t destnode, 
1914                               ctdb_sock_addr *addr,
1915                               struct ctdb_tcp_array *tcparray)
1916 {
1917         int ret, num;
1918         TDB_DATA data;
1919         struct ctdb_control_tcp_tickle_list *list;
1920
1921         if (tcparray) {
1922                 num = tcparray->num;
1923         } else {
1924                 num = 0;
1925         }
1926
1927         data.dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
1928                                 tickles.connections) +
1929                         sizeof(struct ctdb_tcp_connection) * num;
1930         data.dptr = talloc_size(ctdb, data.dsize);
1931         CTDB_NO_MEMORY(ctdb, data.dptr);
1932
1933         list = (struct ctdb_control_tcp_tickle_list *)data.dptr;
1934         list->addr = *addr;
1935         list->tickles.num = num;
1936         if (tcparray) {
1937                 memcpy(&list->tickles.connections[0], tcparray->connections, sizeof(struct ctdb_tcp_connection) * num);
1938         }
1939
1940         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
1941                                        CTDB_CONTROL_SET_TCP_TICKLE_LIST,
1942                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
1943         if (ret != 0) {
1944                 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
1945                 return -1;
1946         }
1947
1948         talloc_free(data.dptr);
1949
1950         return ret;
1951 }
1952
1953
1954 /*
1955   perform tickle updates if required
1956  */
1957 static void ctdb_update_tcp_tickles(struct event_context *ev, 
1958                                 struct timed_event *te, 
1959                                 struct timeval t, void *private_data)
1960 {
1961         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
1962         int ret;
1963         struct ctdb_vnn *vnn;
1964
1965         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1966                 /* we only send out updates for public addresses that 
1967                    we have taken over
1968                  */
1969                 if (ctdb->pnn != vnn->pnn) {
1970                         continue;
1971                 }
1972                 /* We only send out the updates if we need to */
1973                 if (!vnn->tcp_update_needed) {
1974                         continue;
1975                 }
1976                 ret = ctdb_ctrl_set_tcp_tickles(ctdb, 
1977                                 TAKEOVER_TIMEOUT(),
1978                                 CTDB_BROADCAST_CONNECTED,
1979                                 &vnn->public_address,
1980                                 vnn->tcp_array);
1981                 if (ret != 0) {
1982                         DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
1983                                 ctdb_addr_to_str(&vnn->public_address)));
1984                 }
1985         }
1986
1987         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
1988                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
1989                              ctdb_update_tcp_tickles, ctdb);
1990 }               
1991         
1992
1993 /*
1994   start periodic update of tcp tickles
1995  */
1996 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
1997 {
1998         ctdb->tickle_update_context = talloc_new(ctdb);
1999
2000         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
2001                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
2002                              ctdb_update_tcp_tickles, ctdb);
2003 }
2004
2005
2006
2007
2008 struct control_gratious_arp {
2009         struct ctdb_context *ctdb;
2010         ctdb_sock_addr addr;
2011         const char *iface;
2012         int count;
2013 };
2014
2015 /*
2016   send a control_gratuitous arp
2017  */
2018 static void send_gratious_arp(struct event_context *ev, struct timed_event *te, 
2019                                   struct timeval t, void *private_data)
2020 {
2021         int ret;
2022         struct control_gratious_arp *arp = talloc_get_type(private_data, 
2023                                                         struct control_gratious_arp);
2024
2025         ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
2026         if (ret != 0) {
2027                 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp failed (%s)\n", strerror(errno)));
2028         }
2029
2030
2031         arp->count++;
2032         if (arp->count == CTDB_ARP_REPEAT) {
2033                 talloc_free(arp);
2034                 return;
2035         }
2036
2037         event_add_timed(arp->ctdb->ev, arp, 
2038                         timeval_current_ofs(CTDB_ARP_INTERVAL, 0), 
2039                         send_gratious_arp, arp);
2040 }
2041
2042
2043 /*
2044   send a gratious arp 
2045  */
2046 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
2047 {
2048         struct ctdb_control_gratious_arp *gratious_arp = (struct ctdb_control_gratious_arp *)indata.dptr;
2049         struct control_gratious_arp *arp;
2050
2051         /* verify the size of indata */
2052         if (indata.dsize < offsetof(struct ctdb_control_gratious_arp, iface)) {
2053                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
2054                                  (unsigned)indata.dsize, 
2055                                  (unsigned)offsetof(struct ctdb_control_gratious_arp, iface)));
2056                 return -1;
2057         }
2058         if (indata.dsize != 
2059                 ( offsetof(struct ctdb_control_gratious_arp, iface)
2060                 + gratious_arp->len ) ){
2061
2062                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
2063                         "but should be %u bytes\n", 
2064                          (unsigned)indata.dsize, 
2065                          (unsigned)(offsetof(struct ctdb_control_gratious_arp, iface)+gratious_arp->len)));
2066                 return -1;
2067         }
2068
2069
2070         arp = talloc(ctdb, struct control_gratious_arp);
2071         CTDB_NO_MEMORY(ctdb, arp);
2072
2073         arp->ctdb  = ctdb;
2074         arp->addr   = gratious_arp->addr;
2075         arp->iface = talloc_strdup(arp, gratious_arp->iface);
2076         CTDB_NO_MEMORY(ctdb, arp->iface);
2077         arp->count = 0;
2078         
2079         event_add_timed(arp->ctdb->ev, arp, 
2080                         timeval_zero(), send_gratious_arp, arp);
2081
2082         return 0;
2083 }
2084
2085 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
2086 {
2087         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
2088         int ret;
2089
2090         /* verify the size of indata */
2091         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
2092                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
2093                 return -1;
2094         }
2095         if (indata.dsize != 
2096                 ( offsetof(struct ctdb_control_ip_iface, iface)
2097                 + pub->len ) ){
2098
2099                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
2100                         "but should be %u bytes\n", 
2101                          (unsigned)indata.dsize, 
2102                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
2103                 return -1;
2104         }
2105
2106         ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0]);
2107
2108         if (ret != 0) {
2109                 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
2110                 return -1;
2111         }
2112
2113         return 0;
2114 }
2115
2116 /*
2117   called when releaseip event finishes for del_public_address
2118  */
2119 static void delete_ip_callback(struct ctdb_context *ctdb, int status, 
2120                                 void *private_data)
2121 {
2122         talloc_free(private_data);
2123 }
2124
2125 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
2126 {
2127         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
2128         struct ctdb_vnn *vnn;
2129         int ret;
2130
2131         /* verify the size of indata */
2132         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
2133                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
2134                 return -1;
2135         }
2136         if (indata.dsize != 
2137                 ( offsetof(struct ctdb_control_ip_iface, iface)
2138                 + pub->len ) ){
2139
2140                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
2141                         "but should be %u bytes\n", 
2142                          (unsigned)indata.dsize, 
2143                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
2144                 return -1;
2145         }
2146
2147         /* walk over all public addresses until we find a match */
2148         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2149                 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
2150                         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2151
2152                         DLIST_REMOVE(ctdb->vnn, vnn);
2153
2154                         ret = ctdb_event_script_callback(ctdb, 
2155                                          mem_ctx, delete_ip_callback, mem_ctx,
2156                                          false,
2157                                          CTDB_EVENT_RELEASE_IP,
2158                                          "%s %s %u",
2159                                          vnn->iface, 
2160                                          talloc_strdup(mem_ctx, ctdb_addr_to_str(&vnn->public_address)),
2161                                          vnn->public_netmask_bits);
2162                         if (vnn->killtcp) {
2163                                 vnn->killtcp->vnn = NULL;
2164                         }
2165                         talloc_free(vnn);
2166                         if (ret != 0) {
2167                                 return -1;
2168                         }
2169                         return 0;
2170                 }
2171         }
2172
2173         return -1;
2174 }
2175
2176 /* This function is called from the recovery daemon to verify that a remote
2177    node has the expected ip allocation.
2178    This is verified against ctdb->ip_tree
2179 */
2180 int verify_remote_ip_allocation(struct ctdb_context *ctdb, struct ctdb_all_public_ips *ips)
2181 {
2182         struct ctdb_public_ip_list *tmp_ip; 
2183         int i;
2184
2185         if (ctdb->ip_tree == NULL) {
2186                 /* dont know the expected allocation yet, assume remote node
2187                    is correct. */
2188                 return 0;
2189         }
2190
2191         if (ips == NULL) {
2192                 return 0;
2193         }
2194
2195         for (i=0; i<ips->num; i++) {
2196                 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
2197                 if (tmp_ip == NULL) {
2198                         DEBUG(DEBUG_ERR,(__location__ " Could not find host for address %s, reassign ips\n", ctdb_addr_to_str(&ips->ips[i].addr)));
2199                         return -1;
2200                 }
2201
2202                 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
2203                         continue;
2204                 }
2205
2206                 if (tmp_ip->pnn != ips->ips[i].pnn) {
2207                         DEBUG(DEBUG_ERR,("Inconsistent ip allocation. Trigger reallocation. Thinks %s is held by node %u while it is held by node %u\n", ctdb_addr_to_str(&ips->ips[i].addr), ips->ips[i].pnn, tmp_ip->pnn));
2208                         return -1;
2209                 }
2210         }
2211
2212         return 0;
2213 }
2214
2215 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
2216 {
2217         struct ctdb_public_ip_list *tmp_ip; 
2218
2219         if (ctdb->ip_tree == NULL) {
2220                 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
2221                 return -1;
2222         }
2223
2224         tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
2225         if (tmp_ip == NULL) {
2226                 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
2227                 return -1;
2228         }
2229
2230         DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
2231         tmp_ip->pnn = ip->pnn;
2232
2233         return 0;
2234 }