server: implement CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE behavior
[sahlberg/ctdb.git] / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6
7    This program is free software; you can redistribute it and/or modify
8    it under the terms of the GNU General Public License as published by
9    the Free Software Foundation; either version 3 of the License, or
10    (at your option) any later version.
11    
12    This program is distributed in the hope that it will be useful,
13    but WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15    GNU General Public License for more details.
16    
17    You should have received a copy of the GNU General Public License
18    along with this program; if not, see <http://www.gnu.org/licenses/>.
19 */
20 #include "includes.h"
21 #include "lib/events/events.h"
22 #include "lib/tdb/include/tdb.h"
23 #include "lib/util/dlinklist.h"
24 #include "system/network.h"
25 #include "system/filesys.h"
26 #include "system/wait.h"
27 #include "../include/ctdb_private.h"
28 #include "../common/rb_tree.h"
29
30
31 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
32
33 #define CTDB_ARP_INTERVAL 1
34 #define CTDB_ARP_REPEAT   3
35
36 struct ctdb_iface {
37         struct ctdb_iface *prev, *next;
38         const char *name;
39         bool link_up;
40         uint32_t references;
41 };
42
43 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
44 {
45         if (vnn->iface) {
46                 return vnn->iface->name;
47         }
48
49         return "__none__";
50 }
51
52 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
53 {
54         struct ctdb_iface *i;
55
56         /* Verify that we dont have an entry for this ip yet */
57         for (i=ctdb->ifaces;i;i=i->next) {
58                 if (strcmp(i->name, iface) == 0) {
59                         return 0;
60                 }
61         }
62
63         /* create a new structure for this interface */
64         i = talloc_zero(ctdb, struct ctdb_iface);
65         CTDB_NO_MEMORY_FATAL(ctdb, i);
66         i->name = talloc_strdup(i, iface);
67         CTDB_NO_MEMORY(ctdb, i->name);
68         i->link_up = true;
69
70         DLIST_ADD(ctdb->ifaces, i);
71
72         return 0;
73 }
74
75 static struct ctdb_iface *ctdb_find_iface(struct ctdb_context *ctdb,
76                                           const char *iface)
77 {
78         struct ctdb_iface *i;
79
80         /* Verify that we dont have an entry for this ip yet */
81         for (i=ctdb->ifaces;i;i=i->next) {
82                 if (strcmp(i->name, iface) == 0) {
83                         return i;
84                 }
85         }
86
87         return NULL;
88 }
89
90 static struct ctdb_iface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
91                                               struct ctdb_vnn *vnn)
92 {
93         int i;
94         struct ctdb_iface *cur = NULL;
95         struct ctdb_iface *best = NULL;
96
97         for (i=0; vnn->ifaces[i]; i++) {
98
99                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
100                 if (cur == NULL) {
101                         continue;
102                 }
103
104                 if (!cur->link_up) {
105                         continue;
106                 }
107
108                 if (best == NULL) {
109                         best = cur;
110                         continue;
111                 }
112
113                 if (cur->references < best->references) {
114                         best = cur;
115                         continue;
116                 }
117         }
118
119         return best;
120 }
121
122 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
123                                      struct ctdb_vnn *vnn)
124 {
125         struct ctdb_iface *best = NULL;
126
127         if (vnn->iface) {
128                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
129                                    "still assigned to iface '%s'\n",
130                                    ctdb_addr_to_str(&vnn->public_address),
131                                    ctdb_vnn_iface_string(vnn)));
132                 return 0;
133         }
134
135         best = ctdb_vnn_best_iface(ctdb, vnn);
136         if (best == NULL) {
137                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
138                                   "cannot assign to iface any iface\n",
139                                   ctdb_addr_to_str(&vnn->public_address)));
140                 return -1;
141         }
142
143         vnn->iface = best;
144         best->references++;
145         vnn->pnn = ctdb->pnn;
146
147         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
148                            "now assigned to iface '%s' refs[%d]\n",
149                            ctdb_addr_to_str(&vnn->public_address),
150                            ctdb_vnn_iface_string(vnn),
151                            best->references));
152         return 0;
153 }
154
155 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
156                                     struct ctdb_vnn *vnn)
157 {
158         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
159                            "now unassigned (old iface '%s' refs[%d])\n",
160                            ctdb_addr_to_str(&vnn->public_address),
161                            ctdb_vnn_iface_string(vnn),
162                            vnn->iface?vnn->iface->references:0));
163         if (vnn->iface) {
164                 vnn->iface->references--;
165         }
166         vnn->iface = NULL;
167         if (vnn->pnn == ctdb->pnn) {
168                 vnn->pnn = -1;
169         }
170 }
171
172 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
173                                struct ctdb_vnn *vnn)
174 {
175         int i;
176
177         if (vnn->iface && vnn->iface->link_up) {
178                 return true;
179         }
180
181         for (i=0; vnn->ifaces[i]; i++) {
182                 struct ctdb_iface *cur;
183
184                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
185                 if (cur == NULL) {
186                         continue;
187                 }
188
189                 if (cur->link_up) {
190                         return true;
191                 }
192         }
193
194         return false;
195 }
196
197 struct ctdb_takeover_arp {
198         struct ctdb_context *ctdb;
199         uint32_t count;
200         ctdb_sock_addr addr;
201         struct ctdb_tcp_array *tcparray;
202         struct ctdb_vnn *vnn;
203 };
204
205
206 /*
207   lists of tcp endpoints
208  */
209 struct ctdb_tcp_list {
210         struct ctdb_tcp_list *prev, *next;
211         struct ctdb_tcp_connection connection;
212 };
213
214 /*
215   list of clients to kill on IP release
216  */
217 struct ctdb_client_ip {
218         struct ctdb_client_ip *prev, *next;
219         struct ctdb_context *ctdb;
220         ctdb_sock_addr addr;
221         uint32_t client_id;
222 };
223
224
225 /*
226   send a gratuitous arp
227  */
228 static void ctdb_control_send_arp(struct event_context *ev, struct timed_event *te, 
229                                   struct timeval t, void *private_data)
230 {
231         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
232                                                         struct ctdb_takeover_arp);
233         int i, ret;
234         struct ctdb_tcp_array *tcparray;
235         const char *iface = ctdb_vnn_iface_string(arp->vnn);
236
237         ret = ctdb_sys_send_arp(&arp->addr, iface);
238         if (ret != 0) {
239                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
240                                   iface, strerror(errno)));
241         }
242
243         tcparray = arp->tcparray;
244         if (tcparray) {
245                 for (i=0;i<tcparray->num;i++) {
246                         struct ctdb_tcp_connection *tcon;
247
248                         tcon = &tcparray->connections[i];
249                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
250                                 (unsigned)ntohs(tcon->dst_addr.ip.sin_port), 
251                                 ctdb_addr_to_str(&tcon->src_addr),
252                                 (unsigned)ntohs(tcon->src_addr.ip.sin_port)));
253                         ret = ctdb_sys_send_tcp(
254                                 &tcon->src_addr, 
255                                 &tcon->dst_addr,
256                                 0, 0, 0);
257                         if (ret != 0) {
258                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
259                                         ctdb_addr_to_str(&tcon->src_addr)));
260                         }
261                 }
262         }
263
264         arp->count++;
265
266         if (arp->count == CTDB_ARP_REPEAT) {
267                 talloc_free(arp);
268                 return;
269         }
270
271         event_add_timed(arp->ctdb->ev, arp->vnn->takeover_ctx, 
272                         timeval_current_ofs(CTDB_ARP_INTERVAL, 100000), 
273                         ctdb_control_send_arp, arp);
274 }
275
276 struct takeover_callback_state {
277         struct ctdb_req_control *c;
278         ctdb_sock_addr *addr;
279         struct ctdb_vnn *vnn;
280 };
281
282 /*
283   called when takeip event finishes
284  */
285 static void takeover_ip_callback(struct ctdb_context *ctdb, int status, 
286                                  void *private_data)
287 {
288         struct takeover_callback_state *state = 
289                 talloc_get_type(private_data, struct takeover_callback_state);
290         struct ctdb_takeover_arp *arp;
291         struct ctdb_tcp_array *tcparray;
292
293         if (status != 0) {
294                 if (status == -ETIME) {
295                         ctdb_ban_self(ctdb);
296                 }
297                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
298                         ctdb_addr_to_str(state->addr),
299                         ctdb_vnn_iface_string(state->vnn)));
300                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
301                 talloc_free(state);
302                 return;
303         }
304
305         if (!state->vnn->takeover_ctx) {
306                 state->vnn->takeover_ctx = talloc_new(state->vnn);
307                 if (!state->vnn->takeover_ctx) {
308                         goto failed;
309                 }
310         }
311
312         arp = talloc_zero(state->vnn->takeover_ctx, struct ctdb_takeover_arp);
313         if (!arp) goto failed;
314         
315         arp->ctdb = ctdb;
316         arp->addr = *state->addr;
317         arp->vnn  = state->vnn;
318
319         tcparray = state->vnn->tcp_array;
320         if (tcparray) {
321                 /* add all of the known tcp connections for this IP to the
322                    list of tcp connections to send tickle acks for */
323                 arp->tcparray = talloc_steal(arp, tcparray);
324
325                 state->vnn->tcp_array = NULL;
326                 state->vnn->tcp_update_needed = true;
327         }
328
329         event_add_timed(arp->ctdb->ev, state->vnn->takeover_ctx, 
330                         timeval_zero(), ctdb_control_send_arp, arp);
331
332         /* the control succeeded */
333         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
334         talloc_free(state);
335         return;
336
337 failed:
338         ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
339         talloc_free(state);
340         return;
341 }
342
343 /*
344   Find the vnn of the node that has a public ip address
345   returns -1 if the address is not known as a public address
346  */
347 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
348 {
349         struct ctdb_vnn *vnn;
350
351         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
352                 if (ctdb_same_ip(&vnn->public_address, addr)) {
353                         return vnn;
354                 }
355         }
356
357         return NULL;
358 }
359
360 /*
361   take over an ip address
362  */
363 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb, 
364                                  struct ctdb_req_control *c,
365                                  TDB_DATA indata, 
366                                  bool *async_reply)
367 {
368         int ret;
369         struct takeover_callback_state *state;
370         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
371         struct ctdb_vnn *vnn;
372
373         /* update out vnn list */
374         vnn = find_public_ip_vnn(ctdb, &pip->addr);
375         if (vnn == NULL) {
376                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n", 
377                         ctdb_addr_to_str(&pip->addr)));
378                 return 0;
379         }
380         vnn->pnn = pip->pnn;
381
382         /* if our kernel already has this IP, do nothing */
383         if (ctdb_sys_have_ip(&pip->addr)) {
384                 return 0;
385         }
386
387         ret = ctdb_vnn_assign_iface(ctdb, vnn);
388         if (ret != 0) {
389                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
390                                  "assin a usable interface\n",
391                                  ctdb_addr_to_str(&pip->addr),
392                                  vnn->public_netmask_bits));
393                 return -1;
394         }
395
396         state = talloc(vnn, struct takeover_callback_state);
397         CTDB_NO_MEMORY(ctdb, state);
398
399         state->c = talloc_steal(ctdb, c);
400         state->addr = talloc(ctdb, ctdb_sock_addr);
401         CTDB_NO_MEMORY(ctdb, state->addr);
402
403         *state->addr = pip->addr;
404         state->vnn   = vnn;
405
406         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n", 
407                 ctdb_addr_to_str(&pip->addr),
408                 vnn->public_netmask_bits, 
409                 ctdb_vnn_iface_string(vnn)));
410
411         ret = ctdb_event_script_callback(ctdb, 
412                                          state, takeover_ip_callback, state,
413                                          false,
414                                          CTDB_EVENT_TAKE_IP,
415                                          "%s %s %u",
416                                          ctdb_vnn_iface_string(vnn),
417                                          ctdb_addr_to_str(&pip->addr),
418                                          vnn->public_netmask_bits);
419
420         if (ret != 0) {
421                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
422                         ctdb_addr_to_str(&pip->addr),
423                         ctdb_vnn_iface_string(vnn)));
424                 talloc_free(state);
425                 return -1;
426         }
427
428         /* tell ctdb_control.c that we will be replying asynchronously */
429         *async_reply = true;
430
431         return 0;
432 }
433
434 /*
435   takeover an ip address old v4 style
436  */
437 int32_t ctdb_control_takeover_ipv4(struct ctdb_context *ctdb, 
438                                 struct ctdb_req_control *c,
439                                 TDB_DATA indata, 
440                                 bool *async_reply)
441 {
442         TDB_DATA data;
443         
444         data.dsize = sizeof(struct ctdb_public_ip);
445         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
446         CTDB_NO_MEMORY(ctdb, data.dptr);
447         
448         memcpy(data.dptr, indata.dptr, indata.dsize);
449         return ctdb_control_takeover_ip(ctdb, c, data, async_reply);
450 }
451
452 /*
453   kill any clients that are registered with a IP that is being released
454  */
455 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
456 {
457         struct ctdb_client_ip *ip;
458
459         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
460                 ctdb_addr_to_str(addr)));
461
462         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
463                 ctdb_sock_addr tmp_addr;
464
465                 tmp_addr = ip->addr;
466                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
467                         ip->client_id,
468                         ctdb_addr_to_str(&ip->addr)));
469
470                 if (ctdb_same_ip(&tmp_addr, addr)) {
471                         struct ctdb_client *client = ctdb_reqid_find(ctdb, 
472                                                                      ip->client_id, 
473                                                                      struct ctdb_client);
474                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
475                                 ip->client_id,
476                                 ctdb_addr_to_str(&ip->addr),
477                                 client->pid));
478
479                         if (client->pid != 0) {
480                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
481                                         (unsigned)client->pid,
482                                         ctdb_addr_to_str(addr),
483                                         ip->client_id));
484                                 kill(client->pid, SIGKILL);
485                         }
486                 }
487         }
488 }
489
490 /*
491   called when releaseip event finishes
492  */
493 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
494                                 void *private_data)
495 {
496         struct takeover_callback_state *state = 
497                 talloc_get_type(private_data, struct takeover_callback_state);
498         TDB_DATA data;
499
500         if (status == -ETIME) {
501                 ctdb_ban_self(ctdb);
502         }
503
504         /* send a message to all clients of this node telling them
505            that the cluster has been reconfigured and they should
506            release any sockets on this IP */
507         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
508         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
509         data.dsize = strlen((char *)data.dptr)+1;
510
511         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
512
513         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
514
515         /* kill clients that have registered with this IP */
516         release_kill_clients(ctdb, state->addr);
517
518         ctdb_vnn_unassign_iface(ctdb, state->vnn);
519
520         /* the control succeeded */
521         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
522         talloc_free(state);
523 }
524
525 /*
526   release an ip address
527  */
528 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
529                                 struct ctdb_req_control *c,
530                                 TDB_DATA indata, 
531                                 bool *async_reply)
532 {
533         int ret;
534         struct takeover_callback_state *state;
535         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
536         struct ctdb_vnn *vnn;
537
538         /* update our vnn list */
539         vnn = find_public_ip_vnn(ctdb, &pip->addr);
540         if (vnn == NULL) {
541                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
542                         ctdb_addr_to_str(&pip->addr)));
543                 return 0;
544         }
545         vnn->pnn = pip->pnn;
546
547         /* stop any previous arps */
548         talloc_free(vnn->takeover_ctx);
549         vnn->takeover_ctx = NULL;
550
551         if (!ctdb_sys_have_ip(&pip->addr)) {
552                 DEBUG(DEBUG_NOTICE,("Redundant release of IP %s/%u on interface %s (ip not held)\n", 
553                         ctdb_addr_to_str(&pip->addr),
554                         vnn->public_netmask_bits, 
555                         ctdb_vnn_iface_string(vnn)));
556                 ctdb_vnn_unassign_iface(ctdb, vnn);
557                 return 0;
558         }
559
560         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%u\n", 
561                 ctdb_addr_to_str(&pip->addr),
562                 vnn->public_netmask_bits, 
563                 ctdb_vnn_iface_string(vnn),
564                 pip->pnn));
565
566         state = talloc(ctdb, struct takeover_callback_state);
567         CTDB_NO_MEMORY(ctdb, state);
568
569         state->c = talloc_steal(state, c);
570         state->addr = talloc(state, ctdb_sock_addr);       
571         CTDB_NO_MEMORY(ctdb, state->addr);
572         *state->addr = pip->addr;
573         state->vnn   = vnn;
574
575         ret = ctdb_event_script_callback(ctdb, 
576                                          state, release_ip_callback, state,
577                                          false,
578                                          CTDB_EVENT_RELEASE_IP,
579                                          "%s %s %u",
580                                          ctdb_vnn_iface_string(vnn),
581                                          ctdb_addr_to_str(&pip->addr),
582                                          vnn->public_netmask_bits);
583         if (ret != 0) {
584                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
585                         ctdb_addr_to_str(&pip->addr),
586                         ctdb_vnn_iface_string(vnn)));
587                 talloc_free(state);
588                 return -1;
589         }
590
591         /* tell the control that we will be reply asynchronously */
592         *async_reply = true;
593         return 0;
594 }
595
596 /*
597   release an ip address old v4 style
598  */
599 int32_t ctdb_control_release_ipv4(struct ctdb_context *ctdb, 
600                                 struct ctdb_req_control *c,
601                                 TDB_DATA indata, 
602                                 bool *async_reply)
603 {
604         TDB_DATA data;
605         
606         data.dsize = sizeof(struct ctdb_public_ip);
607         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
608         CTDB_NO_MEMORY(ctdb, data.dptr);
609         
610         memcpy(data.dptr, indata.dptr, indata.dsize);
611         return ctdb_control_release_ip(ctdb, c, data, async_reply);
612 }
613
614
615 static int ctdb_add_public_address(struct ctdb_context *ctdb,
616                                    ctdb_sock_addr *addr,
617                                    unsigned mask, const char *ifaces)
618 {
619         struct ctdb_vnn      *vnn;
620         uint32_t num = 0;
621         char *tmp;
622         const char *iface;
623         int i;
624         int ret;
625
626         /* Verify that we dont have an entry for this ip yet */
627         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
628                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
629                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
630                                 ctdb_addr_to_str(addr)));
631                         return -1;
632                 }               
633         }
634
635         /* create a new vnn structure for this ip address */
636         vnn = talloc_zero(ctdb, struct ctdb_vnn);
637         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
638         vnn->ifaces = talloc_array(vnn, const char *, num + 2);
639         tmp = talloc_strdup(vnn, ifaces);
640         CTDB_NO_MEMORY_FATAL(ctdb, tmp);
641         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
642                 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
643                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
644                 vnn->ifaces[num] = talloc_strdup(vnn, iface);
645                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
646                 num++;
647         }
648         talloc_free(tmp);
649         vnn->ifaces[num] = NULL;
650         vnn->public_address      = *addr;
651         vnn->public_netmask_bits = mask;
652         vnn->pnn                 = -1;
653
654         for (i=0; vnn->ifaces[i]; i++) {
655                 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
656                 if (ret != 0) {
657                         DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
658                                            "for public_address[%s]\n",
659                                            vnn->ifaces[i], ctdb_addr_to_str(addr)));
660                         talloc_free(vnn);
661                         return -1;
662                 }
663         }
664
665         DLIST_ADD(ctdb->vnn, vnn);
666
667         return 0;
668 }
669
670 /*
671   setup the event script directory
672 */
673 int ctdb_set_event_script_dir(struct ctdb_context *ctdb, const char *script_dir)
674 {
675         ctdb->event_script_dir = talloc_strdup(ctdb, script_dir);
676         CTDB_NO_MEMORY(ctdb, ctdb->event_script_dir);
677         return 0;
678 }
679
680 /*
681   setup the public address lists from a file
682 */
683 int ctdb_set_public_addresses(struct ctdb_context *ctdb, const char *alist)
684 {
685         char **lines;
686         int nlines;
687         int i;
688
689         lines = file_lines_load(alist, &nlines, ctdb);
690         if (lines == NULL) {
691                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", alist);
692                 return -1;
693         }
694         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
695                 nlines--;
696         }
697
698         for (i=0;i<nlines;i++) {
699                 unsigned mask;
700                 ctdb_sock_addr addr;
701                 const char *addrstr;
702                 const char *ifaces;
703                 char *tok, *line;
704
705                 line = lines[i];
706                 while ((*line == ' ') || (*line == '\t')) {
707                         line++;
708                 }
709                 if (*line == '#') {
710                         continue;
711                 }
712                 if (strcmp(line, "") == 0) {
713                         continue;
714                 }
715                 tok = strtok(line, " \t");
716                 addrstr = tok;
717                 tok = strtok(NULL, " \t");
718                 if (tok == NULL) {
719                         if (NULL == ctdb->default_public_interface) {
720                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
721                                          i+1));
722                                 talloc_free(lines);
723                                 return -1;
724                         }
725                         ifaces = ctdb->default_public_interface;
726                 } else {
727                         ifaces = tok;
728                 }
729
730                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
731                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
732                         talloc_free(lines);
733                         return -1;
734                 }
735                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces)) {
736                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
737                         talloc_free(lines);
738                         return -1;
739                 }
740         }
741
742         talloc_free(lines);
743         return 0;
744 }
745
746 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
747                               const char *iface,
748                               const char *ip)
749 {
750         struct ctdb_vnn *svnn;
751         bool ok;
752         int ret;
753
754         svnn = talloc_zero(ctdb, struct ctdb_vnn);
755         CTDB_NO_MEMORY(ctdb, svnn);
756
757         svnn->ifaces = talloc_array(svnn, const char *, 2);
758         CTDB_NO_MEMORY(ctdb, svnn->ifaces);
759         svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
760         CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
761         svnn->ifaces[1] = NULL;
762
763         ok = parse_ip(ip, iface, 0, &svnn->public_address);
764         if (!ok) {
765                 talloc_free(svnn);
766                 return -1;
767         }
768
769         ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
770         if (ret != 0) {
771                 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
772                                    "for single_ip[%s]\n",
773                                    svnn->ifaces[0],
774                                    ctdb_addr_to_str(&svnn->public_address)));
775                 talloc_free(svnn);
776                 return -1;
777         }
778
779         ret = ctdb_vnn_assign_iface(ctdb, svnn);
780         if (ret != 0) {
781                 talloc_free(svnn);
782                 return -1;
783         }
784
785         ctdb->single_ip_vnn = svnn;
786         return 0;
787 }
788
789 struct ctdb_public_ip_list {
790         struct ctdb_public_ip_list *next;
791         uint32_t pnn;
792         ctdb_sock_addr addr;
793 };
794
795
796 /* Given a physical node, return the number of
797    public addresses that is currently assigned to this node.
798 */
799 static int node_ip_coverage(struct ctdb_context *ctdb, 
800         int32_t pnn,
801         struct ctdb_public_ip_list *ips)
802 {
803         int num=0;
804
805         for (;ips;ips=ips->next) {
806                 if (ips->pnn == pnn) {
807                         num++;
808                 }
809         }
810         return num;
811 }
812
813
814 /* Check if this is a public ip known to the node, i.e. can that
815    node takeover this ip ?
816 */
817 static int can_node_serve_ip(struct ctdb_context *ctdb, int32_t pnn, 
818                 struct ctdb_public_ip_list *ip)
819 {
820         struct ctdb_all_public_ips *public_ips;
821         int i;
822
823         public_ips = ctdb->nodes[pnn]->public_ips;
824
825         if (public_ips == NULL) {
826                 return -1;
827         }
828
829         for (i=0;i<public_ips->num;i++) {
830                 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
831                         /* yes, this node can serve this public ip */
832                         return 0;
833                 }
834         }
835
836         return -1;
837 }
838
839
840 /* search the node lists list for a node to takeover this ip.
841    pick the node that currently are serving the least number of ips
842    so that the ips get spread out evenly.
843 */
844 static int find_takeover_node(struct ctdb_context *ctdb, 
845                 struct ctdb_node_map *nodemap, uint32_t mask, 
846                 struct ctdb_public_ip_list *ip,
847                 struct ctdb_public_ip_list *all_ips)
848 {
849         int pnn, min=0, num;
850         int i;
851
852         pnn    = -1;
853         for (i=0;i<nodemap->num;i++) {
854                 if (nodemap->nodes[i].flags & mask) {
855                         /* This node is not healty and can not be used to serve
856                            a public address 
857                         */
858                         continue;
859                 }
860
861                 /* verify that this node can serve this ip */
862                 if (can_node_serve_ip(ctdb, i, ip)) {
863                         /* no it couldnt   so skip to the next node */
864                         continue;
865                 }
866
867                 num = node_ip_coverage(ctdb, i, all_ips);
868                 /* was this the first node we checked ? */
869                 if (pnn == -1) {
870                         pnn = i;
871                         min  = num;
872                 } else {
873                         if (num < min) {
874                                 pnn = i;
875                                 min  = num;
876                         }
877                 }
878         }       
879         if (pnn == -1) {
880                 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
881                         ctdb_addr_to_str(&ip->addr)));
882
883                 return -1;
884         }
885
886         ip->pnn = pnn;
887         return 0;
888 }
889
890 #define IP_KEYLEN       4
891 static uint32_t *ip_key(ctdb_sock_addr *ip)
892 {
893         static uint32_t key[IP_KEYLEN];
894
895         bzero(key, sizeof(key));
896
897         switch (ip->sa.sa_family) {
898         case AF_INET:
899                 key[3]  = htonl(ip->ip.sin_addr.s_addr);
900                 break;
901         case AF_INET6:
902                 key[0]  = htonl(ip->ip6.sin6_addr.s6_addr32[0]);
903                 key[1]  = htonl(ip->ip6.sin6_addr.s6_addr32[1]);
904                 key[2]  = htonl(ip->ip6.sin6_addr.s6_addr32[2]);
905                 key[3]  = htonl(ip->ip6.sin6_addr.s6_addr32[3]);
906                 break;
907         default:
908                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
909                 return key;
910         }
911
912         return key;
913 }
914
915 static void *add_ip_callback(void *parm, void *data)
916 {
917         return parm;
918 }
919
920 void getips_count_callback(void *param, void *data)
921 {
922         struct ctdb_public_ip_list **ip_list = (struct ctdb_public_ip_list **)param;
923         struct ctdb_public_ip_list *new_ip = (struct ctdb_public_ip_list *)data;
924
925         new_ip->next = *ip_list;
926         *ip_list     = new_ip;
927 }
928
929 struct ctdb_public_ip_list *
930 create_merged_ip_list(struct ctdb_context *ctdb, TALLOC_CTX *tmp_ctx)
931 {
932         int i, j;
933         struct ctdb_public_ip_list *ip_list;
934         struct ctdb_all_public_ips *public_ips;
935         trbt_tree_t *ip_tree;
936
937         ip_tree = trbt_create(tmp_ctx, 0);
938
939         for (i=0;i<ctdb->num_nodes;i++) {
940                 public_ips = ctdb->nodes[i]->public_ips;
941
942                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
943                         continue;
944                 }
945
946                 /* there were no public ips for this node */
947                 if (public_ips == NULL) {
948                         continue;
949                 }               
950
951                 for (j=0;j<public_ips->num;j++) {
952                         struct ctdb_public_ip_list *tmp_ip; 
953
954                         tmp_ip = talloc_zero(tmp_ctx, struct ctdb_public_ip_list);
955                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
956                         tmp_ip->pnn  = public_ips->ips[j].pnn;
957                         tmp_ip->addr = public_ips->ips[j].addr;
958                         tmp_ip->next = NULL;
959
960                         trbt_insertarray32_callback(ip_tree,
961                                 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
962                                 add_ip_callback,
963                                 tmp_ip);
964                 }
965         }
966
967         ip_list = NULL;
968         trbt_traversearray32(ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
969
970         return ip_list;
971 }
972
973 /*
974   make any IP alias changes for public addresses that are necessary 
975  */
976 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
977 {
978         int i, num_healthy, retries;
979         struct ctdb_public_ip ip;
980         struct ctdb_public_ipv4 ipv4;
981         uint32_t mask;
982         struct ctdb_public_ip_list *all_ips, *tmp_ip;
983         int maxnode, maxnum=0, minnode, minnum=0, num;
984         TDB_DATA data;
985         struct timeval timeout;
986         struct client_async_data *async_data;
987         struct ctdb_client_control_state *state;
988         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
989
990
991         ZERO_STRUCT(ip);
992
993         /* Count how many completely healthy nodes we have */
994         num_healthy = 0;
995         for (i=0;i<nodemap->num;i++) {
996                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
997                         num_healthy++;
998                 }
999         }
1000
1001         if (num_healthy > 0) {
1002                 /* We have healthy nodes, so only consider them for 
1003                    serving public addresses
1004                 */
1005                 mask = NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED;
1006         } else {
1007                 /* We didnt have any completely healthy nodes so
1008                    use "disabled" nodes as a fallback
1009                 */
1010                 mask = NODE_FLAGS_INACTIVE;
1011         }
1012
1013         /* since nodes only know about those public addresses that
1014            can be served by that particular node, no single node has
1015            a full list of all public addresses that exist in the cluster.
1016            Walk over all node structures and create a merged list of
1017            all public addresses that exist in the cluster.
1018         */
1019         all_ips = create_merged_ip_list(ctdb, tmp_ctx);
1020
1021         /* If we want deterministic ip allocations, i.e. that the ip addresses
1022            will always be allocated the same way for a specific set of
1023            available/unavailable nodes.
1024         */
1025         if (1 == ctdb->tunable.deterministic_public_ips) {              
1026                 DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
1027                 for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
1028                         tmp_ip->pnn = i%nodemap->num;
1029                 }
1030         }
1031
1032
1033         /* mark all public addresses with a masked node as being served by
1034            node -1
1035         */
1036         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1037                 if (tmp_ip->pnn == -1) {
1038                         continue;
1039                 }
1040                 if (nodemap->nodes[tmp_ip->pnn].flags & mask) {
1041                         tmp_ip->pnn = -1;
1042                 }
1043         }
1044
1045         /* verify that the assigned nodes can serve that public ip
1046            and set it to -1 if not
1047         */
1048         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1049                 if (tmp_ip->pnn == -1) {
1050                         continue;
1051                 }
1052                 if (can_node_serve_ip(ctdb, tmp_ip->pnn, tmp_ip) != 0) {
1053                         /* this node can not serve this ip. */
1054                         tmp_ip->pnn = -1;
1055                 }
1056         }
1057
1058
1059         /* now we must redistribute all public addresses with takeover node
1060            -1 among the nodes available
1061         */
1062         retries = 0;
1063 try_again:
1064         /* loop over all ip's and find a physical node to cover for 
1065            each unassigned ip.
1066         */
1067         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1068                 if (tmp_ip->pnn == -1) {
1069                         if (find_takeover_node(ctdb, nodemap, mask, tmp_ip, all_ips)) {
1070                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1071                                         ctdb_addr_to_str(&tmp_ip->addr)));
1072                         }
1073                 }
1074         }
1075
1076         /* If we dont want ips to fail back after a node becomes healthy
1077            again, we wont even try to reallocat the ip addresses so that
1078            they are evenly spread out.
1079            This can NOT be used at the same time as DeterministicIPs !
1080         */
1081         if (1 == ctdb->tunable.no_ip_failback) {
1082                 if (1 == ctdb->tunable.deterministic_public_ips) {
1083                         DEBUG(DEBUG_ERR, ("ERROR: You can not use 'DeterministicIPs' and 'NoIPFailback' at the same time\n"));
1084                 }
1085                 goto finished;
1086         }
1087
1088
1089         /* now, try to make sure the ip adresses are evenly distributed
1090            across the node.
1091            for each ip address, loop over all nodes that can serve this
1092            ip and make sure that the difference between the node
1093            serving the most and the node serving the least ip's are not greater
1094            than 1.
1095         */
1096         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1097                 if (tmp_ip->pnn == -1) {
1098                         continue;
1099                 }
1100
1101                 /* Get the highest and lowest number of ips's served by any 
1102                    valid node which can serve this ip.
1103                 */
1104                 maxnode = -1;
1105                 minnode = -1;
1106                 for (i=0;i<nodemap->num;i++) {
1107                         if (nodemap->nodes[i].flags & mask) {
1108                                 continue;
1109                         }
1110
1111                         /* only check nodes that can actually serve this ip */
1112                         if (can_node_serve_ip(ctdb, i, tmp_ip)) {
1113                                 /* no it couldnt   so skip to the next node */
1114                                 continue;
1115                         }
1116
1117                         num = node_ip_coverage(ctdb, i, all_ips);
1118                         if (maxnode == -1) {
1119                                 maxnode = i;
1120                                 maxnum  = num;
1121                         } else {
1122                                 if (num > maxnum) {
1123                                         maxnode = i;
1124                                         maxnum  = num;
1125                                 }
1126                         }
1127                         if (minnode == -1) {
1128                                 minnode = i;
1129                                 minnum  = num;
1130                         } else {
1131                                 if (num < minnum) {
1132                                         minnode = i;
1133                                         minnum  = num;
1134                                 }
1135                         }
1136                 }
1137                 if (maxnode == -1) {
1138                         DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1139                                 ctdb_addr_to_str(&tmp_ip->addr)));
1140
1141                         continue;
1142                 }
1143
1144                 /* If we want deterministic IPs then dont try to reallocate 
1145                    them to spread out the load.
1146                 */
1147                 if (1 == ctdb->tunable.deterministic_public_ips) {
1148                         continue;
1149                 }
1150
1151                 /* if the spread between the smallest and largest coverage by
1152                    a node is >=2 we steal one of the ips from the node with
1153                    most coverage to even things out a bit.
1154                    try to do this at most 5 times  since we dont want to spend
1155                    too much time balancing the ip coverage.
1156                 */
1157                 if ( (maxnum > minnum+1)
1158                   && (retries < 5) ){
1159                         struct ctdb_public_ip_list *tmp;
1160
1161                         /* mark one of maxnode's vnn's as unassigned and try
1162                            again
1163                         */
1164                         for (tmp=all_ips;tmp;tmp=tmp->next) {
1165                                 if (tmp->pnn == maxnode) {
1166                                         tmp->pnn = -1;
1167                                         retries++;
1168                                         goto try_again;
1169                                 }
1170                         }
1171                 }
1172         }
1173
1174
1175         /* finished distributing the public addresses, now just send the 
1176            info out to the nodes
1177         */
1178 finished:
1179
1180         /* at this point ->pnn is the node which will own each IP
1181            or -1 if there is no node that can cover this ip
1182         */
1183
1184         /* now tell all nodes to delete any alias that they should not
1185            have.  This will be a NOOP on nodes that don't currently
1186            hold the given alias */
1187         async_data = talloc_zero(tmp_ctx, struct client_async_data);
1188         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
1189
1190         for (i=0;i<nodemap->num;i++) {
1191                 /* don't talk to unconnected nodes, but do talk to banned nodes */
1192                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1193                         continue;
1194                 }
1195
1196                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1197                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
1198                                 /* This node should be serving this
1199                                    vnn so dont tell it to release the ip
1200                                 */
1201                                 continue;
1202                         }
1203                         if (tmp_ip->addr.sa.sa_family == AF_INET) {
1204                                 ipv4.pnn = tmp_ip->pnn;
1205                                 ipv4.sin = tmp_ip->addr.ip;
1206
1207                                 timeout = TAKEOVER_TIMEOUT();
1208                                 data.dsize = sizeof(ipv4);
1209                                 data.dptr  = (uint8_t *)&ipv4;
1210                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
1211                                                 0, CTDB_CONTROL_RELEASE_IPv4, 0,
1212                                                 data, async_data,
1213                                                 &timeout, NULL);
1214                         } else {
1215                                 ip.pnn  = tmp_ip->pnn;
1216                                 ip.addr = tmp_ip->addr;
1217
1218                                 timeout = TAKEOVER_TIMEOUT();
1219                                 data.dsize = sizeof(ip);
1220                                 data.dptr  = (uint8_t *)&ip;
1221                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
1222                                                 0, CTDB_CONTROL_RELEASE_IP, 0,
1223                                                 data, async_data,
1224                                                 &timeout, NULL);
1225                         }
1226
1227                         if (state == NULL) {
1228                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
1229                                 talloc_free(tmp_ctx);
1230                                 return -1;
1231                         }
1232                 
1233                         ctdb_client_async_add(async_data, state);
1234                 }
1235         }
1236         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
1237                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
1238                 talloc_free(tmp_ctx);
1239                 return -1;
1240         }
1241         talloc_free(async_data);
1242
1243
1244         /* tell all nodes to get their own IPs */
1245         async_data = talloc_zero(tmp_ctx, struct client_async_data);
1246         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
1247         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1248                 if (tmp_ip->pnn == -1) {
1249                         /* this IP won't be taken over */
1250                         continue;
1251                 }
1252
1253                 if (tmp_ip->addr.sa.sa_family == AF_INET) {
1254                         ipv4.pnn = tmp_ip->pnn;
1255                         ipv4.sin = tmp_ip->addr.ip;
1256
1257                         timeout = TAKEOVER_TIMEOUT();
1258                         data.dsize = sizeof(ipv4);
1259                         data.dptr  = (uint8_t *)&ipv4;
1260                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
1261                                         0, CTDB_CONTROL_TAKEOVER_IPv4, 0,
1262                                         data, async_data,
1263                                         &timeout, NULL);
1264                 } else {
1265                         ip.pnn  = tmp_ip->pnn;
1266                         ip.addr = tmp_ip->addr;
1267
1268                         timeout = TAKEOVER_TIMEOUT();
1269                         data.dsize = sizeof(ip);
1270                         data.dptr  = (uint8_t *)&ip;
1271                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
1272                                         0, CTDB_CONTROL_TAKEOVER_IP, 0,
1273                                         data, async_data,
1274                                         &timeout, NULL);
1275                 }
1276                 if (state == NULL) {
1277                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
1278                         talloc_free(tmp_ctx);
1279                         return -1;
1280                 }
1281                 
1282                 ctdb_client_async_add(async_data, state);
1283         }
1284         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
1285                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
1286                 talloc_free(tmp_ctx);
1287                 return -1;
1288         }
1289
1290         talloc_free(tmp_ctx);
1291         return 0;
1292 }
1293
1294
1295 /*
1296   destroy a ctdb_client_ip structure
1297  */
1298 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
1299 {
1300         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
1301                 ctdb_addr_to_str(&ip->addr),
1302                 ntohs(ip->addr.ip.sin_port),
1303                 ip->client_id));
1304
1305         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
1306         return 0;
1307 }
1308
1309 /*
1310   called by a client to inform us of a TCP connection that it is managing
1311   that should tickled with an ACK when IP takeover is done
1312   we handle both the old ipv4 style of packets as well as the new ipv4/6
1313   pdus.
1314  */
1315 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
1316                                 TDB_DATA indata)
1317 {
1318         struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
1319         struct ctdb_control_tcp *old_addr = NULL;
1320         struct ctdb_control_tcp_addr new_addr;
1321         struct ctdb_control_tcp_addr *tcp_sock = NULL;
1322         struct ctdb_tcp_list *tcp;
1323         struct ctdb_control_tcp_vnn t;
1324         int ret;
1325         TDB_DATA data;
1326         struct ctdb_client_ip *ip;
1327         struct ctdb_vnn *vnn;
1328         ctdb_sock_addr addr;
1329
1330         switch (indata.dsize) {
1331         case sizeof(struct ctdb_control_tcp):
1332                 old_addr = (struct ctdb_control_tcp *)indata.dptr;
1333                 ZERO_STRUCT(new_addr);
1334                 tcp_sock = &new_addr;
1335                 tcp_sock->src.ip  = old_addr->src;
1336                 tcp_sock->dest.ip = old_addr->dest;
1337                 break;
1338         case sizeof(struct ctdb_control_tcp_addr):
1339                 tcp_sock = (struct ctdb_control_tcp_addr *)indata.dptr;
1340                 break;
1341         default:
1342                 DEBUG(DEBUG_ERR,(__location__ " Invalid data structure passed "
1343                                  "to ctdb_control_tcp_client. size was %d but "
1344                                  "only allowed sizes are %lu and %lu\n",
1345                                  (int)indata.dsize,
1346                                  (long unsigned)sizeof(struct ctdb_control_tcp),
1347                                  (long unsigned)sizeof(struct ctdb_control_tcp_addr)));
1348                 return -1;
1349         }
1350
1351         addr = tcp_sock->src;
1352         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
1353         addr = tcp_sock->dest;
1354         ctdb_canonicalize_ip(&addr, &tcp_sock->dest);
1355
1356         ZERO_STRUCT(addr);
1357         memcpy(&addr, &tcp_sock->dest, sizeof(addr));
1358         vnn = find_public_ip_vnn(ctdb, &addr);
1359         if (vnn == NULL) {
1360                 switch (addr.sa.sa_family) {
1361                 case AF_INET:
1362                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
1363                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
1364                                         ctdb_addr_to_str(&addr)));
1365                         }
1366                         break;
1367                 case AF_INET6:
1368                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
1369                                 ctdb_addr_to_str(&addr)));
1370                         break;
1371                 default:
1372                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
1373                 }
1374
1375                 return 0;
1376         }
1377
1378         if (vnn->pnn != ctdb->pnn) {
1379                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
1380                         ctdb_addr_to_str(&addr),
1381                         client_id, client->pid));
1382                 /* failing this call will tell smbd to die */
1383                 return -1;
1384         }
1385
1386         ip = talloc(client, struct ctdb_client_ip);
1387         CTDB_NO_MEMORY(ctdb, ip);
1388
1389         ip->ctdb      = ctdb;
1390         ip->addr      = addr;
1391         ip->client_id = client_id;
1392         talloc_set_destructor(ip, ctdb_client_ip_destructor);
1393         DLIST_ADD(ctdb->client_ip_list, ip);
1394
1395         tcp = talloc(client, struct ctdb_tcp_list);
1396         CTDB_NO_MEMORY(ctdb, tcp);
1397
1398         tcp->connection.src_addr = tcp_sock->src;
1399         tcp->connection.dst_addr = tcp_sock->dest;
1400
1401         DLIST_ADD(client->tcp_list, tcp);
1402
1403         t.src  = tcp_sock->src;
1404         t.dest = tcp_sock->dest;
1405
1406         data.dptr = (uint8_t *)&t;
1407         data.dsize = sizeof(t);
1408
1409         switch (addr.sa.sa_family) {
1410         case AF_INET:
1411                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
1412                         (unsigned)ntohs(tcp_sock->dest.ip.sin_port), 
1413                         ctdb_addr_to_str(&tcp_sock->src),
1414                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
1415                 break;
1416         case AF_INET6:
1417                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
1418                         (unsigned)ntohs(tcp_sock->dest.ip6.sin6_port), 
1419                         ctdb_addr_to_str(&tcp_sock->src),
1420                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
1421                 break;
1422         default:
1423                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
1424         }
1425
1426
1427         /* tell all nodes about this tcp connection */
1428         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
1429                                        CTDB_CONTROL_TCP_ADD,
1430                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
1431         if (ret != 0) {
1432                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
1433                 return -1;
1434         }
1435
1436         return 0;
1437 }
1438
1439 /*
1440   find a tcp address on a list
1441  */
1442 static struct ctdb_tcp_connection *ctdb_tcp_find(struct ctdb_tcp_array *array, 
1443                                            struct ctdb_tcp_connection *tcp)
1444 {
1445         int i;
1446
1447         if (array == NULL) {
1448                 return NULL;
1449         }
1450
1451         for (i=0;i<array->num;i++) {
1452                 if (ctdb_same_sockaddr(&array->connections[i].src_addr, &tcp->src_addr) &&
1453                     ctdb_same_sockaddr(&array->connections[i].dst_addr, &tcp->dst_addr)) {
1454                         return &array->connections[i];
1455                 }
1456         }
1457         return NULL;
1458 }
1459
1460 /*
1461   called by a daemon to inform us of a TCP connection that one of its
1462   clients managing that should tickled with an ACK when IP takeover is
1463   done
1464  */
1465 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata)
1466 {
1467         struct ctdb_control_tcp_vnn *p = (struct ctdb_control_tcp_vnn *)indata.dptr;
1468         struct ctdb_tcp_array *tcparray;
1469         struct ctdb_tcp_connection tcp;
1470         struct ctdb_vnn *vnn;
1471
1472         vnn = find_public_ip_vnn(ctdb, &p->dest);
1473         if (vnn == NULL) {
1474                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
1475                         ctdb_addr_to_str(&p->dest)));
1476
1477                 return -1;
1478         }
1479
1480
1481         tcparray = vnn->tcp_array;
1482
1483         /* If this is the first tickle */
1484         if (tcparray == NULL) {
1485                 tcparray = talloc_size(ctdb->nodes, 
1486                         offsetof(struct ctdb_tcp_array, connections) +
1487                         sizeof(struct ctdb_tcp_connection) * 1);
1488                 CTDB_NO_MEMORY(ctdb, tcparray);
1489                 vnn->tcp_array = tcparray;
1490
1491                 tcparray->num = 0;
1492                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
1493                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
1494
1495                 tcparray->connections[tcparray->num].src_addr = p->src;
1496                 tcparray->connections[tcparray->num].dst_addr = p->dest;
1497                 tcparray->num++;
1498                 return 0;
1499         }
1500
1501
1502         /* Do we already have this tickle ?*/
1503         tcp.src_addr = p->src;
1504         tcp.dst_addr = p->dest;
1505         if (ctdb_tcp_find(vnn->tcp_array, &tcp) != NULL) {
1506                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
1507                         ctdb_addr_to_str(&tcp.dst_addr),
1508                         ntohs(tcp.dst_addr.ip.sin_port),
1509                         vnn->pnn));
1510                 return 0;
1511         }
1512
1513         /* A new tickle, we must add it to the array */
1514         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
1515                                         struct ctdb_tcp_connection,
1516                                         tcparray->num+1);
1517         CTDB_NO_MEMORY(ctdb, tcparray->connections);
1518
1519         vnn->tcp_array = tcparray;
1520         tcparray->connections[tcparray->num].src_addr = p->src;
1521         tcparray->connections[tcparray->num].dst_addr = p->dest;
1522         tcparray->num++;
1523                                 
1524         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
1525                 ctdb_addr_to_str(&tcp.dst_addr),
1526                 ntohs(tcp.dst_addr.ip.sin_port),
1527                 vnn->pnn));
1528
1529         return 0;
1530 }
1531
1532
1533 /*
1534   called by a daemon to inform us of a TCP connection that one of its
1535   clients managing that should tickled with an ACK when IP takeover is
1536   done
1537  */
1538 static void ctdb_remove_tcp_connection(struct ctdb_context *ctdb, struct ctdb_tcp_connection *conn)
1539 {
1540         struct ctdb_tcp_connection *tcpp;
1541         struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst_addr);
1542
1543         if (vnn == NULL) {
1544                 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
1545                         ctdb_addr_to_str(&conn->dst_addr)));
1546                 return;
1547         }
1548
1549         /* if the array is empty we cant remove it
1550            and we dont need to do anything
1551          */
1552         if (vnn->tcp_array == NULL) {
1553                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
1554                         ctdb_addr_to_str(&conn->dst_addr),
1555                         ntohs(conn->dst_addr.ip.sin_port)));
1556                 return;
1557         }
1558
1559
1560         /* See if we know this connection
1561            if we dont know this connection  then we dont need to do anything
1562          */
1563         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
1564         if (tcpp == NULL) {
1565                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
1566                         ctdb_addr_to_str(&conn->dst_addr),
1567                         ntohs(conn->dst_addr.ip.sin_port)));
1568                 return;
1569         }
1570
1571
1572         /* We need to remove this entry from the array.
1573            Instead of allocating a new array and copying data to it
1574            we cheat and just copy the last entry in the existing array
1575            to the entry that is to be removed and just shring the 
1576            ->num field
1577          */
1578         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
1579         vnn->tcp_array->num--;
1580
1581         /* If we deleted the last entry we also need to remove the entire array
1582          */
1583         if (vnn->tcp_array->num == 0) {
1584                 talloc_free(vnn->tcp_array);
1585                 vnn->tcp_array = NULL;
1586         }               
1587
1588         vnn->tcp_update_needed = true;
1589
1590         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
1591                 ctdb_addr_to_str(&conn->src_addr),
1592                 ntohs(conn->src_addr.ip.sin_port)));
1593 }
1594
1595
1596 /*
1597   called when a daemon restarts - send all tickes for all public addresses
1598   we are serving immediately to the new node.
1599  */
1600 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t vnn)
1601 {
1602 /*XXX here we should send all tickes we are serving to the new node */
1603         return 0;
1604 }
1605
1606
1607 /*
1608   called when a client structure goes away - hook to remove
1609   elements from the tcp_list in all daemons
1610  */
1611 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
1612 {
1613         while (client->tcp_list) {
1614                 struct ctdb_tcp_list *tcp = client->tcp_list;
1615                 DLIST_REMOVE(client->tcp_list, tcp);
1616                 ctdb_remove_tcp_connection(client->ctdb, &tcp->connection);
1617         }
1618 }
1619
1620
1621 /*
1622   release all IPs on shutdown
1623  */
1624 void ctdb_release_all_ips(struct ctdb_context *ctdb)
1625 {
1626         struct ctdb_vnn *vnn;
1627
1628         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1629                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
1630                         ctdb_vnn_unassign_iface(ctdb, vnn);
1631                         continue;
1632                 }
1633                 if (!vnn->iface) {
1634                         continue;
1635                 }
1636                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
1637                                   ctdb_vnn_iface_string(vnn),
1638                                   ctdb_addr_to_str(&vnn->public_address),
1639                                   vnn->public_netmask_bits);
1640                 release_kill_clients(ctdb, &vnn->public_address);
1641                 ctdb_vnn_unassign_iface(ctdb, vnn);
1642         }
1643 }
1644
1645
1646 /*
1647   get list of public IPs
1648  */
1649 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
1650                                     struct ctdb_req_control *c, TDB_DATA *outdata)
1651 {
1652         int i, num, len;
1653         struct ctdb_all_public_ips *ips;
1654         struct ctdb_vnn *vnn;
1655         bool only_available = false;
1656
1657         if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
1658                 only_available = true;
1659         }
1660
1661         /* count how many public ip structures we have */
1662         num = 0;
1663         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1664                 num++;
1665         }
1666
1667         len = offsetof(struct ctdb_all_public_ips, ips) + 
1668                 num*sizeof(struct ctdb_public_ip);
1669         ips = talloc_zero_size(outdata, len);
1670         CTDB_NO_MEMORY(ctdb, ips);
1671
1672         i = 0;
1673         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1674                 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
1675                         continue;
1676                 }
1677                 ips->ips[i].pnn  = vnn->pnn;
1678                 ips->ips[i].addr = vnn->public_address;
1679                 i++;
1680         }
1681         ips->num = i;
1682         len = offsetof(struct ctdb_all_public_ips, ips) +
1683                 i*sizeof(struct ctdb_public_ip);
1684
1685         outdata->dsize = len;
1686         outdata->dptr  = (uint8_t *)ips;
1687
1688         return 0;
1689 }
1690
1691
1692 /*
1693   get list of public IPs, old ipv4 style.  only returns ipv4 addresses
1694  */
1695 int32_t ctdb_control_get_public_ipsv4(struct ctdb_context *ctdb, 
1696                                     struct ctdb_req_control *c, TDB_DATA *outdata)
1697 {
1698         int i, num, len;
1699         struct ctdb_all_public_ipsv4 *ips;
1700         struct ctdb_vnn *vnn;
1701
1702         /* count how many public ip structures we have */
1703         num = 0;
1704         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1705                 if (vnn->public_address.sa.sa_family != AF_INET) {
1706                         continue;
1707                 }
1708                 num++;
1709         }
1710
1711         len = offsetof(struct ctdb_all_public_ipsv4, ips) + 
1712                 num*sizeof(struct ctdb_public_ipv4);
1713         ips = talloc_zero_size(outdata, len);
1714         CTDB_NO_MEMORY(ctdb, ips);
1715
1716         outdata->dsize = len;
1717         outdata->dptr  = (uint8_t *)ips;
1718
1719         ips->num = num;
1720         i = 0;
1721         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1722                 if (vnn->public_address.sa.sa_family != AF_INET) {
1723                         continue;
1724                 }
1725                 ips->ips[i].pnn = vnn->pnn;
1726                 ips->ips[i].sin = vnn->public_address.ip;
1727                 i++;
1728         }
1729
1730         return 0;
1731 }
1732
1733
1734 /* 
1735    structure containing the listening socket and the list of tcp connections
1736    that the ctdb daemon is to kill
1737 */
1738 struct ctdb_kill_tcp {
1739         struct ctdb_vnn *vnn;
1740         struct ctdb_context *ctdb;
1741         int capture_fd;
1742         struct fd_event *fde;
1743         trbt_tree_t *connections;
1744         void *private_data;
1745 };
1746
1747 /*
1748   a tcp connection that is to be killed
1749  */
1750 struct ctdb_killtcp_con {
1751         ctdb_sock_addr src_addr;
1752         ctdb_sock_addr dst_addr;
1753         int count;
1754         struct ctdb_kill_tcp *killtcp;
1755 };
1756
1757 /* this function is used to create a key to represent this socketpair
1758    in the killtcp tree.
1759    this key is used to insert and lookup matching socketpairs that are
1760    to be tickled and RST
1761 */
1762 #define KILLTCP_KEYLEN  10
1763 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
1764 {
1765         static uint32_t key[KILLTCP_KEYLEN];
1766
1767         bzero(key, sizeof(key));
1768
1769         if (src->sa.sa_family != dst->sa.sa_family) {
1770                 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
1771                 return key;
1772         }
1773         
1774         switch (src->sa.sa_family) {
1775         case AF_INET:
1776                 key[0]  = dst->ip.sin_addr.s_addr;
1777                 key[1]  = src->ip.sin_addr.s_addr;
1778                 key[2]  = dst->ip.sin_port;
1779                 key[3]  = src->ip.sin_port;
1780                 break;
1781         case AF_INET6:
1782                 key[0]  = dst->ip6.sin6_addr.s6_addr32[3];
1783                 key[1]  = src->ip6.sin6_addr.s6_addr32[3];
1784                 key[2]  = dst->ip6.sin6_addr.s6_addr32[2];
1785                 key[3]  = src->ip6.sin6_addr.s6_addr32[2];
1786                 key[4]  = dst->ip6.sin6_addr.s6_addr32[1];
1787                 key[5]  = src->ip6.sin6_addr.s6_addr32[1];
1788                 key[6]  = dst->ip6.sin6_addr.s6_addr32[0];
1789                 key[7]  = src->ip6.sin6_addr.s6_addr32[0];
1790                 key[8]  = dst->ip6.sin6_port;
1791                 key[9]  = src->ip6.sin6_port;
1792                 break;
1793         default:
1794                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
1795                 return key;
1796         }
1797
1798         return key;
1799 }
1800
1801 /*
1802   called when we get a read event on the raw socket
1803  */
1804 static void capture_tcp_handler(struct event_context *ev, struct fd_event *fde, 
1805                                 uint16_t flags, void *private_data)
1806 {
1807         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
1808         struct ctdb_killtcp_con *con;
1809         ctdb_sock_addr src, dst;
1810         uint32_t ack_seq, seq;
1811
1812         if (!(flags & EVENT_FD_READ)) {
1813                 return;
1814         }
1815
1816         if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
1817                                 killtcp->private_data,
1818                                 &src, &dst,
1819                                 &ack_seq, &seq) != 0) {
1820                 /* probably a non-tcp ACK packet */
1821                 return;
1822         }
1823
1824         /* check if we have this guy in our list of connections
1825            to kill
1826         */
1827         con = trbt_lookuparray32(killtcp->connections, 
1828                         KILLTCP_KEYLEN, killtcp_key(&src, &dst));
1829         if (con == NULL) {
1830                 /* no this was some other packet we can just ignore */
1831                 return;
1832         }
1833
1834         /* This one has been tickled !
1835            now reset him and remove him from the list.
1836          */
1837         DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
1838                 ntohs(con->dst_addr.ip.sin_port),
1839                 ctdb_addr_to_str(&con->src_addr),
1840                 ntohs(con->src_addr.ip.sin_port)));
1841
1842         ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
1843         talloc_free(con);
1844 }
1845
1846
1847 /* when traversing the list of all tcp connections to send tickle acks to
1848    (so that we can capture the ack coming back and kill the connection
1849     by a RST)
1850    this callback is called for each connection we are currently trying to kill
1851 */
1852 static void tickle_connection_traverse(void *param, void *data)
1853 {
1854         struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
1855
1856         /* have tried too many times, just give up */
1857         if (con->count >= 5) {
1858                 talloc_free(con);
1859                 return;
1860         }
1861
1862         /* othervise, try tickling it again */
1863         con->count++;
1864         ctdb_sys_send_tcp(
1865                 (ctdb_sock_addr *)&con->dst_addr,
1866                 (ctdb_sock_addr *)&con->src_addr,
1867                 0, 0, 0);
1868 }
1869
1870
1871 /* 
1872    called every second until all sentenced connections have been reset
1873  */
1874 static void ctdb_tickle_sentenced_connections(struct event_context *ev, struct timed_event *te, 
1875                                               struct timeval t, void *private_data)
1876 {
1877         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
1878
1879
1880         /* loop over all connections sending tickle ACKs */
1881         trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, NULL);
1882
1883
1884         /* If there are no more connections to kill we can remove the
1885            entire killtcp structure
1886          */
1887         if ( (killtcp->connections == NULL) || 
1888              (killtcp->connections->root == NULL) ) {
1889                 talloc_free(killtcp);
1890                 return;
1891         }
1892
1893         /* try tickling them again in a seconds time
1894          */
1895         event_add_timed(killtcp->ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
1896                         ctdb_tickle_sentenced_connections, killtcp);
1897 }
1898
1899 /*
1900   destroy the killtcp structure
1901  */
1902 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
1903 {
1904         killtcp->vnn->killtcp = NULL;
1905         return 0;
1906 }
1907
1908
1909 /* nothing fancy here, just unconditionally replace any existing
1910    connection structure with the new one.
1911
1912    dont even free the old one if it did exist, that one is talloc_stolen
1913    by the same node in the tree anyway and will be deleted when the new data 
1914    is deleted
1915 */
1916 static void *add_killtcp_callback(void *parm, void *data)
1917 {
1918         return parm;
1919 }
1920
1921 /*
1922   add a tcp socket to the list of connections we want to RST
1923  */
1924 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb, 
1925                                        ctdb_sock_addr *s,
1926                                        ctdb_sock_addr *d)
1927 {
1928         ctdb_sock_addr src, dst;
1929         struct ctdb_kill_tcp *killtcp;
1930         struct ctdb_killtcp_con *con;
1931         struct ctdb_vnn *vnn;
1932
1933         ctdb_canonicalize_ip(s, &src);
1934         ctdb_canonicalize_ip(d, &dst);
1935
1936         vnn = find_public_ip_vnn(ctdb, &dst);
1937         if (vnn == NULL) {
1938                 vnn = find_public_ip_vnn(ctdb, &src);
1939         }
1940         if (vnn == NULL) {
1941                 /* if it is not a public ip   it could be our 'single ip' */
1942                 if (ctdb->single_ip_vnn) {
1943                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
1944                                 vnn = ctdb->single_ip_vnn;
1945                         }
1946                 }
1947         }
1948         if (vnn == NULL) {
1949                 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n")); 
1950                 return -1;
1951         }
1952
1953         killtcp = vnn->killtcp;
1954         
1955         /* If this is the first connection to kill we must allocate
1956            a new structure
1957          */
1958         if (killtcp == NULL) {
1959                 killtcp = talloc_zero(ctdb, struct ctdb_kill_tcp);
1960                 CTDB_NO_MEMORY(ctdb, killtcp);
1961
1962                 killtcp->vnn         = vnn;
1963                 killtcp->ctdb        = ctdb;
1964                 killtcp->capture_fd  = -1;
1965                 killtcp->connections = trbt_create(killtcp, 0);
1966
1967                 vnn->killtcp         = killtcp;
1968                 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
1969         }
1970
1971
1972
1973         /* create a structure that describes this connection we want to
1974            RST and store it in killtcp->connections
1975         */
1976         con = talloc(killtcp, struct ctdb_killtcp_con);
1977         CTDB_NO_MEMORY(ctdb, con);
1978         con->src_addr = src;
1979         con->dst_addr = dst;
1980         con->count    = 0;
1981         con->killtcp  = killtcp;
1982
1983
1984         trbt_insertarray32_callback(killtcp->connections,
1985                         KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
1986                         add_killtcp_callback, con);
1987
1988         /* 
1989            If we dont have a socket to listen on yet we must create it
1990          */
1991         if (killtcp->capture_fd == -1) {
1992                 const char *iface = ctdb_vnn_iface_string(vnn);
1993                 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
1994                 if (killtcp->capture_fd == -1) {
1995                         DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
1996                                           "socket on iface '%s' for killtcp (%s)\n",
1997                                           iface, strerror(errno)));
1998                         goto failed;
1999                 }
2000         }
2001
2002
2003         if (killtcp->fde == NULL) {
2004                 killtcp->fde = event_add_fd(ctdb->ev, killtcp, killtcp->capture_fd, 
2005                                             EVENT_FD_READ | EVENT_FD_AUTOCLOSE, 
2006                                             capture_tcp_handler, killtcp);
2007
2008                 /* We also need to set up some events to tickle all these connections
2009                    until they are all reset
2010                 */
2011                 event_add_timed(ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
2012                                 ctdb_tickle_sentenced_connections, killtcp);
2013         }
2014
2015         /* tickle him once now */
2016         ctdb_sys_send_tcp(
2017                 &con->dst_addr,
2018                 &con->src_addr,
2019                 0, 0, 0);
2020
2021         return 0;
2022
2023 failed:
2024         talloc_free(vnn->killtcp);
2025         vnn->killtcp = NULL;
2026         return -1;
2027 }
2028
2029 /*
2030   kill a TCP connection.
2031  */
2032 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
2033 {
2034         struct ctdb_control_killtcp *killtcp = (struct ctdb_control_killtcp *)indata.dptr;
2035
2036         return ctdb_killtcp_add_connection(ctdb, &killtcp->src_addr, &killtcp->dst_addr);
2037 }
2038
2039 /*
2040   called by a daemon to inform us of the entire list of TCP tickles for
2041   a particular public address.
2042   this control should only be sent by the node that is currently serving
2043   that public address.
2044  */
2045 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
2046 {
2047         struct ctdb_control_tcp_tickle_list *list = (struct ctdb_control_tcp_tickle_list *)indata.dptr;
2048         struct ctdb_tcp_array *tcparray;
2049         struct ctdb_vnn *vnn;
2050
2051         /* We must at least have tickles.num or else we cant verify the size
2052            of the received data blob
2053          */
2054         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
2055                                         tickles.connections)) {
2056                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list. Not enough data for the tickle.num field\n"));
2057                 return -1;
2058         }
2059
2060         /* verify that the size of data matches what we expect */
2061         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
2062                                 tickles.connections)
2063                          + sizeof(struct ctdb_tcp_connection)
2064                                  * list->tickles.num) {
2065                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list\n"));
2066                 return -1;
2067         }       
2068
2069         vnn = find_public_ip_vnn(ctdb, &list->addr);
2070         if (vnn == NULL) {
2071                 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n", 
2072                         ctdb_addr_to_str(&list->addr)));
2073
2074                 return 1;
2075         }
2076
2077         /* remove any old ticklelist we might have */
2078         talloc_free(vnn->tcp_array);
2079         vnn->tcp_array = NULL;
2080
2081         tcparray = talloc(ctdb->nodes, struct ctdb_tcp_array);
2082         CTDB_NO_MEMORY(ctdb, tcparray);
2083
2084         tcparray->num = list->tickles.num;
2085
2086         tcparray->connections = talloc_array(tcparray, struct ctdb_tcp_connection, tcparray->num);
2087         CTDB_NO_MEMORY(ctdb, tcparray->connections);
2088
2089         memcpy(tcparray->connections, &list->tickles.connections[0], 
2090                sizeof(struct ctdb_tcp_connection)*tcparray->num);
2091
2092         /* We now have a new fresh tickle list array for this vnn */
2093         vnn->tcp_array = talloc_steal(vnn, tcparray);
2094         
2095         return 0;
2096 }
2097
2098 /*
2099   called to return the full list of tickles for the puclic address associated 
2100   with the provided vnn
2101  */
2102 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
2103 {
2104         ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
2105         struct ctdb_control_tcp_tickle_list *list;
2106         struct ctdb_tcp_array *tcparray;
2107         int num;
2108         struct ctdb_vnn *vnn;
2109
2110         vnn = find_public_ip_vnn(ctdb, addr);
2111         if (vnn == NULL) {
2112                 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n", 
2113                         ctdb_addr_to_str(addr)));
2114
2115                 return 1;
2116         }
2117
2118         tcparray = vnn->tcp_array;
2119         if (tcparray) {
2120                 num = tcparray->num;
2121         } else {
2122                 num = 0;
2123         }
2124
2125         outdata->dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
2126                                 tickles.connections)
2127                         + sizeof(struct ctdb_tcp_connection) * num;
2128
2129         outdata->dptr  = talloc_size(outdata, outdata->dsize);
2130         CTDB_NO_MEMORY(ctdb, outdata->dptr);
2131         list = (struct ctdb_control_tcp_tickle_list *)outdata->dptr;
2132
2133         list->addr = *addr;
2134         list->tickles.num = num;
2135         if (num) {
2136                 memcpy(&list->tickles.connections[0], tcparray->connections, 
2137                         sizeof(struct ctdb_tcp_connection) * num);
2138         }
2139
2140         return 0;
2141 }
2142
2143
2144 /*
2145   set the list of all tcp tickles for a public address
2146  */
2147 static int ctdb_ctrl_set_tcp_tickles(struct ctdb_context *ctdb, 
2148                               struct timeval timeout, uint32_t destnode, 
2149                               ctdb_sock_addr *addr,
2150                               struct ctdb_tcp_array *tcparray)
2151 {
2152         int ret, num;
2153         TDB_DATA data;
2154         struct ctdb_control_tcp_tickle_list *list;
2155
2156         if (tcparray) {
2157                 num = tcparray->num;
2158         } else {
2159                 num = 0;
2160         }
2161
2162         data.dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
2163                                 tickles.connections) +
2164                         sizeof(struct ctdb_tcp_connection) * num;
2165         data.dptr = talloc_size(ctdb, data.dsize);
2166         CTDB_NO_MEMORY(ctdb, data.dptr);
2167
2168         list = (struct ctdb_control_tcp_tickle_list *)data.dptr;
2169         list->addr = *addr;
2170         list->tickles.num = num;
2171         if (tcparray) {
2172                 memcpy(&list->tickles.connections[0], tcparray->connections, sizeof(struct ctdb_tcp_connection) * num);
2173         }
2174
2175         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
2176                                        CTDB_CONTROL_SET_TCP_TICKLE_LIST,
2177                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2178         if (ret != 0) {
2179                 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
2180                 return -1;
2181         }
2182
2183         talloc_free(data.dptr);
2184
2185         return ret;
2186 }
2187
2188
2189 /*
2190   perform tickle updates if required
2191  */
2192 static void ctdb_update_tcp_tickles(struct event_context *ev, 
2193                                 struct timed_event *te, 
2194                                 struct timeval t, void *private_data)
2195 {
2196         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
2197         int ret;
2198         struct ctdb_vnn *vnn;
2199
2200         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2201                 /* we only send out updates for public addresses that 
2202                    we have taken over
2203                  */
2204                 if (ctdb->pnn != vnn->pnn) {
2205                         continue;
2206                 }
2207                 /* We only send out the updates if we need to */
2208                 if (!vnn->tcp_update_needed) {
2209                         continue;
2210                 }
2211                 ret = ctdb_ctrl_set_tcp_tickles(ctdb, 
2212                                 TAKEOVER_TIMEOUT(),
2213                                 CTDB_BROADCAST_CONNECTED,
2214                                 &vnn->public_address,
2215                                 vnn->tcp_array);
2216                 if (ret != 0) {
2217                         DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
2218                                 ctdb_addr_to_str(&vnn->public_address)));
2219                 }
2220         }
2221
2222         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
2223                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
2224                              ctdb_update_tcp_tickles, ctdb);
2225 }               
2226         
2227
2228 /*
2229   start periodic update of tcp tickles
2230  */
2231 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
2232 {
2233         ctdb->tickle_update_context = talloc_new(ctdb);
2234
2235         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
2236                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
2237                              ctdb_update_tcp_tickles, ctdb);
2238 }
2239
2240
2241
2242
2243 struct control_gratious_arp {
2244         struct ctdb_context *ctdb;
2245         ctdb_sock_addr addr;
2246         const char *iface;
2247         int count;
2248 };
2249
2250 /*
2251   send a control_gratuitous arp
2252  */
2253 static void send_gratious_arp(struct event_context *ev, struct timed_event *te, 
2254                                   struct timeval t, void *private_data)
2255 {
2256         int ret;
2257         struct control_gratious_arp *arp = talloc_get_type(private_data, 
2258                                                         struct control_gratious_arp);
2259
2260         ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
2261         if (ret != 0) {
2262                 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
2263                                  arp->iface, strerror(errno)));
2264         }
2265
2266
2267         arp->count++;
2268         if (arp->count == CTDB_ARP_REPEAT) {
2269                 talloc_free(arp);
2270                 return;
2271         }
2272
2273         event_add_timed(arp->ctdb->ev, arp, 
2274                         timeval_current_ofs(CTDB_ARP_INTERVAL, 0), 
2275                         send_gratious_arp, arp);
2276 }
2277
2278
2279 /*
2280   send a gratious arp 
2281  */
2282 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
2283 {
2284         struct ctdb_control_gratious_arp *gratious_arp = (struct ctdb_control_gratious_arp *)indata.dptr;
2285         struct control_gratious_arp *arp;
2286
2287         /* verify the size of indata */
2288         if (indata.dsize < offsetof(struct ctdb_control_gratious_arp, iface)) {
2289                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
2290                                  (unsigned)indata.dsize, 
2291                                  (unsigned)offsetof(struct ctdb_control_gratious_arp, iface)));
2292                 return -1;
2293         }
2294         if (indata.dsize != 
2295                 ( offsetof(struct ctdb_control_gratious_arp, iface)
2296                 + gratious_arp->len ) ){
2297
2298                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
2299                         "but should be %u bytes\n", 
2300                          (unsigned)indata.dsize, 
2301                          (unsigned)(offsetof(struct ctdb_control_gratious_arp, iface)+gratious_arp->len)));
2302                 return -1;
2303         }
2304
2305
2306         arp = talloc(ctdb, struct control_gratious_arp);
2307         CTDB_NO_MEMORY(ctdb, arp);
2308
2309         arp->ctdb  = ctdb;
2310         arp->addr   = gratious_arp->addr;
2311         arp->iface = talloc_strdup(arp, gratious_arp->iface);
2312         CTDB_NO_MEMORY(ctdb, arp->iface);
2313         arp->count = 0;
2314         
2315         event_add_timed(arp->ctdb->ev, arp, 
2316                         timeval_zero(), send_gratious_arp, arp);
2317
2318         return 0;
2319 }
2320
2321 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
2322 {
2323         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
2324         int ret;
2325
2326         /* verify the size of indata */
2327         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
2328                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
2329                 return -1;
2330         }
2331         if (indata.dsize != 
2332                 ( offsetof(struct ctdb_control_ip_iface, iface)
2333                 + pub->len ) ){
2334
2335                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
2336                         "but should be %u bytes\n", 
2337                          (unsigned)indata.dsize, 
2338                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
2339                 return -1;
2340         }
2341
2342         ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0]);
2343
2344         if (ret != 0) {
2345                 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
2346                 return -1;
2347         }
2348
2349         return 0;
2350 }
2351
2352 /*
2353   called when releaseip event finishes for del_public_address
2354  */
2355 static void delete_ip_callback(struct ctdb_context *ctdb, int status, 
2356                                 void *private_data)
2357 {
2358         talloc_free(private_data);
2359 }
2360
2361 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
2362 {
2363         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
2364         struct ctdb_vnn *vnn;
2365         int ret;
2366
2367         /* verify the size of indata */
2368         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
2369                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
2370                 return -1;
2371         }
2372         if (indata.dsize != 
2373                 ( offsetof(struct ctdb_control_ip_iface, iface)
2374                 + pub->len ) ){
2375
2376                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
2377                         "but should be %u bytes\n", 
2378                          (unsigned)indata.dsize, 
2379                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
2380                 return -1;
2381         }
2382
2383         /* walk over all public addresses until we find a match */
2384         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2385                 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
2386                         TALLOC_CTX *mem_ctx;
2387
2388                         DLIST_REMOVE(ctdb->vnn, vnn);
2389                         if (vnn->iface == NULL) {
2390                                 talloc_free(vnn);
2391                                 return 0;
2392                         }
2393
2394                         mem_ctx = talloc_new(ctdb);
2395                         ret = ctdb_event_script_callback(ctdb, 
2396                                          mem_ctx, delete_ip_callback, mem_ctx,
2397                                          false,
2398                                          CTDB_EVENT_RELEASE_IP,
2399                                          "%s %s %u",
2400                                          ctdb_vnn_iface_string(vnn),
2401                                          ctdb_addr_to_str(&vnn->public_address),
2402                                          vnn->public_netmask_bits);
2403                         ctdb_vnn_unassign_iface(ctdb, vnn);
2404                         talloc_free(vnn);
2405                         if (ret != 0) {
2406                                 return -1;
2407                         }
2408                         return 0;
2409                 }
2410         }
2411
2412         return -1;
2413 }
2414