server: implement ctdb_control_get_ifaces()
[tridge/ctdb.git] / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6
7    This program is free software; you can redistribute it and/or modify
8    it under the terms of the GNU General Public License as published by
9    the Free Software Foundation; either version 3 of the License, or
10    (at your option) any later version.
11    
12    This program is distributed in the hope that it will be useful,
13    but WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15    GNU General Public License for more details.
16    
17    You should have received a copy of the GNU General Public License
18    along with this program; if not, see <http://www.gnu.org/licenses/>.
19 */
20 #include "includes.h"
21 #include "lib/events/events.h"
22 #include "lib/tdb/include/tdb.h"
23 #include "lib/util/dlinklist.h"
24 #include "system/network.h"
25 #include "system/filesys.h"
26 #include "system/wait.h"
27 #include "../include/ctdb_private.h"
28 #include "../common/rb_tree.h"
29
30
31 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
32
33 #define CTDB_ARP_INTERVAL 1
34 #define CTDB_ARP_REPEAT   3
35
36 struct ctdb_iface {
37         struct ctdb_iface *prev, *next;
38         const char *name;
39         bool link_up;
40         uint32_t references;
41 };
42
43 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
44 {
45         if (vnn->iface) {
46                 return vnn->iface->name;
47         }
48
49         return "__none__";
50 }
51
52 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
53 {
54         struct ctdb_iface *i;
55
56         /* Verify that we dont have an entry for this ip yet */
57         for (i=ctdb->ifaces;i;i=i->next) {
58                 if (strcmp(i->name, iface) == 0) {
59                         return 0;
60                 }
61         }
62
63         /* create a new structure for this interface */
64         i = talloc_zero(ctdb, struct ctdb_iface);
65         CTDB_NO_MEMORY_FATAL(ctdb, i);
66         i->name = talloc_strdup(i, iface);
67         CTDB_NO_MEMORY(ctdb, i->name);
68         i->link_up = true;
69
70         DLIST_ADD(ctdb->ifaces, i);
71
72         return 0;
73 }
74
75 static struct ctdb_iface *ctdb_find_iface(struct ctdb_context *ctdb,
76                                           const char *iface)
77 {
78         struct ctdb_iface *i;
79
80         /* Verify that we dont have an entry for this ip yet */
81         for (i=ctdb->ifaces;i;i=i->next) {
82                 if (strcmp(i->name, iface) == 0) {
83                         return i;
84                 }
85         }
86
87         return NULL;
88 }
89
90 static struct ctdb_iface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
91                                               struct ctdb_vnn *vnn)
92 {
93         int i;
94         struct ctdb_iface *cur = NULL;
95         struct ctdb_iface *best = NULL;
96
97         for (i=0; vnn->ifaces[i]; i++) {
98
99                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
100                 if (cur == NULL) {
101                         continue;
102                 }
103
104                 if (!cur->link_up) {
105                         continue;
106                 }
107
108                 if (best == NULL) {
109                         best = cur;
110                         continue;
111                 }
112
113                 if (cur->references < best->references) {
114                         best = cur;
115                         continue;
116                 }
117         }
118
119         return best;
120 }
121
122 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
123                                      struct ctdb_vnn *vnn)
124 {
125         struct ctdb_iface *best = NULL;
126
127         if (vnn->iface) {
128                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
129                                    "still assigned to iface '%s'\n",
130                                    ctdb_addr_to_str(&vnn->public_address),
131                                    ctdb_vnn_iface_string(vnn)));
132                 return 0;
133         }
134
135         best = ctdb_vnn_best_iface(ctdb, vnn);
136         if (best == NULL) {
137                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
138                                   "cannot assign to iface any iface\n",
139                                   ctdb_addr_to_str(&vnn->public_address)));
140                 return -1;
141         }
142
143         vnn->iface = best;
144         best->references++;
145         vnn->pnn = ctdb->pnn;
146
147         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
148                            "now assigned to iface '%s' refs[%d]\n",
149                            ctdb_addr_to_str(&vnn->public_address),
150                            ctdb_vnn_iface_string(vnn),
151                            best->references));
152         return 0;
153 }
154
155 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
156                                     struct ctdb_vnn *vnn)
157 {
158         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
159                            "now unassigned (old iface '%s' refs[%d])\n",
160                            ctdb_addr_to_str(&vnn->public_address),
161                            ctdb_vnn_iface_string(vnn),
162                            vnn->iface?vnn->iface->references:0));
163         if (vnn->iface) {
164                 vnn->iface->references--;
165         }
166         vnn->iface = NULL;
167         if (vnn->pnn == ctdb->pnn) {
168                 vnn->pnn = -1;
169         }
170 }
171
172 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
173                                struct ctdb_vnn *vnn)
174 {
175         int i;
176
177         if (vnn->iface && vnn->iface->link_up) {
178                 return true;
179         }
180
181         for (i=0; vnn->ifaces[i]; i++) {
182                 struct ctdb_iface *cur;
183
184                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
185                 if (cur == NULL) {
186                         continue;
187                 }
188
189                 if (cur->link_up) {
190                         return true;
191                 }
192         }
193
194         return false;
195 }
196
197 struct ctdb_takeover_arp {
198         struct ctdb_context *ctdb;
199         uint32_t count;
200         ctdb_sock_addr addr;
201         struct ctdb_tcp_array *tcparray;
202         struct ctdb_vnn *vnn;
203 };
204
205
206 /*
207   lists of tcp endpoints
208  */
209 struct ctdb_tcp_list {
210         struct ctdb_tcp_list *prev, *next;
211         struct ctdb_tcp_connection connection;
212 };
213
214 /*
215   list of clients to kill on IP release
216  */
217 struct ctdb_client_ip {
218         struct ctdb_client_ip *prev, *next;
219         struct ctdb_context *ctdb;
220         ctdb_sock_addr addr;
221         uint32_t client_id;
222 };
223
224
225 /*
226   send a gratuitous arp
227  */
228 static void ctdb_control_send_arp(struct event_context *ev, struct timed_event *te, 
229                                   struct timeval t, void *private_data)
230 {
231         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
232                                                         struct ctdb_takeover_arp);
233         int i, ret;
234         struct ctdb_tcp_array *tcparray;
235         const char *iface = ctdb_vnn_iface_string(arp->vnn);
236
237         ret = ctdb_sys_send_arp(&arp->addr, iface);
238         if (ret != 0) {
239                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
240                                   iface, strerror(errno)));
241         }
242
243         tcparray = arp->tcparray;
244         if (tcparray) {
245                 for (i=0;i<tcparray->num;i++) {
246                         struct ctdb_tcp_connection *tcon;
247
248                         tcon = &tcparray->connections[i];
249                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
250                                 (unsigned)ntohs(tcon->dst_addr.ip.sin_port), 
251                                 ctdb_addr_to_str(&tcon->src_addr),
252                                 (unsigned)ntohs(tcon->src_addr.ip.sin_port)));
253                         ret = ctdb_sys_send_tcp(
254                                 &tcon->src_addr, 
255                                 &tcon->dst_addr,
256                                 0, 0, 0);
257                         if (ret != 0) {
258                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
259                                         ctdb_addr_to_str(&tcon->src_addr)));
260                         }
261                 }
262         }
263
264         arp->count++;
265
266         if (arp->count == CTDB_ARP_REPEAT) {
267                 talloc_free(arp);
268                 return;
269         }
270
271         event_add_timed(arp->ctdb->ev, arp->vnn->takeover_ctx, 
272                         timeval_current_ofs(CTDB_ARP_INTERVAL, 100000), 
273                         ctdb_control_send_arp, arp);
274 }
275
276 struct takeover_callback_state {
277         struct ctdb_req_control *c;
278         ctdb_sock_addr *addr;
279         struct ctdb_vnn *vnn;
280 };
281
282 /*
283   called when takeip event finishes
284  */
285 static void takeover_ip_callback(struct ctdb_context *ctdb, int status, 
286                                  void *private_data)
287 {
288         struct takeover_callback_state *state = 
289                 talloc_get_type(private_data, struct takeover_callback_state);
290         struct ctdb_takeover_arp *arp;
291         struct ctdb_tcp_array *tcparray;
292
293         if (status != 0) {
294                 if (status == -ETIME) {
295                         ctdb_ban_self(ctdb);
296                 }
297                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
298                         ctdb_addr_to_str(state->addr),
299                         ctdb_vnn_iface_string(state->vnn)));
300                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
301                 talloc_free(state);
302                 return;
303         }
304
305         if (!state->vnn->takeover_ctx) {
306                 state->vnn->takeover_ctx = talloc_new(state->vnn);
307                 if (!state->vnn->takeover_ctx) {
308                         goto failed;
309                 }
310         }
311
312         arp = talloc_zero(state->vnn->takeover_ctx, struct ctdb_takeover_arp);
313         if (!arp) goto failed;
314         
315         arp->ctdb = ctdb;
316         arp->addr = *state->addr;
317         arp->vnn  = state->vnn;
318
319         tcparray = state->vnn->tcp_array;
320         if (tcparray) {
321                 /* add all of the known tcp connections for this IP to the
322                    list of tcp connections to send tickle acks for */
323                 arp->tcparray = talloc_steal(arp, tcparray);
324
325                 state->vnn->tcp_array = NULL;
326                 state->vnn->tcp_update_needed = true;
327         }
328
329         event_add_timed(arp->ctdb->ev, state->vnn->takeover_ctx, 
330                         timeval_zero(), ctdb_control_send_arp, arp);
331
332         /* the control succeeded */
333         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
334         talloc_free(state);
335         return;
336
337 failed:
338         ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
339         talloc_free(state);
340         return;
341 }
342
343 /*
344   Find the vnn of the node that has a public ip address
345   returns -1 if the address is not known as a public address
346  */
347 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
348 {
349         struct ctdb_vnn *vnn;
350
351         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
352                 if (ctdb_same_ip(&vnn->public_address, addr)) {
353                         return vnn;
354                 }
355         }
356
357         return NULL;
358 }
359
360 /*
361   take over an ip address
362  */
363 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb, 
364                                  struct ctdb_req_control *c,
365                                  TDB_DATA indata, 
366                                  bool *async_reply)
367 {
368         int ret;
369         struct takeover_callback_state *state;
370         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
371         struct ctdb_vnn *vnn;
372
373         /* update out vnn list */
374         vnn = find_public_ip_vnn(ctdb, &pip->addr);
375         if (vnn == NULL) {
376                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n", 
377                         ctdb_addr_to_str(&pip->addr)));
378                 return 0;
379         }
380         vnn->pnn = pip->pnn;
381
382         /* if our kernel already has this IP, do nothing */
383         if (ctdb_sys_have_ip(&pip->addr)) {
384                 return 0;
385         }
386
387         ret = ctdb_vnn_assign_iface(ctdb, vnn);
388         if (ret != 0) {
389                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
390                                  "assin a usable interface\n",
391                                  ctdb_addr_to_str(&pip->addr),
392                                  vnn->public_netmask_bits));
393                 return -1;
394         }
395
396         state = talloc(vnn, struct takeover_callback_state);
397         CTDB_NO_MEMORY(ctdb, state);
398
399         state->c = talloc_steal(ctdb, c);
400         state->addr = talloc(ctdb, ctdb_sock_addr);
401         CTDB_NO_MEMORY(ctdb, state->addr);
402
403         *state->addr = pip->addr;
404         state->vnn   = vnn;
405
406         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n", 
407                 ctdb_addr_to_str(&pip->addr),
408                 vnn->public_netmask_bits, 
409                 ctdb_vnn_iface_string(vnn)));
410
411         ret = ctdb_event_script_callback(ctdb, 
412                                          state, takeover_ip_callback, state,
413                                          false,
414                                          CTDB_EVENT_TAKE_IP,
415                                          "%s %s %u",
416                                          ctdb_vnn_iface_string(vnn),
417                                          ctdb_addr_to_str(&pip->addr),
418                                          vnn->public_netmask_bits);
419
420         if (ret != 0) {
421                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
422                         ctdb_addr_to_str(&pip->addr),
423                         ctdb_vnn_iface_string(vnn)));
424                 talloc_free(state);
425                 return -1;
426         }
427
428         /* tell ctdb_control.c that we will be replying asynchronously */
429         *async_reply = true;
430
431         return 0;
432 }
433
434 /*
435   takeover an ip address old v4 style
436  */
437 int32_t ctdb_control_takeover_ipv4(struct ctdb_context *ctdb, 
438                                 struct ctdb_req_control *c,
439                                 TDB_DATA indata, 
440                                 bool *async_reply)
441 {
442         TDB_DATA data;
443         
444         data.dsize = sizeof(struct ctdb_public_ip);
445         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
446         CTDB_NO_MEMORY(ctdb, data.dptr);
447         
448         memcpy(data.dptr, indata.dptr, indata.dsize);
449         return ctdb_control_takeover_ip(ctdb, c, data, async_reply);
450 }
451
452 /*
453   kill any clients that are registered with a IP that is being released
454  */
455 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
456 {
457         struct ctdb_client_ip *ip;
458
459         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
460                 ctdb_addr_to_str(addr)));
461
462         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
463                 ctdb_sock_addr tmp_addr;
464
465                 tmp_addr = ip->addr;
466                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
467                         ip->client_id,
468                         ctdb_addr_to_str(&ip->addr)));
469
470                 if (ctdb_same_ip(&tmp_addr, addr)) {
471                         struct ctdb_client *client = ctdb_reqid_find(ctdb, 
472                                                                      ip->client_id, 
473                                                                      struct ctdb_client);
474                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
475                                 ip->client_id,
476                                 ctdb_addr_to_str(&ip->addr),
477                                 client->pid));
478
479                         if (client->pid != 0) {
480                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
481                                         (unsigned)client->pid,
482                                         ctdb_addr_to_str(addr),
483                                         ip->client_id));
484                                 kill(client->pid, SIGKILL);
485                         }
486                 }
487         }
488 }
489
490 /*
491   called when releaseip event finishes
492  */
493 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
494                                 void *private_data)
495 {
496         struct takeover_callback_state *state = 
497                 talloc_get_type(private_data, struct takeover_callback_state);
498         TDB_DATA data;
499
500         if (status == -ETIME) {
501                 ctdb_ban_self(ctdb);
502         }
503
504         /* send a message to all clients of this node telling them
505            that the cluster has been reconfigured and they should
506            release any sockets on this IP */
507         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
508         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
509         data.dsize = strlen((char *)data.dptr)+1;
510
511         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
512
513         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
514
515         /* kill clients that have registered with this IP */
516         release_kill_clients(ctdb, state->addr);
517
518         ctdb_vnn_unassign_iface(ctdb, state->vnn);
519
520         /* the control succeeded */
521         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
522         talloc_free(state);
523 }
524
525 /*
526   release an ip address
527  */
528 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
529                                 struct ctdb_req_control *c,
530                                 TDB_DATA indata, 
531                                 bool *async_reply)
532 {
533         int ret;
534         struct takeover_callback_state *state;
535         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
536         struct ctdb_vnn *vnn;
537
538         /* update our vnn list */
539         vnn = find_public_ip_vnn(ctdb, &pip->addr);
540         if (vnn == NULL) {
541                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
542                         ctdb_addr_to_str(&pip->addr)));
543                 return 0;
544         }
545         vnn->pnn = pip->pnn;
546
547         /* stop any previous arps */
548         talloc_free(vnn->takeover_ctx);
549         vnn->takeover_ctx = NULL;
550
551         if (!ctdb_sys_have_ip(&pip->addr)) {
552                 DEBUG(DEBUG_NOTICE,("Redundant release of IP %s/%u on interface %s (ip not held)\n", 
553                         ctdb_addr_to_str(&pip->addr),
554                         vnn->public_netmask_bits, 
555                         ctdb_vnn_iface_string(vnn)));
556                 ctdb_vnn_unassign_iface(ctdb, vnn);
557                 return 0;
558         }
559
560         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%u\n", 
561                 ctdb_addr_to_str(&pip->addr),
562                 vnn->public_netmask_bits, 
563                 ctdb_vnn_iface_string(vnn),
564                 pip->pnn));
565
566         state = talloc(ctdb, struct takeover_callback_state);
567         CTDB_NO_MEMORY(ctdb, state);
568
569         state->c = talloc_steal(state, c);
570         state->addr = talloc(state, ctdb_sock_addr);       
571         CTDB_NO_MEMORY(ctdb, state->addr);
572         *state->addr = pip->addr;
573         state->vnn   = vnn;
574
575         ret = ctdb_event_script_callback(ctdb, 
576                                          state, release_ip_callback, state,
577                                          false,
578                                          CTDB_EVENT_RELEASE_IP,
579                                          "%s %s %u",
580                                          ctdb_vnn_iface_string(vnn),
581                                          ctdb_addr_to_str(&pip->addr),
582                                          vnn->public_netmask_bits);
583         if (ret != 0) {
584                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
585                         ctdb_addr_to_str(&pip->addr),
586                         ctdb_vnn_iface_string(vnn)));
587                 talloc_free(state);
588                 return -1;
589         }
590
591         /* tell the control that we will be reply asynchronously */
592         *async_reply = true;
593         return 0;
594 }
595
596 /*
597   release an ip address old v4 style
598  */
599 int32_t ctdb_control_release_ipv4(struct ctdb_context *ctdb, 
600                                 struct ctdb_req_control *c,
601                                 TDB_DATA indata, 
602                                 bool *async_reply)
603 {
604         TDB_DATA data;
605         
606         data.dsize = sizeof(struct ctdb_public_ip);
607         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
608         CTDB_NO_MEMORY(ctdb, data.dptr);
609         
610         memcpy(data.dptr, indata.dptr, indata.dsize);
611         return ctdb_control_release_ip(ctdb, c, data, async_reply);
612 }
613
614
615 static int ctdb_add_public_address(struct ctdb_context *ctdb,
616                                    ctdb_sock_addr *addr,
617                                    unsigned mask, const char *ifaces)
618 {
619         struct ctdb_vnn      *vnn;
620         uint32_t num = 0;
621         char *tmp;
622         const char *iface;
623         int i;
624         int ret;
625
626         /* Verify that we dont have an entry for this ip yet */
627         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
628                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
629                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
630                                 ctdb_addr_to_str(addr)));
631                         return -1;
632                 }               
633         }
634
635         /* create a new vnn structure for this ip address */
636         vnn = talloc_zero(ctdb, struct ctdb_vnn);
637         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
638         vnn->ifaces = talloc_array(vnn, const char *, num + 2);
639         tmp = talloc_strdup(vnn, ifaces);
640         CTDB_NO_MEMORY_FATAL(ctdb, tmp);
641         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
642                 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
643                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
644                 vnn->ifaces[num] = talloc_strdup(vnn, iface);
645                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
646                 num++;
647         }
648         talloc_free(tmp);
649         vnn->ifaces[num] = NULL;
650         vnn->public_address      = *addr;
651         vnn->public_netmask_bits = mask;
652         vnn->pnn                 = -1;
653
654         for (i=0; vnn->ifaces[i]; i++) {
655                 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
656                 if (ret != 0) {
657                         DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
658                                            "for public_address[%s]\n",
659                                            vnn->ifaces[i], ctdb_addr_to_str(addr)));
660                         talloc_free(vnn);
661                         return -1;
662                 }
663         }
664
665         DLIST_ADD(ctdb->vnn, vnn);
666
667         return 0;
668 }
669
670 /*
671   setup the event script directory
672 */
673 int ctdb_set_event_script_dir(struct ctdb_context *ctdb, const char *script_dir)
674 {
675         ctdb->event_script_dir = talloc_strdup(ctdb, script_dir);
676         CTDB_NO_MEMORY(ctdb, ctdb->event_script_dir);
677         return 0;
678 }
679
680 /*
681   setup the public address lists from a file
682 */
683 int ctdb_set_public_addresses(struct ctdb_context *ctdb, const char *alist)
684 {
685         char **lines;
686         int nlines;
687         int i;
688
689         lines = file_lines_load(alist, &nlines, ctdb);
690         if (lines == NULL) {
691                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", alist);
692                 return -1;
693         }
694         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
695                 nlines--;
696         }
697
698         for (i=0;i<nlines;i++) {
699                 unsigned mask;
700                 ctdb_sock_addr addr;
701                 const char *addrstr;
702                 const char *ifaces;
703                 char *tok, *line;
704
705                 line = lines[i];
706                 while ((*line == ' ') || (*line == '\t')) {
707                         line++;
708                 }
709                 if (*line == '#') {
710                         continue;
711                 }
712                 if (strcmp(line, "") == 0) {
713                         continue;
714                 }
715                 tok = strtok(line, " \t");
716                 addrstr = tok;
717                 tok = strtok(NULL, " \t");
718                 if (tok == NULL) {
719                         if (NULL == ctdb->default_public_interface) {
720                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
721                                          i+1));
722                                 talloc_free(lines);
723                                 return -1;
724                         }
725                         ifaces = ctdb->default_public_interface;
726                 } else {
727                         ifaces = tok;
728                 }
729
730                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
731                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
732                         talloc_free(lines);
733                         return -1;
734                 }
735                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces)) {
736                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
737                         talloc_free(lines);
738                         return -1;
739                 }
740         }
741
742         talloc_free(lines);
743         return 0;
744 }
745
746 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
747                               const char *iface,
748                               const char *ip)
749 {
750         struct ctdb_vnn *svnn;
751         bool ok;
752         int ret;
753
754         svnn = talloc_zero(ctdb, struct ctdb_vnn);
755         CTDB_NO_MEMORY(ctdb, svnn);
756
757         svnn->ifaces = talloc_array(svnn, const char *, 2);
758         CTDB_NO_MEMORY(ctdb, svnn->ifaces);
759         svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
760         CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
761         svnn->ifaces[1] = NULL;
762
763         ok = parse_ip(ip, iface, 0, &svnn->public_address);
764         if (!ok) {
765                 talloc_free(svnn);
766                 return -1;
767         }
768
769         ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
770         if (ret != 0) {
771                 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
772                                    "for single_ip[%s]\n",
773                                    svnn->ifaces[0],
774                                    ctdb_addr_to_str(&svnn->public_address)));
775                 talloc_free(svnn);
776                 return -1;
777         }
778
779         ret = ctdb_vnn_assign_iface(ctdb, svnn);
780         if (ret != 0) {
781                 talloc_free(svnn);
782                 return -1;
783         }
784
785         ctdb->single_ip_vnn = svnn;
786         return 0;
787 }
788
789 struct ctdb_public_ip_list {
790         struct ctdb_public_ip_list *next;
791         uint32_t pnn;
792         ctdb_sock_addr addr;
793 };
794
795
796 /* Given a physical node, return the number of
797    public addresses that is currently assigned to this node.
798 */
799 static int node_ip_coverage(struct ctdb_context *ctdb, 
800         int32_t pnn,
801         struct ctdb_public_ip_list *ips)
802 {
803         int num=0;
804
805         for (;ips;ips=ips->next) {
806                 if (ips->pnn == pnn) {
807                         num++;
808                 }
809         }
810         return num;
811 }
812
813
814 /* Check if this is a public ip known to the node, i.e. can that
815    node takeover this ip ?
816 */
817 static int can_node_serve_ip(struct ctdb_context *ctdb, int32_t pnn, 
818                 struct ctdb_public_ip_list *ip)
819 {
820         struct ctdb_all_public_ips *public_ips;
821         int i;
822
823         public_ips = ctdb->nodes[pnn]->available_public_ips;
824
825         if (public_ips == NULL) {
826                 return -1;
827         }
828
829         for (i=0;i<public_ips->num;i++) {
830                 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
831                         /* yes, this node can serve this public ip */
832                         return 0;
833                 }
834         }
835
836         return -1;
837 }
838
839
840 /* search the node lists list for a node to takeover this ip.
841    pick the node that currently are serving the least number of ips
842    so that the ips get spread out evenly.
843 */
844 static int find_takeover_node(struct ctdb_context *ctdb, 
845                 struct ctdb_node_map *nodemap, uint32_t mask, 
846                 struct ctdb_public_ip_list *ip,
847                 struct ctdb_public_ip_list *all_ips)
848 {
849         int pnn, min=0, num;
850         int i;
851
852         pnn    = -1;
853         for (i=0;i<nodemap->num;i++) {
854                 if (nodemap->nodes[i].flags & mask) {
855                         /* This node is not healty and can not be used to serve
856                            a public address 
857                         */
858                         continue;
859                 }
860
861                 /* verify that this node can serve this ip */
862                 if (can_node_serve_ip(ctdb, i, ip)) {
863                         /* no it couldnt   so skip to the next node */
864                         continue;
865                 }
866
867                 num = node_ip_coverage(ctdb, i, all_ips);
868                 /* was this the first node we checked ? */
869                 if (pnn == -1) {
870                         pnn = i;
871                         min  = num;
872                 } else {
873                         if (num < min) {
874                                 pnn = i;
875                                 min  = num;
876                         }
877                 }
878         }       
879         if (pnn == -1) {
880                 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
881                         ctdb_addr_to_str(&ip->addr)));
882
883                 return -1;
884         }
885
886         ip->pnn = pnn;
887         return 0;
888 }
889
890 #define IP_KEYLEN       4
891 static uint32_t *ip_key(ctdb_sock_addr *ip)
892 {
893         static uint32_t key[IP_KEYLEN];
894
895         bzero(key, sizeof(key));
896
897         switch (ip->sa.sa_family) {
898         case AF_INET:
899                 key[3]  = htonl(ip->ip.sin_addr.s_addr);
900                 break;
901         case AF_INET6:
902                 key[0]  = htonl(ip->ip6.sin6_addr.s6_addr32[0]);
903                 key[1]  = htonl(ip->ip6.sin6_addr.s6_addr32[1]);
904                 key[2]  = htonl(ip->ip6.sin6_addr.s6_addr32[2]);
905                 key[3]  = htonl(ip->ip6.sin6_addr.s6_addr32[3]);
906                 break;
907         default:
908                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
909                 return key;
910         }
911
912         return key;
913 }
914
915 static void *add_ip_callback(void *parm, void *data)
916 {
917         return parm;
918 }
919
920 void getips_count_callback(void *param, void *data)
921 {
922         struct ctdb_public_ip_list **ip_list = (struct ctdb_public_ip_list **)param;
923         struct ctdb_public_ip_list *new_ip = (struct ctdb_public_ip_list *)data;
924
925         new_ip->next = *ip_list;
926         *ip_list     = new_ip;
927 }
928
929 struct ctdb_public_ip_list *
930 create_merged_ip_list(struct ctdb_context *ctdb, TALLOC_CTX *tmp_ctx)
931 {
932         int i, j;
933         struct ctdb_public_ip_list *ip_list;
934         struct ctdb_all_public_ips *public_ips;
935         trbt_tree_t *ip_tree;
936
937         ip_tree = trbt_create(tmp_ctx, 0);
938
939         for (i=0;i<ctdb->num_nodes;i++) {
940                 public_ips = ctdb->nodes[i]->known_public_ips;
941
942                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
943                         continue;
944                 }
945
946                 /* there were no public ips for this node */
947                 if (public_ips == NULL) {
948                         continue;
949                 }               
950
951                 for (j=0;j<public_ips->num;j++) {
952                         struct ctdb_public_ip_list *tmp_ip; 
953
954                         tmp_ip = talloc_zero(tmp_ctx, struct ctdb_public_ip_list);
955                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
956                         tmp_ip->pnn  = public_ips->ips[j].pnn;
957                         tmp_ip->addr = public_ips->ips[j].addr;
958                         tmp_ip->next = NULL;
959
960                         trbt_insertarray32_callback(ip_tree,
961                                 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
962                                 add_ip_callback,
963                                 tmp_ip);
964                 }
965         }
966
967         ip_list = NULL;
968         trbt_traversearray32(ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
969
970         return ip_list;
971 }
972
973 /*
974   make any IP alias changes for public addresses that are necessary 
975  */
976 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
977 {
978         int i, num_healthy, retries;
979         struct ctdb_public_ip ip;
980         struct ctdb_public_ipv4 ipv4;
981         uint32_t mask;
982         struct ctdb_public_ip_list *all_ips, *tmp_ip;
983         int maxnode, maxnum=0, minnode, minnum=0, num;
984         TDB_DATA data;
985         struct timeval timeout;
986         struct client_async_data *async_data;
987         struct ctdb_client_control_state *state;
988         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
989
990
991         ZERO_STRUCT(ip);
992
993         /* Count how many completely healthy nodes we have */
994         num_healthy = 0;
995         for (i=0;i<nodemap->num;i++) {
996                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
997                         num_healthy++;
998                 }
999         }
1000
1001         if (num_healthy > 0) {
1002                 /* We have healthy nodes, so only consider them for 
1003                    serving public addresses
1004                 */
1005                 mask = NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED;
1006         } else {
1007                 /* We didnt have any completely healthy nodes so
1008                    use "disabled" nodes as a fallback
1009                 */
1010                 mask = NODE_FLAGS_INACTIVE;
1011         }
1012
1013         /* since nodes only know about those public addresses that
1014            can be served by that particular node, no single node has
1015            a full list of all public addresses that exist in the cluster.
1016            Walk over all node structures and create a merged list of
1017            all public addresses that exist in the cluster.
1018         */
1019         all_ips = create_merged_ip_list(ctdb, tmp_ctx);
1020
1021         /* If we want deterministic ip allocations, i.e. that the ip addresses
1022            will always be allocated the same way for a specific set of
1023            available/unavailable nodes.
1024         */
1025         if (1 == ctdb->tunable.deterministic_public_ips) {              
1026                 DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
1027                 for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
1028                         tmp_ip->pnn = i%nodemap->num;
1029                 }
1030         }
1031
1032
1033         /* mark all public addresses with a masked node as being served by
1034            node -1
1035         */
1036         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1037                 if (tmp_ip->pnn == -1) {
1038                         continue;
1039                 }
1040                 if (nodemap->nodes[tmp_ip->pnn].flags & mask) {
1041                         tmp_ip->pnn = -1;
1042                 }
1043         }
1044
1045         /* verify that the assigned nodes can serve that public ip
1046            and set it to -1 if not
1047         */
1048         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1049                 if (tmp_ip->pnn == -1) {
1050                         continue;
1051                 }
1052                 if (can_node_serve_ip(ctdb, tmp_ip->pnn, tmp_ip) != 0) {
1053                         /* this node can not serve this ip. */
1054                         tmp_ip->pnn = -1;
1055                 }
1056         }
1057
1058
1059         /* now we must redistribute all public addresses with takeover node
1060            -1 among the nodes available
1061         */
1062         retries = 0;
1063 try_again:
1064         /* loop over all ip's and find a physical node to cover for 
1065            each unassigned ip.
1066         */
1067         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1068                 if (tmp_ip->pnn == -1) {
1069                         if (find_takeover_node(ctdb, nodemap, mask, tmp_ip, all_ips)) {
1070                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1071                                         ctdb_addr_to_str(&tmp_ip->addr)));
1072                         }
1073                 }
1074         }
1075
1076         /* If we dont want ips to fail back after a node becomes healthy
1077            again, we wont even try to reallocat the ip addresses so that
1078            they are evenly spread out.
1079            This can NOT be used at the same time as DeterministicIPs !
1080         */
1081         if (1 == ctdb->tunable.no_ip_failback) {
1082                 if (1 == ctdb->tunable.deterministic_public_ips) {
1083                         DEBUG(DEBUG_ERR, ("ERROR: You can not use 'DeterministicIPs' and 'NoIPFailback' at the same time\n"));
1084                 }
1085                 goto finished;
1086         }
1087
1088
1089         /* now, try to make sure the ip adresses are evenly distributed
1090            across the node.
1091            for each ip address, loop over all nodes that can serve this
1092            ip and make sure that the difference between the node
1093            serving the most and the node serving the least ip's are not greater
1094            than 1.
1095         */
1096         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1097                 if (tmp_ip->pnn == -1) {
1098                         continue;
1099                 }
1100
1101                 /* Get the highest and lowest number of ips's served by any 
1102                    valid node which can serve this ip.
1103                 */
1104                 maxnode = -1;
1105                 minnode = -1;
1106                 for (i=0;i<nodemap->num;i++) {
1107                         if (nodemap->nodes[i].flags & mask) {
1108                                 continue;
1109                         }
1110
1111                         /* only check nodes that can actually serve this ip */
1112                         if (can_node_serve_ip(ctdb, i, tmp_ip)) {
1113                                 /* no it couldnt   so skip to the next node */
1114                                 continue;
1115                         }
1116
1117                         num = node_ip_coverage(ctdb, i, all_ips);
1118                         if (maxnode == -1) {
1119                                 maxnode = i;
1120                                 maxnum  = num;
1121                         } else {
1122                                 if (num > maxnum) {
1123                                         maxnode = i;
1124                                         maxnum  = num;
1125                                 }
1126                         }
1127                         if (minnode == -1) {
1128                                 minnode = i;
1129                                 minnum  = num;
1130                         } else {
1131                                 if (num < minnum) {
1132                                         minnode = i;
1133                                         minnum  = num;
1134                                 }
1135                         }
1136                 }
1137                 if (maxnode == -1) {
1138                         DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1139                                 ctdb_addr_to_str(&tmp_ip->addr)));
1140
1141                         continue;
1142                 }
1143
1144                 /* If we want deterministic IPs then dont try to reallocate 
1145                    them to spread out the load.
1146                 */
1147                 if (1 == ctdb->tunable.deterministic_public_ips) {
1148                         continue;
1149                 }
1150
1151                 /* if the spread between the smallest and largest coverage by
1152                    a node is >=2 we steal one of the ips from the node with
1153                    most coverage to even things out a bit.
1154                    try to do this at most 5 times  since we dont want to spend
1155                    too much time balancing the ip coverage.
1156                 */
1157                 if ( (maxnum > minnum+1)
1158                   && (retries < 5) ){
1159                         struct ctdb_public_ip_list *tmp;
1160
1161                         /* mark one of maxnode's vnn's as unassigned and try
1162                            again
1163                         */
1164                         for (tmp=all_ips;tmp;tmp=tmp->next) {
1165                                 if (tmp->pnn == maxnode) {
1166                                         tmp->pnn = -1;
1167                                         retries++;
1168                                         goto try_again;
1169                                 }
1170                         }
1171                 }
1172         }
1173
1174
1175         /* finished distributing the public addresses, now just send the 
1176            info out to the nodes
1177         */
1178 finished:
1179
1180         /* at this point ->pnn is the node which will own each IP
1181            or -1 if there is no node that can cover this ip
1182         */
1183
1184         /* now tell all nodes to delete any alias that they should not
1185            have.  This will be a NOOP on nodes that don't currently
1186            hold the given alias */
1187         async_data = talloc_zero(tmp_ctx, struct client_async_data);
1188         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
1189
1190         for (i=0;i<nodemap->num;i++) {
1191                 /* don't talk to unconnected nodes, but do talk to banned nodes */
1192                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1193                         continue;
1194                 }
1195
1196                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1197                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
1198                                 /* This node should be serving this
1199                                    vnn so dont tell it to release the ip
1200                                 */
1201                                 continue;
1202                         }
1203                         if (tmp_ip->addr.sa.sa_family == AF_INET) {
1204                                 ipv4.pnn = tmp_ip->pnn;
1205                                 ipv4.sin = tmp_ip->addr.ip;
1206
1207                                 timeout = TAKEOVER_TIMEOUT();
1208                                 data.dsize = sizeof(ipv4);
1209                                 data.dptr  = (uint8_t *)&ipv4;
1210                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
1211                                                 0, CTDB_CONTROL_RELEASE_IPv4, 0,
1212                                                 data, async_data,
1213                                                 &timeout, NULL);
1214                         } else {
1215                                 ip.pnn  = tmp_ip->pnn;
1216                                 ip.addr = tmp_ip->addr;
1217
1218                                 timeout = TAKEOVER_TIMEOUT();
1219                                 data.dsize = sizeof(ip);
1220                                 data.dptr  = (uint8_t *)&ip;
1221                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
1222                                                 0, CTDB_CONTROL_RELEASE_IP, 0,
1223                                                 data, async_data,
1224                                                 &timeout, NULL);
1225                         }
1226
1227                         if (state == NULL) {
1228                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
1229                                 talloc_free(tmp_ctx);
1230                                 return -1;
1231                         }
1232                 
1233                         ctdb_client_async_add(async_data, state);
1234                 }
1235         }
1236         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
1237                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
1238                 talloc_free(tmp_ctx);
1239                 return -1;
1240         }
1241         talloc_free(async_data);
1242
1243
1244         /* tell all nodes to get their own IPs */
1245         async_data = talloc_zero(tmp_ctx, struct client_async_data);
1246         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
1247         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1248                 if (tmp_ip->pnn == -1) {
1249                         /* this IP won't be taken over */
1250                         continue;
1251                 }
1252
1253                 if (tmp_ip->addr.sa.sa_family == AF_INET) {
1254                         ipv4.pnn = tmp_ip->pnn;
1255                         ipv4.sin = tmp_ip->addr.ip;
1256
1257                         timeout = TAKEOVER_TIMEOUT();
1258                         data.dsize = sizeof(ipv4);
1259                         data.dptr  = (uint8_t *)&ipv4;
1260                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
1261                                         0, CTDB_CONTROL_TAKEOVER_IPv4, 0,
1262                                         data, async_data,
1263                                         &timeout, NULL);
1264                 } else {
1265                         ip.pnn  = tmp_ip->pnn;
1266                         ip.addr = tmp_ip->addr;
1267
1268                         timeout = TAKEOVER_TIMEOUT();
1269                         data.dsize = sizeof(ip);
1270                         data.dptr  = (uint8_t *)&ip;
1271                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
1272                                         0, CTDB_CONTROL_TAKEOVER_IP, 0,
1273                                         data, async_data,
1274                                         &timeout, NULL);
1275                 }
1276                 if (state == NULL) {
1277                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
1278                         talloc_free(tmp_ctx);
1279                         return -1;
1280                 }
1281                 
1282                 ctdb_client_async_add(async_data, state);
1283         }
1284         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
1285                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
1286                 talloc_free(tmp_ctx);
1287                 return -1;
1288         }
1289
1290         talloc_free(tmp_ctx);
1291         return 0;
1292 }
1293
1294
1295 /*
1296   destroy a ctdb_client_ip structure
1297  */
1298 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
1299 {
1300         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
1301                 ctdb_addr_to_str(&ip->addr),
1302                 ntohs(ip->addr.ip.sin_port),
1303                 ip->client_id));
1304
1305         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
1306         return 0;
1307 }
1308
1309 /*
1310   called by a client to inform us of a TCP connection that it is managing
1311   that should tickled with an ACK when IP takeover is done
1312   we handle both the old ipv4 style of packets as well as the new ipv4/6
1313   pdus.
1314  */
1315 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
1316                                 TDB_DATA indata)
1317 {
1318         struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
1319         struct ctdb_control_tcp *old_addr = NULL;
1320         struct ctdb_control_tcp_addr new_addr;
1321         struct ctdb_control_tcp_addr *tcp_sock = NULL;
1322         struct ctdb_tcp_list *tcp;
1323         struct ctdb_control_tcp_vnn t;
1324         int ret;
1325         TDB_DATA data;
1326         struct ctdb_client_ip *ip;
1327         struct ctdb_vnn *vnn;
1328         ctdb_sock_addr addr;
1329
1330         switch (indata.dsize) {
1331         case sizeof(struct ctdb_control_tcp):
1332                 old_addr = (struct ctdb_control_tcp *)indata.dptr;
1333                 ZERO_STRUCT(new_addr);
1334                 tcp_sock = &new_addr;
1335                 tcp_sock->src.ip  = old_addr->src;
1336                 tcp_sock->dest.ip = old_addr->dest;
1337                 break;
1338         case sizeof(struct ctdb_control_tcp_addr):
1339                 tcp_sock = (struct ctdb_control_tcp_addr *)indata.dptr;
1340                 break;
1341         default:
1342                 DEBUG(DEBUG_ERR,(__location__ " Invalid data structure passed "
1343                                  "to ctdb_control_tcp_client. size was %d but "
1344                                  "only allowed sizes are %lu and %lu\n",
1345                                  (int)indata.dsize,
1346                                  (long unsigned)sizeof(struct ctdb_control_tcp),
1347                                  (long unsigned)sizeof(struct ctdb_control_tcp_addr)));
1348                 return -1;
1349         }
1350
1351         addr = tcp_sock->src;
1352         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
1353         addr = tcp_sock->dest;
1354         ctdb_canonicalize_ip(&addr, &tcp_sock->dest);
1355
1356         ZERO_STRUCT(addr);
1357         memcpy(&addr, &tcp_sock->dest, sizeof(addr));
1358         vnn = find_public_ip_vnn(ctdb, &addr);
1359         if (vnn == NULL) {
1360                 switch (addr.sa.sa_family) {
1361                 case AF_INET:
1362                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
1363                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
1364                                         ctdb_addr_to_str(&addr)));
1365                         }
1366                         break;
1367                 case AF_INET6:
1368                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
1369                                 ctdb_addr_to_str(&addr)));
1370                         break;
1371                 default:
1372                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
1373                 }
1374
1375                 return 0;
1376         }
1377
1378         if (vnn->pnn != ctdb->pnn) {
1379                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
1380                         ctdb_addr_to_str(&addr),
1381                         client_id, client->pid));
1382                 /* failing this call will tell smbd to die */
1383                 return -1;
1384         }
1385
1386         ip = talloc(client, struct ctdb_client_ip);
1387         CTDB_NO_MEMORY(ctdb, ip);
1388
1389         ip->ctdb      = ctdb;
1390         ip->addr      = addr;
1391         ip->client_id = client_id;
1392         talloc_set_destructor(ip, ctdb_client_ip_destructor);
1393         DLIST_ADD(ctdb->client_ip_list, ip);
1394
1395         tcp = talloc(client, struct ctdb_tcp_list);
1396         CTDB_NO_MEMORY(ctdb, tcp);
1397
1398         tcp->connection.src_addr = tcp_sock->src;
1399         tcp->connection.dst_addr = tcp_sock->dest;
1400
1401         DLIST_ADD(client->tcp_list, tcp);
1402
1403         t.src  = tcp_sock->src;
1404         t.dest = tcp_sock->dest;
1405
1406         data.dptr = (uint8_t *)&t;
1407         data.dsize = sizeof(t);
1408
1409         switch (addr.sa.sa_family) {
1410         case AF_INET:
1411                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
1412                         (unsigned)ntohs(tcp_sock->dest.ip.sin_port), 
1413                         ctdb_addr_to_str(&tcp_sock->src),
1414                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
1415                 break;
1416         case AF_INET6:
1417                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
1418                         (unsigned)ntohs(tcp_sock->dest.ip6.sin6_port), 
1419                         ctdb_addr_to_str(&tcp_sock->src),
1420                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
1421                 break;
1422         default:
1423                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
1424         }
1425
1426
1427         /* tell all nodes about this tcp connection */
1428         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
1429                                        CTDB_CONTROL_TCP_ADD,
1430                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
1431         if (ret != 0) {
1432                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
1433                 return -1;
1434         }
1435
1436         return 0;
1437 }
1438
1439 /*
1440   find a tcp address on a list
1441  */
1442 static struct ctdb_tcp_connection *ctdb_tcp_find(struct ctdb_tcp_array *array, 
1443                                            struct ctdb_tcp_connection *tcp)
1444 {
1445         int i;
1446
1447         if (array == NULL) {
1448                 return NULL;
1449         }
1450
1451         for (i=0;i<array->num;i++) {
1452                 if (ctdb_same_sockaddr(&array->connections[i].src_addr, &tcp->src_addr) &&
1453                     ctdb_same_sockaddr(&array->connections[i].dst_addr, &tcp->dst_addr)) {
1454                         return &array->connections[i];
1455                 }
1456         }
1457         return NULL;
1458 }
1459
1460 /*
1461   called by a daemon to inform us of a TCP connection that one of its
1462   clients managing that should tickled with an ACK when IP takeover is
1463   done
1464  */
1465 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata)
1466 {
1467         struct ctdb_control_tcp_vnn *p = (struct ctdb_control_tcp_vnn *)indata.dptr;
1468         struct ctdb_tcp_array *tcparray;
1469         struct ctdb_tcp_connection tcp;
1470         struct ctdb_vnn *vnn;
1471
1472         vnn = find_public_ip_vnn(ctdb, &p->dest);
1473         if (vnn == NULL) {
1474                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
1475                         ctdb_addr_to_str(&p->dest)));
1476
1477                 return -1;
1478         }
1479
1480
1481         tcparray = vnn->tcp_array;
1482
1483         /* If this is the first tickle */
1484         if (tcparray == NULL) {
1485                 tcparray = talloc_size(ctdb->nodes, 
1486                         offsetof(struct ctdb_tcp_array, connections) +
1487                         sizeof(struct ctdb_tcp_connection) * 1);
1488                 CTDB_NO_MEMORY(ctdb, tcparray);
1489                 vnn->tcp_array = tcparray;
1490
1491                 tcparray->num = 0;
1492                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
1493                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
1494
1495                 tcparray->connections[tcparray->num].src_addr = p->src;
1496                 tcparray->connections[tcparray->num].dst_addr = p->dest;
1497                 tcparray->num++;
1498                 return 0;
1499         }
1500
1501
1502         /* Do we already have this tickle ?*/
1503         tcp.src_addr = p->src;
1504         tcp.dst_addr = p->dest;
1505         if (ctdb_tcp_find(vnn->tcp_array, &tcp) != NULL) {
1506                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
1507                         ctdb_addr_to_str(&tcp.dst_addr),
1508                         ntohs(tcp.dst_addr.ip.sin_port),
1509                         vnn->pnn));
1510                 return 0;
1511         }
1512
1513         /* A new tickle, we must add it to the array */
1514         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
1515                                         struct ctdb_tcp_connection,
1516                                         tcparray->num+1);
1517         CTDB_NO_MEMORY(ctdb, tcparray->connections);
1518
1519         vnn->tcp_array = tcparray;
1520         tcparray->connections[tcparray->num].src_addr = p->src;
1521         tcparray->connections[tcparray->num].dst_addr = p->dest;
1522         tcparray->num++;
1523                                 
1524         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
1525                 ctdb_addr_to_str(&tcp.dst_addr),
1526                 ntohs(tcp.dst_addr.ip.sin_port),
1527                 vnn->pnn));
1528
1529         return 0;
1530 }
1531
1532
1533 /*
1534   called by a daemon to inform us of a TCP connection that one of its
1535   clients managing that should tickled with an ACK when IP takeover is
1536   done
1537  */
1538 static void ctdb_remove_tcp_connection(struct ctdb_context *ctdb, struct ctdb_tcp_connection *conn)
1539 {
1540         struct ctdb_tcp_connection *tcpp;
1541         struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst_addr);
1542
1543         if (vnn == NULL) {
1544                 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
1545                         ctdb_addr_to_str(&conn->dst_addr)));
1546                 return;
1547         }
1548
1549         /* if the array is empty we cant remove it
1550            and we dont need to do anything
1551          */
1552         if (vnn->tcp_array == NULL) {
1553                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
1554                         ctdb_addr_to_str(&conn->dst_addr),
1555                         ntohs(conn->dst_addr.ip.sin_port)));
1556                 return;
1557         }
1558
1559
1560         /* See if we know this connection
1561            if we dont know this connection  then we dont need to do anything
1562          */
1563         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
1564         if (tcpp == NULL) {
1565                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
1566                         ctdb_addr_to_str(&conn->dst_addr),
1567                         ntohs(conn->dst_addr.ip.sin_port)));
1568                 return;
1569         }
1570
1571
1572         /* We need to remove this entry from the array.
1573            Instead of allocating a new array and copying data to it
1574            we cheat and just copy the last entry in the existing array
1575            to the entry that is to be removed and just shring the 
1576            ->num field
1577          */
1578         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
1579         vnn->tcp_array->num--;
1580
1581         /* If we deleted the last entry we also need to remove the entire array
1582          */
1583         if (vnn->tcp_array->num == 0) {
1584                 talloc_free(vnn->tcp_array);
1585                 vnn->tcp_array = NULL;
1586         }               
1587
1588         vnn->tcp_update_needed = true;
1589
1590         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
1591                 ctdb_addr_to_str(&conn->src_addr),
1592                 ntohs(conn->src_addr.ip.sin_port)));
1593 }
1594
1595
1596 /*
1597   called when a daemon restarts - send all tickes for all public addresses
1598   we are serving immediately to the new node.
1599  */
1600 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t vnn)
1601 {
1602 /*XXX here we should send all tickes we are serving to the new node */
1603         return 0;
1604 }
1605
1606
1607 /*
1608   called when a client structure goes away - hook to remove
1609   elements from the tcp_list in all daemons
1610  */
1611 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
1612 {
1613         while (client->tcp_list) {
1614                 struct ctdb_tcp_list *tcp = client->tcp_list;
1615                 DLIST_REMOVE(client->tcp_list, tcp);
1616                 ctdb_remove_tcp_connection(client->ctdb, &tcp->connection);
1617         }
1618 }
1619
1620
1621 /*
1622   release all IPs on shutdown
1623  */
1624 void ctdb_release_all_ips(struct ctdb_context *ctdb)
1625 {
1626         struct ctdb_vnn *vnn;
1627
1628         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1629                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
1630                         ctdb_vnn_unassign_iface(ctdb, vnn);
1631                         continue;
1632                 }
1633                 if (!vnn->iface) {
1634                         continue;
1635                 }
1636                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
1637                                   ctdb_vnn_iface_string(vnn),
1638                                   ctdb_addr_to_str(&vnn->public_address),
1639                                   vnn->public_netmask_bits);
1640                 release_kill_clients(ctdb, &vnn->public_address);
1641                 ctdb_vnn_unassign_iface(ctdb, vnn);
1642         }
1643 }
1644
1645
1646 /*
1647   get list of public IPs
1648  */
1649 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
1650                                     struct ctdb_req_control *c, TDB_DATA *outdata)
1651 {
1652         int i, num, len;
1653         struct ctdb_all_public_ips *ips;
1654         struct ctdb_vnn *vnn;
1655         bool only_available = false;
1656
1657         if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
1658                 only_available = true;
1659         }
1660
1661         /* count how many public ip structures we have */
1662         num = 0;
1663         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1664                 num++;
1665         }
1666
1667         len = offsetof(struct ctdb_all_public_ips, ips) + 
1668                 num*sizeof(struct ctdb_public_ip);
1669         ips = talloc_zero_size(outdata, len);
1670         CTDB_NO_MEMORY(ctdb, ips);
1671
1672         i = 0;
1673         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1674                 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
1675                         continue;
1676                 }
1677                 ips->ips[i].pnn  = vnn->pnn;
1678                 ips->ips[i].addr = vnn->public_address;
1679                 i++;
1680         }
1681         ips->num = i;
1682         len = offsetof(struct ctdb_all_public_ips, ips) +
1683                 i*sizeof(struct ctdb_public_ip);
1684
1685         outdata->dsize = len;
1686         outdata->dptr  = (uint8_t *)ips;
1687
1688         return 0;
1689 }
1690
1691
1692 /*
1693   get list of public IPs, old ipv4 style.  only returns ipv4 addresses
1694  */
1695 int32_t ctdb_control_get_public_ipsv4(struct ctdb_context *ctdb, 
1696                                     struct ctdb_req_control *c, TDB_DATA *outdata)
1697 {
1698         int i, num, len;
1699         struct ctdb_all_public_ipsv4 *ips;
1700         struct ctdb_vnn *vnn;
1701
1702         /* count how many public ip structures we have */
1703         num = 0;
1704         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1705                 if (vnn->public_address.sa.sa_family != AF_INET) {
1706                         continue;
1707                 }
1708                 num++;
1709         }
1710
1711         len = offsetof(struct ctdb_all_public_ipsv4, ips) + 
1712                 num*sizeof(struct ctdb_public_ipv4);
1713         ips = talloc_zero_size(outdata, len);
1714         CTDB_NO_MEMORY(ctdb, ips);
1715
1716         outdata->dsize = len;
1717         outdata->dptr  = (uint8_t *)ips;
1718
1719         ips->num = num;
1720         i = 0;
1721         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1722                 if (vnn->public_address.sa.sa_family != AF_INET) {
1723                         continue;
1724                 }
1725                 ips->ips[i].pnn = vnn->pnn;
1726                 ips->ips[i].sin = vnn->public_address.ip;
1727                 i++;
1728         }
1729
1730         return 0;
1731 }
1732
1733 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
1734                                         struct ctdb_req_control *c,
1735                                         TDB_DATA indata,
1736                                         TDB_DATA *outdata)
1737 {
1738         int i, num, len;
1739         ctdb_sock_addr *addr;
1740         struct ctdb_control_public_ip_info *info;
1741         struct ctdb_vnn *vnn;
1742
1743         addr = (ctdb_sock_addr *)indata.dptr;
1744
1745         vnn = find_public_ip_vnn(ctdb, addr);
1746         if (vnn == NULL) {
1747                 /* if it is not a public ip   it could be our 'single ip' */
1748                 if (ctdb->single_ip_vnn) {
1749                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
1750                                 vnn = ctdb->single_ip_vnn;
1751                         }
1752                 }
1753         }
1754         if (vnn == NULL) {
1755                 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
1756                                  "'%s'not a public address\n",
1757                                  ctdb_addr_to_str(addr)));
1758                 return -1;
1759         }
1760
1761         /* count how many public ip structures we have */
1762         num = 0;
1763         for (;vnn->ifaces[num];) {
1764                 num++;
1765         }
1766
1767         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
1768                 num*sizeof(struct ctdb_control_iface_info);
1769         info = talloc_zero_size(outdata, len);
1770         CTDB_NO_MEMORY(ctdb, info);
1771
1772         info->ip.addr = vnn->public_address;
1773         info->ip.pnn = vnn->pnn;
1774         info->active_idx = 0xFFFFFFFF;
1775
1776         for (i=0; vnn->ifaces[i]; i++) {
1777                 struct ctdb_iface *cur;
1778
1779                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
1780                 if (cur == NULL) {
1781                         DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
1782                                            vnn->ifaces[i]));
1783                         return -1;
1784                 }
1785                 if (vnn->iface == cur) {
1786                         info->active_idx = i;
1787                 }
1788                 strcpy(info->ifaces[i].name, cur->name);
1789                 info->ifaces[i].link_state = cur->link_up;
1790                 info->ifaces[i].references = cur->references;
1791         }
1792         info->num = i;
1793         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
1794                 i*sizeof(struct ctdb_control_iface_info);
1795
1796         outdata->dsize = len;
1797         outdata->dptr  = (uint8_t *)info;
1798
1799         return 0;
1800 }
1801
1802 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
1803                                 struct ctdb_req_control *c,
1804                                 TDB_DATA *outdata)
1805 {
1806         int i, num, len;
1807         struct ctdb_control_get_ifaces *ifaces;
1808         struct ctdb_iface *cur;
1809
1810         /* count how many public ip structures we have */
1811         num = 0;
1812         for (cur=ctdb->ifaces;cur;cur=cur->next) {
1813                 num++;
1814         }
1815
1816         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
1817                 num*sizeof(struct ctdb_control_iface_info);
1818         ifaces = talloc_zero_size(outdata, len);
1819         CTDB_NO_MEMORY(ctdb, ifaces);
1820
1821         i = 0;
1822         for (cur=ctdb->ifaces;cur;cur=cur->next) {
1823                 strcpy(ifaces->ifaces[i].name, cur->name);
1824                 ifaces->ifaces[i].link_state = cur->link_up;
1825                 ifaces->ifaces[i].references = cur->references;
1826                 i++;
1827         }
1828         ifaces->num = i;
1829         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
1830                 i*sizeof(struct ctdb_control_iface_info);
1831
1832         outdata->dsize = len;
1833         outdata->dptr  = (uint8_t *)ifaces;
1834
1835         return 0;
1836 }
1837
1838 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
1839                                     struct ctdb_req_control *c,
1840                                     TDB_DATA indata)
1841 {
1842         return -1;
1843 }
1844
1845
1846 /* 
1847    structure containing the listening socket and the list of tcp connections
1848    that the ctdb daemon is to kill
1849 */
1850 struct ctdb_kill_tcp {
1851         struct ctdb_vnn *vnn;
1852         struct ctdb_context *ctdb;
1853         int capture_fd;
1854         struct fd_event *fde;
1855         trbt_tree_t *connections;
1856         void *private_data;
1857 };
1858
1859 /*
1860   a tcp connection that is to be killed
1861  */
1862 struct ctdb_killtcp_con {
1863         ctdb_sock_addr src_addr;
1864         ctdb_sock_addr dst_addr;
1865         int count;
1866         struct ctdb_kill_tcp *killtcp;
1867 };
1868
1869 /* this function is used to create a key to represent this socketpair
1870    in the killtcp tree.
1871    this key is used to insert and lookup matching socketpairs that are
1872    to be tickled and RST
1873 */
1874 #define KILLTCP_KEYLEN  10
1875 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
1876 {
1877         static uint32_t key[KILLTCP_KEYLEN];
1878
1879         bzero(key, sizeof(key));
1880
1881         if (src->sa.sa_family != dst->sa.sa_family) {
1882                 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
1883                 return key;
1884         }
1885         
1886         switch (src->sa.sa_family) {
1887         case AF_INET:
1888                 key[0]  = dst->ip.sin_addr.s_addr;
1889                 key[1]  = src->ip.sin_addr.s_addr;
1890                 key[2]  = dst->ip.sin_port;
1891                 key[3]  = src->ip.sin_port;
1892                 break;
1893         case AF_INET6:
1894                 key[0]  = dst->ip6.sin6_addr.s6_addr32[3];
1895                 key[1]  = src->ip6.sin6_addr.s6_addr32[3];
1896                 key[2]  = dst->ip6.sin6_addr.s6_addr32[2];
1897                 key[3]  = src->ip6.sin6_addr.s6_addr32[2];
1898                 key[4]  = dst->ip6.sin6_addr.s6_addr32[1];
1899                 key[5]  = src->ip6.sin6_addr.s6_addr32[1];
1900                 key[6]  = dst->ip6.sin6_addr.s6_addr32[0];
1901                 key[7]  = src->ip6.sin6_addr.s6_addr32[0];
1902                 key[8]  = dst->ip6.sin6_port;
1903                 key[9]  = src->ip6.sin6_port;
1904                 break;
1905         default:
1906                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
1907                 return key;
1908         }
1909
1910         return key;
1911 }
1912
1913 /*
1914   called when we get a read event on the raw socket
1915  */
1916 static void capture_tcp_handler(struct event_context *ev, struct fd_event *fde, 
1917                                 uint16_t flags, void *private_data)
1918 {
1919         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
1920         struct ctdb_killtcp_con *con;
1921         ctdb_sock_addr src, dst;
1922         uint32_t ack_seq, seq;
1923
1924         if (!(flags & EVENT_FD_READ)) {
1925                 return;
1926         }
1927
1928         if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
1929                                 killtcp->private_data,
1930                                 &src, &dst,
1931                                 &ack_seq, &seq) != 0) {
1932                 /* probably a non-tcp ACK packet */
1933                 return;
1934         }
1935
1936         /* check if we have this guy in our list of connections
1937            to kill
1938         */
1939         con = trbt_lookuparray32(killtcp->connections, 
1940                         KILLTCP_KEYLEN, killtcp_key(&src, &dst));
1941         if (con == NULL) {
1942                 /* no this was some other packet we can just ignore */
1943                 return;
1944         }
1945
1946         /* This one has been tickled !
1947            now reset him and remove him from the list.
1948          */
1949         DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
1950                 ntohs(con->dst_addr.ip.sin_port),
1951                 ctdb_addr_to_str(&con->src_addr),
1952                 ntohs(con->src_addr.ip.sin_port)));
1953
1954         ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
1955         talloc_free(con);
1956 }
1957
1958
1959 /* when traversing the list of all tcp connections to send tickle acks to
1960    (so that we can capture the ack coming back and kill the connection
1961     by a RST)
1962    this callback is called for each connection we are currently trying to kill
1963 */
1964 static void tickle_connection_traverse(void *param, void *data)
1965 {
1966         struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
1967
1968         /* have tried too many times, just give up */
1969         if (con->count >= 5) {
1970                 talloc_free(con);
1971                 return;
1972         }
1973
1974         /* othervise, try tickling it again */
1975         con->count++;
1976         ctdb_sys_send_tcp(
1977                 (ctdb_sock_addr *)&con->dst_addr,
1978                 (ctdb_sock_addr *)&con->src_addr,
1979                 0, 0, 0);
1980 }
1981
1982
1983 /* 
1984    called every second until all sentenced connections have been reset
1985  */
1986 static void ctdb_tickle_sentenced_connections(struct event_context *ev, struct timed_event *te, 
1987                                               struct timeval t, void *private_data)
1988 {
1989         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
1990
1991
1992         /* loop over all connections sending tickle ACKs */
1993         trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, NULL);
1994
1995
1996         /* If there are no more connections to kill we can remove the
1997            entire killtcp structure
1998          */
1999         if ( (killtcp->connections == NULL) || 
2000              (killtcp->connections->root == NULL) ) {
2001                 talloc_free(killtcp);
2002                 return;
2003         }
2004
2005         /* try tickling them again in a seconds time
2006          */
2007         event_add_timed(killtcp->ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
2008                         ctdb_tickle_sentenced_connections, killtcp);
2009 }
2010
2011 /*
2012   destroy the killtcp structure
2013  */
2014 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
2015 {
2016         killtcp->vnn->killtcp = NULL;
2017         return 0;
2018 }
2019
2020
2021 /* nothing fancy here, just unconditionally replace any existing
2022    connection structure with the new one.
2023
2024    dont even free the old one if it did exist, that one is talloc_stolen
2025    by the same node in the tree anyway and will be deleted when the new data 
2026    is deleted
2027 */
2028 static void *add_killtcp_callback(void *parm, void *data)
2029 {
2030         return parm;
2031 }
2032
2033 /*
2034   add a tcp socket to the list of connections we want to RST
2035  */
2036 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb, 
2037                                        ctdb_sock_addr *s,
2038                                        ctdb_sock_addr *d)
2039 {
2040         ctdb_sock_addr src, dst;
2041         struct ctdb_kill_tcp *killtcp;
2042         struct ctdb_killtcp_con *con;
2043         struct ctdb_vnn *vnn;
2044
2045         ctdb_canonicalize_ip(s, &src);
2046         ctdb_canonicalize_ip(d, &dst);
2047
2048         vnn = find_public_ip_vnn(ctdb, &dst);
2049         if (vnn == NULL) {
2050                 vnn = find_public_ip_vnn(ctdb, &src);
2051         }
2052         if (vnn == NULL) {
2053                 /* if it is not a public ip   it could be our 'single ip' */
2054                 if (ctdb->single_ip_vnn) {
2055                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
2056                                 vnn = ctdb->single_ip_vnn;
2057                         }
2058                 }
2059         }
2060         if (vnn == NULL) {
2061                 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n")); 
2062                 return -1;
2063         }
2064
2065         killtcp = vnn->killtcp;
2066         
2067         /* If this is the first connection to kill we must allocate
2068            a new structure
2069          */
2070         if (killtcp == NULL) {
2071                 killtcp = talloc_zero(ctdb, struct ctdb_kill_tcp);
2072                 CTDB_NO_MEMORY(ctdb, killtcp);
2073
2074                 killtcp->vnn         = vnn;
2075                 killtcp->ctdb        = ctdb;
2076                 killtcp->capture_fd  = -1;
2077                 killtcp->connections = trbt_create(killtcp, 0);
2078
2079                 vnn->killtcp         = killtcp;
2080                 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
2081         }
2082
2083
2084
2085         /* create a structure that describes this connection we want to
2086            RST and store it in killtcp->connections
2087         */
2088         con = talloc(killtcp, struct ctdb_killtcp_con);
2089         CTDB_NO_MEMORY(ctdb, con);
2090         con->src_addr = src;
2091         con->dst_addr = dst;
2092         con->count    = 0;
2093         con->killtcp  = killtcp;
2094
2095
2096         trbt_insertarray32_callback(killtcp->connections,
2097                         KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
2098                         add_killtcp_callback, con);
2099
2100         /* 
2101            If we dont have a socket to listen on yet we must create it
2102          */
2103         if (killtcp->capture_fd == -1) {
2104                 const char *iface = ctdb_vnn_iface_string(vnn);
2105                 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
2106                 if (killtcp->capture_fd == -1) {
2107                         DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
2108                                           "socket on iface '%s' for killtcp (%s)\n",
2109                                           iface, strerror(errno)));
2110                         goto failed;
2111                 }
2112         }
2113
2114
2115         if (killtcp->fde == NULL) {
2116                 killtcp->fde = event_add_fd(ctdb->ev, killtcp, killtcp->capture_fd, 
2117                                             EVENT_FD_READ | EVENT_FD_AUTOCLOSE, 
2118                                             capture_tcp_handler, killtcp);
2119
2120                 /* We also need to set up some events to tickle all these connections
2121                    until they are all reset
2122                 */
2123                 event_add_timed(ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
2124                                 ctdb_tickle_sentenced_connections, killtcp);
2125         }
2126
2127         /* tickle him once now */
2128         ctdb_sys_send_tcp(
2129                 &con->dst_addr,
2130                 &con->src_addr,
2131                 0, 0, 0);
2132
2133         return 0;
2134
2135 failed:
2136         talloc_free(vnn->killtcp);
2137         vnn->killtcp = NULL;
2138         return -1;
2139 }
2140
2141 /*
2142   kill a TCP connection.
2143  */
2144 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
2145 {
2146         struct ctdb_control_killtcp *killtcp = (struct ctdb_control_killtcp *)indata.dptr;
2147
2148         return ctdb_killtcp_add_connection(ctdb, &killtcp->src_addr, &killtcp->dst_addr);
2149 }
2150
2151 /*
2152   called by a daemon to inform us of the entire list of TCP tickles for
2153   a particular public address.
2154   this control should only be sent by the node that is currently serving
2155   that public address.
2156  */
2157 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
2158 {
2159         struct ctdb_control_tcp_tickle_list *list = (struct ctdb_control_tcp_tickle_list *)indata.dptr;
2160         struct ctdb_tcp_array *tcparray;
2161         struct ctdb_vnn *vnn;
2162
2163         /* We must at least have tickles.num or else we cant verify the size
2164            of the received data blob
2165          */
2166         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
2167                                         tickles.connections)) {
2168                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list. Not enough data for the tickle.num field\n"));
2169                 return -1;
2170         }
2171
2172         /* verify that the size of data matches what we expect */
2173         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
2174                                 tickles.connections)
2175                          + sizeof(struct ctdb_tcp_connection)
2176                                  * list->tickles.num) {
2177                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list\n"));
2178                 return -1;
2179         }       
2180
2181         vnn = find_public_ip_vnn(ctdb, &list->addr);
2182         if (vnn == NULL) {
2183                 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n", 
2184                         ctdb_addr_to_str(&list->addr)));
2185
2186                 return 1;
2187         }
2188
2189         /* remove any old ticklelist we might have */
2190         talloc_free(vnn->tcp_array);
2191         vnn->tcp_array = NULL;
2192
2193         tcparray = talloc(ctdb->nodes, struct ctdb_tcp_array);
2194         CTDB_NO_MEMORY(ctdb, tcparray);
2195
2196         tcparray->num = list->tickles.num;
2197
2198         tcparray->connections = talloc_array(tcparray, struct ctdb_tcp_connection, tcparray->num);
2199         CTDB_NO_MEMORY(ctdb, tcparray->connections);
2200
2201         memcpy(tcparray->connections, &list->tickles.connections[0], 
2202                sizeof(struct ctdb_tcp_connection)*tcparray->num);
2203
2204         /* We now have a new fresh tickle list array for this vnn */
2205         vnn->tcp_array = talloc_steal(vnn, tcparray);
2206         
2207         return 0;
2208 }
2209
2210 /*
2211   called to return the full list of tickles for the puclic address associated 
2212   with the provided vnn
2213  */
2214 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
2215 {
2216         ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
2217         struct ctdb_control_tcp_tickle_list *list;
2218         struct ctdb_tcp_array *tcparray;
2219         int num;
2220         struct ctdb_vnn *vnn;
2221
2222         vnn = find_public_ip_vnn(ctdb, addr);
2223         if (vnn == NULL) {
2224                 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n", 
2225                         ctdb_addr_to_str(addr)));
2226
2227                 return 1;
2228         }
2229
2230         tcparray = vnn->tcp_array;
2231         if (tcparray) {
2232                 num = tcparray->num;
2233         } else {
2234                 num = 0;
2235         }
2236
2237         outdata->dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
2238                                 tickles.connections)
2239                         + sizeof(struct ctdb_tcp_connection) * num;
2240
2241         outdata->dptr  = talloc_size(outdata, outdata->dsize);
2242         CTDB_NO_MEMORY(ctdb, outdata->dptr);
2243         list = (struct ctdb_control_tcp_tickle_list *)outdata->dptr;
2244
2245         list->addr = *addr;
2246         list->tickles.num = num;
2247         if (num) {
2248                 memcpy(&list->tickles.connections[0], tcparray->connections, 
2249                         sizeof(struct ctdb_tcp_connection) * num);
2250         }
2251
2252         return 0;
2253 }
2254
2255
2256 /*
2257   set the list of all tcp tickles for a public address
2258  */
2259 static int ctdb_ctrl_set_tcp_tickles(struct ctdb_context *ctdb, 
2260                               struct timeval timeout, uint32_t destnode, 
2261                               ctdb_sock_addr *addr,
2262                               struct ctdb_tcp_array *tcparray)
2263 {
2264         int ret, num;
2265         TDB_DATA data;
2266         struct ctdb_control_tcp_tickle_list *list;
2267
2268         if (tcparray) {
2269                 num = tcparray->num;
2270         } else {
2271                 num = 0;
2272         }
2273
2274         data.dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
2275                                 tickles.connections) +
2276                         sizeof(struct ctdb_tcp_connection) * num;
2277         data.dptr = talloc_size(ctdb, data.dsize);
2278         CTDB_NO_MEMORY(ctdb, data.dptr);
2279
2280         list = (struct ctdb_control_tcp_tickle_list *)data.dptr;
2281         list->addr = *addr;
2282         list->tickles.num = num;
2283         if (tcparray) {
2284                 memcpy(&list->tickles.connections[0], tcparray->connections, sizeof(struct ctdb_tcp_connection) * num);
2285         }
2286
2287         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
2288                                        CTDB_CONTROL_SET_TCP_TICKLE_LIST,
2289                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2290         if (ret != 0) {
2291                 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
2292                 return -1;
2293         }
2294
2295         talloc_free(data.dptr);
2296
2297         return ret;
2298 }
2299
2300
2301 /*
2302   perform tickle updates if required
2303  */
2304 static void ctdb_update_tcp_tickles(struct event_context *ev, 
2305                                 struct timed_event *te, 
2306                                 struct timeval t, void *private_data)
2307 {
2308         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
2309         int ret;
2310         struct ctdb_vnn *vnn;
2311
2312         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2313                 /* we only send out updates for public addresses that 
2314                    we have taken over
2315                  */
2316                 if (ctdb->pnn != vnn->pnn) {
2317                         continue;
2318                 }
2319                 /* We only send out the updates if we need to */
2320                 if (!vnn->tcp_update_needed) {
2321                         continue;
2322                 }
2323                 ret = ctdb_ctrl_set_tcp_tickles(ctdb, 
2324                                 TAKEOVER_TIMEOUT(),
2325                                 CTDB_BROADCAST_CONNECTED,
2326                                 &vnn->public_address,
2327                                 vnn->tcp_array);
2328                 if (ret != 0) {
2329                         DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
2330                                 ctdb_addr_to_str(&vnn->public_address)));
2331                 }
2332         }
2333
2334         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
2335                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
2336                              ctdb_update_tcp_tickles, ctdb);
2337 }               
2338         
2339
2340 /*
2341   start periodic update of tcp tickles
2342  */
2343 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
2344 {
2345         ctdb->tickle_update_context = talloc_new(ctdb);
2346
2347         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
2348                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
2349                              ctdb_update_tcp_tickles, ctdb);
2350 }
2351
2352
2353
2354
2355 struct control_gratious_arp {
2356         struct ctdb_context *ctdb;
2357         ctdb_sock_addr addr;
2358         const char *iface;
2359         int count;
2360 };
2361
2362 /*
2363   send a control_gratuitous arp
2364  */
2365 static void send_gratious_arp(struct event_context *ev, struct timed_event *te, 
2366                                   struct timeval t, void *private_data)
2367 {
2368         int ret;
2369         struct control_gratious_arp *arp = talloc_get_type(private_data, 
2370                                                         struct control_gratious_arp);
2371
2372         ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
2373         if (ret != 0) {
2374                 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
2375                                  arp->iface, strerror(errno)));
2376         }
2377
2378
2379         arp->count++;
2380         if (arp->count == CTDB_ARP_REPEAT) {
2381                 talloc_free(arp);
2382                 return;
2383         }
2384
2385         event_add_timed(arp->ctdb->ev, arp, 
2386                         timeval_current_ofs(CTDB_ARP_INTERVAL, 0), 
2387                         send_gratious_arp, arp);
2388 }
2389
2390
2391 /*
2392   send a gratious arp 
2393  */
2394 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
2395 {
2396         struct ctdb_control_gratious_arp *gratious_arp = (struct ctdb_control_gratious_arp *)indata.dptr;
2397         struct control_gratious_arp *arp;
2398
2399         /* verify the size of indata */
2400         if (indata.dsize < offsetof(struct ctdb_control_gratious_arp, iface)) {
2401                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
2402                                  (unsigned)indata.dsize, 
2403                                  (unsigned)offsetof(struct ctdb_control_gratious_arp, iface)));
2404                 return -1;
2405         }
2406         if (indata.dsize != 
2407                 ( offsetof(struct ctdb_control_gratious_arp, iface)
2408                 + gratious_arp->len ) ){
2409
2410                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
2411                         "but should be %u bytes\n", 
2412                          (unsigned)indata.dsize, 
2413                          (unsigned)(offsetof(struct ctdb_control_gratious_arp, iface)+gratious_arp->len)));
2414                 return -1;
2415         }
2416
2417
2418         arp = talloc(ctdb, struct control_gratious_arp);
2419         CTDB_NO_MEMORY(ctdb, arp);
2420
2421         arp->ctdb  = ctdb;
2422         arp->addr   = gratious_arp->addr;
2423         arp->iface = talloc_strdup(arp, gratious_arp->iface);
2424         CTDB_NO_MEMORY(ctdb, arp->iface);
2425         arp->count = 0;
2426         
2427         event_add_timed(arp->ctdb->ev, arp, 
2428                         timeval_zero(), send_gratious_arp, arp);
2429
2430         return 0;
2431 }
2432
2433 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
2434 {
2435         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
2436         int ret;
2437
2438         /* verify the size of indata */
2439         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
2440                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
2441                 return -1;
2442         }
2443         if (indata.dsize != 
2444                 ( offsetof(struct ctdb_control_ip_iface, iface)
2445                 + pub->len ) ){
2446
2447                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
2448                         "but should be %u bytes\n", 
2449                          (unsigned)indata.dsize, 
2450                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
2451                 return -1;
2452         }
2453
2454         ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0]);
2455
2456         if (ret != 0) {
2457                 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
2458                 return -1;
2459         }
2460
2461         return 0;
2462 }
2463
2464 /*
2465   called when releaseip event finishes for del_public_address
2466  */
2467 static void delete_ip_callback(struct ctdb_context *ctdb, int status, 
2468                                 void *private_data)
2469 {
2470         talloc_free(private_data);
2471 }
2472
2473 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
2474 {
2475         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
2476         struct ctdb_vnn *vnn;
2477         int ret;
2478
2479         /* verify the size of indata */
2480         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
2481                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
2482                 return -1;
2483         }
2484         if (indata.dsize != 
2485                 ( offsetof(struct ctdb_control_ip_iface, iface)
2486                 + pub->len ) ){
2487
2488                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
2489                         "but should be %u bytes\n", 
2490                          (unsigned)indata.dsize, 
2491                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
2492                 return -1;
2493         }
2494
2495         /* walk over all public addresses until we find a match */
2496         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2497                 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
2498                         TALLOC_CTX *mem_ctx;
2499
2500                         DLIST_REMOVE(ctdb->vnn, vnn);
2501                         if (vnn->iface == NULL) {
2502                                 talloc_free(vnn);
2503                                 return 0;
2504                         }
2505
2506                         mem_ctx = talloc_new(ctdb);
2507                         ret = ctdb_event_script_callback(ctdb, 
2508                                          mem_ctx, delete_ip_callback, mem_ctx,
2509                                          false,
2510                                          CTDB_EVENT_RELEASE_IP,
2511                                          "%s %s %u",
2512                                          ctdb_vnn_iface_string(vnn),
2513                                          ctdb_addr_to_str(&vnn->public_address),
2514                                          vnn->public_netmask_bits);
2515                         ctdb_vnn_unassign_iface(ctdb, vnn);
2516                         talloc_free(vnn);
2517                         if (ret != 0) {
2518                                 return -1;
2519                         }
2520                         return 0;
2521                 }
2522         }
2523
2524         return -1;
2525 }
2526